微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

Scrapyrt不使用回调函数

如何解决Scrapyrt不使用回调函数

这些是我的文件

views.py

from uuid import uuid4
from urllib.parse import urlparse
from django.core.validators import URLValidator
from rest_framework.decorators import api_view,renderer_classes
from django.views.decorators.csrf import csrf_exempt
from django.http import JsonResponse
from time import sleep
import os
import json

import requests
    

def is_valid_url(url):
    validate = URLValidator()
    try:
        validate(url) # check if url format is valid
    except ValidationError:
        return False

return True

@csrf_exempt
@api_view(['POST',])
def getProduct(request):

    url = request.POST['url']

    if not url:
        return JsonResponse({'error': 'Missing  args'})
    
    if not is_valid_url(url):
        return JsonResponse({'error': 'URL is invalid'})

    data = {
        "request": {
            "url": str(url),"callback": "start_requests","dont_filter": "false"
        },"spider_name": "GetinfoSpider"
    }

    scrapyrt = 'http://127.0.0.1:9081/crawl.json'

    try:
        #print(str(requests.post(scrapyrt,data = data)))
        r = requests.post(scrapyrt,json = data)
        print(r)
        return JsonResponse({'data': r})
    except Exception as e:
        print(e)
        return JsonResponse({'error': str(e)})

这是我的蜘蛛文件getInfo.py

import scrapy
from scrapy_splash import SplashRequest

from ..items import WallmartItem

import logging


class GetinfoSpider(scrapy.Spider):
    name:str = 'GetinfoSpider'
    allowed_domains = ['www.walmart.com']

    script1:str = '''
        function main(splash,args)
            splash.private_mode_enabled = false
            url = args.url
            assert(splash:go(url))
            assert(splash:wait(1))
            splash:set_viewport_full()
            return splash:html()
        end
    '''


    def start_requests(self,url):

        yield scrapy.Request(str(url),callback=self.parse_item,Meta={
            'splash': {
                'args': {
                    # set rendering arguments here
                    'lua_source': self.script1

                },# optional parameters
                'endpoint': 'render.html',# optional; default is render.json
            }
        })

    def parse_item(self,response):
        logging.log(logging.WARNING,"This is a warning")
        item = WallmartItem()
        #specs = response.css
    
    
        item['title'] = response.xpath('//*[@id="product-overview"]/div/div[3]/div/h1//text()').get(),item['price'] = response.css('span.price-characteristic::attr(content)').getall(),item['deliveryDate'] = response.css('p.no-margin::text').get(),item['pictures'] = response.css('img.prod-alt-image-carousel-image--left::attr(src)').getall(),item['description'] = response.css('div.about-desc ::text').getall(),yield item

在我发送包含视图中的数据字典的请求后,一切似乎都工作正常,

2020-10-21 12:09:28 [scrapy] DEBUG: Crawled (200) <GET https://www.site/robots.txt> (referer: None)
2020-10-21 12:09:29 [scrapy] DEBUG: Crawled (200) <GET https://www.site/ip/Fruit-of-the-Loom-Men-s-and-Big-Men-s-Eversoft-Fleece-Full-Zip-Hoodie-Jacket-up-to-Size-3XL/630234494> (referer: None)
2020-10-21 12:09:29 [scrapy] INFO: Closing spider (finished)
2020-10-21 12:09:29 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1142,'downloader/request_count': 2,'downloader/request_method_count/GET': 2,'downloader/response_bytes': 113270,'downloader/response_count': 2,'downloader/response_status_count/200': 2,'elapsed_time_seconds': 1.577416,'finish_reason': 'finished','finish_time': datetime.datetime(2020,10,21,18,9,29,136179),'log_count/DEBUG': 13,'log_count/INFO': 7,'offsite/filtered': 1,'request_depth_max': 1,'response_received_count': 2,'robotstxt/request_count': 1,'robotstxt/response_count': 1,'robotstxt/response_status_count/200': 1,'scheduler/dequeued': 1,'scheduler/dequeued/memory': 1,'scheduler/enqueued': 1,'scheduler/enqueued/memory': 1,'start_time': datetime.datetime(2020,27,558763)}
2020-10-21 12:09:29 [scrapy] INFO: Spider closed (finished)

没有要解析的项目或信息,我认为问题可能是scrapyrt没有调用我的回调函数,但我仍然不太确定

任何信息将不胜感激

解决方法

整个问题是处理请求的方式,如果有人遇到同样的问题,只需确保URL是您传递给回调的那个

类似这样的东西:

def start_requests(self,request):
    url = request.url
    return SplashRequest(
            url=str(url),callback=self.parse_item,endpoint='execute',args = {
                'lua_source': self.script1
        }
    )

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。