如何解决Scrapyrt不使用回调函数
这些是我的文件:
views.py
from uuid import uuid4
from urllib.parse import urlparse
from django.core.validators import URLValidator
from rest_framework.decorators import api_view,renderer_classes
from django.views.decorators.csrf import csrf_exempt
from django.http import JsonResponse
from time import sleep
import os
import json
import requests
def is_valid_url(url):
validate = URLValidator()
try:
validate(url) # check if url format is valid
except ValidationError:
return False
return True
@csrf_exempt
@api_view(['POST',])
def getProduct(request):
url = request.POST['url']
if not url:
return JsonResponse({'error': 'Missing args'})
if not is_valid_url(url):
return JsonResponse({'error': 'URL is invalid'})
data = {
"request": {
"url": str(url),"callback": "start_requests","dont_filter": "false"
},"spider_name": "GetinfoSpider"
}
scrapyrt = 'http://127.0.0.1:9081/crawl.json'
try:
#print(str(requests.post(scrapyrt,data = data)))
r = requests.post(scrapyrt,json = data)
print(r)
return JsonResponse({'data': r})
except Exception as e:
print(e)
return JsonResponse({'error': str(e)})
这是我的蜘蛛文件getInfo.py
import scrapy
from scrapy_splash import SplashRequest
from ..items import WallmartItem
import logging
class GetinfoSpider(scrapy.Spider):
name:str = 'GetinfoSpider'
allowed_domains = ['www.walmart.com']
script1:str = '''
function main(splash,args)
splash.private_mode_enabled = false
url = args.url
assert(splash:go(url))
assert(splash:wait(1))
splash:set_viewport_full()
return splash:html()
end
'''
def start_requests(self,url):
yield scrapy.Request(str(url),callback=self.parse_item,Meta={
'splash': {
'args': {
# set rendering arguments here
'lua_source': self.script1
},# optional parameters
'endpoint': 'render.html',# optional; default is render.json
}
})
def parse_item(self,response):
logging.log(logging.WARNING,"This is a warning")
item = WallmartItem()
#specs = response.css
item['title'] = response.xpath('//*[@id="product-overview"]/div/div[3]/div/h1//text()').get(),item['price'] = response.css('span.price-characteristic::attr(content)').getall(),item['deliveryDate'] = response.css('p.no-margin::text').get(),item['pictures'] = response.css('img.prod-alt-image-carousel-image--left::attr(src)').getall(),item['description'] = response.css('div.about-desc ::text').getall(),yield item
在我发送包含视图中的数据字典的请求后,一切似乎都工作正常,
2020-10-21 12:09:28 [scrapy] DEBUG: Crawled (200) <GET https://www.site/robots.txt> (referer: None)
2020-10-21 12:09:29 [scrapy] DEBUG: Crawled (200) <GET https://www.site/ip/Fruit-of-the-Loom-Men-s-and-Big-Men-s-Eversoft-Fleece-Full-Zip-Hoodie-Jacket-up-to-Size-3XL/630234494> (referer: None)
2020-10-21 12:09:29 [scrapy] INFO: Closing spider (finished)
2020-10-21 12:09:29 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1142,'downloader/request_count': 2,'downloader/request_method_count/GET': 2,'downloader/response_bytes': 113270,'downloader/response_count': 2,'downloader/response_status_count/200': 2,'elapsed_time_seconds': 1.577416,'finish_reason': 'finished','finish_time': datetime.datetime(2020,10,21,18,9,29,136179),'log_count/DEBUG': 13,'log_count/INFO': 7,'offsite/filtered': 1,'request_depth_max': 1,'response_received_count': 2,'robotstxt/request_count': 1,'robotstxt/response_count': 1,'robotstxt/response_status_count/200': 1,'scheduler/dequeued': 1,'scheduler/dequeued/memory': 1,'scheduler/enqueued': 1,'scheduler/enqueued/memory': 1,'start_time': datetime.datetime(2020,27,558763)}
2020-10-21 12:09:29 [scrapy] INFO: Spider closed (finished)
没有要解析的项目或信息,我认为问题可能是scrapyrt没有调用我的回调函数,但我仍然不太确定
任何信息将不胜感激
解决方法
整个问题是处理请求的方式,如果有人遇到同样的问题,只需确保URL是您传递给回调的那个
类似这样的东西:
def start_requests(self,request):
url = request.url
return SplashRequest(
url=str(url),callback=self.parse_item,endpoint='execute',args = {
'lua_source': self.script1
}
)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。