如何解决Python3.9 + Scrapy + Splash lua_source发生错误意外令牌'<'
我想练习自动抓取带有scrapy
和splash
的某些文章列表和每个内容,但是我遇到了一些无法解决的错误:
使用lua_source发送一些html节点后,我总是收到错误(splash docker errors):
2020-10-20 11:50:49.420004 [-] Server listening on http://0.0.0.0:8050
2020-10-20 12:15:55.623975 [events] {"path": "/execute","rendertime": 8.717848777770996,"maxRSS": 247656,"load": [0.0,0.04,0.01],"fds": 82,"active": 0,"qsize": 0,"_id": 140574281705680,"method": "POST","timestamp": 1603196155,"user-agent": "Scrapy/2.4.0 (+https://scrapy.org)","args": {"cookies": [],"headers": {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en","User-Agent": "Scrapy/2.4.0 (+https://scrapy.org)"},"lua_source": "\nfunction main(splash,args)\n assert(splash:go(args.url))\n assert(splash:wait(args.wait))\n assert(splash:wait(args.wait))\n return splash:html()\nend\n","page": 1,"timeout": 90,"url": "https://segmentfault.com/blogs?page=1","wait": 3,"uid": 140574281705680},"status_code": 200,"client_ip": "172.17.0.1"}
2020-10-20 12:15:55.624471 [-] "172.17.0.1" - - [20/Oct/2020:12:15:55 +0000] "POST /execute HTTP/1.1" 200 78667 "-" "Scrapy/2.4.0 (+https://scrapy.org)"
2020-10-20 12:16:03.121159 [events] {"path": "/execute","rendertime": 7.355923414230347,"maxRSS": 281760,"load": [0.29,0.1,0.03],"fds": 73,"_id": 140574661640768,"timestamp": 1603196163,"Referer": "https://segmentfault.com/blogs?page=1","url": "https://segmentfault.com//a/1190000037533517","uid": 140574661640768},"client_ip": "172.17.0.1"}
2020-10-20 12:16:03.121436 [-] "172.17.0.1" - - [20/Oct/2020:12:16:02 +0000] "POST /execute HTTP/1.1" 200 144939 "-" "Scrapy/2.4.0 (+https://scrapy.org)"
2020-10-20 12:16:03.274100 [events] {"path": "/execute","rendertime": 0.01170206069946289,"fds": 56,"args": {"article_content": "<article class=\"article fmt article-content\" data-id=\"1190000037533517\" data-license=\"cc\"> </p> ...(some very long html nodes)...</article>","cookies": [],args)\n splash.images_enabled = false\n js = string.format(\"document.querySelector('textarea#original_content').value=%s\",args.article_content)\n splash:evaljs(js)\n splash:wait(args.wait)\n click_js = string.format(\"document.querySelector('#translate_button').click()\")\n splash:evaljs(click_js)\n splash:wait(args.wait)\n return_js = string.format(\"document.querySelector('textarea#md_content').value\")\n return splash:evaljs(return_js)\nend\n","url": "http://localhost:8080","wait": 1,"status_code": 400,"client_ip": "172.17.0.1","error": {"error": 400,"type": "ScriptError","description": "Error happened while executing Lua script","info": {"type": "JS_ERROR","js_error_type": "SyntaxError","js_error_message": "Unexpected token '<'","js_error": "SyntaxError: Unexpected token '<'","message": "[string \"...\"]:5: JS error: \"SyntaxError: Unexpected token '<'\"","splash_method": "evaljs","source": "[string \"...\"]","line_number": 5,"error": "JS error: \"SyntaxError: Unexpected token '<'\""}}}
2020-10-20 12:16:03.277508 [-] "172.17.0.1" - - [20/Oct/2020:12:16:02 +0000] "POST /execute HTTP/1.1" 400 469 "-" "Scrapy/2.4.0 (+https://scrapy.org)"
我的主要抓取代码是:
from scrapy import Spider,Request
from udaskweb.items import UdaskwebItem
from scrapy_splash import SplashRequest
from scrapy.selector import Selector
script = """
function main(splash,args)
assert(splash:go(args.url))
assert(splash:wait(args.wait))
assert(splash:wait(args.wait))
return splash:html()
end
"""
md_script = """
function main(splash,args)
splash.images_enabled = false
js = string.format("document.querySelector('textarea#original_content').value=%s",args.article_content)
splash:evaljs(js)
splash:wait(args.wait)
click_js = string.format("document.querySelector('#translate_button').click()")
splash:evaljs(click_js)
splash:wait(args.wait)
return_js = string.format("document.querySelector('textarea#md_content').value")
return splash:evaljs(return_js)
end
"""
class SegmentSpider(Spider):
name = 'segment'
start_urls = 'xxxx.com'
allowed_domains = ['xxxx.com',localhost']
md_url = 'http://localhost:8080'
start_urls = 'https://xxxx.com/blogs'
start_domains = 'https://xxxx.com'
def start_requests(self):
for page in range(1,self.settings.get('MAX_PAGE') + 1):
url = self.start_urls + "?page=" + str(page)
yield SplashRequest(url,callback=self.parse,endpoint='execute',args={'lua_source': script,'wait': 3,'page': page,'timeout': 90})
def parse(self,response):
item = UdaskwebItem()
articles = response.css("div.blog-stream").xpath(
'section[@class="stream-list__item"]//div[@class="summary"]//h2/a/@href').extract()
item['links'] = articles
detail_url = self.start_domains + "/" + articles[0]
yield SplashRequest(detail_url,Meta={"item": item},callback=self.article_detail,'timeout': 90})
def article_detail(self,response):
item = response.Meta["item"]
article = response.css("div.card-body")
article_content = article.xpath(
'.//article[contains(@class,"article-content")]').extract_first()
# below line get the errors,I was beginning to think it may be article_content too long(in fact it is a long html nodes string),but after I change article_content to 'ddddd',it also get the error.
yield SplashRequest(self.md_url,callback=self.get_item,args={'lua_source': md_script,'article_content': article_content,'wait': 1})
def get_item(self,response):
print("=================================")
我几乎整天都在尝试修复它,但是失败了。
有人帮忙吗?非常感谢。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。