如何解决一段时间后,爬虫引擎卡住了,没有显示任何错误/日志
一段时间后,广泛的爬虫引擎卡住了,你可以不显示任何错误/日志 see here。我已经更改了很多时间设置,例如集成的 15 个专用 IP 代理、download_delay、concurrent_items 和 concurrent_requests、concurrent_requests_per_ip 等,但仍然面临这个问题。
myspider.py
import datetime
import urllib3
from six.moves.urllib.parse import urlsplit
import scrapy
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import CustomLinkExtractor
from ..items import broadCrawlerItem
import collections
from ..extractors import DateExtractor,FaviconExtractor
import extraction
from scrapy.utils.project import get_project_settings
import os
import tldextract
from urllib.parse import urlparse
class FollowAllSpider(CrawlSpider):
name = 'follow_all'
start_urls = [
'https://news.google.com/topstories',"https://en.wikipedia.org/wiki/Bill_Gates","https://detailed.com/50/","https://www.techlearning.com/news/15-awesome-article-sites-for-students","https://jamesclear.com/articles",'https://en.wikipedia.org/wiki/Donald_Trump','https://en.wikipedia.org/wiki/Elon_Musk'
]
denylist = ['icann.org','blogspot.com','ganji.com','dihe.cn','google.com','twitter.com','glassdoor.com','glassdoor.ie','youtube.com','wordcamp.com','wordcamp.org','ganchang.cn','aa.com.tr','xinhuanet.com','nasdaq.com','wikipedia.org','wikinews.org','wikimedia.org','indianexpress.com','whatsapp.com','edweek.org','apple.com','facebook.com','reddit.com','linkedin.com','stackoverflow.com','t.co','fzcom.cn','github.com','amazon.com']
rules = [Rule(CustomLinkExtractor(deny_domains=denylist),process_links='filter_links',follow=True,callback='parse_item')]
count = 1
def filter_links(self,links):
for link in links:
url = link.url.lower()
if 'privacy' in url or 'forgot' in url or 'password' in url or 'developer' in url\
or 'login' in url or 'twitter.com' in url or 'linkedin.com':
continue
yield link
def parse_item(self,response):
items = broadCrawlerItem()
absolute_url = response.request.url
domain = urlsplit(absolute_url)[0] + "://" + urlsplit(absolute_url)[1]
title = self.get_title(response)
Meta_keywords = self.get_Meta_keywords(response)
extracted = extraction.Extractor().extract(response.body,source_url=absolute_url)
Meta_descripiton = self.get_Meta_description(extracted)
if not Meta_descripiton:
print('description not found..!')
return
print(f'{self.count} | {domain} | {absolute_url}')
self.count += 1
yield items
settings.py
BOT_NAME = 'broad_crawler'
SPIDER_MODULES = ['broad_crawler.spiders']
NEWSPIDER_MODULE = 'broad_crawler.spiders'
ITEM_PIPELInes = {
'broad_crawler.pipelines.broadCrawlerPipeline': 300,}
DOWNLOAD_DELAY = 0
CONCURRENT_ITEMS = 15
CONCURRENT_REQUESTS = 20
CONCURRENT_REQUESTS_PER_IP = 8
CONCURRENT_REQUESTS_PER_DOMAIN = 2
REACTOR_THREADPOOL_MAXSIZE = 20
ROBOTSTXT_OBEY = False
DOWNLOAD_TIMEOUT = 8
USER_AGENT = 'my-bot'
COOKIES_ENABLED = False
RETRY_ENABLED = False
DOWNLOAD_MAXSIZE = 5592405
REDIRECT_ENABLED = False
AJAXCRAWL_ENABLED = True
LOG_LEVEL = 'WARN'
REdis_HOST = 'localhost'
REdis_PORT = 6379
# SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = True
# REdis_URL = "redis =//127.0.0.1 =6379"
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。