本文实例讲述了Python实现从脚本里运行scrapy的方法。分享给大家供大家参考。具体如下:
#!/usr/bin/python
import os
os.environ.setdefault('SCRAPY_SETTINGS_MODULE','project.settings') #Must be at the top before other imports
from scrapy import log,signals,project
from scrapy.xlib.pydispatch import dispatcher
from scrapy.conf import settings
from scrapy.crawler import CrawlerProcess
from multiprocessing import Process,Queue
class CrawlerScript():
def __init__(self):
self.crawler = CrawlerProcess(settings)
if not hasattr(project,'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
dispatcher.connect(self._item_passed,signals.item_passed)
def _item_passed(self,item):
self.items.append(item)
def _crawl(self,queue,spider_name):
spider = self.crawler.spiders.create(spider_name)
if spider:
self.crawler.queue.append_spider(spider)
self.crawler.start()
self.crawler.stop()
queue.put(self.items)
def crawl(self,spider):
queue = Queue()
p = Process(target=self._crawl,args=(queue,spider,))
p.start()
p.join()
return queue.get(True)
# Usage
if __name__ == "__main__":
log.start()
"""
This example runs spider1 and then spider2 three times.
"""
items = list()
crawler = CrawlerScript()
items.append(crawler.crawl('spider1'))
for i in range(3):
items.append(crawler.crawl('spider2'))
print items
import os
os.environ.setdefault('SCRAPY_SETTINGS_MODULE','project.settings') #Must be at the top before other imports
from scrapy import log,signals,project
from scrapy.xlib.pydispatch import dispatcher
from scrapy.conf import settings
from scrapy.crawler import CrawlerProcess
from multiprocessing import Process,Queue
class CrawlerScript():
def __init__(self):
self.crawler = CrawlerProcess(settings)
if not hasattr(project,'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
dispatcher.connect(self._item_passed,signals.item_passed)
def _item_passed(self,item):
self.items.append(item)
def _crawl(self,queue,spider_name):
spider = self.crawler.spiders.create(spider_name)
if spider:
self.crawler.queue.append_spider(spider)
self.crawler.start()
self.crawler.stop()
queue.put(self.items)
def crawl(self,spider):
queue = Queue()
p = Process(target=self._crawl,args=(queue,spider,))
p.start()
p.join()
return queue.get(True)
# Usage
if __name__ == "__main__":
log.start()
"""
This example runs spider1 and then spider2 three times.
"""
items = list()
crawler = CrawlerScript()
items.append(crawler.crawl('spider1'))
for i in range(3):
items.append(crawler.crawl('spider2'))
print items
希望本文所述对大家的Python程序设计有所帮助。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。