defrun3_crawlerRunner(): '''如果你的应用程序使用了twisted,建议使用crawlerrunner 而不是crawlerprocess Note that you will also have to shutdown the Twisted reactor yourself after the spider is finished. This can be achieved by adding callbacks to the deferred returned by the CrawlerRunner.crawl method. ''' from scrapy.crawler import CrawlerRunner runner = CrawlerRunner(get_project_settings())
# 'spidername' is the name of one of the spiders of the project. d = runner.crawl('spidername') # stop reactor when spider closes # d.addBoth(lambda _: reactor.stop()) d.addBoth(spider_closing) # 等价写法
reactor.run() # the script will block here until the crawling is finished
defrun4_multiple_spider(): from scrapy.crawler import CrawlerProcess process = CrawlerProcess()
from scrapy_test1.spiders import myspider1, myspider2 for s in [myspider1, myspider2]: process.crawl(s) process.start()
defrun5_multiplespider(): '''using CrawlerRunner''' from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging
configure_logging() runner = CrawlerRunner() from scrapy_test1.spiders import myspider1, myspider2 for s in [myspider1, myspider2]: runner.crawl(s)
d = runner.join() d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
defrun6_multiplespider(): '''通过链接(chaining) deferred来线性运行spider''' from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging configure_logging() runner = CrawlerRunner()
@defer.inlineCallbacks defcrawl(): from scrapy_test1.spiders import myspider1, myspider2 for s in [myspider1, myspider2]: yield runner.crawl(s) reactor.stop()
crawl() reactor.run() # the script will block here until the last crawl call is finished
if __name__=='__main__': # run4_multiple_spider() # run5_multiplespider() run6_multiplespider()