gpt4 book ai didi

python - 通过扭曲的 inlineCallbacks 运行 Scrapy 蜘蛛

转载 作者:太空宇宙 更新时间:2023-11-03 15:21:08 31 4
gpt4 key购买 nike

我有ImportError:没有名为“spiders”的模块,所以我认为当蜘蛛调用发生时没有环境变量。但我不完全明白如何让它们正常工作。

基本上,我想运行一些 Scrapy 蜘蛛,它们将填充数据库,然后我的程序应该进行小计算。这应该定期发生(比如每分钟)。由于 scrapy 依赖关系已经扭曲,我决定将其结合起来。项目结构如下(简单地):

 -Project
|-src
|- __init__.py
|- spiders.py
|-bot.py

在spiders.py中,我有2个独立的蜘蛛,当我在该文件中启动它们时,它们工作得很好。但现在我在 bot.py 中添加了一些逻辑并得出:

from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from twisted.internet import task
from twisted.internet import reactor
from twisted.internet.defer import inlineCallbacks, returnValue

from src.spiders import first_spider, second_spider

def do_some_stuff(): pass

if __name__ == '__main__':
runner = CrawlerRunner(get_project_settings())

@inlineCallbacks
def cycle():
yield runner.crawl(first_spider)
yield runner.crawl(second_spider)
returnValue(do_some_stuff())


timeout = 60.0

l = task.LoopingCall(cycle)
l.start(timeout)

reactor.run()

以及错误跟踪:

    2017-04-21 15:32:26 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole']
2017-04-21 15:32:26 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-04-21 15:32:26 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
Unhandled error in Deferred:
2017-04-21 15:32:26 [twisted] CRITICAL: Unhandled error in Deferred:

2017-04-21 15:32:26 [twisted] CRITICAL:
Traceback (most recent call last):
File "projectpath/venv/lib/python3.5/site-packages/twisted/internet/defer.py", line 1299, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "projectpath/venv/lib/python3.5/site-packages/twisted/python/failure.py", line 393, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "projectpath/bot.py", line 141, in cycle
yield runner.crawl(first_spider)
ImportError: No module named 'spiders'

更新。在spiders.py中导入:

import hashlib
import json

import pymongo
import scrapy

from scrapy.crawler import CrawlerRunner
from scrapy.exceptions import DropItem
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor

最佳答案

所以你的项目结构是

.
├── bot.py
└── src
├── __init__.py
└── spiders.py

要运行它,您应该使用 PYTHONPATH如下

$ PYTHONPATH=. python3 bot.py

这是一个基于单文件的功能性 scrapy 项目,它将每 60 秒进行一次循环抓取。

# scraper.py
import datetime
import json
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.item import Item, Field
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
from twisted.internet import task
from twisted.internet.defer import inlineCallbacks

class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open(spider.settings['JSON_FILE'], 'a')

def close_spider(self, spider):
self.file.close()

def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item

class QuoteItem(Item):
text = Field()
author = Field()
tags = Field()
spider = Field()

class QuotesSpiderOne(scrapy.Spider):
name = "quotes1"

def start_requests(self):
urls = ['http://quotes.toscrape.com/page/1/', ]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)

def parse(self, response):
for quote in response.css('div.quote'):
item = QuoteItem()
item['text'] = quote.css('span.text::text').get()
item['author'] = quote.css('small.author::text').get()
item['tags'] = quote.css('div.tags a.tag::text').getall()
item['spider'] = self.name
yield item

class QuotesSpiderTwo(scrapy.Spider):
name = "quotes2"

def start_requests(self):
urls = ['http://quotes.toscrape.com/page/2/', ]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)

def parse(self, response):
for quote in response.css('div.quote'):
item = QuoteItem()
item['text'] = quote.css('span.text::text').get()
item['author'] = quote.css('small.author::text').get()
item['tags'] = quote.css('div.tags a.tag::text').getall()
item['spider'] = self.name
yield item

def do_some_stuff():
print(datetime.datetime.now().strftime("%H:%M:%S"))

@inlineCallbacks
def cycle():
yield runner.crawl(QuotesSpiderOne)
yield runner.crawl(QuotesSpiderTwo)
return do_some_stuff()

if __name__ == '__main__':
settings = dict()
settings['USER_AGENT'] = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
settings['HTTPCACHE_ENABLED'] = True
settings['JSON_FILE'] = 'items.jl'
settings['ITEM_PIPELINES'] = dict()
settings['ITEM_PIPELINES']['__main__.JsonWriterPipeline'] = 800

configure_logging()
runner = CrawlerRunner(settings=settings)
timeout = 60.0

l = task.LoopingCall(cycle)
l.start(timeout)

reactor.run()

运行它

$ python3 scraper.py

单文件 scrapy 项目的一个优点是易于生成 pyinstaller二进制。

可以使用 reactor.callLater 构建替代循环逻辑。此选项允许在周期内修改超时

# above code stays the same

@inlineCallbacks
def cycle(runner, timeout):
yield runner.crawl(QuotesSpiderOne)
yield runner.crawl(QuotesSpiderTwo)
do_some_stuff()
reactor.callLater(timeout, cycle, runner, timeout)

def main():
settings = dict()
settings['USER_AGENT'] = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
settings['HTTPCACHE_ENABLED'] = True
settings['JSON_FILE'] = 'items.jl'
settings['ITEM_PIPELINES'] = dict()
settings['ITEM_PIPELINES']['__main__.JsonWriterPipeline'] = 800

configure_logging()
runner = CrawlerRunner(settings=settings)
timeout = 60.0

reactor.callLater(timeout, cycle, runner, timeout)
reactor.run()

if __name__ == '__main__':
main()

关于python - 通过扭曲的 inlineCallbacks 运行 Scrapy 蜘蛛,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43544516/

31 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com