gpt4 book ai didi

python - 部署失败,因为有多个带有 Scrapinghub 的蜘蛛

转载 作者:太空宇宙 更新时间:2023-11-03 13:59:45 24 4
gpt4 key购买 nike

我使用 scrapy 创建一个项目并将数据保存到我的 mongodb 中。它可以工作。

这是我的代码:

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import time

# scrapy api imports
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# I import a lots of spider file here.
from Tainan.Madou import Madou
# from ... import ...
# from ... import ...

# Spider Array: add spider into array
CrawlersArray = [ Madou ]

class MoviesSpider(scrapy.Spider):
name = 'movies'
allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']


process = CrawlerProcess(get_project_settings())

for spider in CrawlersArray:
process.crawl(spider)

process.start()

这是我的麻 bean 蜘蛛,我有很多像麻 bean 一样的蜘蛛,如果我不添加 if __name__ == '__main__': 我可以运行我所有的蜘蛛

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request

from TainanItem import Tainan, MovieReleased
# 麻豆戲院
class Madou(scrapy.Spider):
name = 'Madou'
allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
start_urls = ['https://movies.yahoo.com.tw/theater_result.html/id=68']

global tainan, movieReleased
tainan = Tainan()
movieReleased = MovieReleased()

global detailDict
detailDict = {}
global locationDetail
locationDetail = {}
global lonAndLatArray
global cnNameArray, enNameArray, releasedTimeArray, versionTypeArray, movieStyleArray, moviePhotoArray, movieContentArray, nextPageHrefArray
global movieDateArray, movieTimeArray, movieStillsArray, movieActorCnArray, movieActorPhotoArray
cnNameArray = []
enNameArray = []
versionTypeArray = []
movieStyleArray = []
releasedTimeArray = []
moviePhotoArray = []
movieContentArray = []
nextPageHrefArray = []
movieDateArray = []
movieTimeArray = []
movieStillsArray = []
movieActorCnArray = []
movieActorPhotoArray = []
lonAndLatArray = []
global dataLen, countLen
dataLen = 0
countLen = 0

def parse(self, response):

global tainan
global movieReleased, detailDict, locationDetail
global lonAndLatArray
global cnNameArray, enNameArray, versionTypeArray, movieStyleArray, releasedTimeArray, moviePhotoArray, movieContentArray
global movieDateArray, movieTimeArray, movieStillsArray, movieActorCnArray, movieActorPhotoArray
global nextPageHrefArray
global dataLen
tainan['theater'] = 'Madou'
tainan['theaterCn'] = '麻豆戲院'
tainan['address'] = '台南縣麻豆鎮興中路106號3樓'
tainan['phone'] = '06-5722159'
lonAndLatArray = [float(120.251206), float(23.183880)]

htmlNodes = response.xpath('//div[@class="release_info_text"]')
for htmlNode in htmlNodes:
cnName = htmlNode.xpath('.//div[@class="theaterlist_name"]/a/text()').extract_first()
enName = htmlNode.xpath('.//div[@class="en"]/a/text()').extract_first()
versionType = htmlNode.xpath('.//div[@class="tapR"]/text()').extract_first()
releasedTime = htmlNode.xpath('.//ul[@class="theater_time"]/li/text()').extract()

cnNameArray.append(cnName)
enNameArray.append(enName)
versionTypeArray.append(versionType)
releasedTimeArray.append(releasedTime)

i = 1000
dataLen = len(response.xpath('//div[@class="release_foto"]'))
photoNodes = response.xpath('//div[@class="release_foto"]')
for photoNode in photoNodes:
contentHref = photoNode.xpath('.//a/@href').extract_first()
yield Request(contentHref, callback=self.parse_page, priority = i, dont_filter=True)
i -= 1
photoHref = photoNode.xpath('.//a/img/@src').extract_first()
moviePhotoArray.append(photoHref)

detailDict.update({
'cnName': cnNameArray,
'enName': enNameArray,
'movieContent': movieContentArray,
'versionType': versionTypeArray,
'movieStyle': movieStyleArray,
'releasedTime': releasedTimeArray,
'moviePhoto': moviePhotoArray,
'movieDate': movieDateArray,
'movieTime': movieTimeArray,
'movieStills': movieStillsArray,
'movieActorCn': movieActorCnArray,
'movieActorPhoto': movieActorPhotoArray})

locationDetail.update({
'type': "Point",
'coordinates': lonAndLatArray
})

movieReleased['film'] = dict(detailDict)
tainan['geometry'] = dict(locationDetail)
tainan['movie'] = dict(movieReleased)

def parse_page(self, response):

global movieContentArray, countLen, dataLen
global movieDateArray, movieTimeArray, movieStillsArray, movieStyleArray, movieActorCnArray, movieActorPhotoArray
movieContent = response.xpath('//div[@class="gray_infobox_inner"]/span/text()').extract_first()
movieDate = response.xpath('//*[@class="movie_intro_info_r"]/span/text()')[0].extract()
movieTime = response.xpath('//*[@class="movie_intro_info_r"]/span/text()')[1].extract()
movieStills = response.xpath('//ul[@class="trailer_list imglist"]//div[@class="foto"]/img/@src').extract()
movieStyle = response.xpath('//div[@class="level_name_box"]//div[@class="level_name"]/a/text()').extract()
movieActorCn = response.xpath('//ul[@class="trailer_list alist starlist"]/li/a//div[@class="fotoinner"]/img/@title').extract()
movieActorPhoto = response.xpath('//ul[@class="trailer_list alist starlist"]/li/a//div[@class="fotoinner"]/img/@src').extract()
movieContentArray.append(movieContent)
movieDateArray.append(movieDate)
movieTimeArray.append(movieTime)
movieStillsArray.append(movieStills)
movieStyleArray.append(movieStyle)
movieActorCnArray.append(movieActorCn)
movieActorPhotoArray.append(movieActorPhoto)

countLen += 1
if countLen == dataLen:
yield tainan

但是当我想将我的项目部署到 Scrapinghub 时,我收到错误

Exceeded container timeout 60s

我从github上找到了解决方案 https://github.com/scrapinghub/shub/issues/273

我不确定如何使用第一个解决方案,所以我像提问者一样尝试第二个解决方案。

我像这样修复代码:

if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())

for spider in CrawlersArray:
process.crawl(spider)

process.start()

它可以成功地将项目部署到Scrapinghub,但是当我运行该项目时,我发现没有任何蜘蛛运行。

为什么?我想不通。

如有任何帮助,我们将不胜感激。提前致谢。

这是我运行项目时的终端信息:

File "/Library/Python/2.7/site-packages/scrapy/spiders/__init__.py", line 90, in parse
raise NotImplementedError
NotImplementedError
2018-03-18 10:40:25 [scrapy.core.engine] INFO: Closing spider (finished)
2018-03-18 10:40:25 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 828,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'downloader/response_bytes': 87445,
'downloader/response_count': 3,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 3, 18, 2, 40, 25, 666163),
'log_count/DEBUG': 4,
'log_count/ERROR': 1,
'log_count/INFO': 7,
'memusage/max': 53428224,
'memusage/startup': 53424128,
'response_received_count': 1,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'spider_exceptions/NotImplementedError': 1,
'start_time': datetime.datetime(2018, 3, 18, 2, 40, 18, 487308)}
2018-03-18 10:40:25 [scrapy.core.engine] INFO: Spider closed (finished)

尝试修复:

class MoviesSpider(scrapy.Spider):
name = 'movies'
allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']

def parse(self, response):
print("inside parse")

if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())

for spider in CrawlersArray:
process.crawl(spider)

process.start()

日志:

2018-03-18 17:31:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movies.yahoo.com.tw/movie_thisweek.html/> (referer: None)
inside parse
2018-03-18 17:31:34 [scrapy.core.engine] INFO: Closing spider (finished)
2018-03-18 17:31:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:

最佳答案

从日志中可以清楚地看出您错过了 parse 回调

class MoviesSpider(scrapy.Spider):
name = 'movies'
allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']

def parse(self, response):
print("inside parse")

在解析回调函数中,您解析响应(网页)并返回带有提取数据的字典、Item 对象、Request 对象或这些对象的可迭代对象。这些请求还将包含一个回调(可能相同),然后由 Scrapy 下载,然后由指定的回调处理它们的响应。

更新:整个代码

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import time

# scrapy api imports
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# I import a lots of spider file here.
# from myfile import project

# Spider Array: add spider into array
CrawlersArray = [ ... ]

class MoviesSpider(scrapy.Spider):
name = 'movies'
allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']

def parse(self, response):
print("inside parse")

if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())

for spider in CrawlersArray:
process.crawl(spider)

process.start()

关于python - 部署失败,因为有多个带有 Scrapinghub 的蜘蛛,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49335623/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com