gpt4 book ai didi

python - Scrapy 蜘蛛不包括所有请求的页面

转载 作者:太空宇宙 更新时间:2023-11-04 06:03:47 25 4
gpt4 key购买 nike

我有一个 Yelp 的 Scrapy 脚本,在大多数情况下,它是有效的。基本上我可以为它提供 Yelp 页面列表,它应该返回所有页面的所有评论。到目前为止的脚本如下:

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re

from yelp2.items import YelpReviewItem

RESTAURANTS = ['sixteen-chicago']

def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[@class="rating-info clearfix"]//span[@itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages

class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]

def parse(self, response):
requests = []

sel = Selector(response)
reviews = sel.xpath('//div[@class="review review-with-no-actions"]')
items = []

for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[@property="og:title"]/@content').extract()
item['reviewer'] = review.xpath('.//li[@class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[@class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[@itemprop="ratingValue"]/@content').extract()
item['reviewDate'] = review.xpath('.//meta[@itemprop="datePublished"]/@content').extract()
item['reviewText'] = review.xpath('.//p[@itemprop="description"]/text()').extract()
item['url'] = response.url
items.append(item)
return items

if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
return requests

但是,我遇到的问题是这个特定的脚本会抓取每个请求的评论的每一页,除了第一页。如果我注释掉最后一个“if”语句,它只会抓取第一页。我怀疑我所需要的只是一个简单的“else”命令,但我很困惑……非常感谢帮助!

编辑:这是目前基于收到的帮助的代码...

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re

from yelp2.items import YelpReviewItem

RESTAURANTS = ['sixteen-chicago']

def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[@class="rating-info clearfix"]//span[@itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages

class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]

def parse(self, response):
requests = []

sel = Selector(response)
reviews = sel.xpath('//div[@class="review review-with-no-actions"]')
items = []

for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[@property="og:title"]/@content').extract()
item['reviewer'] = review.xpath('.//li[@class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[@class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[@itemprop="ratingValue"]/@content').extract()
item['reviewDate'] = review.xpath('.//meta[@itemprop="datePublished"]/@content').extract()
item['reviewText'] = review.xpath('.//p[@itemprop="description"]/text()').extract()
item['url'] = response.url
yield item

if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request

正如下面的评论中提到的,按原样运行此代码会抓取每个所需的页面,但它每个页面只返回一条评论,而不是所有评论。

我尝试更改 yield itemyield items , 但错误消息 ERROR: Spider must return Request, BaseItem or None, got 'list' in <GET http://www.yelp.com/biz/[...]>为抓取的每个 URL 返回。

最佳答案

您需要重新组织一下方法。首先在 parse() 方法中解析餐厅页面。然后,返回评论请求并以另一种方法处理响应,例如parse_review():

import re

from scrapy.item import Item, Field
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request

from yelp2.items import YelpReviewItem


RESTAURANTS = ['sixteen-chicago']

class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]

def parse(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[@class="rating-info clearfix"]//span[@itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse_review) for n in range(totalReviews/reviewsPerPage)]
return pages

def parse_review(self, response):
sel = Selector(response)
reviews = sel.xpath('//div[@class="review review-with-no-actions"]')
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[@property="og:title"]/@content').extract()
item['reviewer'] = review.xpath('.//li[@class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[@class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[@itemprop="ratingValue"]/@content').extract()
item['reviewDate'] = review.xpath('.//meta[@itemprop="datePublished"]/@content').extract()
item['reviewText'] = review.xpath('.//p[@itemprop="description"]/text()').extract()
item['url'] = response.url
yield item

关于python - Scrapy 蜘蛛不包括所有请求的页面,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/23305502/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com