gpt4 book ai didi

Python Scrapy 不断从下一页按钮获取同一页面链接

转载 作者:行者123 更新时间:2023-12-01 03:01:37 24 4
gpt4 key购买 nike

我正在尝试抓取 amazon.com 上有超过 800 条评论的产品链接,但我不断从下一页按钮获取相同的页面链接,它不断地返回第 2 页,而我应该得到第 3 页,4 等等

我已经设置了一个 IF 条件来溢出并将评论字符串(如 1,020)转换为整数,并根据访问页面来比较是否大于 800

这是代码

# -*- coding: utf-8 -*-
import scrapy
from amazon.items import AmazonItem
from urlparse import urljoin


class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']


def parse(self, response):


SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr

item = AmazonItem()

review_selector = './/*[@class="acs_product-rating__review-count"]/text()'
link_selector = './/*[@class="a-link-normal"]/@href'

if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())
item['LINKS'] = url
if url:
yield scrapy.Request(url, callback=self.parse_link, meta={'item': item})


next_page = './/span[@class="pagnRA"]/a[@id="pagnNextLink"]/@href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)
if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)
def parse_link(self, response):

item = AmazonItem(response.meta['item'])

catselector = '.cat-link ::text'
defaultcatselector = '.nav-search-label ::text'
cat = response.css(catselector).extract_first()
if cat:
item['CATAGORY'] = cat
else:
item['CATAGORY'] = response.css(defaultcatselector).extract_first()
return item

这是我在递归调用解析函数之前打印下一页链接时的输出

herehere

这是页面下一页选择器的屏幕截图 here我哪里出错了?

最佳答案

将下一页代码块移到循环之外。

class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']


def parse(self, response):


SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr


review_selector = './/*[@class="acs_product-rating__review-count"]/text()'
link_selector = './/*[@class="a-link-normal"]/@href'

if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())


next_page = './/span[@class="pagnRA"]/a[@id="pagnNextLink"]/@href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)

if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)

关于Python Scrapy 不断从下一页按钮获取同一页面链接,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43778461/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com