gpt4 book ai didi

python - Scrapy 蜘蛛每页仅抓取一个链接

转载 作者:太空宇宙 更新时间:2023-11-03 14:20:47 27 4
gpt4 key购买 nike

我想从 http://www.nyhistory.org/programs/upcoming-public-programs 中抓取所有事件数据。事件分页,每页 5 个事件。我创建了两条规则:一条遵循下一页,另一条遵循事件的详细信息页面。因此,我希望蜘蛛首先输入每个事件的 url,从那里收集我需要的所有数据,然后进入下一页,输入每个事件的 url 等等。但是,由于某种原因,我的蜘蛛仅访问每个页面中的一个事件,而且这只是第一个事件。请参阅下面我的代码

import scrapy
from nyhistory.items import EventItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from datetime import datetime
from w3lib.html import remove_tags
from scrapy.selector import Selector
import re

class NyhistorySpider(CrawlSpider):

name = "events"

start_urls = ['http://www.nyhistory.org/programs/upcoming-public-programs',]

rules = [Rule(LinkExtractor(allow='.*?page=.*',restrict_xpaths='//li[@class="pager-next"]'), follow=True),
Rule(LinkExtractor(restrict_xpaths='//div[@class="view-content"]/div[contains(@class,"views-row")]'), callback='parse_event_details',follow=True),
]

def parse_event_details(self, response):

base_url = 'http://www.nyhistory.org'

item = EventItem()
item['title'] = response.xpath('//div[@class="views-field-title"]//text()')[2].extract()
item['eventWebsite'] = response.url

details_area = response.xpath('//div[@class="body-programs"]')
details_area_str = " ".join(details_area.extract())
details_area_str_split = re.split('EVENT DETAILS|LOCATION|PURCHASING TICKETS', details_area_str)
speakers_names_area = details_area_str_split[1]
speakersNames = Selector(text=speakers_names_area).xpath('strong').extract()
try:
item['speaker1FirstName'] = speakersNames[0].split()[0]
item['speaker1LastName'] = speakersNames[0].split()[1]
except:
item['speaker1FirstName'] = ''
item['speaker1LastName'] = ''

description = remove_tags(details_area_str_split[1]).strip()
item['description'] = description

try:
address_line = remove_tags(details_area_str_split[2]).strip()
item['location'] = address_line.split(',')[0]
item['city'] = address_line.split(',')[-2].strip()
item['state'] = address_line.split(',')[-1].split()[0]
item['zip'] = address_line.split(',')[-1].split()[1]
item['street'] = address_line.split(',')[1].strip()
except:
item['location'] = ''
item['city'] = ''
item['state'] = ''
item['zip'] = ''
item['street'] = ''

try:
item['dateFrom'] = self.date_converter(response.xpath('//span[@class="date-display-single"]/text()').extract_first(default='').rstrip(' - '))
except:
try:
item['dateFrom'] = response.xpath('//span[@class="date-display-single"]/text()').extract()[1].split('|')[0]
except:
item['dateFrom'] = ''
try:
item['startTime'] = self.time_converter(response.xpath('//span[@class="date-display-start"]/text()')[1].extract())
# item['endTime'] = self.time_converter(response.xpath('//span[@class="date-display-end"]/text()')[1].extract())
except:
try:
item['startTime'] = self.time_converter(response.xpath('//span[@class="date-display-single"]/text()').extract()[1].split(' | ')[1])
except:
item['startTime'] = ''
item['In_group_id'] = ''
try:
item['ticketUrl'] = base_url + response.xpath('//a[contains(@class,"btn-buy-tickets")]/@href').extract_first()
except:
item['ticketUrl'] = ''
item['eventImage'] = response.xpath('//div[@class="views-field-field-speaker-photo-1"]/div/div/img/@src').extract_first(default='')
item['organization'] = "New York Historical Society"

yield item

@staticmethod
def date_converter(raw_date):
try:
raw_date_datetime_object = datetime.strptime(raw_date.replace(',',''), '%a %m/%d/%Y')
final_date = raw_date_datetime_object.strftime('%d/%m/%Y')
return final_date
except:
raw_date_datetime_object = datetime.strptime(raw_date.replace(',','').replace('th','').strip(), '%a %B %d %Y')
final_date = raw_date_datetime_object.strftime('%d/%m/%Y')
return final_date
@staticmethod
def time_converter(raw_time):
raw_time_datetime_object = datetime.strptime(raw_time, '%I:%M %p')
final_time = raw_time_datetime_object.strftime('%I:%M %p')
return final_time

最佳答案

使用CrawlSpider时,规则如您所提到的,遵循相应的链接“直到”您找到您真正想要获得的项目。

但是,蜘蛛(或规则)如何知道何时停止?这是为了使用 callbackfollow 属性。如果您使用callback,则不需要follow(因为callback指定该链接需要作为响应处理) ,如果您使用follow,则不需要callback,因为它告诉蜘蛛继续寻找新链接。

您必须定义更好的规则,并指定要遵循的规则以及返回回调的规则。

关于python - Scrapy 蜘蛛每页仅抓取一个链接,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/47963650/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com