gpt4 book ai didi

python - Scrapy MultiCSVItemPipeline 导出一些空项目

转载 作者:行者123 更新时间:2023-12-01 09:33:06 25 4
gpt4 key购买 nike

我有多个带有不同项目的蜘蛛,我想将每个项目导出到不同的 csv 文件中。我使用了 How can scrapy export items to separate csv files per item 中的代码示例,但是有一个问题。

现在我的蜘蛛只会写“页面”项目。所有项目都已填充到外壳中,但文件仍然是空的。我调试了管道,但到目前为止我没有发现错误。

这是我的蜘蛛:

import csv

import scrapy
from BeautifulSoup import BeautifulSoup
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.spiders import Rule

from DataSpiders import CSV_PATH
from ScrapingItems import TrierDeItem
from SuperSpider import SuperSpider

HTML_PATH = 'pages/trier.de/'


class TrierDeSpider(scrapy.Spider, SuperSpider):
name = 'trierDeSpider'

allowed_domains = ['trier.de']
denied_domains = []
start_urls = [
'https://www.trier.de/rathaus-buerger-in/trier-in-zahlen/',
'https://trier.de/startseite/',
'https://www.trier.de/leben-in-trier/',
'https://www.trier.de/kultur-freizeit/',
'https://www.trier.de/wirtschaft-arbeit/',
'https://www.trier.de/bildung-wissenschaft/',
'https://www.trier.de/bauen-wohnen/',
'https://www.trier.de/umwelt-verkehr/',
]
# Set starting point for the spider and starts crawling from start_urls
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse', follow=True),)

def parse(self, response):
"""
Parse for Links Page Body. Follow allowed Domains by adding them to the request. Parse the current page with
callback and the method parse_page.
:param response:
:return:
"""
for link in LxmlLinkExtractor(allow=self.allowed_domains, deny=self.denied_domains).extract_links(response):
yield scrapy.Request(response.urljoin(link.url), callback=self.parse_page)

def parse_page(self, response):
"""
Parse the current page for information.
:param response:
:return:
"""
trier_de_item = TrierDeItem()
yield self.parse_general_page_info(response, HTML_PATH)
# extract the page url
trier_de_item["url"] = response.url
# extract the crawling datetime
trier_de_item["crawling_date_time"] = response.headers['Date']
# extract page title
trier_de_item["title"] = response.css('title::text').extract()
# extract description tags
trier_de_item["description"] = response.xpath('//meta[@name="description"]/@content').extract()
trier_de_item["og_description"] = response.xpath('//meta[@name="og:description"]/@content').extract()
# extract all page headers
trier_de_item["news_title"] = response.xpath('//div[@class="dachzeile"]/text()').extract()
# extract topic
trier_de_item["topic"] = response.xpath('//div[@class="topic"]/text()').extract()
# extract headlines
trier_de_item['headlines'] = response.xpath('//h1/text()').extract()

# check if page contains a table
table = response.xpath('//table[@class="datentabelle"]').extract()
if len(table) > 0:
self.parse_table(response.body, trier_de_item['headlines'][0])
yield trier_de_item

@staticmethod
def parse_table(body_html, title):
'''
Parse HTML Page with table and save to csv file
:param body_html:
:param title:
:return:
'''
title = title.replace('/', '')
try:
# Create Filename from title
filename = title + '.csv'
soup = BeautifulSoup(body_html)
soup.prettify('utf-8')
content = []
# find all tables in html
tables = soup.findAll('table')
for table in tables:
# find reach table row
for row in table.findAll('tr'):
# extract each table header and row and extract text to line from each row
line = []
for header in row.findAll('th'):
if ' ' in header.text:
line.append('')
else:
line.append(header.text)
for row in row.findAll('td'):
if ' ' in row.text:
line.append('')
else:
line.append(row.text)
content.append(line)
# Open a new csv file an write each line to the file
with open(CSV_PATH + filename, 'wb') as csv_file:
wr = csv.writer(csv_file)
for line in content:
wr.writerow(line)
except Exception as e:
print(e)
pass

super 蜘蛛:

import urlparse

from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor

from DataSpiders import write_html
from DataSpiders.ScrapingItems import PageItem, BaseItem

ALLOWED_FILE_TYPES = ('.pdf', '.csv', '.xls', '.xlsx')


class SuperSpider:
def __init__(self):
pass

def url_join(self, urls, response):
'''
Join URL with response
:param urls:
:param response:
:return:
'''
joined_urls = []
for url in urls:
joined_urls.append(response.urljoin(url))

return joined_urls

def parse_general_page_info(self, response, HTML_PATH):
page_item = PageItem()
page_item["url"] = response.url
# extract respones body
if 'jsp' in response.url:
url = response.url.split('.jsp')
write_html(url[0], response.body, HTML_PATH)
elif '?' in response.url:
url = response.url.split('?')
write_html(url[0], response.body, HTML_PATH)
else:
write_html(response.url, response.body, HTML_PATH)
# Search for files that contain any allowed file type
found_files = []
domain = response.url.split('/')[2]
for a in response.xpath('//a[@href]/@href'):
link = a.extract()
if link.endswith(ALLOWED_FILE_TYPES):
link = urlparse.urljoin(domain, link)
found_files.append(link)
# extract all refering links
extractor = LxmlLinkExtractor()
linklist = []
for link in extractor.extract_links(response):
# extract links which contain a file in url and add those to 'found_files' for downloading
if '?imgUid' in link.url:
fullpath = link.url
path = fullpath.split('.de')[1]
found_files.append(urlparse.urljoin(domain, path))
else:
linklist.append(link.url)
page_item["links"] = linklist
# add all files to lokaloItem
page_item["file_urls"] = self.url_join(found_files, response)
# extract page title
page_item["title"] = response.css('title::text').extract()

# extract all image urls
relative_img_urls = response.css("img::attr(src)").extract()
page_item["image_urls"] = self.url_join(relative_img_urls, response)

return page_item

def parse_base_page_information(self, response):
baseItem = BaseItem()
baseItem["url"] = response.url
# extract page title
baseItem["title"] = response.css('title::text').extract()
baseItem["crawling_date_time"] = response.headers['Date']
# extract description tags
baseItem["description"] = response.xpath('//meta[@name="description"]/@content').extract()
baseItem["og_description"] = response.xpath('//meta[@name="og:description"]/@content').extract()
baseItem['headlines'] = response.xpath('//h1/text()').extract()
return baseItem

抓取项目:

from scrapy import Item, Field


class PageItem(Item):
url = Field()
title = Field()
image_urls = Field()
file_urls = Field()
links = Field()


class BaseItem(Item):
url = Field()
title = Field()
crawling_date_time = Field()
description = Field()
og_description = Field()
headlines = Field()


class TrierDeItem(BaseItem):
news_title = Field()
tag = Field()
topic = Field()

以及多 CSV 管道:

class MultiCSVItemPipeline(object):
CSVPath = "csv_data/"
SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']

def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

def spider_opened(self, spider):
self.files = dict([(name, open(self.CSVPath + name + '.csv', 'ab')) for name in self.SaveTypes])
self.exporters = dict([(name, CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]

def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]

def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item


def item_type(item):
'''
Returns the scraping item name
:param item:
:return:
'''
return type(item).__name__.replace('Item', '').lower()

我现在还没有找到解决方案,但我尝试了一些失败的方法。

  • 生成项目列表,不适用于 scrapy
  • 仅生成一项并为 page_item 和 trier_item 创建两个解析方法
  • 删除除“trierde”之外的所有SaveType。蜘蛛没有写任何东西

因此,与我尝试过的这些选项相关,我相信管道本身存在一些错误......我感谢任何人可以提供的帮助。

其他信息:在将管道更改为 MultiCSV 之前,我能够将每个项目保存到 csv。

最佳答案

在我无法解决 Scrapy 导出器的问题后,我决定创建自己的导出器。

以下代码适用于想要将多个不同项目导出到一个或多个蜘蛛中的不同 csv 文件的人。到目前为止,它对我有用,但我仍在检查代码是否有错误。如果您有任何改进的想法,请随时回复。

class MultiCSVItemPipeline(object):
# Subfolder path, where the csv files are stored
CSVPath = "csv_data/"
# All allowed items
SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
# List for already checked csv headers
CheckedHeaders = []

def __init__(self):
import sys
reload(sys)
sys.setdefaultencoding('utf8')
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

def spider_opened(self, spider):
# Check if items exists and create new ones if not
for file in set(self.SaveTypes):
f = open(self.CSVPath + file + '.csv', 'a+')
f.close()

def spider_closed(self, spider):
# not needed anymore
# [e.finish_exporting() for e in self.exporters.values()]
# [f.close() for f in self.files.values()]
pass

def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
try:
# Check if csv file contains header, but only those, that aren't checked
if what not in self.CheckedHeaders:
self.check_header(what, item)
self.write_item_to_row(item, what)
except Exception as e:
logging.error("########################################################")
logging.error("Error writing to " + what + ".csv file ")
logging.error("Error Message: " + e.message)
logging.error("Error Reason: " + e.reason)
logging.error("Error Object: " + e.object)
logging.error("########################################################")
return item

def write_item_to_row(self, item, what):
"""
Write a single item to a row in csv file
:param item:
:param what:
:return:
"""
ofile = open(self.CSVPath + what + '.csv', "ab")
writer = csv.writer(ofile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
item_dict = item.__dict__['_values']
row = []
for k in item_dict:
d = item_dict[k]
# Ig item is not a list join the element to string, replace all delimiters and set encoding to utf-8
if not isinstance(d, types.ListType):
value = ''.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
else:
value = ','.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
row.append(value)
writer.writerow(row)
ofile.close()

def check_header(self, what, item):
"""
Check if the file contains header elements and create if missing
:param what:
:param item:
:return:
"""
try:
with open(self.CSVPath + what + '.csv', 'ab+') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
item_dict = item.__dict__['_values']
# If file is empty, create new csv header
if os.stat(self.CSVPath + what + '.csv').st_size == 0:
self.write_csv_header(item_dict, writer)
else:
# Read first row and check header elements
read_csv = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
first_row = read_csv.next()
# if not all headers are set in the csv file, print warning
if not self.check_key_in_csv_header(item_dict, first_row):
# TODO: Add missing header to the csv file
logging.warning("Wrong headers for file " + what + ".csv")
self.CheckedHeaders.append(what)
csvfile.close()
return True
except Exception as e:
logging.error(e.message)
return False

@staticmethod
def write_csv_header(item_dict, writer):
"""
Write header of a csv file.
Header is writen from each keys in the scrapy item
:param item_dict:
:param writer:
:return:
"""
first_row = []
for k in item_dict:
# Join each Key to a string, delete delimiters and encode to utf-8
value = ''.join(k).replace('\t', '').replace('\n', '').encode('utf8')
first_row.append(value)
writer.writerow(first_row)

@staticmethod
def check_key_in_csv_header(item_dict, row):
"""
Check, for each item key, if it's contained in the first line of the csv
k (key) stands for each dictionary key of the scrapy item.
:param item_dict:
:param row:
:return:
"""
for k in item_dict:
if k not in row:
return False
return True

关于python - Scrapy MultiCSVItemPipeline 导出一些空项目,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49782869/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com