gpt4 book ai didi

python - 使用 Scrapy 通过电子邮件发送项目和日志

转载 作者:太空狗 更新时间:2023-10-30 00:45:38 27 4
gpt4 key购买 nike

我试图让 Scrapy 在爬虫完成或中断时向我发送电子邮件。已经有一个用于发送统计信息的内置扩展,但我想将蜘蛛的错误附加为 <spidername>-errors.log和抓取掉的元素为<spidername>-items.json .

我已将回调连接到每个信号,但由于某种原因只有最后一个信号触发:

from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder

from collections import defaultdict

try:
from cStringIO import cStringIO as StringIO
except ImportError:
from StringIO import StringIO

class StatusMailer(object):
def __init__(self, recipients, mail, crawler):
self.recipients = recipients
self.mail = mail
self.files = defaultdict(StringIO)
self.encoder = ScrapyJSONEncoder(crawler=crawler)

@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist("STATUSMAILER_RCPTS")

if not recipients:
raise NotConfigured

mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, crawler)

crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)

return instance

def item_scraped(self, item, response, spider):
self.files[spider.name + '.json'].write(self.encoder.encode(item) + '\n')

def spider_error(self, failure, response, spider):
self.files[spider.name + '-errors.log'].write(failure.getTraceback() + '\n')

def spider_closed(self, spider):
return self.mail.send(
to=self.recipients,
subject="Crawler for %s finished" % spider.name,
body="",
attachs=[(name, 'text/plain', contents) for name, contents in self.files.items()]
)

有没有什么方法可以从 Scrapy 中访问导出的项目和蜘蛛的错误(可能在将这些消息打印到控制台之前制作某种钩子(Hook)来拦截这些消息)?

最佳答案

嗯,看起来问题比我想象的要简单得多。完成写入后,您必须“倒回”StringIO 实例:

def spider_closed(self, spider):
files = []

for name, contents in self.files.items():
contents.seek(0)

files.append((name, 'text/plain', contents))

return self.mail.send(
to=self.recipients,
subject="Crawler for %s finished" % spider.name,
body="",
attachs=files
)

对于任何感兴趣的人,这是我的电子邮件扩展名:

import gzip
import datetime

from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder

from collections import defaultdict

try:
from cStringIO import cStringIO as StringIO
except ImportError:
from StringIO import StringIO

def format_size(size):
for x in ['bytes', 'KB', 'MB', 'GB']:
if size < 1024.0:
return "%3.1f %s" % (size, x)

size /= 1024.0

class GzipCompressor(gzip.GzipFile):
extension = '.gz'
mimetype = 'application/gzip'

def __init__(self):
super(GzipCompressor, self).__init__(fileobj=PlainCompressor(), mode='w')
self.read = self.fileobj.read

class PlainCompressor(StringIO):
extension = ''
mimetype = 'text/plain'

def read(self, *args, **kwargs):
self.seek(0)

return StringIO.read(self, *args, **kwargs)

@property
def size(self):
return len(self.getvalue())

class StatusMailer(object):
def __init__(self, recipients, mail, compressor, crawler):
self.recipients = recipients
self.mail = mail
self.encoder = ScrapyJSONEncoder(crawler=crawler)
self.files = defaultdict(compressor)

self.num_items = 0
self.num_errors = 0

@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
compression = crawler.settings.get('STATUSMAILER_COMPRESSION')

if not compression:
compressor = PlainCompressor
elif compression.lower().startswith('gz'):
compressor = GzipCompressor
else:
raise NotConfigured

if not recipients:
raise NotConfigured

mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, compressor, crawler)

crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(instance.request_received, signal=signals.request_received)

return instance

def item_scraped(self, item, response, spider):
self.files[spider.name + '-items.json'].write(self.encoder.encode(item))
self.num_items += 1

def spider_error(self, failure, response, spider):
self.files[spider.name + '.log'].write(failure.getTraceback())
self.num_errors += 1

def request_received(self, request, spider):
self.files[spider.name + '.log'].write(str(request) + '\n')

def spider_closed(self, spider, reason):
files = []

for name, compressed in self.files.items():
files.append((name + compressed.extension, compressed.mimetype, compressed))

try:
size = self.files[spider.name + '-items.json'].size
except KeyError:
size = 0

body='''Crawl statistics:

- Spider name: {0}
- Spider finished at: {1}
- Number of items scraped: {2}
- Number of errors: {3}
- Size of scraped items: {4}'''.format(
spider.name,
datetime.datetime.now(),
self.num_items,
self.num_errors,
format_size(size)
)

return self.mail.send(
to=self.recipients,
subject='Crawler for %s: %s' % (spider.name, reason),
body=body,
attachs=files
)

将其添加到您的settings.py:

EXTENSIONS = {
'your_package.extensions.StatusMailer': 80
}

并配置它:

STATUSMAILER_RECIPIENTS = []
STATUSMAILER_COMPRESSION = 'gzip'
#STATUSMAILER_COMPRESSION = None

MAIL_HOST = 'smtp.gmail.com'
MAIL_PORT = 587
MAIL_USER = ''
MAIL_PASS = ''

关于python - 使用 Scrapy 通过电子邮件发送项目和日志,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/16260753/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com