gpt4 book ai didi

mysql - scrapy 和 mysql

转载 作者:行者123 更新时间:2023-11-29 06:52:00 25 4
gpt4 key购买 nike

我试图让 scrapy 将爬取的数据插入到 mysql 中,我的代码可以很好地爬取并收集缓冲区中的数据,不会出错,但数据库永远不会更新。'运气不好','没有错误'

管道.py

from twisted.enterprise import adbapi
import datetime
import MySQLdb.cursors

class SQLStorePipeline(object):

def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb', db='craigs',
user='bra', passwd='boobs', cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8', use_unicode=True)

def process_item(self, items, spider):
# run db query in thread pool
query = self.dbpool.runInteraction(self._conditional_insert, items)
query.addErrback(self.handle_error)

return items

def _conditional_insert(self, tx, items):
# create record if doesn't exist.
# all this block run on it's own thread
tx.execute("select * from scraped where link = %s", (items['link'][0], ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % items, level=log.DEBUG)
else:
tx.execute(\
"insert into scraped (posting_id, email, location, text, title) "
"values (%s, %s, %s, %s, %s)",
(items['posting_id'][0],
items['email'][1],
items['location'][2],
items['text'][3],
items['title'][4],
)

)
log.msg("Item stored in db: %s" % items, level=log.DEBUG)

def handle_error(self, e):
log.err(e)

抓取代码

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigs.items import CraigsItem

class MySpider(CrawlSpider):
name = "craigs"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
rules = [Rule(SgmlLinkExtractor(restrict_xpaths=('/html/body/blockquote[3]/p/a',)), follow=True, callback='parse_profile')]

def parse_profile(self, response):
items = []
img = CraigsItem()
hxs = HtmlXPathSelector(response)
img['title'] = hxs.select('//h2[contains(@class, "postingtitle")]/text()').extract()
img['posting_id'] = hxs.select('//html/body/article/section/section[2]/div/p/text()').extract()
items.append(img)
return items[0]
return img[0]

设置.py

BOT_NAME = 'craigs' 
BOT_VERSION = '1.0'
SPIDER_MODULES = ['craigs.spiders']
NEWSPIDER_MODULE = 'craigs.spiders'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)

最佳答案

管道代码根本没有被调用的原因是因为它还没有被激活。根据 Item Pipelines page in the documentation,通过向 settings.py 添加一个新部分来完成此激活。 .例如

ITEM_PIPELINES = [
'craigs.pipeline.SQLStorePipeline',
]

此外,您的parse_profile 函数应该只返回img。如果单个响应页面会导致多个项目,您只需添加一个项目列表以返回。

关于mysql - scrapy 和 mysql,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/14785994/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com