gpt4 book ai didi

python - 在scrapy中使用mysql插入数据

转载 作者:太空宇宙 更新时间:2023-11-04 06:18:30 25 4
gpt4 key购买 nike

这是我的demo,我认为没有错误但是这段代码不能向mysql插入数据!?

设置.py

这是设置代码:

BOT_NAME = 'Scan'
SPIDER_MODULES = ['scan.spiders']
#NEWSPIDER_MODULE = 'scan.spiders'
ITEM_PIPELINES = ['scan.pipelines.MySQLStorePipeline']

这是管道代码,我认为它没有错误:

管道.py

from scrapy import log
from twisted.enterprise import adbapi
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.contrib.pipeline.images import ImagesPipeline
import datetime
import MySQLdb
import MySQLdb.cursors


class MySQLStorePipeline(object):

def __init__(self):
self.db = adbapi.ConnectionPool('MySQLdb',
db = 'spider',
host='localhost',
user = 'root',
passwd = '123456',
cursorclass = MySQLdb.cursors.DictCursor,
charset = 'utf8',
use_unicode = True
)

def process_item(self, item, spider):
query = self.db.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item

def _conditional_insert(self, tx, item):
if item.get('url'):
tx.execute(\
"insert into spider (url) "
"values (%s)",(item['link'])
)
#log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)

这是蜘蛛模块,我认为它没有错误!

蜘蛛.py

# coding=utf-8
from urlparse import urljoin
import simplejson

from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from scan.items import ScanItem

class ScanSpider(CrawlSpider):
name = 'Scan'
allowed_domains = ["a.com"]
start_urls = [
"http://www.a.com",
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'http://(.*?)'),deny_domains=(r'qq.com'))
),
Rule(SgmlLinkExtractor(allow=(r'http://www.a.com')), callback="parse_item"),
)

def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = ScanItem()
items = []
#item['title'] = hxs.select('//title/text()').extract()
item['url'] = hxs.select('//a[@href]').re('(\"http://(.*?)\")').extract()
items.append(item)
return items
SPIDER = ScanSpider()

最佳答案

你真的需要读出来http://doc.scrapy.org ,您缺少非常基础的东西。

def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = ScanItem()
items = []
#item['title'] = hxs.select('//title/text()').extract()
item['url'] = hxs.select('//a[@href]').re('(\"http://(.*?)\")').extract()
items.append(item)
return items

在你的 parse_items 中不需要在列表的项目中添加项目你可以简单地返回 item 就像

    def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = ScanItem()
#item['title'] = hxs.select('//title/text()').extract()
item['url'] = hxs.select('//a[@href]').re('(\"http://(.*?)\")').extract()
return item

记住 item['url'] 有一个 urls 的列表

在你的 MySQLStorePipeline

def _conditional_insert(self, tx, item):
if item.get('url'):
tx.execute(\
"insert into spider (url) "
"values (%s)",(item['link'])
)

你试图在数据库中插入 item['link'] 而你从未填充 item['link'] 而只是 item['url' ]

关于python - 在scrapy中使用mysql插入数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/13952731/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com