gpt4 book ai didi

python - 为什么 scrapy 不将数据存储到 mongodb 中?

转载 作者:可可西里 更新时间:2023-11-01 10:28:28 35 4
gpt4 key购买 nike

我的主文件:

import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request


class Product(scrapy.Item):
brand = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
name = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field()
heading = scrapy.Field()
data = scrapy.Field()
Model_name = scrapy.Field()


class aqaqspider(CrawlSpider):
name = "mouth_shut_new"
allowed_domains = ["mouthshut.com"]
start_urls = ["http://www.mouthshut.com/mobile-phones/Yu-Yureka-reviews-925723476"

]
rules = (
Rule(
SgmlLinkExtractor(allow=('.*\-page-.*',)),
callback="parse_start_url",
follow=True),
)

def parse_start_url(self, response):
products = response.xpath('//div[@id="allreviews"]/ul/li')
items = []
if not products:
raise CloseSpider("No more products!")

for product in products:
item = Product()
#item['Model_name'] = product.xpath('/html/body/form/div[12]/div/div[5]/div/div[1]/div[3]/ul/li[1]/h1/a/span/text()').extract()
item['name'] = product.xpath('.//li[@class="profile"]/div/a/span/text()').extract()[0]
item['title'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/text()').extract()[0]
item['date'] = product.xpath('.//div[@class="reviewrate"]//span[@class="datetime"]/span/span/span/text()').extract()[0]
item['link'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)

items.append(item)

def anchor_page(self, response):
old_item = response.request.meta['item']

old_item['data'] = response.xpath('.//div[@itemprop="description"]/p/text()').extract()
yield old_item

# yield Request(url="http://www.mouthshut.com/Product/mobileListing.aspx?cid=925602729&f1=1&view=list&nsort1=0&nsort2=2015-06-01%2016:12:23.000&ntype=3&mpad=1&ran=0.3691624044781373&dcal=Intex%20Aqua%20Xtreme" ,
# headers={"Referer": "http://www.mouthshut.com/mobile-phones.php", "X-Requested-With": "XMLHttpRequest"},
# callback=self.parse,
# dont_filter=True)

我的设置.py:

# -*- coding: utf-8 -*-

# Scrapy settings for mouth project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'mouth'

SPIDER_MODULES = ['mouth.spiders']
NEWSPIDER_MODULE = 'mouth.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mouth (+http://www.yourdomain.com)'


ITEM_PIPELINES = {'mouth.pipelines.MongoDBPipeline':300}

MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "mobiles_complaints" # Change in prod
MONGODB_COLLECTION = "Yu_Yureka"
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'consumer (+http://www.yourdomain.com)'

我的管道.py:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


import pymongo
from scrapy.conf import settings
from scrapy import log


class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]

def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item

我运行了 scrapy scrapy crawl mouth_shut_new。但是我的数据没有存储在数据库中。在输出中它应该显示数据存储在 mongo 和集合名称中。我缺少什么?

最佳答案

process_item() 方法缩进不正确,应该是:

class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DATABASE']]
self.collection = db[settings['MONGODB_COLLECTION']]

def process_item(self, item, spider):
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DATABASE'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_HOST'],
settings['MONGODB_PORT']))
return item

关于python - 为什么 scrapy 不将数据存储到 mongodb 中?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30701704/

35 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com