gpt4 book ai didi

python - Scrapy 异常 - exceptions.AttributeError : 'unicode' object has no attribute 'select'

转载 作者:太空狗 更新时间:2023-10-30 00:50:51 27 4
gpt4 key购买 nike

我写了一个蜘蛛,但每当我运行这个蜘蛛时,我都会收到这个错误:

Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/task.py", line 607, in _tick
taskObj._oneWorkUnit()
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
result = next(self._iterator)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield it.next()
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/offsite.py", line 28, in process_spider_output
for x in result:
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/vaibhav/scrapyprog/comparison/eScraperInterface/eScraper/spiders/streetstylestoreSpider.py", line 38, in parse
item['productURL'] = site.select('.//a/@href').extract()
exceptions.AttributeError: 'unicode' object has no attribute 'select'

我的代码是:

from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider

#------------------------------------------------------------------------------

class ESpider(CrawlSpider):

name = "streetstylestoreSpider"
allowed_domains = ["streetstylestore.com"]

start_urls = [
"http://streetstylestore.com/index.php?id_category=16&controller=category",
"http://streetstylestore.com/index.php?id_category=46&controller=category",
"http://streetstylestore.com/index.php?id_category=51&controller=category",
"http://streetstylestore.com/index.php?id_category=61&controller=category",
"http://streetstylestore.com/index.php?id_category=4&controller=category"
]


def parse(self, response):

items = []
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul[@id="product_list"]/li').extract()

for site in sites:

item = EscraperItem()
item['currency'] = 'INR'
item['productSite'] = ["http://streetstylestore.com"]
item['productURL'] = site.select('.//a/@href').extract()
item['productImage'] = site.select('.//a/img/@src').extract()
item['productTitle'] = site.select('.//a/@title').extract()
productMRP = [i.strip().split('Rs')[-1].replace(',','') for i in hxs.select('.//div[@class="price_container"]//span[@class="old_price"]/text()').extract()]
productPrice = [i.strip().split('Rs')[-1].replace(',','') for i in hxs.select('.//div[@class="price_container"]//p[@class="price"]/text()').extract()]
item['productPrice'] = productMRP + productPrice

items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request


def parsePage2(self, response):

temp = []
item = response.meta['item']
hxs = HtmlXPathSelector(response)

availability = [i for i in hxs.select('//div[@class="details"]/p/text()').extract() if 'In Stock ' in i]

if availability:
item['availability'] = True
else:
item['availability'] = False

hasVariants = hxs.select('//div[@class="attribute_list"]').extract()

if hasVariants:
item['hasVariants'] = True
else:
item['hasVariants'] = False

category = hxs.select('//div[@class="breadcrumb"]/a/text()').extract()
if category:
productCategory = [category[0]]
if len(category) >= 1:
productSubCategory = [category[1]]
else:
productSubCategory = ['']
else:
productCategory = ['']
productSubCategory = ['']

item['productCategory'] = productCategory
item['productSubCategory'] = productSubCategory

for i in hxs.select('//div[@id="thumbs_list"]/ul/li/a/img/@src').extract():
temp.append(i.replace("medium","large"))

item['productDesc'] = " ".join([i for i in hxs.select('//div[@id="short_description_content"]/p/text()').extract()])
item['productImage'] = item['productImage'] + hxs.select('//div[@id="thumbs_list"]/ul/li/a/img/@src').extract() + hxs.select('//div[@id="thumbs_list"]/ul/li/a/@href').extract() + temp
item['image_urls'] = list(set(item['productImage']))

return item

谁能告诉我我的代码有什么问题...

最佳答案

不要对存储在 sites 中的内容调用 .extract() - extract() 会返回文本,但您不会还想要它的文字。这...

sites = hxs.select('//ul[@id="product_list"]/li').extract()

...应该是这样的:

sites = hxs.select('//ul[@id="product_list"]/li')

关于python - Scrapy 异常 - exceptions.AttributeError : 'unicode' object has no attribute 'select' ,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/17268175/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com