gpt4 book ai didi

python - 抓取刀不抓取字段 "Description"

转载 作者:行者123 更新时间:2023-12-01 06:48:18 27 4
gpt4 key购买 nike

我有一个使用 scrapy 为我编码的网络抓取工具。

我希望从抓取工具抓取的网站中添加一个额外的字段。

列标题“Description”已在 CSV 数据库中创建,但未抓取任何内容。

# -*- coding: utf-8 -*-
import scrapy
from pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import csv,re
from scrapy import signals
class Rapid7(scrapy.Spider):
name = 'vulns'
allowed_domains = ['rapid7.com']
main_url = 'https://www.rapid7.com/db/?q=&type=nexpose&page={}'
#start_urls = ['https://www.rapid7.com/db/vulnerabilities']
keys = ['Published','CVEID', 'Added', 'Modified', 'Related', 'Severity', 'CVSS', 'Created', 'Solution', 'References', 'Description', 'URL']
def __init__(self):
SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
def start_requests(self):
for i in range(1,10):
url = self.main_url.format(i)
yield scrapy.Request(url,callback=self.parse)
def parse(self, response):
flag = True
temp = response.xpath('//div[@class="vulndb__intro-content"]/p/text()').extract_first()
if temp:
if temp.strip()=='An error occurred.':
flag= False
temp = [i for i in response.xpath('//*[@class="results-info"]/parent::div/p/text()').extract()if i.strip()]
if len(temp)==1:
flag= False
if flag:
for article in response.xpath('//*[@class="vulndb__results"]/a/@href').extract():
yield scrapy.Request(response.urljoin(article), callback=self.parse_article, dont_filter=True)

def parse_article(self,response):
item=dict()
item['Published'] = item['Added'] = item['Modified'] = item['Related'] = item['Severity'] = item['Description'] =''
r=response.xpath('//h1[text()="Related Vulnerabilities"]/..//a/@href').extract()
temp = response.xpath('//meta[@property="og:title"]/@content').extract_first()
item['CVEID'] = ''
try:
temp2 = re.search('(CVE-.*-\d*)',temp).groups()[0]
if ":" in temp2:
raise KeyError
except:
try:
temp2 = re.search('(CVE-.*):',temp).groups()[0]
except:
temp2 = ''
if temp2:
item['CVEID'] = temp2.replace(': Important',"").replace(')','')
table = response.xpath('//section[@class="tableblock"]/div')
for row in table:
header = row.xpath('header/text()').extract_first()
data = row.xpath('div/text()').extract_first()
item[header]=data
temp = [i for i in response.xpath('//div[@class="vulndb__related-content"]//text()').extract() if i.strip()]
for ind,i in enumerate(temp):
if "CVE" in i:
temp[ind] = i.replace(' ','')

item['Related']= ", ".join(temp) if temp else ""
temp2= [i for i in response.xpath('//h4[text()="Solution(s)"]/parent::*/ul/li/text()').extract() if i.strip()]
item['Solution'] =", ".join(temp2) if temp2 else ''
temp3 = [i for i in response.xpath('//h4[text()="References"]/parent::*/ul/li/text()').extract() if i.strip()]
item['References'] = ", ".join(temp3) if temp3 else ''
temp4 = [i for i in response.xpath('//h4[text()="Description"]/parent::*/ul/li/text()').extract() if i.strip()]
item['Description'] = ", ".join(temp4) if temp4 else ''
item['URL'] = response.request.url
new_item=dict()
for key in self.keys:
if key not in list(item.keys()):
new_item[key] = ''
else:
new_item[key]=item[key]
yield new_item

def _close(self):
print("Done Scraping")

谢谢

“看来您的帖子主要是代码;请添加更多详细信息。”对不起。 :(“看来您的帖子主要是代码;请添加更多详细信息。”对不起。 :(

最佳答案

尝试替换您的 temp4 :

temp4 = [i for i in response.xpath('//h4[text()="Description"]/parent::*/ul/li/text()').extract() if i.strip()]

关于:

temp4 = [i for i in response.xpath('//h4[text()="Description"]/parent::*/p/text()').extract() if i.strip()]

<h4>Description</h4>你还没有<ul><li>标签,仅 <p>

关于python - 抓取刀不抓取字段 "Description",我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59127552/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com