gpt4 book ai didi

python - 在网站上找到一个词并获取其页面链接

转载 作者:行者123 更新时间:2023-12-04 03:22:35 25 4
gpt4 key购买 nike

我想抓取一些网站,看看那里是否有“katalog”这个词。如果是,我想检索该词所在的所有选项卡/子页面的链接。有可能这样做吗?
我尝试按照本教程进行操作,但最终得到的 wordlist.csv 是空的,即使网站上确实存在“目录”一词。
https://www.phooky.com/blog/find-specific-words-on-web-pages-with-scrapy/

        wordlist = [
"katalog",
"downloads",
"download"
]

def find_all_substrings(string, sub):
starts = [match.start() for match in re.finditer(re.escape(sub), string)]
return starts

class WebsiteSpider(CrawlSpider):

name = "webcrawler"
allowed_domains = ["www.reichelt.com/"]
start_urls = ["https://www.reichelt.com/"]
rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]

crawl_count = 0
words_found = 0

def check_buzzwords(self, response):

self.__class__.crawl_count += 1

crawl_count = self.__class__.crawl_count

url = response.url
contenttype = response.headers.get("content-type", "").decode('utf-8').lower()
data = response.body.decode('utf-8')

for word in wordlist:
substrings = find_all_substrings(data, word)
print("substrings", substrings)
for pos in substrings:
ok = False
if not ok:
self.__class__.words_found += 1
print(word + ";" + url + ";")
return Item()

def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
如何在网站上找到一个词的所有实例并获得该词所在页面的链接?

最佳答案

主要问题是错误allowed_domain - 它必须没有路径 /

    allowed_domains = ["www.reichelt.com"]
其他问题可能是本教程已经 3 年了(有指向 Scarpy 1.5 文档的链接,但最新版本是 2.5.0)。
它还使用了一些无用的代码行。
它得到 contenttype但千万不要用它来 decode request.body .您的网址使用 iso8859-1用于原始语言和 utf-8?LANGUAGE=PL - 但你可以简单地使用 request.text它会自动解码。
它还使用 ok = False后来检查它,但它完全没用。

最少的工作代码 - 您可以将其复制到单个文件并作为 python script.py 运行无需创建项目。
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import re

wordlist = [
"katalog",
"catalog",
"downloads",
"download",
]

def find_all_substrings(string, sub):
return [match.start() for match in re.finditer(re.escape(sub), string)]

class WebsiteSpider(CrawlSpider):

name = "webcrawler"

allowed_domains = ["www.reichelt.com"]
start_urls = ["https://www.reichelt.com/"]
#start_urls = ["https://www.reichelt.com/?LANGUAGE=PL"]

rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]

#crawl_count = 0
#words_found = 0

def check_buzzwords(self, response):
print('[check_buzzwords] url:', response.url)

#self.crawl_count += 1

#content_type = response.headers.get("content-type", "").decode('utf-8').lower()
#print('content_type:', content_type)
#data = response.body.decode('utf-8')

data = response.text

for word in wordlist:
print('[check_buzzwords] check word:', word)
substrings = find_all_substrings(data, word)
print('[check_buzzwords] substrings:', substrings)

for pos in substrings:
#self.words_found += 1
# only display
print('[check_buzzwords] word: {} | pos: {} | sub: {} | url: {}'.format(word, pos, data[pos-20:pos+20], response.url))
# send to file
yield {'word': word, 'pos': pos, 'sub': data[pos-20:pos+20], 'url': response.url}

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(WebsiteSpider)
c.start()

编辑:
我加了 data[pos-20:pos+20]生成数据以查看子字符串在哪里,有时它在 URL 中,如 .../elements/adw_2018/catalog/...或其他地方,如 <img alt=""catalog"" - 所以使用 regex不一定是好主意。也许更好的是使用 xpathcss selector仅在某些地方或链接中搜索文本。

编辑:
搜索与列表中的单词链接的版本。它使用 response.xpath搜索所有 linsk,然后检查 href 中是否有单词- 所以它不需要 regex .
问题可能是它处理与 -downloads- 的链接(与 s )作为与词 download 的链接和 downloads所以它需要更复杂的方法来检查(即使用 regex )仅将其视为与单词 downloads 的链接
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

wordlist = [
"katalog",
"catalog",
"downloads",
"download",
]

class WebsiteSpider(CrawlSpider):

name = "webcrawler"

allowed_domains = ["www.reichelt.com"]
start_urls = ["https://www.reichelt.com/"]

rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]

def check_buzzwords(self, response):
print('[check_buzzwords] url:', response.url)

links = response.xpath('//a[@href]')

for word in wordlist:

for link in links:
url = link.attrib.get('href')
if word in url:
print('[check_buzzwords] word: {} | url: {} | page: {}'.format(word, url, response.url))
# send to file
yield {'word': word, 'url': url, 'page': response.url}

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(WebsiteSpider)
c.start()

关于python - 在网站上找到一个词并获取其页面链接,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/68193300/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com