gpt4 book ai didi

python - 无效架构 : No connection adapters were found for

转载 作者:行者123 更新时间:2023-12-01 03:00:34 25 4
gpt4 key购买 nike

我在 python 2.7 中使用 beautiful soup 和 requests 包来抓取网络新闻。当我调试下面的代码时,我收到错误。

#encoding:utf-8

import re
import socket
import requests
import httplib
import urllib2
from bs4 import BeautifulSoup

#headers = ('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0')
response = requests.get('http://www.mhi.com.my/')

class Crawler(object):
"""Crawler"""
def __init__(self, url):
self.url = url

def getNextUrls(self):
urls = []
request = urllib2.Request(self.url)
request.add_header('User-Agent',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0')
try:
html = urllib2.urlopen(request)
except socket.timeout, e:
pass
except urllib2.URLError,ee:
pass
except httplib.BadStatusLine:
pass
# analyse the txt have gotten
soup = BeautifulSoup(response.text,'lxml')# slesct and return a list
pattern = 'http://www\.mhi\.com\.my/.*\.html'
links = soup.find_all('a', href=re.compile(pattern))
for link in links:
urls.append(link)
return urls

def getNews(url):
print url
xinwen = ''
request = requests.get(url)
request.add_header('User-Agent',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0')
try:
html = urllib2.urlopen(request)
except urllib2.HTTPError, e:
print e.code

soup = BeautifulSoup(html, 'html.parser')
for news in soup.select('p.para'):
xinwen += news.get_text().decode('utf-8')
return xinwen

class News(object):
"""
source:from where
title:title of news
time:published time of news
content:content of news
type:type of news
"""
def __init__(self, title, time, content, type):
self.title = title
self.time = time
self.content = content
self.type = type

file = open('C:/MyFold/kiki.json', 'a')
url = "http://www.mhi.com.my"
print url
s = Crawler(url)
for newsUrl in s.getNextUrls():
file.write(getNews(newsUrl))
file.write("\n")
print "---------------------------"

file.close()

这是返回错误。

C:\Python27\python.exe C:/MyFold/CodeTest/file1.py
http://www.mhi.com.my
Traceback (most recent call last):
File "C:/MyFold/CodeTest/file1.py", line 74, in <module>
file.write(getNews(newsUrl))
File "C:/MyFold/CodeTest/file1.py", line 42, in getNews
request = requests.get(url)
File "C:\Python27\lib\site-packages\requests\api.py", line 70, in get
return request('get', url, params=params, **kwargs)
File "C:\Python27\lib\site-packages\requests\api.py", line 56, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 488, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 603, in send
adapter = self.get_adapter(url=request.url)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 685, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for '<a class="glow" href="http://www.mhi.com.my/akhbar2016.html" style="text-decoration: none;"></a>'
<a class="glow" href="http://www.mhi.com.my/akhbar2016.html" style="text-decoration: none;"></a>

是我的循环有问题吗?谁能帮我解决一下吗?

最佳答案

在您的 classCrawler 中,函数 getNextUrls()返回<a>列表:

[<a class="glow" href="http://www.mhi.com.my/akhbar2016.html" style="text-decoration: none;"></a>]

当你循环它时,它会传递整个 <a>功能元素getNews ,但参数应该是一个url。

您可以更改您的功能 getNextUrls() :

来自

urls.append(link)

urls.append(link.get('href'))

这样函数getNextUrls将返回您的网址列表而不是 <a>元素列表:

['http://www.mhi.com.my/akhbar2016.html']

关于python - 无效架构 : No connection adapters were found for,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43882033/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com