gpt4 book ai didi

python - AttributeError : 'str' object has no attribute 'findAll' , 使用 BeautifulSoup 从 Youtube 抓取数据时无输出

转载 作者:太空宇宙 更新时间:2023-11-04 01:50:31 26 4
gpt4 key购买 nike

我正在尝试从 Youtube 上抓取热门视频。我无法从 youtbue 获取 href 标签。我的代码和预期输出如下。您将能够使用 url = 'https://www.youtube.com/watch?v=tL8AOS9ZRMg' 提取数据并在 youtubelinks 中评论部分 for link: 并更正缩进 你会得到正确的。导入 BeautifulSoup、urllib、ssl、os。我正在尝试转换为 json 格式并保存。

我的代码如下

import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import ast
import json
import os
from urllib.request import Request, urlopen

# For ignoring SSL certificate errors

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Input from user

#url = input('Enter Youtube Video Url- ')
#url = 'https://www.youtube.com/watch?v=MxnkDj8PIxQ'
url = 'https://www.youtube.com/feed/trending'
# Making the website believe that you are accessing it using a mozilla browser

req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()

# Creating a BeautifulSoup object of the html page for easy extraction of data.

soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
video_details = {}
other_details = {}

#All the trending youtube links
a = soup.select('a[href^="/watch?v="]')[:3]
youtubelinks = list(set("https://www.youtube.com" + do.get('href', None) for do in a))
youtubelinks
#print (doc)
for link in youtubelinks:
for span in link.findAll('span',attrs={'class': 'watch-title'}):
#video_details['TITLE'] = span.text.strip()
video_details['TITLE'] = span.text.encode('utf8')

for script in link.findAll('script',attrs={'type': 'application/ld+json'}):
channelDesctiption = json.loads(script.text.strip())
video_details['CHANNEL_NAME'] = channelDesctiption['itemListElement'][0]['item']['name']

for div in link.findAll('div',attrs={'class': 'watch-view-count'}):
video_details['NUMBER_OF_VIEWS'] = div.text.strip()

for button in link.findAll('button',attrs={'title': 'I like this'}):
video_details['LIKES'] = button.text.strip()

for button in link.findAll('button',attrs={'title': 'I dislike this'}):
video_details['DISLIKES'] = button.text.strip()

for span in link.findAll('span',attrs={'class': 'yt-subscription-button-subscriber-count-branded-horizontal yt-subscriber-count'}):
video_details['NUMBER_OF_SUBSCRIPTIONS'] = span.text.strip()

hashtags = []
for span in link.findAll('span',attrs={'class': 'standalone-collection-badge-renderer-text'}):
for a in link.findAll('a',attrs={'class': 'yt-uix-sessionlink'}):
hashtags.append(a.text.strip())
video_details['HASH_TAGS'] = hashtags

with open('output_file.html', 'wb') as file:
file.write(html)

with open('data.json', 'w', encoding='utf8') as outfile:
json.dump(video_details, outfile, ensure_ascii=False,indent=4)

print ('----------Extraction of data is complete. Check json file.----------')

我的预期结果

{ 1. {
"TITLE": "A",
"CHANNEL_NAME": "B"
"NUMBER_OF_VIEWS": "8,945 views",
"LIKES": "71",
"DISLIKES": "6",
"NUMBER_OF_SUBSCRIPTIONS": "13.3K",
"HASH_TAGS": [
"#A",
"#B",
"#C"
]
}

2.{
"TITLE": "D",
"CHANNEL_NAME": "E",
"NUMBER_OF_VIEWS": "8,945 views",
"LIKES": "71K",
"DISLIKES": "6K",
"NUMBER_OF_SUBSCRIPTIONS": "1.3M",
"HASH_TAGS": [
"#M",
"#F",
"#G"
]
}
}

最佳答案

您收到错误的原因是您没有将此调用到请求的链接。我为 get_soup(url) 创建了一个函数,它在循环中调用。

from bs4 import BeautifulSoup
import ssl
import json
import ast
import json
import os
from urllib.request import Request, urlopen

# For ignoring SSL certificate errors

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def get_soup(url):
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
return soup

url = 'https://www.youtube.com/feed/trending'
soup=get_soup(url)
html = soup.prettify('utf-8')
video_details = {}
other_details = {}

#All the trending youtube links
youtubelinks = []
for a in soup.select('a[href^="/watch?v="]')[:1]:
youtubelinks.append("https://www.youtube.com"+ a['href'])

for link in youtubelinks:
link=get_soup(link)
for span in link.findAll('span',attrs={'class': 'watch-title'}):
video_details['TITLE'] = span.text.strip()

for script in link.findAll('script',attrs={'type': 'application/ld+json'}):
channelDesctiption = json.loads(script.text.strip())
video_details['CHANNEL_NAME'] = channelDesctiption['itemListElement'][0]['item']['name']

for div in link.findAll('div',attrs={'class': 'watch-view-count'}):
video_details['NUMBER_OF_VIEWS'] = div.text.strip()

for button in link.findAll('button',attrs={'title': 'I like this'}):
video_details['LIKES'] = button.text.strip()

for button in link.findAll('button',attrs={'title': 'I dislike this'}):
video_details['DISLIKES'] = button.text.strip()

for span in link.findAll('span',attrs={'class': 'yt-subscription-button-subscriber-count-branded-horizontal yt-subscriber-count'}):
video_details['NUMBER_OF_SUBSCRIPTIONS'] = span.text.strip()

hashtags = []
for span in link.findAll('span',attrs={'class': 'standalone-collection-badge-renderer-text'}):
for a in link.findAll('a',attrs={'class': 'yt-uix-sessionlink'}):
hashtags.append(a.text.strip())
video_details['HASH_TAGS'] = hashtags

print(video_details)

with open('output_file.html', 'wb') as file:
file.write(html)

with open('data.json', 'w', encoding='utf8') as outfile:
json.dump(video_details, outfile, ensure_ascii=False,indent=4)

print ('----------Extraction of data is complete. Check json file.----------')

输出:

{'LIKES': '11,114', 'CHANNEL_NAME': 'World Rugby', 'DISLIKES': '293', 'NUMBER_OF_SUBSCRIPTIONS': '614K', 'NUMBER_OF_VIEWS': '634,395 views', 'TITLE': 'HIGHLIGHTS: Japan v Ireland - Rugby World Cup 2019', 'HASH_TAGS': ['GB', '', 'Review', '#1 on Trending', '', 'World Rugby', 'Sign in', 'Sign in', 'Sign in', 'Sign in', 'https://youtube.com/user/worldrugby', 'https://youtube.com/user/worldrugby', 'http://www.rugbyworldcup.com', 'https://twitter.com/rugbyworldcup', 'https://www.facebook.com/rugbyworldcup', 'http://www.instagram.com/rugbyworldcup', 'http://giphy.com/worldrugby', 'https://www.tiktok.com/@rugbyworldcup...', 'https://www.snapchat.com/add/rugbywor...', 'Sports', 'Extended Highlights: New Zealand v South Africa\n  \n\n     - Duration: 8:51.\n  \nWorld Rugby\n869,064 viewsNew', '8:51', "Schmidt and Best's post match press conference| Japan v Ireland\n  \n\n     - Duration: 12:00.\n  \nWorld Rugby\n48,365 viewsNew", '12:00', 'Liverpool players react to their FIFA 20 ratings | Van Dijk with Salah, Mane, Firmino and more\n  \n\n     - Duration: 5:52.\n  \nLiverpool FC\n2,178,177 viewsNew', '5:52', "35th America's Cup Race 7 NZL vs. USA | AMERICA'S CUP\n  \n\n     - Duration: 23:23.\n  \nAmerica's Cup\n152,003 views", '23:23', "Guy's maiden voyage on his hydrofoil boat | Guy Martin Proper\n  \n\n     - Duration: 7:09.\n  \nGuy Martin Proper\n66,941 viewsNew", '7:09', "Furious Boris Johnson humiliates Jeremy Corbyn, rages at Labour's Brexit LIES and gets long APPLAUSE\n  \n\n     - Duration: 7:32.\n  \nProductiehuisEU\n394,890 viewsNew", '7:32', "KOREA vs. BRAZIL - Highlights | Women's Volleyball World Cup 2019\n  \n\n     - Duration: 8:49.\n  \nVolleyball World\n145,837 viewsNew", '8:49', "Jonah Lomu's 15 unforgettable Rugby World Cup tries\n  \n\n     - Duration: 6:00.\n  \nWorld Rugby\n995,979 views", '6:00', 'Extended Highlights: France v Argentina\n  \n\n     - Duration: 8:35.\n  \nWorld Rugby\n347,394 viewsNew', '8:35', 'What Martin Johnson did just before the 2003 World Cup final || Rugby World Cup Memories - Neil Back\n  \n\n     - Duration: 8:58.\n  \nRugbyPass Official\n95,379 views', '8:58', "Ireland's Shock reaction to Japan Loss\n  \n\n     - Duration: 12:04.\n  \nRugbyPass Official\n6,045 viewsNew", '12:04', 'Bodybuilder Tries Rugby, Gets SMASHED\n  \n\n     - Duration: 15:17.\n  \nJuji & Tom\n2,138,650 views', '15:17', 'EXTENDED HIGHLIGHTS | Matchday One: Japan vs Russia\n  \n\n     - Duration: 23:38.\n  \nWorld Rugby\n338,672 viewsNew', '23:38', 'My Story: Ruaridh McConnochie\n  \n\n     - Duration: 7:24.\n  \nEngland Rugby\n20,312 viewsNew', '7:24', 'Japan head coach speaks after historic victory over Ireland\n  \n\n     - Duration: 1:21.\n  \nWorld Rugby\n52,472 viewsNew', '1:21', 'HIGHLIGHTS: Argentina v Tonga - Rugby World Cup 2019\n  \n\n     - Duration: 2:56.\n  \nWorld Rugby\n195,221 viewsNew', '2:56', 'Extended Highlights: Russia v Samoa - Rugby World Cup 2019\n  \n\n     - Duration: 23:11.\n  \nWorld Rugby\n222,043 viewsNew', '23:11', 'Argentina vs Tonga (28-12) | Rugby World Cup 2019 Highlights\n  \n\n     - Duration: 3:16.\n  \nITV\n16,274 viewsNew', '3:16', "Guy competes with the British America's Cup team | Guy Martin Proper\n  \n\n     - Duration: 9:29.\n  \nGuy Martin Proper\n40,810 viewsNew", '9:29', 'Irish Rugby TV: Ireland v New Zealand 2018 GUINNESS Series Highlights\n  \n\n     - Duration: 7:13.\n  \nIrish Rugby TV\n777,015 views', '7:13', '', 'History']}
----------Extraction of data is complete. Check json file.----------

关于python - AttributeError : 'str' object has no attribute 'findAll' , 使用 BeautifulSoup 从 Youtube 抓取数据时无输出,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58145590/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com