gpt4 book ai didi

python - 获取 Youtube 搜索结果的链接

转载 作者:太空宇宙 更新时间:2023-11-04 04:40:38 24 4
gpt4 key购买 nike

我正在尝试获取出现在 YouTube 上特定查询的搜索结果中的视频链接。我正在使用 BeautifulSoup 并请求 Python 库,这是我所做的:

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

base="https://www.youtube.com/results?search_query="
query="mickey+mouse"
r = requests.get(base+query)
page=r.text
soup=bs(page,'html.parser')

vids = soup.findAll('a',attrs={'class':'yt-uix-tile-link'})

videolist=[]
for v in vids:
tmp = 'https://www.youtube.com' + v['href']
videolist.append(tmp)

pd.DataFrame(videolist).to_excel(<PATH>, header=False, index=False)

这会查找搜索结果并将前 20 个视频(出现在页面中)的链接保存到 Excel 文件中。但是,我希望获得与同一查询相关的 400 或 500 个链接。我该怎么做?我知道如何从特定 channel 获取所有链接,但如何获取特定搜索查询的链接?

最佳答案

用户 dk1 (Code Review)除了导出到 Excel 之外,创建的几乎完全是您想要的,而是导出到 CSV:

#!/usr/bin/python
# http://docs.python-requests.org/en/latest/user/quickstart/
# http://www.crummy.com/software/BeautifulSoup/bs4/doc/

import csv
import re
import requests
import time
from bs4 import BeautifulSoup

# scrapes the title
def getTitle():
d = soup.find_all("h1", "branded-page-header-title")
for i in d:
name = i.text.strip().replace('\n',' ').replace(',','').encode("utf-8")
f.write(str(name) + ',')
print(f'\t\t{name}')

# scrapes the subscriber and view count
def getStats():
b = soup.find_all("li", "about-stat ") # trailing space is required.
for i in b:
value = i.b.text.strip().replace(',','')
name = i.b.next_sibling.strip().replace(',','')
f.write(value+',')
print('\t\t%s = %s') % (name, value)

# scrapes the description
def getDescription():
c = soup.find_all("div", "about-description")
for i in c:
description = i.text.strip().replace('\n',' ').replace(',','').encode("utf-8")
f.write(str(description) + ',')
#print('\t\t%s') % (description)

# scrapes all the external links
def getLinks():
a = soup.find_all("a", "about-channel-link ") # trailing space is required.
for i in a:
url = i.get('href')
f.write(url+',')
print(f'\t\t{url}')

# scrapes the related channels
def getRelated():
s = soup.find_all("h3", "yt-lockup-title")
for i in s:
t = i.find_all(href=re.compile("user"))
for i in t:
url = 'https://www.youtube.com'+i.get('href')
rCSV.write(url+'\n')
print(f'\t\t{i.text}, {url}')

f = open("youtube-scrape-data.csv", "w+")
rCSV = open("related-channels.csv", "w+")
visited = []
base = "https://www.youtube.com/results?search_query="
q = ['search+query+here']
page = "&page="
features="html.parser"
count = 1
pagesToScrape = 20

for query in q:
while count <= pagesToScrape:
scrapeURL = base + str(query) + page + str(count)
print(f'Scraping {scrapeURL} \n')
r = requests.get(scrapeURL)
soup = BeautifulSoup(r.text)
users = soup.find_all("div", "yt-lockup-byline")
for each in users:
a = each.find_all(href=re.compile("user"))
for i in a:
url = 'https://www.youtube.com'+i.get('href')+'/about'
if url in visited:
print(f'\t{url} has already been scraped\n\n')
else:
r = requests.get(url)
soup = BeautifulSoup(r.text)
f.write(url+',')
print(f'\t{url}')
getTitle()
getStats()
getDescription()
getLinks()
getRelated()
f.write('\n')
print('\n')
visited.append(url)
time.sleep(3)
count += 1
time.sleep(3)
print('\n')
count = 1
print('\n')
f.close()

关于python - 获取 Youtube 搜索结果的链接,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50713219/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com