gpt4 book ai didi

python-3.x - 如何使用 selenium 线程进行网页抓取?

转载 作者:行者123 更新时间:2023-12-03 13:07:49 25 4
gpt4 key购买 nike

我的主要目标是在可汗学院上尽可能多地抓取个人资料链接。然后在每个配置文件上抓取一些特定数据。
我对这个问题的目标是使用线程来使我的脚本工作得更快。
所以我将分两部分介绍我的代码:第一部分没有线程,第二部分有线程。
这是没有线程的原始代码:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession

session = HTMLSession()
r = session.get('https://www.khanacademy.org/computing/computer-programming/programming#intro-to-programming')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')

#find course steps links
courses_links = soup.find_all(class_='link_1uvuyao-o_O-nodeStyle_cu2reh-o_O-nodeStyleIcon_4udnki')
list_courses={}

for links in courses_links:
courses = links.extract()
link_course = courses['href']
title_course= links.find(class_='nodeTitle_145jbuf')
span_title_course=title_course.span
text_span=span_title_course.text.strip()
final_link_course ='https://www.khanacademy.org'+link_course
list_courses[text_span]=final_link_course
#print(list_courses)

# my goal is to loop the below script with each "course link" that I got above with list_courses
for courses_step in list_courses.values():
driver = webdriver.Chrome()
driver.get(courses_step)
while True:
try:
showmore=WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME,'button_1eqj1ga-o_O-shared_1t8r4tr-o_O-default_9fm203')))
showmore.click()
except TimeoutException:
break
except StaleElementReferenceException:
break

soup=BeautifulSoup(driver.page_source,'html.parser')
#find the profile links
profiles = soup.find_all(href=re.compile("/profile/kaid"))
profile_list=[]
for links in profiles:
links_no_list = links.extract()
text_link = links_no_list['href']
text_link_nodiscussion = text_link[:-10]
final_profile_link ='https://www.khanacademy.org'+text_link_nodiscussion
profile_list.append(final_profile_link)

#remove duplicates
profile_list=list(set(profile_list))

#print number of profiles we got
print('in this link:')
print(courses_step)
print('we have this number of profiles:')
print(len(profile_list))
#create the csv file
filename = "khanscrapetry1.csv"
f = open(filename, "w")
headers = "link, date_joined, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx, last_date\n"
f.write(headers)

#for each profile link, scrape the specific data and store them into the csv
for link in profile_list:
#to avoid Scraping same profile multiple times
#print each profile link we are about to scrape
print("Scraping ",link)
driver.get(link)
#wait for content to load
#if profile does not exist skip
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH ,'//*[@id="widget-list"]/div[1]/div[1]')))
except TimeoutException:
continue
soup=BeautifulSoup(driver.page_source,'html.parser')
user_info_table=soup.find('table', class_='user-statistics-table')
if user_info_table is not None:
dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
else:
dates=points=videos='NA'

user_socio_table=soup.find_all('div', class_='discussion-stat')
data = {}
for gettext in user_socio_table:
category = gettext.find('span')
category_text = category.text.strip()
number = category.previousSibling.strip()
data[category_text] = number

full_data_keys=['questions','votes','answers','flags raised','project help requests','project help replies','comments','tips and thanks'] #might change answers to answer because when it's 1 it's putting NA instead
for header_value in full_data_keys:
if header_value not in data.keys():
data[header_value]='NA'

user_calendar = soup.find('div',class_='streak-calendar-scroll-container')
if user_calendar is not None:
last_activity = user_calendar.find('span',class_='streak-cell filled')
try:
last_activity_date = last_activity['title']
except TypeError:
last_activity_date='NA'
else:
last_activity_date='NA'
f.write(link + "," + dates + "," + points.replace("," , "") + "," + videos + "," + data['questions'] + "," + data['votes'] + "," + data['answers'] + "," + data['flags raised'] + "," + data['project help requests'] + "," + data['project help replies'] + "," + data['comments'] + "," + data['tips and thanks'] + "," + last_activity_date + "\n")
这段代码应该可以正常工作。但问题是:它花费了太多时间。
这是包含线程的脚本:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
import concurrent.futures

session = HTMLSession()
r = session.get('https://www.khanacademy.org/computing/computer-programming/programming#intro-to-programming')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')

#find course steps links
courses_links = soup.find_all(class_='link_1uvuyao-o_O-nodeStyle_cu2reh-o_O-nodeStyleIcon_4udnki')
list_courses={}

for links in courses_links:
courses = links.extract()
link_course = courses['href']
title_course= links.find(class_='nodeTitle_145jbuf')
span_title_course=title_course.span
text_span=span_title_course.text.strip()
final_link_course ='https://www.khanacademy.org'+link_course
list_courses[text_span]=final_link_course

#that's my driver function
def showmore(url, timeout):
driver = webdriver.Chrome()
driver.get(url)
while True:
try:
showmore=WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CLASS_NAME,'button_1eqj1ga-o_O-shared_1t8r4tr-o_O-default_9fm203')))
showmore.click()
except TimeoutException:
break
except StaleElementReferenceException:
break

#that's my pool
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
#do this in my pool
future_to_url = {executor.submit(showmore, url, 20): url for url in list_courses.values()}
如您所见,第二个脚本尚未完成所有操作。我仍然需要添加整个数据抓取/写入过程。
我的问题是:如何为抓取和写入部分创建线程?我应该如何订购这些线程?
更广泛地说:如何让我的脚本尽可能快地运行?

最佳答案

要回答您的“更广泛”的问题,您应该使用 asyncio结合 requests或类似的包。可以找到一个不错的指南 here .线程不是为运行异步 http 请求而构建的。

我无法向您展示如何使用 asyncio 编写代码,因为我自己几乎不知道如何使用它,而且可能需要数百行代码才能完成。

如果你想要一个快速的解决方案来使用你已经拥有的代码来提高性能,你应该将你的 selenium 浏览器设置为 headless 模式:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.headless = True
driver = webdriver.Chrome('YOUR_CHROMEDRIVER_PATH_HERE', chrome_options=options)

关于python-3.x - 如何使用 selenium 线程进行网页抓取?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/55033633/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com