gpt4 book ai didi

python - 如何使用selenium从网页中提取数据更加稳健和高效?

转载 作者:行者123 更新时间:2023-12-03 08:17:37 27 4
gpt4 key购买 nike

我想从雅虎财经网页提取所有期权链数据,为简单起见,采用看跌期权链数据。首先加载程序中使用的所有包:

import time 
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

将某公司的看跌期权链数据写入目录的函数:

def write_option_chain(code):
browser = webdriver.Chrome()
browser.maximize_window()
url = "https://finance.yahoo.com/quote/{}/options?p={}".format(code,code)
browser.get(url)
WebDriverWait(browser,10).until(EC.visibility_of_element_located((By.XPATH, ".//select/option")))
time.sleep(25)
date_elem = browser.find_elements_by_xpath(".//select/option")
time_span = len(date_elem)
print('{} option chains exists in {}'.format(time_span,code))
df_all = pd.DataFrame()
for item in range(1,time_span):
element_date = browser.find_element_by_xpath('.//select/option[{}]'.format(item))
print("parsing {}'s put option chain on {} now".format(code,element_date.text))
element_date.click()
WebDriverWait(browser,10).until(EC.visibility_of_all_elements_located((By.XPATH, ".//table[@class='puts W(100%) Pos(r) list-options']//td")))
time.sleep(11)
put_table = browser.find_element_by_xpath((".//table[@class='puts W(100%) Pos(r) list-options']"))
put_table_string = put_table.get_attribute('outerHTML')
df_put = pd.read_html(put_table_string)[0]
df_all = df_all.append(df_put)
browser.close()
browser.quit()
df_all.to_csv('/tmp/{}.csv'.format(code))
print('{} otpion chain written into csv file'.format(code))

要使用列表测试 write_option_chain:

nas_list = ['aapl','adbe','adi','adp','adsk']
for item in nas_list:
try:
write_option_chain(code=item)
except:
print("check what happens to {} ".format(item))
continue
time.sleep(5)

输出信息显示:

#i omitted many lines for simplicity
18 option chains exists in aapl
parsing aapl's put option chain on August 27, 2021 now
check what happens to aapl
check what happens to adbe
12 option chains exists in adi
parsing adi's put option chain on December 17, 2021 now
adi otpion chain written into csv file
11 option chains exists in adp
parsing adp's put option chain on August 27, 2021 now
adp otpion chain written into csv file
check what happens to adsk

我们对以上信息进行总结:

1.仅将 adpadi 的 put 选项链数据写入所需目录。
2.仅获取 aapladp 的部分选项链数据
3.无法打开adsk的选项网页。
4.执行时间将近20分钟。

如何使用selenium使网页数据提取更加稳健和高效?

最佳答案

如果可以使用 selenium 以外的其他内容,那么可以通过使用 asyncio 来实现最佳吞吐量与 ahiohttp包来自PyPi由于需要发出大量并发 URL get 请求(因此是比多线程更好的选择)。为了获得更高的性能(此处未完成),可以将代码分为获取 URL(纯 I/O)和数据帧处理(CPU 密集型),并为后者使用多处理池。

import asyncio
import aiohttp
from bs4 import BeautifulSoup
import pandas as pd
import time

async def process_code(session, code):
async with session.get(f'https://finance.yahoo.com/quote/{code}/options?p={code}') as resp:
status = resp.status
if status != 200:
raise Exception('status returned =', status)
code_page = await resp.text()
soup = BeautifulSoup(code_page, 'lxml')
dates = [elem['value'] for elem in soup.find('select').find_all('option')]
df_all = pd.DataFrame()
df_tables = await asyncio.gather(*(process_date(session, code, date) for date in dates))
for df_table in df_tables:
if df_table is not None:
df_all = df_all.append(df_table)
df_all.to_csv('/tmp/{}.csv'.format(code))

async def process_date(session, code, date):
async with session.get(f'https://finance.yahoo.com/quote/{code}/options?date={date}&p={code}') as resp:
status = resp.status
if status != 200:
raise Exception('status returned =', status)
code_page = await resp.text()
soup = BeautifulSoup(code_page, 'lxml')
table = soup.find('table', class_='puts W(100%) Pos(r) list-options')
try:
return pd.read_html(str(table))[0]
except ValueError:
return None

async def main():
nas_list = ['aapl','adbe','adi','adp','adsk']
# Connection: keep-alive required to prevent ClientPayloadError on some websites:
t = time.time()
async with aiohttp.ClientSession(headers = {'Connection': 'keep-alive', 'user-agent': 'my-application'}) as session:
await asyncio.gather(*(process_code(session, code) for code in nas_list))
print('Elapsed time:', time.time() - t)

# Test if we are running under iPython or Jupyter Notebook:
try:
__IPYTHON__
except NameError:
asyncio.get_event_loop().run_until_complete(main())
else:
asyncio.get_running_loop().create_task(main())

这是多线程版本

from multiprocessing.pool import ThreadPool
from functools import partial
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def process_code(session, pool, code):
code_page = session.get(f'https://finance.yahoo.com/quote/{code}/options?p={code}')
soup = BeautifulSoup(code_page.content, 'lxml')
dates = [elem['value'] for elem in soup.find('select').find_all('option')]
df_all = pd.DataFrame()
for df_table in pool.imap(partial(process_date, session, code), dates):
if df_table is not None:
df_all = df_all.append(df_table)
df_all.to_csv('/tmp/{}.csv'.format(code))

def process_date(session, code, date):
code_page = session.get(f'https://finance.yahoo.com/quote/{code}/options?date={date}&p={code}')
soup = BeautifulSoup(code_page.content, 'lxml')
table = soup.find('table', class_='puts W(100%) Pos(r) list-options')
try:
return pd.read_html(str(table))[0]
except ValueError:
return None

t = time.time()
nas_list = ['aapl','adbe','adi','adp','adsk']
with requests.Session() as session:
headers = {'User-Agent': 'my-application'}
session.headers = headers
pool = ThreadPool(100)
pool.map(partial(process_code, session, pool), nas_list)
print('Elapsed time:', time.time() - t)

关于python - 如何使用selenium从网页中提取数据更加稳健和高效?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/68887536/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com