gpt4 book ai didi

python - Selenium Python webscraper 真的很慢

转载 作者:太空宇宙 更新时间:2023-11-04 04:28:35 32 4
gpt4 key购买 nike

我是网络抓取工具的新手。我做了一些有用的东西,但要花好几个小时才能得到我需要的一切。我阅读了一些关于使用并行进程来处理 URL 的内容,但我不知道如何去做并将其合并到我已有的内容中。非常感谢您的帮助!

这是我的代码,仍然非常困惑。我还在学习:)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time
import random
import pprint
import itertools
import csv
import pandas as pd

start_url = "https://www.nationalevacaturebank.nl/vacature/zoeken?query=&location=&distance=city&limit=100&sort=relevance&filters%5BcareerLevel%5D%5B%5D=Starter&filters%5BeducationLevel%5D%5B%5D=MBO"

driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
driver.get(start_url)
driver.find_element_by_xpath('//*[@id="form_save"]').click() #accepts cookies

wait = WebDriverWait(driver, random.randint(1500,3200)/1000.0)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
num_jobs = int(driver.find_element_by_xpath('/html/body/div[3]/div/main/div[2]/div[3]/div/header/h2/span').text)
num_pages = int(num_jobs/102)

urls = []
list_of_links = []

for i in range(num_pages+1):
try:


elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="search-results-container"]//article/job/a')))
for i in elements:
list_of_links.append(i.get_attribute('href'))

j = random.randint(1500,3200)/1000.0
time.sleep(j)

if 'page=3' not in driver.current_url:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[6]/a').click()
else:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[5]/a').click()

url = driver.current_url
if url not in urls:
print(url)
urls.append(url)

else:
break


except:
continue


set_list_of_links = list(set(list_of_links))
print(len(set_list_of_links), "results")
driver.close()

def grouper(n, iterable):
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, n))
if not chunk:
return
yield chunk

def remove_empty_lists(l):
keep_going = True
prev_l = l
while keep_going:
new_l = remover(prev_l)
#are they identical objects?
if new_l == prev_l:
keep_going = False
#set prev to new
prev_l = new_l
#return the result
return new_l


def remover(l):
newlist = []
for i in l:
if isinstance(i, list) and len(i) != 0:
newlist.append(remover(i))
if not isinstance(i, list):
newlist.append(i)

return newlist

vacatures = []
chunks = grouper(100, set_list_of_links)
chunk_count = 0

for chunk in chunks:
chunk_count +=1
print(chunk_count)
j = random.randint(1500,3200)/1000.0
time.sleep(j)

for url in chunk:

driver = webdriver.Firefox()
driver.set_page_load_timeout(20)

try:
driver.get(url)
driver.find_element_by_xpath('//*[@id="form_save"]').click() #accepts cookies

vacature = []
vacature.append(url)

j = random.randint(1500,3200)/1000.0
time.sleep(j)

elements = driver.find_elements_by_tag_name('dl')
p_elements = driver.find_elements_by_tag_name('p')
li_elements = driver.find_elements_by_tag_name('li')

for i in elements:
if "Salaris:" not in i.text:
vacature.append(i.text)

running_text = list()
for p in p_elements:
running_text.append(p.text)

text= [''.join(running_text)]

remove_ls = ['vacatures', 'carrièretips', 'help', 'inloggen', 'inschrijven', 'Bezoek website', 'YouTube',
'Over Nationale Vacaturebank', 'Werken bij de Persgroep', 'Persberichten', 'Autotrack', 'Tweakers',
'Tweakers Elect', 'ITBanen', 'Contact', 'Carrière Mentors', 'Veelgestelde vragen',
'Vacatures, stages en bijbanen', 'Bruto Netto Calculator', 'Salariswijzer', 'Direct vacature plaatsen',
'Kandidaten zoeken', 'Bekijk de webshop', 'Intermediair', 'Volg ons op Facebook']

for li in li_elements:
if li.text not in remove_ls:
text.append(li.text)

text = ''. join(text)
vacature.append(text)

vacatures.append(vacature)

driver.close()

except TimeoutException as ex:
isrunning = 0
print("Exception has been thrown. " + str(ex))
driver.close()

except NoSuchElementException:
continue

最佳答案

Python Selenium webdriver 不是线程安全的。这意味着您的浏览器无法正确使用来自多个线程的异步调用。尝试用 requests 抓取网站和 bs4 + lxml .它比 Selenium 快得多。 This回答可能会有帮助。

关于python - Selenium Python webscraper 真的很慢,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/53083033/

32 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com