gpt4 book ai didi

python - 无法从网站下载 pdf 文件

转载 作者:行者123 更新时间:2023-12-04 07:39:00 26 4
gpt4 key购买 nike

嗨,我有以下代码,我想从网站“https://www.journal-officiel.gouv.fr/balo/recherche/resultats?parutionDateStart=2021-05-17&parutionDateEnd=2021-05-17&_token=”下载pdf 0oP3_cJ2xZ10SbEEGoNdP6vUpAIv5nBkrTZptI0Nzd8"
这是我用来下载文件的脚本,但没有下载任何 pdf。
虽然它没有给出任何错误,但它每次都创建一个空白文件夹

from selenium import webdriver
from selenium.webdriver.support.ui import Select
driver = webdriver.Chrome(executable_path='C:\\Users\\u6080267\\Documents\\chromedriver.exe')
driver.get("https://www.journal-officiel.gouv.fr/balo/recherche/")
link = driver.find_element_by_xpath("//a[contains(@href,'token')]")
link.click()

url1 = driver.current_url
import urllib.request
from bs4 import BeautifulSoup
import os
import urllib
from datetime import datetime
import requests
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36'}
def get_urls(url):

url = url1
#url = https://www.journal-officiel.gouv.fr/balo/recherche/resultats?parutionDateStart=2021-05-03&parutionDateEnd=2021-05-03&_token=48BMi0HUW0CZJVdbccoO_wX9IzRJfglO8Uq-K0lfMNg
req = urllib.request.Request(url, None, HEADERS)
opener = urllib.request.build_opener()
content = opener.open(req).read()
soup = BeautifulSoup(content, "html.parser")
soup.prettify()
urls = {}
for anchor in soup.findAll('a', href=True): #Going inside links
if "/balo/document" in anchor.get('href'):
name = anchor.get('href')[(anchor.get('href').rindex("=")+1):]
url = "https://www.journal-officiel.gouv.fr/" + anchor.get('href')

if name not in urls:
urls[name]=url

return urls
def download(urls, path):
os.chdir(path)
for name, url in urls.items():
try:
res = requests.get(url, allow_redirects=True)
# programmatic access requires a form to be submitted. On agreeing the consent, the pdfurl can be used
soup = BeautifulSoup(res.content, "html.parser")
soup.prettify()
for pdfurl in soup.findAll(attrs={"name": "pdfURL"}):
downloadurl = "https://www.journal-officiel.gouv.fr/" + pdfurl.get('value')
res = requests.get(downloadurl)
open(name + ".pdf", 'wb').write(res.content)
print ("Downloaded", name + ".pdf")

except Exception as e:
print ("Failed to download", name, ", because of", e)

def main():
pathToStoreFiles = os.getcwd() + "\\" + datetime.today().strftime('%Y-%m-%d')
os.makedirs(pathToStoreFiles)
urls = get_urls('')
download(urls, pathToStoreFiles)
if __name__ == "__main__":
main()

最佳答案

显然,您没有得到正确的pdf下载链接。此外,您使抓取变得比实际复杂得多,也就是说,不需要 selenium 的重型枪支。 .token使用 bs4 可以轻松访问查询 url .
然后您可以使用它来获取 results HTML并将其解析为 pdf links .
以下是如何下载第一个50文件。

import time

import requests
from bs4 import BeautifulSoup

main_url = "https://www.journal-officiel.gouv.fr"
search_path = "/balo/recherche/"


def wait_a_bit(wait_for: float = 1.5):
time.sleep(wait_for)


with requests.Session() as connection:
connection.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"

search_url = (
BeautifulSoup(connection.get(f"{main_url}{search_path}").text, "lxml")
.find_all("a", class_="aide-link")[-1]["href"]
)

pdf_links = (
f'{main_url}{link.find("a")["href"]}' for link in
BeautifulSoup(
connection.get(f"{main_url}{search_url}&limit=50").text, "lxml"
)
.select(".download-link")
)

for pdf_link in pdf_links:
print(f"Fetching {pdf_link}")
pdf_file = connection.get(pdf_link).content
with open(f'{pdf_link.rsplit("/")[-1]}.pdf', "wb") as output:
output.write(pdf_file)
wait_a_bit()
输出:
Fetching https://www.journal-officiel.gouv.fr/balo/document/202105172101761-59
Fetching https://www.journal-officiel.gouv.fr/balo/document/202105172101784-59
Fetching https://www.journal-officiel.gouv.fr/balo/document/202105172101798-59
Fetching https://www.journal-officiel.gouv.fr/balo/document/202105172101801-59
Fetching https://www.journal-officiel.gouv.fr/balo/document/202105172101810-59

and more ...
所有文件都保存在脚本的当前目录中:
202105172101675-59.pdf
202105172101686-59.pdf
202105172101687-59.pdf
202105172101688-59.pdf
202105172101697-59.pdf
...

关于python - 无法从网站下载 pdf 文件,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/67588889/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com