gpt4 book ai didi

python - 我正在使用 BeautifulSoup,如何在重定向后获取链接?

转载 作者:太空宇宙 更新时间:2023-11-04 04:22:28 24 4
gpt4 key购买 nike

我想获取文章页面下载链接跳转后的链接

例如: https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/

在上述文章页面中,有以下下载链接: https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/yz5cw79mbn3a/ECNHOgoNYk0MIkEoFlUkFlY5Vj5WVSRQACVKfx8EOw8ReVs+FFs=

直接打开此链接,不会跳转到真正的下载链接,需要在文章页面打开。

# coding=utf-8

import lxml
import re
import requests
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen


def urlopen(url):
'''
using requests to replace urllib.requests.urlopen
return an html
'''
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
return r.text

def generate_pages(subTitle,fromPage,toPage):
'''
return page sites' url list
'''
pages = []
if(fromPage > 0 and fromPage<toPage):
for i in range(fromPage,toPage+1):
pages.append('https://scanlibs.com/category/books'+subTitle+'/page/'+str(i))
return pages



def get_book_sites_of_one_page(page):
'''
get book site's url in one page
input: page site url
output: book site urls list
return book sites in one page
'''
html = urlopen(page)
soup = BeautifulSoup(html,'html.parser')
linkList = soup.find('main').findAll('a',{'rel':'bookmark'})
bookSites= []
for link in linkList[::2]:
if 'href' in link.attrs:
#print(link)
bookSites.append(link.attrs['href'])
return bookSites


def get_book_urls(bookSite):
'''
input a book site
find book downloading urls in this book site
then
return them as a list
'''
bookURLs=[]
html = urlopen(bookSite)
soup = BeautifulSoup(html,'lxml')
linkList = soup.findAll("a",{"target":"_blank"})
for link in linkList[::2]:
# print(link)
if 'href' in link.attrs:
bookURLs.append(link.attrs['href'])
return bookURLs


def get_all_book_urls(fromPage=1, toPage=1, subTitle=''):
bookSites = []
bookURLs = []
pages = generate_pages(subTitle,fromPage, toPage)

for page in pages:
bookSiteOfOnePage=get_book_sites_of_one_page(page)
bookSites.extend(bookSiteOfOnePage)

for bookSite in bookSites:
book_urls=get_book_urls(bookSite)
bookURLs += book_urls

for bookURL in bookURLs:
print(bookURL)

#with open(filename, 'w') as f:
# f.write(bookURLs)


def main():
if(len(sys.argv) == 4):
'''
python getUrl.py 1, 100, programming
from page 1 to page in subject programming
'''
subTitle = str(sys.argv[3])
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
get_all_book_urls(fromPage, toPage, subTitle)

if(len(sys.argv) == 3):
'''
python getUrl.py 1 100
from page 1 to page 100
'''
subTitle = ''
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
#filename = subTitle="-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)

elif(len(sys.argv) == 2):
'''
python getUrl.py 10
from page 10 to page 10
only download books on page 10
'''
fromPage = int(sys.argv[1])
toPage = fromPage + 1
subTitle = ''
#filename = "All-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)

elif(len(sys.argv)== 1):
fromPage = 1
# custom page range
toPage = 2
subTitle = ''

#filename = "All-"+"1"+"-"+time.strftime('%Y-%m-%d', time.localtime())+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
else:
print("Error, too many arguments")



if __name__ == '__main__':

#filename = ''
main()

感谢您的帮助!

最佳答案

本网站在重定向时检查是否设置了 referer。您可以在 header 中将原始 url 作为 referer 并轻松绕过它。您还可以看到,在最终的下载链接中使用了 referer 作为 url 参数。

import requests
from bs4 import BeautifulSoup
s = requests.Session()
url='https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/'
r=html=s.get(url).text
soup=BeautifulSoup(html,'html.parser')
relative_link=soup.find('a',{'id':'download'})['href'] #get the relative link
download_redirect_link=url+relative_link
headers={
"referer": url
}
r2=requests.get(download_redirect_link,headers=headers)
print(r2.url)

输出

https://rapidgator.net/file/80e881f7631eddb49de31e5718eb96ba?referer=https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/

关于python - 我正在使用 BeautifulSoup,如何在重定向后获取链接?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54212278/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com