gpt4 book ai didi

python - Webscrape 多个网页到单个 csv

转载 作者:太空宇宙 更新时间:2023-11-04 01:57:20 28 4
gpt4 key购买 nike

我希望能够将来自多个 New Egg 网页的数据抓取到一个 csv 文件中。

我目前能够将单个 New Egg 网页中的数据网络抓取到一个 csv 文件中,但是我希望同时对多个页面进行网络抓取。

from bs4 import BeautifulSoup

import requests

import bs4

from urllib.request import urlopen as uReq

from bs4 import BeautifulSoup as soup



#my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48?PageSize=96'
my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48/Page-1?PageSize=96'



#opening up connection and grabbing the page

uClient = uReq(my_url)

page_html = uClient.read()

uClient.close()



#html parsing

page_soup = soup(page_html, "html.parser")



#grabs each product

containers = page_soup.findAll("div", {"class":"item-container"})



filename = "99FINAL.csv"

f = open(filename, "w")

headers = "Brand, Title, Shipping, Price\n"



f.write(headers)

# only for information, not used in url
page = 0

while True:

print('---', page, '---')

r = requests.get(my_url)

soup = BeautifulSoup(r.content, "html.parser")

# String substitution for HTML




for container in containers:

brand_container = container.findAll("a", {"class":"item-title"})

brand = brand_container[0].text



title_container = container.findAll("a", {"class":"item-title"})

title = title_container[0].text



shipping_container = container.findAll("li", {"class":"price-ship"})

shipping = shipping_container[0].text.strip()



price = container.findAll("li", {"class":"price-current"})

price = price[0]
pricing_container = price.findAll("strong")

pricing = pricing_container[0].text

cents_container = price.findAll("sup")
centing = cents_container[0].text




print("brand: " + brand.partition(' ')[0])

print("title: " + title)

print("shipping: " + shipping)

print("pricing: " + pricing)

print("centing: " + centing)



f.write(brand.partition(' ')[0] + "," + title.replace(",","|") + "," + shipping.partition(' ')[0] + "," + pricing.replace(",","")+centing + "\n")

break
# link to next page

next_page = soup.find("button", {"title": "Next"})

if next_page:
my_url = next_page.get("href")
page += 1
else:
break # exit `while True`for "Page" in my_url
f.close()

该算法成功抓取了第一页,但在切换到下一页时出现问题。

我得到的错误是:“MissingSchema:无效 URL“无”:未提供架构。也许您的意思是 http://None?”

最佳答案

您可以使用 while 循环继续抓取,直到加载了禁用“下一步”按钮的页面:

from bs4 import BeautifulSoup as soup
import requests, csv, re
def get_products(d):
full_price = [i.text for i in getattr(d.find('li', {'class':'price-current'}), 'find_all', lambda _:[])(re.compile('strong|sup'))]
return [getattr(d.find('a', {'class':'item-brand'}), 'img', {'title':'N/A'})['title'], d.find('a', {'class':'item-title'}).text, *full_price, d.find('li', {'class':'price-ship'}).text]

rows, count, header = [], 1, ['brand', 'title', 'pricing', 'centing', 'shipping']
while True:
d = soup(requests.get(f'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48/Page-{count}?PageSize=96').text, 'html.parser')
rows.extend([get_products(i) for i in d.find_all('div', {'class':re.compile('item\-container')})])
if 'disabled' in d.find('button', {'title':'Next'}).attrs:
break
count += 1

with open('graphics_cards.csv', 'w') as f:
write = csv.writer(f)
write.writerows([header, *rows])

输出(前五个产品):

brand,title,pricing,centing,shipping
ASUS,ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card,509,.99,"
Free Shipping
"
MSI,MSI Radeon RX 580 DirectX 12 RX 580 ARMOR 8G OC 8GB 256-Bit GDDR5 PCI Express x16 HDCP Ready CrossFireX Support Video Card,194,.99,"
Free Shipping
"
GIGABYTE,"GIGABYTE GeForce GTX 1660 GAMING OC 6G Graphics Card, 3 x WINDFORCE Fans, 6GB 192-Bit GDDR5, GV-N1660GAMING OC-6GD Video Card",229,.99,"
Free Shipping
"
Sapphire Tech,SAPPHIRE NITRO+ Radeon RX Vega 64 DirectX 12 100410NT+SR 8GB 2048-Bit HBM2 PCI Express 3.0 Video Card,399,.99,"
Free Shipping
"
ASUS,ASUS Radeon RX 580 O4G Dual-fan OC Edition GDDR5 DP HDMI DVI VR Ready AMD Graphics Card (DUAL-RX580-O4G),189,.99,"
Free Shipping
"

关于python - Webscrape 多个网页到单个 csv,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56521011/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com