gpt4 book ai didi

python - 使用 Python 请求抓取页面

转载 作者:太空宇宙 更新时间:2023-11-04 05:42:38 25 4
gpt4 key购买 nike

我在网页抓取方面遇到了一些问题,这是我的代码:

from bs4 import BeautifulSoup

import requests
import re
import csv
import argparse

def save_csv_file(filename, array):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerow(["item_name","item_price","item_category"])
writer.writerows(array)

def process_data(name, price, category):

item_name = name.text if name else 'NA'
item_price = price.text if price else 'NA'
item_category = category.text if category else 'NA'

item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")

return (item_name, item_price, item_category)

def do_scrap(filename, url, payload, headers):

# Request the URL with parameters and headers
r = requests.post(url, payload, headers = headers, allow_redirects = True)

if(r.status_code == 200):

# Save response content in html variable
html = r.content

# Parsed html variable into HTML file with bs4
parsed_html = BeautifulSoup(html, "html.parser")

# Print document title
print parsed_html.head.find('title').text

# Find all of the HTML elements which are describing hotels
tables = parsed_html.find_all("a", {"class" : "result-link"})

# Print the numbers of the hotels
print "Found %s records." % len(tables)

# Empty helpers
items = []
count = 0

# Looping the HTML elements and print properties for each hotel
for table in tables:
name = table.find("h3", {"class" : "result-title"})
price = table.find("p", {"class" : "price text-truncate"})
category = table.find("p", {"class" : "merchant-name text-truncate"})

items.append(process_data(name, price, category))
count += 1

if count > 0:
# Save array with data to csv file
save_csv_file(filename = filename, array = items)

# Print end of job info
print "\n%s records downloaded and saved to %s." % (count, filename)
else:
print "Code error: %s" % r.status_code

if __name__ == '__main__':

ap = argparse.ArgumentParser()

ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")

args = vars(ap.parse_args())

product = args['product']
category = args['category']

payload = {
'siteSearchQuery':product,
'from':'colibri'
}

headers = {
'Host':'www.kelkoo.co.uk',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}

url = "http://www.kelkoo.co.uk/ctl/do/search"

filename = "%s_co_uk_kelkoo_data.csv" % product

do_scrap(
filename=filename,
url=url,
payload=payload,
headers=headers)

在这个请求之后,我得到的结果与我给出的结果不同:

www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri

进入我的网络浏览器,是什么导致了这个问题?是不是跟页面重定向什么的有关?

最佳答案

我可以看到多种因素会导致您得到不同的结果:

  • 您启动了 POST 而不是 GET。为 requests.get 查找 params
  • 他们使用 javascript 修改页面。

关于python - 使用 Python 请求抓取页面,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/33332695/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com