gpt4 book ai didi

python - Scrapy FormRequest 参数不起作用,而是显示所有结果

转载 作者:行者123 更新时间:2023-12-05 00:44:51 31 4
gpt4 key购买 nike

我正在抓取此网页 https://researchgrant.gov.sg/eservices/advanced-search/?keyword=&source=sharepoint&type=project&status=open&_pp_projectstatus=&_pp_hiname=&_pp_piname=&_pp_source=sharepoint&_pp_details=#project使用 Scrapy FormRequest。我的代码如下。参数_pp_hinameab_pp_pinamepuaresponse.text 中应该只返回 1 个结果但它以 HTML 代码形式返回所有结果。参数显然不起作用,但我看不出有什么问题。

def start_requests(self):
params = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': '1',
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': 'pua',
'_pp_source': '',
'_pp_details': '',
'name':'advancesearchawardedprojectsp'
}
yield scrapy.FormRequest('https://researchgrant.gov.sg/eservices/mvcgrid',callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})

def parse_item(self,response):
print(response.text)

应该只有 1 个条目:
actual result

但显然它显示了所有条目:
shows all result

最新更新:
class ToScrapeCSSSpiderSG(scrapy.Spider):
name = "toscrapesg-css"
# start_urls = [
# 'https://researchgrant.gov.sg/eservices/mvcgrid',
# ]
params = {
'name':'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': 1,
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': '',
'_pp_source': '',
'_pp_details': '',
'name':'advancesearchawardedprojectsp'
}
def start_requests(self):
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url,callback=self.parse_item,method='POST',formdata=self.params,headers = {'X-Requested-With':'XMLHttpRequest'})


def parse_item(self,response):
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
link=row.xpath('td[1]/a/@href').extract_first()
yield scrapy.FormRequest(link,callback = self.parse_product,method='GET')


onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
if onclick:
self.args['page'] += 1
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers = {'X-Requested-With': 'XMLHttpRequest'})

def parse_product(self,response):
text = response.xpath('//span[contains(@id,"ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle")]/text()').extract()
# text = info.xpath('./text()').extract()
print(text)

控制台消息:
enter image description here

最佳答案

它发送 POST仅 body Name=advancesearchawardedprojectsp .其他参数应该在 url 中作为查询。

所以url应该

https://researchgrant.gov.sg/eservices/mvcgrid?keyword=&source=sharepoint&type=project&status=open&page=1&_pp_projectstatus=&_pp_hiname=ab&_pp_piname=pua&_pp_source=&_pp_details

您可以使用 urllib.parse.urlencode(args)为了这。

它给了我一个结果。

import urllib.parse

def start_requests(self):

params = {
'name':'advancesearchawardedprojectsp'
}

args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': '1',
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': 'pua',
'_pp_source': '',
'_pp_details': '',
}

args = urllib.parse.urlencode(args)

url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args

yield scrapy.FormRequest(url, callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})

编辑:加载下一页和检查按钮的示例 Next Page停止。

编辑:现在它可以保存在 csv文件。
import scrapy
import urllib.parse

class MySpider(scrapy.Spider):

name = 'myspider'
#allowed_domains = []

params = {
'name': 'advancesearchawardedprojectsp'
}

args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': 1,
'_pp_projectstatus': '',

#'_pp_hiname': 'tan',
#'_pp_piname': '',
'_pp_hiname': 'ab',
'_pp_piname': '', #'pua',

'_pp_source': '',
'_pp_details': '',
}

def start_requests(self):

# create request for first page
args = urllib.parse.urlencode(self.args)

url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args

yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})


def parse_item(self,response):
#print('parse_item] url:', response.url)
#print('parse_item] text:', response.text)

#for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
# for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
# link = row.xpath('td[1]/a/@href').extract_first()
# yield scrapy.Request(link, callback=self.parse_product)

for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'):
link = row.xpath('.//a/@href').get()
#title = row.xpath('.//a/text()').get()
yield scrapy.Request(link, callback=self.parse_product)

# create request for next page
onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()

if onclick:
# next page
self.args['page'] += 1
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})

def parse_product(self, response):
#print('parse_product] url:', response.url)

# .extract_first() or .get() instead of .extract()
project_id = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjIdExt"]/text()').get()
title = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle"]/text()').get()
pi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblLeadPIName"]/text()').get()
hi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblHostInstName"]/text()').get()
date = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_dtPickerStartDate"]/text()').get()
# etc.

item = {
'id': project_id,
'title': title,
'pi': pi,
'hi': hi,
'date': date,
}

yield item

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()

关于python - Scrapy FormRequest 参数不起作用,而是显示所有结果,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59259699/

31 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com