gpt4 book ai didi

python - 如何使用 beautifulsoup 从页面中抓取数据

转载 作者:行者123 更新时间:2023-12-04 17:09:05 26 4
gpt4 key购买 nike

import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl='https://signal.nfx.com/'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://signal.nfx.com/investor-lists/top-who-invested-in-female-founders-investors')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('div',class_='pr3')
productlinks=[]
p=[]
u=[]
for links in tra:
for link in links.find_all('a',href=True):
comp=baseurl+link['href']
productlinks.append(comp)

for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
try:
address=soup.find('span',class_='ml1').text
except:
address=''
p.append(address)
try:
link=soup.find('a',class_='ml1 subheader lower-subheader').text
except:
link=''
u.append(link)

df = pd.DataFrame(

{"address": p, "link": u}
)
print(df)

这是我的输出,只给我一个 addresslink 然后他们将打印 empty list 并完成任务你能帮忙吗找到所有地址和链接我试图从页面中抓取数据但他们不会提供页面的完整信息这些是我抓取信息的链接 https://signal.nfx.com/investors/aaleen-anjum

     address             link
0 Toronto, Ontario twosmallfish.vc
1
2
3
4
5
6
7
8
9
10
11

最佳答案

可以通过api获取数据。例如,这里是投资者:

import requests
import pandas as pd

url= "https://signal-api.nfx.com/graphql"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
payload = {"operationName":"vclInvestors",
"variables":{"slug":"who-invested-in-female-founders",
"order":[{}],
"after":""},
"query":"query vclInvestors($slug: String!, $after: String) {\n list(slug: $slug) {\n id\n slug\n investor_count\n vertical {\n id\n display_name\n kind\n __typename\n }\n location {\n id\n display_name\n __typename\n }\n stage\n firms {\n id\n name\n slug\n __typename\n }\n scored_investors(first: 8, after: $after) {\n pageInfo {\n hasNextPage\n hasPreviousPage\n endCursor\n __typename\n }\n record_count\n edges {\n node {\n ...investorListInvestorProfileFields\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n}\n\nfragment investorListInvestorProfileFields on InvestorProfile {\n id\n person {\n id\n first_name\n last_name\n name\n slug\n is_me\n is_on_target_list\n __typename\n }\n image_urls\n position\n min_investment\n max_investment\n target_investment\n is_preferred_coinvestor\n firm {\n id\n name\n slug\n __typename\n }\n investment_locations {\n id\n display_name\n location_investor_list {\n id\n slug\n __typename\n }\n __typename\n }\n investor_lists {\n id\n stage_name\n slug\n vertical {\n id\n display_name\n __typename\n }\n __typename\n }\n __typename\n}\n"}


results = pd.DataFrame()
hasNextPage = True
after = ''

while hasNextPage == True:
payload['variables']['after'] == after
jsonData = requests.post(url, headers=headers, json=payload ).json()
data = jsonData['data']['list']['scored_investors']['edges']
df = pd.json_normalize(data)
results = results.append(df, sort=False).reset_index(drop=True)

count = len(results)
tot = jsonData['data']['list']['investor_count']

print(f'{count} of {tot}')

hasNextPage = jsonData['data']['list']['scored_investors']['pageInfo']['hasNextPage']
after = jsonData['data']['list']['scored_investors']['pageInfo']['endCursor']

输出:

print(results.head(2).to_string())
__typename node.__typename node.id node.person.id node.person.first_name node.person.last_name node.person.name node.person.slug node.person.is_me node.person.is_on_target_list node.person.__typename node.image_urls node.position node.min_investment node.max_investment node.target_investment node.is_preferred_coinvestor node.firm.id node.firm.name node.firm.slug node.firm.__typename node.investment_locations node.investor_lists node.firm
0 InvestorProfileEdge InvestorProfile 19676 87099 Aaleen Anjum Aaleen Anjum aaleen-anjum False False Person [https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBMWp2QVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--1dc8054880c588f1fd59361ebd5d8526f841049d/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/0, https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBMW52QVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--df77fc9ad679d550ce8e2472e47150cb9fc610e6/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/1, https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBMXJ2QVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--1f58605b9a843b9ee1e820d63d154aea24936f84/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/2, https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBMXZ2QVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--2a200a001411bbff92bd9deb68b4a54215ee0863/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/3] analyst 150000 1000000 250000 False 4445 Two Small Fish Ventures two-small-fish-ventures Firm [] [{'id': '6141', 'stage_name': 'Pre-Seed', 'slug': 'ai-pre-seed', 'vertical': {'id': '3', 'display_name': 'AI', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '3', 'stage_name': 'Seed', 'slug': 'ai-seed', 'vertical': {'id': '3', 'display_name': 'AI', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6144', 'stage_name': 'Pre-Seed', 'slug': 'blockchain-pre-seed', 'vertical': {'id': '7', 'display_name': 'Blockchain', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '7', 'stage_name': 'Seed', 'slug': 'blockchain-seed', 'vertical': {'id': '7', 'display_name': 'Blockchain', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '5406', 'stage_name': 'Other Lists', 'slug': 'british-columbia', 'vertical': {'id': '9678', 'display_name': 'British Columbia', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6169', 'stage_name': 'Pre-Seed', 'slug': 'consumer-health-pre-seed', 'vertical': {'id': '11', 'display_name': 'Consumer Health', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '11', 'stage_name': 'Seed', 'slug': 'consumer-health-seed', 'vertical': {'id': '11', 'display_name': 'Consumer Health', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6145', 'stage_name': 'Pre-Seed', 'slug': 'cryptocurrency-pre-seed', 'vertical': {'id': '13', 'display_name': 'Cryptocurrency', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '13', 'stage_name': 'Seed', 'slug': 'cryptocurrency-seed', 'vertical': {'id': '13', 'display_name': 'Cryptocurrency', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6203', 'stage_name': 'Pre-Seed', 'slug': 'cybersecurity-pre-seed', 'vertical': {'id': '57799', 'display_name': 'Cybersecurity', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '5554', 'stage_name': 'Seed', 'slug': 'cybersecurity-seed', 'vertical': {'id': '57799', 'display_name': 'Cybersecurity', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6172', 'stage_name': 'Pre-Seed', 'slug': 'developer-tools-pre-seed', 'vertical': {'id': '15', 'display_name': 'Developer Tools', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '15', 'stage_name': 'Seed', 'slug': 'developer-tools-seed', 'vertical': {'id': '15', 'display_name': 'Developer Tools', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6205', 'stage_name': 'Pre-Seed', 'slug': 'digital-health-pre-seed', 'vertical': {'id': '57801', 'display_name': 'Digital Health', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '5644', 'stage_name': 'Seed', 'slug': 'digital-health-seed', 'vertical': {'id': '57801', 'display_name': 'Digital Health', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6207', 'stage_name': 'Pre-Seed', 'slug': 'direct-to-consumer-dtc-pre-seed', 'vertical': {'id': '57803', 'display_name': 'Direct-to-Consumer (DTC)', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '5734', 'stage_name': 'Seed', 'slug': 'direct-to-consumer-dtc-seed', 'vertical': {'id': '57803', 'display_name': 'Direct-to-Consumer (DTC)', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '171', 'stage_name': 'Other Lists', 'slug': 'diverse', 'vertical': {'id': '24242', 'display_name': 'Diverse Investors', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6148', 'stage_name': 'Pre-Seed', 'slug': 'enterprise-pre-seed', 'vertical': {'id': '20', 'display_name': 'Enterprise', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '20', 'stage_name': 'Seed', 'slug': 'enterprise-seed', 'vertical': {'id': '20', 'display_name': 'Enterprise', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '172', 'stage_name': 'Other Lists', 'slug': 'female', 'vertical': {'id': '24241', 'display_name': 'Female Investors', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6162', 'stage_name': 'Pre-Seed', 'slug': 'saas-pre-seed', 'vertical': {'id': '48', 'display_name': 'SaaS', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '47', 'stage_name': 'Seed', 'slug': 'saas-seed', 'vertical': {'id': '48', 'display_name': 'SaaS', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '169', 'stage_name': 'Other Lists', 'slug': 'who-invested-in-diverse-founders', 'vertical': {'id': '24244', 'display_name': 'Investors who invested in diverse founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '170', 'stage_name': 'Other Lists', 'slug': 'who-invested-in-female-founders', 'vertical': {'id': '24243', 'display_name': 'Investors who invested in female founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '187', 'stage_name': 'Other Lists', 'slug': 'who-were-founders', 'vertical': {'id': '24387', 'display_name': 'Investors who were founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}] NaN
1 InvestorProfileEdge InvestorProfile 13187 29548 Aamir Virani Aamir Virani aamir-virani False False Person [https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBeTJHQVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--a7cd75f799cb3eb96a06cbd6b67d287971185953/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/0] angel 1 100000 25000 False NaN NaN NaN NaN [{'id': '7500', 'display_name': 'California', 'location_investor_list': None, '__typename': 'Tag'}, {'id': '7502', 'display_name': 'Texas', 'location_investor_list': None, '__typename': 'Tag'}, {'id': '7498', 'display_name': 'United States', 'location_investor_list': None, '__typename': 'Tag'}] [{'id': '6141', 'stage_name': 'Pre-Seed', 'slug': 'ai-pre-seed', 'vertical': {'id': '3', 'display_name': 'AI', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '3', 'stage_name': 'Seed', 'slug': 'ai-seed', 'vertical': {'id': '3', 'display_name': 'AI', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6170', 'stage_name': 'Pre-Seed', 'slug': 'consumer-internet-pre-seed', 'vertical': {'id': '12', 'display_name': 'Consumer Internet', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '12', 'stage_name': 'Seed', 'slug': 'consumer-internet-seed', 'vertical': {'id': '12', 'display_name': 'Consumer Internet', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6152', 'stage_name': 'Pre-Seed', 'slug': 'hardware-pre-seed', 'vertical': {'id': '28', 'display_name': 'Hardware', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '28', 'stage_name': 'Seed', 'slug': 'hardware-seed', 'vertical': {'id': '28', 'display_name': 'Hardware', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6156', 'stage_name': 'Pre-Seed', 'slug': 'iot-pre-seed', 'vertical': {'id': '34', 'display_name': 'IoT', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '33', 'stage_name': 'Seed', 'slug': 'iot-seed', 'vertical': {'id': '34', 'display_name': 'IoT', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6179', 'stage_name': 'Pre-Seed', 'slug': 'local-services-pre-seed', 'vertical': {'id': '35', 'display_name': 'Local Services', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '34', 'stage_name': 'Seed', 'slug': 'local-services-seed', 'vertical': {'id': '35', 'display_name': 'Local Services', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6185', 'stage_name': 'Pre-Seed', 'slug': 'parenting-families-pre-seed', 'vertical': {'id': '43', 'display_name': 'Parenting/Families', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '42', 'stage_name': 'Seed', 'slug': 'parenting-families-seed', 'vertical': {'id': '43', 'display_name': 'Parenting/Families', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6199', 'stage_name': 'Pre-Seed', 'slug': 'real-estate-proptech-pre-seed', 'vertical': {'id': '45', 'display_name': 'Real Estate/PropTech', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '44', 'stage_name': 'Seed', 'slug': 'real-estate-proptech-seed', 'vertical': {'id': '45', 'display_name': 'Real Estate/PropTech', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6161', 'stage_name': 'Pre-Seed', 'slug': 'robotics-pre-seed', 'vertical': {'id': '47', 'display_name': 'Robotics', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '46', 'stage_name': 'Seed', 'slug': 'robotics-seed', 'vertical': {'id': '47', 'display_name': 'Robotics', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6162', 'stage_name': 'Pre-Seed', 'slug': 'saas-pre-seed', 'vertical': {'id': '48', 'display_name': 'SaaS', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '47', 'stage_name': 'Seed', 'slug': 'saas-seed', 'vertical': {'id': '48', 'display_name': 'SaaS', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '188', 'stage_name': 'Other Lists', 'slug': 'san-francisco-bay-area', 'vertical': {'id': '22992', 'display_name': 'San Francisco Bay Area', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6187', 'stage_name': 'Pre-Seed', 'slug': 'smb-software-pre-seed', 'vertical': {'id': '51', 'display_name': 'SMB Software', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '50', 'stage_name': 'Seed', 'slug': 'smb-software-seed', 'vertical': {'id': '51', 'display_name': 'SMB Software', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '169', 'stage_name': 'Other Lists', 'slug': 'who-invested-in-diverse-founders', 'vertical': {'id': '24244', 'display_name': 'Investors who invested in diverse founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '170', 'stage_name': 'Other Lists', 'slug': 'who-invested-in-female-founders', 'vertical': {'id': '24243', 'display_name': 'Investors who invested in female founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '187', 'stage_name': 'Other Lists', 'slug': 'who-were-founders', 'vertical': {'id': '24387', 'display_name': 'Investors who were founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}] NaN

关于python - 如何使用 beautifulsoup 从页面中抓取数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/69866449/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com