gpt4 book ai didi

python - 使用 Python Beautifulsoup 抓取天使列表配置文件描述

转载 作者:行者123 更新时间:2023-11-30 22:45:51 27 4
gpt4 key购买 nike

这里是新手爬虫!

我目前正忙于一项繁琐而无聊的任务,我必须从天使列表中复制/粘贴某些内容并将它们保存在Excel中。我以前曾使用刮刀来自动化这些无聊的任务,但这个任务非常困难,我无法找到一种方法来自动化它。请在下面找到网站链接:

https://angel.co/people/all

请应用过滤器位置-> 美国和市场-> 在线约会。将有大约 550 个结果(请注意,应用过滤器时 URL 不会更改)

应用过滤器后,我已成功抓取所有配置文件的 URL。因此,我有一个 excel 文件,其中包含这些配置文件的 550 个 URL。

现在下一步是转到个人资料并抓取某些信息。我目前正在寻找这些字段:

  1. 姓名
  2. 描述信息
  3. 投资
  4. 创始人
  5. 顾问
  6. 地点
  7. 市场
  8. 我在寻找什么

现在我已经尝试了很多解决方案,但到目前为止都没有奏效。 Import.io、数据挖掘器、数据抓取工具对我帮助不大。

请建议是否有任何 VBA 代码或 Python 代码或任何工具可以帮助我自动执行此抓取任务?

解决方案的完整代码:

这是带有注释的最终代码。如果有人仍然有问题,请在下面评论,我会尽力帮助您。

from bs4 import BeautifulSoup
import urllib2
import json
import csv

def fetch_page(url):
opener = urllib2.build_opener()
# changing the user agent as the default one is banned
opener.addheaders = [('User-Agent', 'Mozilla/43.0.1')]
return opener.open(url).read()


#Create a CSV File.
f = open('angle_profiles.csv', 'w')
# Row Headers
f.write("URL" + "," + "Name" + "," + "Founder" + "," + "Advisor" + "," + "Employee" + "," + "Board Member" + ","
+ "Customer" + "," + "Locations" + "," + "Markets" + "," + "Investments" + "," + "What_iam_looking_for" + "\n")

# URLs to iterate over has been saved in file: 'profiles_links.csv' . I will extract the URLs individually...
index = 1;
with open("profiles_links.csv") as f2:

for row in map(str.strip,f2):
url = format(row)
print "@ Index: ", index
index += 1;

# Check if URL has 404 error. if yes, skip and continue with the rest of URLs.
try:
html = fetch_page(url)
page = urllib2.urlopen(url)
except Exception, e:
print "Error 404 @: " , url
continue

bs = BeautifulSoup(html, "html.parser")

#Extract info from page with these tags..
name = bs.select(".profile-text h1")[0].get_text().strip()

#description = bs.select('div[data-field="bio"]')[0]['data-value']

founder = map(lambda link: link.get_text().strip(), bs.select('.role_founder a'))

advisor = map(lambda link: link.get_text().strip(), bs.select('.role_advisor a'))

employee = map(lambda link: link.get_text().strip(), bs.select('.role_employee a'))

board_member = map(lambda link: link.get_text().strip(), bs.select('.role_board_member a'))

customer = map(lambda link: link.get_text().strip(), bs.select('.role_customer a'))

class_wrapper = bs.body.find('div', attrs={'data-field' : 'tags_interested_locations'})
count = 1
locations = {}

if class_wrapper is not None:
for span in class_wrapper.find_all('span'):
locations[count] = span.text
count +=1

class_wrapper = bs.body.find('div', attrs={'data-field' : 'tags_interested_markets'})
count = 1
markets = {}
if class_wrapper is not None:
for span in class_wrapper.find_all('span'):
markets[count] = span.text
count +=1

what_iam_looking_for = ' '.join(map(lambda p: p.get_text().strip(), bs.select('div.criteria p')))

user_id = bs.select('.profiles-show .profiles-show')[0]['data-user_id']

# investments are loaded using separate request and response is in JSON format
json_data = fetch_page("https://angel.co/startup_roles/investments?user_id=%s" % user_id)

investment_records = json.loads(json_data)

investments = map(lambda x: x['company']['company_name'], investment_records)

# Make sure that every variable is in string

name2 = str(name); founder2 = str(founder); advisor2 = str (advisor); employee2 = str(employee)
board_member2 = str(board_member); customer2 = str(customer); locations2 = str(locations); markets2 = str (markets);
what_iam_looking_for2 = str(what_iam_looking_for); investments2 = str(investments);

# Replace any , found with - so that csv doesn't confuse it as col separator...
name = name2.replace(",", " -")
founder = founder2.replace(",", " -")
advisor = advisor2.replace(",", " -")
employee = employee2.replace(",", " -")
board_member = board_member2.replace(",", " -")
customer = customer2.replace(",", " -")
locations = locations2.replace(",", " -")
markets = markets2.replace(",", " -")
what_iam_looking_for = what_iam_looking_for2.replace(","," -")
investments = investments2.replace(","," -")

# Replace u' with nothing
name = name.replace("u'", "")
founder = founder.replace("u'", "")
advisor = advisor.replace("u'", "")
employee = employee.replace("u'", "")
board_member = board_member.replace("u'", "")
customer = customer.replace("u'", "")
locations = locations.replace("u'", "")
markets = markets.replace("u'", "")
what_iam_looking_for = what_iam_looking_for.replace("u'", "")
investments = investments.replace("u'", "")

# Write the information back to the file... Note \n is used to jump one row ahead...
f.write(url + "," + name + "," + founder + "," + advisor + "," + employee + "," + board_member + ","
+ customer + "," + locations + "," + markets + "," + investments + "," + what_iam_looking_for + "\n")

请随意使用以下链接测试上述代码:

https://angel.co/idg-ventures?utm_source=people
https://angel.co/douglas-feirstein?utm_source=people
https://angel.co/andrew-heckler?utm_source=people
https://angel.co/mvklein?utm_source=people
https://angel.co/rajs1?utm_source=people

快乐编码:)

最佳答案

对于我的食谱,您需要使用 pip 或 easy_install 安装 BeautifulSoup

from bs4 import BeautifulSoup
import urllib2
import json

def fetch_page(url):
opener = urllib2.build_opener()
# changing the user agent as the default one is banned
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
return opener.open(url).read()


html = fetch_page("https://angel.co/davidtisch")

# or load from local file
#html = open('page.html', 'r').read()

bs = BeautifulSoup(html, "html.parser")
name = bs.select(".profile-text h1")[0].get_text().strip()

description = bs.select('div[data-field="bio"]')[0]['data-value']

founder = map(lambda link: link.get_text().strip(), bs.select('.role_founder a'))

advisor = map(lambda link: link.get_text().strip(), bs.select('.role_advisor a'))

locations = map(lambda link: link.get_text().strip(), bs.select('div[data-field="tags_interested_locations"] a'))

markets = map(lambda link: link.get_text().strip(), bs.select('div[data-field="tags_interested_markets"] a'))

what_iam_looking_for = ' '.join(map(lambda p: p.get_text().strip(), bs.select('div.criteria p')))

user_id = bs.select('.profiles-show .profiles-show')[0]['data-user_id']

# investments are loaded using separate request and response is in JSON format
json_data = fetch_page("https://angel.co/startup_roles/investments?user_id=%s" % user_id)

investment_records = json.loads(json_data)

investments = map(lambda x: x['company']['company_name'], investment_records)

关于python - 使用 Python Beautifulsoup 抓取天使列表配置文件描述,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41076801/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com