gpt4 book ai didi

python - 解析向下滚动的整个网页的html代码

转载 作者:太空宇宙 更新时间:2023-11-04 10:23:14 24 4
gpt4 key购买 nike

from bs4 import BeautifulSoup
import urllib,sys
reload(sys)
sys.setdefaultencoding("utf-8")
r = urllib.urlopen('https://twitter.com/ndtv').read()
soup = BeautifulSoup(r)

这不会给我滚动到我想要的末尾的整个网页,而是其中的一部分。

编辑:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,sys,requests
reload(sys)
sys.setdefaultencoding("utf-8")

class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count

def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False

def return_html_code(url):
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
print number_of_tweets
driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
html_full_source=driver.page_source
driver.close()
return html_full_source


url='https://twitter.com/thecoolstacks'
#using selenium browser
html_source=return_html_code(url)
soup_selenium = BeautifulSoup(html_source)
print soup_selenium
text_tweet=[]
alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
for tweet in alltweets_selenium:
#Text of tweet
html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
text_tweet.append(''.join(html_tweet[0].findAll(text=True)))
print text_tweet

预期输出:

import requests from bs4 import BeautifulSoup      url='https://twitter.com/thecoolstacks' 
req = requests.get(url)
soup = BeautifulSoup(req.content)
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'})
print alltweets[0]

最佳答案

我仍然会坚持使用 the Twitter API .

或者,您可以通过以下方式解决 selenium 的问题:

实现:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count

def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False


url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)

# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))

# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)

driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])

try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break

这将向下滚动到加载该 channel 中所有现有推文所需的程度。


这是 HTML 解析片段,提取推文:

page_source = driver.page_source
driver.close()

soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
print tweet.p.text

它打印:

Father's Day Facebook post by arrested cop Suhas Gokhale's son got nearly 10,000 likes http://goo.gl/aPqlxf  pic.twitter.com/JUqmdWNQ3c
#HWL2015 End of third quarter! Breathtaking stuff. India 2-2 Pakistan - http://sports.ndtv.com/hockey/news/244463-hockey-world-league-semifinal-india-vs-pakistan-antwerp …
Why these Kashmiri boys may miss their IIT dream http://goo.gl/9LVKfK  pic.twitter.com/gohX21Gibi
...

关于python - 解析向下滚动的整个网页的html代码,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30982176/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com