gpt4 book ai didi

python - 在 Python 中截取 url 的更好方法

转载 作者:行者123 更新时间:2023-12-03 21:22:02 25 4
gpt4 key购买 nike

问题描述

目前正在从事一个项目,该项目需要我浏览一个 url 并截取网页的屏幕截图。

在查看了各种资源后,我找到了 3 种方法。我将提到我目前使用的所有 3 种方法。

方法 - 1:PhantomJS

from selenium import webdriver
import time
import sys

print 'Without Headless'
_start = time.time()
br = webdriver.PhantomJS()
br.get('http://' + sys.argv[1])
br.save_screenshot('screenshot-phantom.png')
br.quit
_end = time.time()
print 'Total time for non-headless {}'.format(_end - _start)

方法 2: headless (headless)浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

print 'Headless'
_start = time.time()
options = Options()
options.add_argument("--headless") # Runs Chrome in headless mode.
options.add_argument('--no-sandbox') # # Bypass OS security model
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path='/usr/bin/chromedriver')
driver.get('http://' + sys.argv[1])
driver.save_screenshot('screenshot-headless.png')
driver.quit()
_end = time.time()
print 'Total time for headless {}'.format(_end - _start)

方法 - 3 :PyQT
import argparse
import sys
import logging
import sys
import time
import os

import urlparse
from selenium import webdriver
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *

class Screenshot(QWebView):
def __init__(self):
self.app = QApplication(sys.argv)
QWebView.__init__(self)
self._loaded = False
self.loadFinished.connect(self._loadFinished)

def capture(self, url, output_file):
_logger.info('Received url {}'.format(url))
_start = time.time()
try:
#Check for http/https
if url[0:3] == 'http' or url[0:4] == 'https':
self.url = url
else:
url = 'http://' + url
self.load(QUrl(url))
self.wait_load(url)
# set to webpage size
frame = self.page().mainFrame()
self.page().setViewportSize(frame.contentsSize())
# render image
image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
frame.render(painter)
painter.end()
_logger.info('Saving screenshot {} for {}'.format(output_file,url))
image.save(os.path.join(os.path.dirname(os.path.realpath(__file__)),'data',output_file))
except Exception as e:
_logger.error('Error in capturing screenshot {} - {}'.format(url,e))
_end = time.time()
_logger.info('Time took for processing url {} - {}'.format(url,_end - _start))

def wait_load(self,url,delay=1,retry_count=60):
# process app events until page loaded
while not self._loaded and retry_count:
_logger.info('wait_load for url {} retry_count {}'.format(url,retry_count))
self.app.processEvents()
time.sleep(delay)
retry_count -=1
_logger.info('wait_load for url {} expired'.format(url))
self._loaded = False

def _loadFinished(self, result):
self._loaded = True

面临的问题:

这3种方法在使用时,都因为一个或其他错误而卡住。这里问了一个这样的问题 Error Question on Stackoverflow .
所以在这3种方法中用Python对网页进行截图,这是有效的并且可以用于大规模部署。

最佳答案

取自 https://gist.github.com/fabtho/13e4a2e7cfbfde671b8fa81bbe9359fb并用 Python 3 重写
这种方法在技术上可行,但看起来不太好,因为许多网站都会在每个屏幕截图中出现接受 cookie 的弹出窗口,因此根据您使用的网站,您可能希望在开始屏幕截图之前先使用 selenium 删除这些过程。

from PIL import Image
from io import BytesIO

verbose = 1

browser = webdriver.Chrome(executable_path='C:/yourpath/chromedriver.exe')
browser.get('http://stackoverflow.com/questions/37906704/taking-a-whole-page-screenshot-with-selenium-marionette-in-python')

# from here http://stackoverflow.com/questions/1145850/how-to-get-height-of-entire-document-with-javascript
js = 'return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);'

scrollheight = browser.execute_script(js)

if verbose > 0:
print(scrollheight)

slices = []
offset = 0
while offset < scrollheight:
if verbose > 0:
print(offset)

browser.execute_script("window.scrollTo(0, %s);" % offset)
img = Image.open(BytesIO(browser.get_screenshot_as_png()))
offset += img.size[1]
slices.append(img)

if verbose > 0:
browser.get_screenshot_as_file('%s/screen_%s.png' % ('/tmp', offset))
print(scrollheight)


screenshot = Image.new('RGB', (slices[0].size[0], offset))
offset = 0
for img in slices:
screenshot.paste(img, (0, offset))
offset += img.size[1]

screenshot.save('screenshot.png')
browser.quit()```

关于python - 在 Python 中截取 url 的更好方法,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/51000899/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com