gpt4 book ai didi

java - gwt 应用程序的爬虫花费太多时间

转载 作者:塔克拉玛干 更新时间:2023-11-03 02:45:40 25 4
gpt4 key购买 nike

我有一个 gwt 应用程序,我需要针对 seo 进行优化(为 google 抓取内容),并且我一直在尝试许多不满足我们需求的解决方案(这让我们花费了大量时间来返回 html 页面),试验是:

  1. 我尝试使用 htmlUnit 作为 headless 浏览器来按需抓取页面,获取 html 内容大约需要 15 秒(在审核此时间时,结果 80% 的时间由等待循环占用后台 javascript "while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) ")
  2. 一种技术,包括在 google 请求之前抓取页面,然后在 google 请求时提供保存的快照(但这种解决方案绝对不方便,因为内容变化非常频繁,google 可能认为这是“CLOACKING ")

有什么建议吗?

用于抓取的代码:

public class CrawlFilter implements Filter {
private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
private static final long serialVersionUID = 1L;

@Override
public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
return true;
}
}

private final Logger log = Logger.getLogger(CrawlFilter.class.getName());

/**
* Special URL token that gets passed from the crawler to the servlet
* filter. This token is used in case there are already existing query
* parameters.
*/
private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_=";
private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length();
/**
* Special URL token that gets passed from the crawler to the servlet
* filter. This token is used in case there are not already existing query
* parameters.
*/
private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1;
private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length();

private static final long _pumpEventLoopTimeoutMillis = 30000;
private static final long _jsTimeoutMillis = 1000;
private static final long _pageWaitMillis = 200;
private static final int _maxLoopChecks = 2;

private WebClient webClient;

public void doFilter(ServletRequest request, ServletResponse response,
FilterChain filterChain) throws IOException, ServletException {
// Grab the request uri and query strings.
final HttpServletRequest httpRequest = (HttpServletRequest) request;
final String requestURI = httpRequest.getRequestURI();
final String queryString = httpRequest.getQueryString();
final HttpServletResponse httpResponse = (HttpServletResponse) response;

if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) {
final int port = httpRequest.getServerPort();
final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString);
final String scheme = httpRequest.getScheme();
final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment);
final WebRequest webRequest = new WebRequest(urlWithHashFragment);

log.fine("Crawl filter encountered escaped fragment, will open: " + webRequest.toString());

httpResponse.setContentType("text/html;charset=UTF-8");
final PrintWriter out = httpResponse.getWriter();
out.println(renderPage(webRequest));
out.flush();
out.close();

log.fine("HtmlUnit completed webClient.getPage(webRequest) where webRequest = " + webRequest.toString());
} else {
filterChain.doFilter(request, response);
}
}

@Override
public void destroy() {
if (webClient != null) {
webClient.closeAllWindows();
}
}

@Override
public void init(FilterConfig config) throws ServletException {
}

private StringBuilder renderPage(WebRequest webRequest) throws IOException {
webClient = new WebClient(BrowserVersion.FIREFOX_17);
webClient.getCache().clear();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setRedirectEnabled(false);
webClient.setAjaxController(new SyncAllAjaxController());
webClient.setCssErrorHandler(new SilentCssErrorHandler());

final HtmlPage page = webClient.getPage(webRequest);
webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis);

int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
int loopCount = 0;

while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) {
++loopCount;
waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);

if (waitForBackgroundJavaScript == 0) {
log.fine("HtmlUnit exits background javascript at loop counter " + loopCount);
break;
}

synchronized (page) {
log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount);
try {
page.wait(_pageWaitMillis);
} catch (InterruptedException e) {
log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e);
}
}
}

webClient.getAjaxController().processSynchron(page, webRequest, false);
if (webClient.getJavaScriptEngine().isScriptRunning()) {
log.warning("HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()");
webClient.getJavaScriptEngine().shutdownJavaScriptExecutor();
}

final String staticSnapshotHtml = page.asXml();
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("<hr />\n");
stringBuilder.append("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\"");
stringBuilder.append(webRequest.getUrl() + "\">this link</a> for the interactive application.<br></h3></center>");
stringBuilder.append("<hr />");
stringBuilder.append(staticSnapshotHtml);

return stringBuilder;
}

/**
* Maps from the query string that contains _escaped_fragment_ to one that
* doesn't, but is instead followed by a hash fragment. It also unescapes any
* characters that were escaped by the crawler. If the query string does not
* contain _escaped_fragment_, it is not modified.
*
* @param queryString
* @return A modified query string followed by a hash fragment if applicable.
* The non-modified query string otherwise.
* @throws UnsupportedEncodingException
*/
private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException {
int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2);
int length = ESCAPED_FRAGMENT_LENGTH2;

if (index == -1) {
index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1);
length = ESCAPED_FRAGMENT_LENGTH1;
}

if (index != -1) {
StringBuilder queryStringSb = new StringBuilder();
if (index > 0) {
queryStringSb.append("?");
queryStringSb.append(queryString.substring(0, index));
}
queryStringSb.append("#!");
queryStringSb.append(URLDecoder.decode(queryString.substring(index
+ length, queryString.length()), "UTF-8"));
return queryStringSb.toString();
}

return queryString;
}
}

最佳答案

我建议让 HtmlUnit 离线生成静态 html。您可以控制更新频率。

然后,让您的 servlet 过滤器拦截爬虫请求并返回已生成的静态 html。

关于java - gwt 应用程序的爬虫花费太多时间,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/19715598/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com