gpt4 book ai didi

java - 尝试在我的网络爬虫中运行几天时出现许多内存错误

转载 作者:行者123 更新时间:2023-11-30 08:07:04 24 4
gpt4 key购买 nike

我正在开发一个网络爬虫应用程序。当我运行该程序时,我收到以下错误消息:

enter image description here

运行程序超过 3 小时后,我遇到了这些错误。我尝试通过将 eclipse.ini 设置更改为 2048 MB ram 来分配内存,如 this topic 中所述。但 3 小时或更短时间后仍然出现相同的错误。我应该不间断地运行该程序 2-3 天以上才能分析结果。

你能告诉我我在这里缺少什么才能得到下面的这些错误吗?

这些是我的类(class):

seeds.txt

http://www.stanford.edu
http://www.archive.org

WebCrawler.java

 package pkg.crawler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.TimeUnit;

import org.jsoup.HttpStatusException;
import org.jsoup.UnsupportedMimeTypeException;
import org.joda.time.DateTime;


public class WebCrawler {

public static Queue <LinkNodeLight> queue = new PriorityBlockingQueue <> (); // priority queue
public static final int n_threads = 5; // amount of threads
private static Set<String> processed = new LinkedHashSet <> (); // set of processed urls
private PrintWriter out; // output file
private PrintWriter err; // error file
private static Integer cntIntra = new Integer (0); // counters for intra- links in the queue
private static Integer cntInter = new Integer (0); // counters for inter- links in the queue
private static Integer dub = new Integer (0); // amount of skipped urls

public static void main(String[] args) throws Exception {
System.out.println("Running web crawler: " + new Date());

WebCrawler webCrawler = new WebCrawler();
webCrawler.createFiles();
try (Scanner in = new Scanner(new File ("seeds.txt"))) {
while (in.hasNext()) {
webCrawler.enque(new LinkNode (in.nextLine().trim()));
}
} catch (IOException e) {
e.printStackTrace();
return;
}
webCrawler.processQueue();
webCrawler.out.close();
webCrawler.err.close();
}

public void processQueue(){
/* run in threads */
Runnable r = new Runnable() {
@Override
public void run() {
/* queue may be empty but process is not finished, that's why we need to check if any links are being processed */
while (true) {
LinkNode link = deque();
if (link == null)
continue;
link.setStartTime(new DateTime());
boolean process = processLink(link);
link.setEndTime(new DateTime());
if (!process)
continue;
/* print the data to the csv file */
if (link.getStatus() != null && link.getStatus().equals(LinkNodeStatus.OK)) {
synchronized(out) {
out.println(getOutputLine(link));
out.flush();
}
} else {
synchronized(err) {
err.println(getOutputLine(link));
err.flush();
}
}
}
}
};
/* run n_threads threads which perform dequeue and process */
LinkedList <Thread> threads = new LinkedList <> ();
for (int i = 0; i < n_threads; i++) {
threads.add(new Thread(r));
threads.getLast().start();
}
for (Thread thread : threads) {
try {
thread.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}


/* returns true if link was actually processed */
private boolean processLink(LinkNode inputLink) {
String url = getUrlGeneralForm(inputLink);
boolean process = true;
synchronized (processed) {
if (processed.contains(url)) {
process = false;
synchronized (dub) {dub++;}
} else
processed.add(url);
}
/* start processing only if the url have not been processed yet or not being processed */
if (process) {
System.out.println("Processing url " + url);
List<LinkNodeLight> outputLinks = parseAndWieghtResults(inputLink);
for (LinkNodeLight outputLink : outputLinks) {
String getUrlGeneralForumOutput = getUrlGeneralForm(outputLink);
/* add the new link to the queue only if it has not been processed yet */
process = true;
synchronized (processed) {
if (processed.contains(getUrlGeneralForumOutput)) {
process = false;
synchronized (dub) {dub++;}
}
}
if (process) {
enque(outputLink);
}
}
return true;
}
return false;
}

void enque(LinkNodeLight link){
link.setEnqueTime(new DateTime());
/* the add method requires implicit priority */
synchronized (queue) {
if (link.interLinks)
synchronized (cntInter) {cntInter++;}
else
synchronized (cntIntra) {cntIntra++;}
//queue.add(link, 100 - (int)(link.getWeight() * 100.f));
queue.add(link);
}
}


/**
* Picks an element from the queue
* @return top element from the queue or null if the queue is empty
*/
LinkNode deque(){
/* link must be checked */
LinkNode link = null;
synchronized (queue) {
link = (LinkNode) queue.poll();
if (link != null) {
link.setDequeTime(new DateTime());
if (link.isInterLinks())
synchronized (cntInter) {cntInter--;}
else
synchronized (cntIntra) {cntIntra--;}
}
}
return link;
}

private void createFiles() {
/* create output file */
try {
out = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLS.csv", false)));
out.println(generateHeaderFile());
} catch (IOException e) {
System.err.println(e);
}
/* create error file */
try {
err = new PrintWriter(new BufferedWriter(new FileWriter("CrawledURLSERROR.csv", false)));
err.println(generateHeaderFile());
} catch (IOException e) {
System.err.println(e);
}
}
/**
* formats the string so it can be valid entry in csv file
* @param s
* @return
*/
private static String format(String s) {
// replace " by ""
String ret = s.replaceAll("\"", "\"\"");
// put string into quotes
return "\"" + ret + "\"";
}
/**
* Creates the line that needs to be written in the outputfile
* @param link
* @return
*/
public static String getOutputLine(LinkNode link){
StringBuilder builder = new StringBuilder();
builder.append(link.getParentLink()!=null ? format(link.getParentLink().getUrl()) : "");
builder.append(",");
builder.append(link.getParentLink()!=null ? link.getParentLink().getIpAdress() : "");
builder.append(",");
builder.append(link.getParentLink()!=null ? link.getParentLink().linkProcessingDuration() : "");
builder.append(",");
builder.append(format(link.getUrl()));
builder.append(",");
builder.append(link.getDomain());
builder.append(",");
builder.append(link.isInterLinks());
builder.append(",");
builder.append(Util.formatDate(link.getEnqueTime()));
builder.append(",");
builder.append(Util.formatDate(link.getDequeTime()));
builder.append(",");
builder.append(link.waitingInQueue());
builder.append(",");
builder.append(queue.size());
/* Inter and intra links in queue */
builder.append(",");
builder.append(cntIntra.toString());
builder.append(",");
builder.append(cntInter.toString());
builder.append(",");
builder.append(dub);
builder.append(",");
builder.append(new Date ());
/* URL size*/
builder.append(",");
builder.append(link.getSize());
/* HTML file
builder.append(",");
builder.append(link.getFileName());*/
/* add HTTP error */
builder.append(",");
if (link.getParseException() != null) {
if (link.getParseException() instanceof HttpStatusException)
builder.append(((HttpStatusException) link.getParseException()).getStatusCode());
if (link.getParseException() instanceof SocketTimeoutException)
builder.append("Time out");
if (link.getParseException() instanceof MalformedURLException)
builder.append("URL is not valid");
if (link.getParseException() instanceof UnsupportedMimeTypeException)
builder.append("Unsupported mime type: " + ((UnsupportedMimeTypeException)link.getParseException()).getMimeType());
}
return builder.toString();

}

/**
* generates the Header for the file
* @param link
* @return
*/
private String generateHeaderFile(){
StringBuilder builder = new StringBuilder();
builder.append("Seed URL");
builder.append(",");
builder.append("Seed IP");
builder.append(",");
builder.append("Process Duration");
builder.append(",");
builder.append("Link URL");
builder.append(",");
builder.append("Link domain");
builder.append(",");
builder.append("Link IP");
builder.append(",");
builder.append("Enque Time");
builder.append(",");
builder.append("Deque Time");
builder.append(",");
builder.append("Waiting in the Queue");
builder.append(",");
builder.append("QueueSize");
builder.append(",");
builder.append("Intra in queue");
builder.append(",");
builder.append("Inter in queue");
builder.append(",");
builder.append("Dublications skipped");
/* time was printed, but no header was */
builder.append(",");
builder.append("Time");
/* URL size*/
builder.append(",");
builder.append("Size bytes");
/* HTTP errors */
builder.append(",");
builder.append("HTTP error");
return builder.toString();

}



String getUrlGeneralForm(LinkNodeLight link){
String url = link.getUrl();
if (url.endsWith("/")){
url = url.substring(0, url.length() - 1);
}
return url;
}


private List<LinkNodeLight> parseAndWieghtResults(LinkNode inputLink) {
List<LinkNodeLight> outputLinks = HTMLParser.parse(inputLink);
if (inputLink.hasParseException()) {
return outputLinks;
} else {
return URLWeight.weight(inputLink, outputLinks);
}
}
}

HTMLParser.java

package pkg.crawler;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.math.BigInteger;
import java.util.Formatter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;
import java.security.*;
import java.nio.file.Path;
import java.nio.file.Paths;


public class HTMLParser {

private static final int READ_TIMEOUT_IN_MILLISSECS = (int) TimeUnit.MILLISECONDS.convert(30, TimeUnit.SECONDS);
private static HashMap <String, Integer> filecounter = new HashMap<> ();


public static List<LinkNodeLight> parse(LinkNode inputLink){
List<LinkNodeLight> outputLinks = new LinkedList<>();
try {
inputLink.setIpAdress(IpFromUrl.getIp(inputLink.getUrl()));
String url = inputLink.getUrl();
if (inputLink.getIpAdress() != null) {
url.replace(URLWeight.getHostName(url), inputLink.getIpAdress());
}
Document parsedResults = Jsoup
.connect(url)
.timeout(READ_TIMEOUT_IN_MILLISSECS)
.get();
inputLink.setSize(parsedResults.html().length());
/* IP address moved here in order to speed up the process */
inputLink.setStatus(LinkNodeStatus.OK);
inputLink.setDomain(URLWeight.getDomainName(inputLink.getUrl()));
if (true) {
/* save the file to the html */
String filename = parsedResults.title();//digestBig.toString(16) + ".html";
if (filename.length() > 24) {
filename = filename.substring(0, 24);
}
filename = filename.replaceAll("[^\\w\\d\\s]", "").trim();
filename = filename.replaceAll("\\s+", " ");

if (!filecounter.containsKey(filename)) {
filecounter.put(filename, 1);
} else {
Integer tmp = filecounter.remove(filename);
filecounter.put(filename, tmp + 1);
}
filename = filename + "-" + (filecounter.get(filename)).toString() + ".html";
filename = Paths.get("downloads", filename).toString();
inputLink.setFileName(filename);
/* use md5 of url as file name */
try (PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)))) {
out.println("<!--" + inputLink.getUrl() + "-->");
out.print(parsedResults.html());
out.flush();
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
String tag;
Elements tagElements;
List<LinkNode> result;


tag = "a[href";
tagElements = parsedResults.select(tag);
result = toLinkNodeObject(inputLink, tagElements, tag);
outputLinks.addAll(result);


tag = "area[href";
tagElements = parsedResults.select(tag);
result = toLinkNodeObject(inputLink, tagElements, tag);
outputLinks.addAll(result);
} catch (IOException e) {
inputLink.setParseException(e);
inputLink.setStatus(LinkNodeStatus.ERROR);
}

return outputLinks;
}


static List<LinkNode> toLinkNodeObject(LinkNode parentLink, Elements tagElements, String tag) {
List<LinkNode> links = new LinkedList<>();
for (Element element : tagElements) {

if(isFragmentRef(element)){
continue;
}

String absoluteRef = String.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexOf("[") + 1, tag.length()) : "href");
String url = element.attr(absoluteRef);

if(url!=null && url.trim().length()>0) {
LinkNode link = new LinkNode(url);
link.setTag(element.tagName());
link.setParentLink(parentLink);
links.add(link);
}
}
return links;
}

static boolean isFragmentRef(Element element){
String href = element.attr("href");
return href!=null && (href.trim().startsWith("#") || href.startsWith("mailto:"));
}

}

Util.java

package pkg.crawler;

import java.util.Date;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;


public class Util {

private static DateTimeFormatter formatter;
static {



formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss:SSS");


}


public static String linkToString(LinkNode inputLink){


return String.format("%s\t%s\t%s\t%s\t%s\t%s",
inputLink.getUrl(),
inputLink.getWeight(),
formatDate(inputLink.getEnqueTime()),
formatDate(inputLink.getDequeTime()),
differenceInMilliSeconds(inputLink.getEnqueTime(), inputLink.getDequeTime()),
inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl()
);
}

public static String linkToErrorString(LinkNode inputLink){

return String.format("%s\t%s\t%s\t%s\t%s\t%s",
inputLink.getUrl(),
inputLink.getWeight(),
formatDate(inputLink.getEnqueTime()),
formatDate(inputLink.getDequeTime()),
inputLink.getParentLink()==null?"":inputLink.getParentLink().getUrl(),
inputLink.getParseException().getMessage()
);
}


public static String formatDate(DateTime date){
return formatter.print(date);
}

public static long differenceInMilliSeconds(DateTime dequeTime, DateTime enqueTime){
return (dequeTime.getMillis()- enqueTime.getMillis());
}

public static int differenceInSeconds(Date enqueTime, Date dequeTime){
return (int)((dequeTime.getTime()/1000) - (enqueTime.getTime()/1000));
}

public static int differenceInMinutes(Date enqueTime, Date dequeTime){
return (int)((dequeTime.getTime()/60000) - (enqueTime.getTime()/60000));
}

}

URLWeight.java

package pkg.crawler;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;

public class URLWeight {

public static List<LinkNodeLight> weight(LinkNode sourceLink, List<LinkNodeLight> links) {

List<LinkNodeLight> interLinks = new LinkedList<>();
List<LinkNodeLight> intraLinks = new LinkedList<>();

for (LinkNodeLight link : links) {
if (isIntraLink(sourceLink, link)) {
intraLinks.add(link);
link.setInterLinks(false);
} else {
interLinks.add(link);
link.setInterLinks(true);
}
}



static boolean isIntraLink(LinkNodeLight sourceLink, LinkNodeLight link){

String parentDomainName = getHostName(sourceLink.getUrl());

String childDomainName = getHostName(link.getUrl());
return parentDomainName.equalsIgnoreCase(childDomainName);
}

public static String getHostName(String url) {
if(url == null){
// System.out.println("Deneme");
return "";

}

String domainName = new String(url);

int index = domainName.indexOf("://");
if (index != -1) {

domainName = domainName.substring(index + 3);
}
for (int i = 0; i < domainName.length(); i++)
if (domainName.charAt(i) == '?' || domainName.charAt(i) == '/') {
domainName = domainName.substring(0, i);
break;
}

/*if (index != -1) {

domainName = domainName.substring(0, index);
}*/

/* have to keep www in order to do replacements with IP */
//domainName = domainName.replaceFirst("^www.*?\\.", "");

return domainName;
}
public static String getDomainName(String url) {
String [] tmp= getHostName(url).split("\\.");
if (tmp.length == 0)
return "";
return tmp[tmp.length - 1];
}


}

PingTaskManager.java

package pkg.crawler;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class PingTaskManager {

private static ExecutorService executor = Executors.newFixedThreadPool(100);

public static void ping (LinkNode e) {
executor.submit(new PingTaks(e));
}


}

class PingTaks implements Runnable {
private LinkNode link;
public PingTaks( LinkNode link ) {

}

@Override
public void run() {
/* link.ping(); */
}


}

LinkNodeStatus.java

package pkg.crawler;

public enum LinkNodeStatus {
OK,
ERROR

}

LinkNodeLight.java

package pkg.crawler;

import org.joda.time.DateTime;

public class LinkNodeLight implements Comparable<LinkNodeLight> {
protected String url;
protected float weight;
protected DateTime enqueTime;
protected boolean interLinks;

public String getUrl() {
return url;
}

public float getWeight() {
return weight;
}

public void setWeight(float weight) {
this.weight = weight;
}

public DateTime getEnqueTime() {
return enqueTime;
}


public LinkNodeLight(String url) {
this.url = url;
}


public void setEnqueTime(DateTime enqueTime) {
this.enqueTime = enqueTime;
}

@Override
public int compareTo(LinkNodeLight link) {

if (this.weight < link.weight) return 1;
else if (this.weight > link.weight) return -1;
return 0;

}
}

LinkNode.java

package pkg.crawler;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.Socket;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;



import org.joda.time.DateTime;


public class LinkNode extends LinkNodeLight{
public LinkNode(String url) {
super(url);
}

private String tag;
private LinkNode parentLink;
private IOException parseException = null; // initialize parse Exception with null
private float weight;
private DateTime dequeTime;
private DateTime startTime;
private DateTime endTime;
private LinkNodeStatus status;
private String ipAdress;
private int size;
private String filename;
private String domain;

public DateTime getStartTime() {
return startTime;
}

public void setStartTime(DateTime startTime) {
this.startTime = startTime;
}

public DateTime getEndTime() {
return endTime;
}

public void setEndTime(DateTime endTime) {
this.endTime = endTime;
}

public DateTime getDequeTime() {
return dequeTime;
}

public String getTag() {
return tag;
}

public LinkNode getParentLink() {
return parentLink;
}

public Exception getParseException() {
return parseException;
}

public boolean hasParseException(){
return parseException!=null;
}


public void setDequeTime(DateTime dequeTime) {
this.dequeTime = dequeTime;
}

public void setTag(String tag) {
this.tag = tag;
}

public void setParentLink(LinkNode parentLink) {
this.parentLink = parentLink;
}

public void setParseException(IOException parseException) {
this.parseException = parseException;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}

LinkNode link = (LinkNode) o;

if (url != null ? !url.equals(link.url) : link.url != null) {
return false;
}

return true;
}

@Override
public int hashCode() {
return url != null ? url.hashCode() : 0;
}

public long waitingInQueue(){
return Util.differenceInMilliSeconds( dequeTime,enqueTime );
}

public long linkProcessingDuration(){
return Util.differenceInMilliSeconds( endTime,startTime );
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder("LinkNode{");
sb.append("url='").append(url).append('\'');
sb.append(", score=").append(weight);
sb.append(", enqueTime=").append(enqueTime);
sb.append(", dequeTime=").append(dequeTime);
sb.append(", tag=").append(tag);
if(parentLink!=null) {
sb.append(", parentLink=").append(parentLink.getUrl());
}
sb.append('}');
return sb.toString();
}

public void setStatus(LinkNodeStatus status) {
this.status = status;
}

public LinkNodeStatus getStatus(){
if (status == null) {
status = LinkNodeStatus.ERROR;
}
return status;
}

// check server link is it exist or not
/* this method gives fake errors
public LinkNodeStatus ping () {

boolean reachable = false;
String sanitizeUrl = url.replaceFirst("^https", "http");

try {
HttpURLConnection connection = (HttpURLConnection) new URL(sanitizeUrl).openConnection();
connection.setConnectTimeout(1000);
connection.setRequestMethod("HEAD");
int responseCode = connection.getResponseCode();
System.err.println(url + " " + responseCode);
reachable = (200 <= responseCode && responseCode <= 399);
} catch (IOException exception) {
}
return reachable?LinkNodeStatus.OK: LinkNodeStatus.ERROR;
}*/


public String getIpAdress() {
return ipAdress;
}

public void setIpAdress(String ipAdress) {
this.ipAdress = ipAdress;
}

/* methods for controlling url size */
public void setSize(int size) {
this.size = size;
}

public int getSize() {
return this.size;
}

public void setFileName(String filename) {
this.filename = filename;
}

public String getFileName() {
return this.filename;
}

public String getDomain() {
return domain;
}

public void setDomain(String domain) {
this.domain = domain;
}
}

最佳答案

I tried to allocate memory by changing eclipse.ini setting to 2048 MB of ram as it was answered in this topic but still get the same errors after 3 hours or less.

我讨厌repeat myself (*),但是在eclipse.ini中你设置了Eclipse的内存,与你的爬虫的内存无关。

使用命令行时,需要通过java -Xmx2G pkg.crawler.WebCrawler启动。

从 Eclipse 启动时,需要将 -Xmx2G 添加到运行配置(“VM 参数”而不是“程序参数”)。

<小时/>

(*) 链接到已删除的问题;需要一定的声誉才能查看。

关于java - 尝试在我的网络爬虫中运行几天时出现许多内存错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30947742/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com