gpt4 book ai didi

Java 线程 - 等待所有子线程才能继续

转载 作者:行者123 更新时间:2023-12-01 11:13:54 25 4
gpt4 key购买 nike

一些背景知识;

我正在开发一个项目,其中一个 servlet 将在文件系统中的许多文本文件上发布爬虫。我正在考虑将负载划分到多个线程下,例如:

爬虫进入一个目录,找到3个文件和6个目录。它将开始处理文件并为其他目录启动一个带有新爬虫的线程。因此,从我的创建者类中,我将在基本目录上创建一个爬虫。爬虫将评估工作负载,如果认为需要,它将在另一个线程下生成另一个爬虫。

我的爬虫类看起来像这样

package com.fujitsu.spider;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;

public class DocumentSpider implements Runnable, Serializable {

private static final long serialVersionUID = 8401649393078703808L;
private Spidermode currentMode = null;
private String URL = null;
private String[] terms = null;
private float score = 0;

private ArrayList<SpiderDataPair> resultList = null;

public enum Spidermode {
FILE, DIRECTORY
}

public DocumentSpider(String resourceURL, Spidermode mode, ArrayList<SpiderDataPair> resultList) {
currentMode = mode;
setURL(resourceURL);
this.setResultList(resultList);
}

@Override
public void run() {
try {
if (currentMode == Spidermode.FILE) {
doCrawlFile();
} else {
doCrawlDirectory();
}
} catch (Exception e) {
e.printStackTrace();
}

System.out.println("SPIDER @ " + URL + " HAS FINISHED.");
}

public Spidermode getCurrentMode() {
return currentMode;
}

public void setCurrentMode(Spidermode currentMode) {
this.currentMode = currentMode;
}

public String getURL() {
return URL;
}

public void setURL(String uRL) {
URL = uRL;
}

public void doCrawlFile() throws Exception {
File target = new File(URL);

if (target.isDirectory()) {
throw new Exception(
"This URL points to a directory while the spider is in FILE mode. Please change this spider to FILE mode.");
}

procesFile(target);
}

public void doCrawlDirectory() throws Exception {
File baseDir = new File(URL);

if (!baseDir.isDirectory()) {
throw new Exception(
"This URL points to a FILE while the spider is in DIRECTORY mode. Please change this spider to DIRECTORY mode.");
}

File[] directoryContent = baseDir.listFiles();

for (File f : directoryContent) {
if (f.isDirectory()) {
DocumentSpider spider = new DocumentSpider(f.getPath(), Spidermode.DIRECTORY, this.resultList);
spider.terms = this.terms;
(new Thread(spider)).start();
} else {
DocumentSpider spider = new DocumentSpider(f.getPath(), Spidermode.FILE, this.resultList);
spider.terms = this.terms;
(new Thread(spider)).start();
}
}
}

public void procesDirectory(String target) throws IOException {
File base = new File(target);
File[] directoryContent = base.listFiles();

for (File f : directoryContent) {
if (f.isDirectory()) {
procesDirectory(f.getPath());
} else {
procesFile(f);
}
}
}

public void procesFile(File target) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(target));
String line;
while ((line = br.readLine()) != null) {

String[] words = line.split(" ");
for (String currentWord : words) {
for (String a : terms) {
if (a.toLowerCase().equalsIgnoreCase(currentWord)) {
score += 1f;
}
;
if (currentWord.toLowerCase().contains(a)) {
score += 1f;
}
;
}
}
}

br.close();
resultList.add(new SpiderDataPair(this, URL));
}

public String[] getTerms() {
return terms;
}

public void setTerms(String[] terms) {
this.terms = terms;
}

public float getScore() {
return score;
}

public void setScore(float score) {
this.score = score;
}

public ArrayList<SpiderDataPair> getResultList() {
return resultList;
}

public void setResultList(ArrayList<SpiderDataPair> resultList) {
this.resultList = resultList;
}

}

我面临的问题是,在我的根爬网程序中,我有来 self 想要进一步处理的每个爬网程序的结果列表。处理此列表中的数据的操作是从 servlet(或本示例中的 main 方法)调用的。然而,这些操作总是在所有爬虫完成其处理之前被调用。从而过早启动处理结果的操作,从而导致数据不完整。

我尝试使用连接方法解决这个问题,但不幸的是我似乎无法弄清楚这个问题。

package com.fujitsu.spider;

import java.util.ArrayList;

import com.fujitsu.spider.DocumentSpider.Spidermode;

public class Main {

public static void main(String[] args) throws InterruptedException {
ArrayList<SpiderDataPair> results = new ArrayList<SpiderDataPair>();
String [] terms = {"SERVER","CHANGE","MO"};

DocumentSpider spider1 = new DocumentSpider("C:\\Users\\Mark\\workspace\\Spider\\Files", Spidermode.DIRECTORY, results);
spider1.setTerms(terms);

DocumentSpider spider2 = new DocumentSpider("C:\\Users\\Mark\\workspace\\Spider\\File2", Spidermode.DIRECTORY, results);
spider2.setTerms(terms);

Thread t1 = new Thread(spider1);
Thread t2 = new Thread(spider2);


t1.start();
t1.join();

t2.start();
t2.join();

for(SpiderDataPair d : spider1.getResultList()){
System.out.println("PATH -> " + d.getFile() + " SCORE -> " + d.getSpider().getScore());
}

for(SpiderDataPair d : spider2.getResultList()){
System.out.println("PATH -> " + d.getFile() + " SCORE -> " + d.getSpider().getScore());
}

}

}

TL:DR enter image description here

我真的很想了解这个主题,因此我们将非常感谢您的帮助!

最佳答案

您需要对代码进行一些更改:

在蜘蛛中:

List<Thread> threads = new LinkedList<Thread>();
for (File f : directoryContent) {
if (f.isDirectory()) {
DocumentSpider spider = new DocumentSpider(f.getPath(), Spidermode.DIRECTORY, this.resultList);
spider.terms = this.terms;
Thread thread = new Thread(spider);
threads.add(thread)
thread.start();
} else {
DocumentSpider spider = new DocumentSpider(f.getPath(), Spidermode.FILE, this.resultList);
spider.terms = this.terms;
Thread thread = new Thread(spider);
threads.add(thread)
thread.start();
}
}
for (Thread thread: threads) thread.join()

这个想法是为每个蜘蛛创建一个新线程并启动它。一旦它们全部运行,您就需要等待每个任务都完成后,Spider 本身才会完成。这样,每个蜘蛛线程都会保持运行,直到完成所有工作(因此顶层线程运行,直到所有子线程及其子线程完成)。

您还需要更改您的运行程序,以便它并行运行两个蜘蛛,而不是像这样一个接一个地运行:

Thread t1 = new Thread(spider1);
Thread t2 = new Thread(spider2);
t1.start();
t2.start();
t1.join();
t2.join();

关于Java 线程 - 等待所有子线程才能继续,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/32062111/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com