gpt4 book ai didi

java - 如何更高效地注释多个Stanford CoreNLP CoreDocuments?

转载 作者:行者123 更新时间:2023-11-30 05:39:38 25 4
gpt4 key购买 nike

我通过斯坦福 Corenlp 将大量字符串注释为 CoreDocuments。 StanleyCoreNLP 管 Prop 有多线程注释的内部功能,以优化流程,但据我所知,CoreDocument 对象无法在我运行的版本中使用该功能,即 stanford-corenlp-full-2018-10-05。

由于我无法使 Pipelines Annotate CoreDocuments 集合,所以我尝试通过将各个注释放置在多线程方法中来优化它们。我对多线程环境没有任何问题。我按预期收到了所有结果,唯一的缺点是耗时。我尝试了大约 7 种不同的实现,其中 3 种是最快的:

//ForkJoinPool is initialized in the main method in my application
private static ForkJoinPool executor = new ForkJoinPool(Runtime.getRuntime().availableProcessors(), ForkJoinPool.defaultForkJoinWorkerThreadFactory, null, false);

public static ConcurrentMap<String, CoreDocument> getMultipleCoreDocumentsWay1(Collection<String> str) {
ConcurrentMap<String, CoreDocument> pipelineCoreDocumentAnnotations = new MapMaker().concurrencyLevel(2).makeMap();
str.parallelStream().forEach((str1) -> {
CoreDocument coreDocument = new CoreDocument(str1);
pipeline.annotate(coreDocument);
pipelineCoreDocumentAnnotations.put(str1, coreDocument);
System.out.println("pipelineCoreDocumentAnnotations size1: " + pipelineCoreDocumentAnnotations.size() + "\nstr size: " + str.size() + "\n");
});
return pipelineCoreDocumentAnnotations;
}


public static ConcurrentMap<String, CoreDocument> getMultipleCoreDocumentsWay4(Collection<String> str) {
ConcurrentMap<String, CoreDocument> pipelineCoreDocumentAnnotations = new MapMaker().concurrencyLevel(2).makeMap();
str.parallelStream().forEach((str1) -> {
try {
ForkJoinTask<CoreDocument> forkCD = new RecursiveTask() {
@Override
protected CoreDocument compute() {
CoreDocument coreDocument = new CoreDocument(str1);
pipeline.annotate(coreDocument);
return coreDocument;
}
};
forkCD.invoke();
pipelineCoreDocumentAnnotations.put(str1, forkCD.get());
System.out.println("pipelineCoreDocumentAnnotations2 size: " + pipelineCoreDocumentAnnotations.size() + "\nstr size: " + str.size() + "\n");
} catch (InterruptedException | ExecutionException ex) {
Logger.getLogger(Parsertest.class.getName()).log(Level.SEVERE, null, ex);
}
});
return pipelineCoreDocumentAnnotations;
}

public static ConcurrentMap<String, CoreDocument> getMultipleCoreDocumentsWay7(ConcurrentMap<Integer, String> hlstatsSTR) {
RecursiveDocumentAnnotation recursiveAnnotation = new RecursiveDocumentAnnotation(hlstatsSTR, pipeline);
ConcurrentMap<String, CoreDocument> returnMap = new MapMaker().concurrencyLevel(2).makeMap();
executor.execute(recursiveAnnotation);
try {
returnMap = recursiveAnnotation.get();
} catch (InterruptedException | ExecutionException ex) {
Logger.getLogger(Parsertest.class.getName()).log(Level.SEVERE, null, ex);
}
System.out.println("reached end\n");
return returnMap;
}
RecursiveDocumentAnnotation class:

public class RecursiveDocumentAnnotation extends RecursiveTask<ConcurrentMap<String, CoreDocument>> {

private String str;
private StanfordCoreNLP nlp;
private static ConcurrentMap<String, CoreDocument> pipelineCoreDocumentAnnotations;
private static ConcurrentMap<Integer, String> hlstatsStrMap;

public static ConcurrentMap<String, CoreDocument> getPipelineCoreDocumentAnnotations() {
return pipelineCoreDocumentAnnotations;
}

public RecursiveDocumentAnnotation(ConcurrentMap<Integer, String> hlstatsStrMap, StanfordCoreNLP pipeline) {
this.pipelineCoreDocumentAnnotations = new MapMaker().concurrencyLevel(2).makeMap();
this.str = hlstatsStrMap.get(0);
this.nlp = pipeline;
this.hlstatsStrMap = hlstatsStrMap;
}

public RecursiveDocumentAnnotation(ConcurrentMap<Integer, String> hlstatsStrMap, StanfordCoreNLP pipeline,
ConcurrentMap<String, CoreDocument> returnMap) {
this.str = hlstatsStrMap.get(returnMap.size());
this.nlp = pipeline;
this.hlstatsStrMap = hlstatsStrMap;
this.pipelineCoreDocumentAnnotations = returnMap;
}

@Override
protected ConcurrentMap<String, CoreDocument> compute() {
CoreDocument coreDocument = new CoreDocument(str);
nlp.annotate(coreDocument);
pipelineCoreDocumentAnnotations.put(str, coreDocument);
System.out.println("hlstatsStrMap size: " + hlstatsStrMap.size() + "\npipelineCoreDocumentAnnotations size: " + pipelineCoreDocumentAnnotations.size()
+ "\n");
if (pipelineCoreDocumentAnnotations.size() >= hlstatsStrMap.size()) {
return pipelineCoreDocumentAnnotations;
}
RecursiveDocumentAnnotation recursiveAnnotation = new RecursiveDocumentAnnotation(hlstatsStrMap, nlp, pipelineCoreDocumentAnnotations);
recursiveAnnotation.fork();
return recursiveAnnotation.join();
} }

时间并行1:336562毫秒。

时间并行4:391556毫秒。

时间并行7:491639毫秒。

老实说,如果管道本身可以以某种方式完成多重注释,那么最好的就是,但是只要我不知道如何实现这一点,我希望有人可以解释我如何单独优化 CoreDocument 注释。PS:将所有字符串混合到一个核心文档中进行注释也不是我想要的,因为我需要单独的核心文档进行比较。

最佳答案

我没有计时,但您可以尝试这个示例代码(将测试字符串添加到字符串列表中)...它应该同时处理 4 个文档:

package edu.stanford.nlp.examples;

import edu.stanford.nlp.pipeline.*;

import java.util.*;
import java.util.function.*;
import java.util.stream.*;


public class MultiThreadStringExample {

public static class AnnotationCollector<T> implements Consumer<T> {

List<T> annotations = new ArrayList<T>();

public void accept(T ann) {
annotations.add(ann);
}
}

public static void main(String[] args) throws Exception {
Properties props = new Properties();
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,depparse");
props.setProperty("threads", "4");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
AnnotationCollector<Annotation> annCollector = new AnnotationCollector<Annotation>();
List<String> exampleStrings = new ArrayList<String>();
for (String exampleString : exampleStrings) {
pipeline.annotate(new Annotation(exampleString), annCollector);
}
Thread.sleep(10000);
List<CoreDocument> coreDocs =
annCollector.annotations.stream().map(ann -> new CoreDocument(ann)).collect(Collectors.toList());
for (CoreDocument coreDoc : coreDocs) {
System.out.println(coreDoc.tokens());
}
}

}

关于java - 如何更高效地注释多个Stanford CoreNLP CoreDocuments?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/55909786/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com