gpt4 book ai didi

java - 使用 Stanford CoreNLP 进行 CorefResolution

转载 作者:行者123 更新时间:2023-11-30 08:39:18 24 4
gpt4 key购买 nike

我正在尝试使用 Stanford CoreNLP 执行 Coref 解析。我使用的版本是stanford-corenlp-full-2015-12-09。基本上,我写了一些类:

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;


public class CorefResolution {
public static String corefResolute(String text, List<String> tokenToReplace) {
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

Annotation doc = new Annotation(text);
pipeline.annotate(doc);

Map<Integer, CorefChain> corefs = doc.get(CorefCoreAnnotations.CorefChainAnnotation.class);
System.out.println(corefs);
List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
List<String> resolved = new ArrayList<String>();

for (CoreMap sentence : sentences) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);

for (CoreLabel token : tokens) {

Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
token.get(Coref)

if (corefClustId == null) {
System.out.println("NULL NULL NULL\n");
resolved.add(token.word());
continue;
}
else {
System.out.println("Exist Exist Exist\n");
}

System.out.println("coreClustId is "+corefClustId.toString()+"\n");
CorefChain chain = corefs.get(corefClustId);

if (chain == null || chain.getMentionsInTextualOrder().size() == 1) {
resolved.add(token.word());
} else {
int sentINdx = chain.getRepresentativeMention().sentNum - 1;
CoreMap corefSentence = sentences.get(sentINdx);
List<CoreLabel> corefSentenceTokens = corefSentence.get(CoreAnnotations.TokensAnnotation.class);

CorefChain.CorefMention reprMent = chain.getRepresentativeMention();

if (tokenToReplace.contains(token.word())) {
for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
resolved.add(matchedLabel.word());
}
} else {
resolved.add(token.word());
}
}
}
}

Detokenizer detokenizer = new Detokenizer();
String resolvedStr = detokenizer.detokenize(resolved);

return resolvedStr;
}
}

另一个类

import java.util.Arrays;
import java.util.List;
import java.util.LinkedList;


public class Detokenizer {

public String detokenize(List<String> tokens) {
//Define list of punctuation characters that should NOT have spaces before or after
List<String> noSpaceBefore = new LinkedList<String>(Arrays.asList(",", ".",";", ":", ")", "}", "]", "'", "'s", "n't"));
List<String> noSpaceAfter = new LinkedList<String>(Arrays.asList("(", "[","{", "\"",""));

StringBuilder sentence = new StringBuilder();

tokens.add(0, ""); //Add an empty token at the beginning because loop checks as position-1 and "" is in noSpaceAfter
for (int i = 1; i < tokens.size(); i++) {
if (noSpaceBefore.contains(tokens.get(i))
|| noSpaceAfter.contains(tokens.get(i - 1))) {
sentence.append(tokens.get(i));
} else {
sentence.append(" " + tokens.get(i));
}

// Assumption that opening double quotes are always followed by matching closing double quotes
// This block switches the " to the other set after each occurrence
// ie The first double quotes should have no space after, then the 2nd double quotes should have no space before
if ("\"".equals(tokens.get(i - 1))) {
if (noSpaceAfter.contains("\"")) {
noSpaceAfter.remove("\"");
noSpaceBefore.add("\"");
} else {
noSpaceAfter.add("\"");
noSpaceBefore.remove("\"");
}
}
}
return sentence.toString();
}
}

另一个类文件

import java.io.*;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;


public class PlainTextCorefResolver {

public static void resolveFile(File inputFile, File outputFile) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8")));
PrintWriter writer = new PrintWriter(outputFile, "UTF-8");


if (inputFile.exists()) System.out.println("input exist\n");
else System.out.println("input not exist\n");

if (outputFile.exists()) System.out.println("output exist\n");
else System.out.println("output not exist\n");

while(true){
String line = reader.readLine();
//EOF
if(line == null)
break;
//Resolve line
List<String> tokenToReplace = Arrays.asList("He", "he", "She", "she", "It", "it", "They", "they"); //!!!
String resolvedLine = CorefResolution.corefResolute(line, tokenToReplace);
writer.println(resolvedLine);
}
reader.close();
writer.close();

} catch (Exception e){
System.err.println("Failed to open/resolve input file [" +inputFile.getAbsoluteFile()+ "] in loader");
e.printStackTrace();
return;
}

}


public static void main(String[] args) {
String inputFileName = "path/file.txt";
String outputFileName = "path/file.resolved.txt";
File inputFile = new File(inputFileName);
File outputFile = new File(outputFileName);
resolveFile(inputFile, outputFile);
}

}

但是,它没有给出任何有用的结果。 corefClusterId 始终为空,因此我总是得到一堆“NULL NULL NULL”输出。

我怎样才能正确执行共指消解,用最典型的提及(人名或组织名称)替换“他/他/她/她/它/它/体育场/...”?

例如,给定:“Estadio El Madrigal 是西类牙的一座体育场,从 1923 年开始使用。目前主要用于足球比赛。”我想得到“Estadio El Madrigal 是西类牙的一座体育场,从 1923 年开始使用。Estadio El Madrigal 目前主要用于足球比赛。”

最佳答案

我不认为我们的 coref 系统在您的示例中将“Estadio El Madrigal”附加到“It”。

这里是一些用于访问 CorefChains 和一般提及的示例代码。

import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
import edu.stanford.nlp.hcoref.data.CorefChain;
import edu.stanford.nlp.hcoref.data.Mention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.*;

public class CorefExample {

public static void main(String[] args) throws Exception {

Annotation document = new Annotation("John Kerry is the secretary of state. He ran for president in 2004.");
Properties props = new Properties();
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,mention,coref");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline.annotate(document);
System.out.println("---");
System.out.println("coref chains");
for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
System.out.println("\t"+cc);
System.out.println(cc.getMentionMap());
List<CorefChain.CorefMention> corefMentions = cc.getMentionsInTextualOrder();
for (CorefChain.CorefMention cm : corefMentions) {
System.out.println("---");
System.out.println("full text: "+cm.mentionSpan);
System.out.println("position: "+cm.position);
System.out.println("start index of first word: "+cm.startIndex);
}
}
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
System.out.println("---");
System.out.println("mentions");
for (Mention m : sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class)) {
System.out.println("\t"+m);
}
}
}
}

======================
更新
@StanfordNLPHelper,使用“coref”而不是“dcoref”时出现错误:

INFO: Read 25 rules
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ...
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator mention
Using mention detector type: rule
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator coref
Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.Arrays.copyOfRange(Arrays.java:3664)
at java.lang.String.<init>(String.java:207)
at java.lang.StringBuilder.toString(StringBuilder.java:407)
at java.io.ObjectInputStream$BlockDataInputStream.readUTFBody(ObjectInputStream.java:3079)
at java.io.ObjectInputStream$BlockDataInputStream.readUTF(ObjectInputStream.java:2874)
at java.io.ObjectInputStream.readString(ObjectInputStream.java:1639)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1342)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
at java.util.HashMap.readObject(HashMap.java:1394)
at sun.reflect.GeneratedMethodAccessor2.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1017)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1900)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2000)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1924)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
at edu.stanford.nlp.io.IOUtils.readObjectFromURLOrClasspathOrFileSystem(IOUtils.java:324)
at edu.stanford.nlp.scoref.SimpleLinearClassifier.<init>(SimpleLinearClassifier.java:30)
at edu.stanford.nlp.scoref.PairwiseModel.<init>(PairwiseModel.java:75)
at edu.stanford.nlp.scoref.PairwiseModel$Builder.build(PairwiseModel.java:57)
at edu.stanford.nlp.scoref.ClusteringCorefSystem.<init>(ClusteringCorefSystem.java:31)
at edu.stanford.nlp.scoref.StatisticalCorefSystem.fromProps(StatisticalCorefSystem.java:48)
at edu.stanford.nlp.pipeline.CorefAnnotator.<init>(CorefAnnotator.java:66)
at edu.stanford.nlp.pipeline.AnnotatorImplementations.coref(AnnotatorImplementations.java:220)
at edu.stanford.nlp.pipeline.AnnotatorFactories$13.create(AnnotatorFactories.java:515)
at edu.stanford.nlp.pipeline.AnnotatorPool.get(AnnotatorPool.java:85)
at edu.stanford.nlp.pipeline.StanfordCoreNLP.construct(StanfordCoreNLP.java:375)

Process finished with exit code 1

关于java - 使用 Stanford CoreNLP 进行 CorefResolution,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36204856/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com