gpt4 book ai didi

java - 无法更新 Open NLP 模型

转载 作者:行者123 更新时间:2023-11-30 03:42:11 25 4
gpt4 key购买 nike

我正在为我的一个项目使用 Apache OpenNLP。我正在创建一个新模型来识别位置,因为预训练模型 (en-ner-location.bin) 没有此位置。

这是代码:

package com.equinox.nlp;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;

public class NlpTesting {
protected Map<String, NameFinderME> finders;
protected Tokenizer tokenizer;

public static void main(String[] args) throws InvalidFormatException,
IOException {

String bankura = "In the 2011 census, Bankura municipality had a population of 138,036, out of which 70,734 were males and 67,302 were females.";
String london = "London is the capital city of England and the United Kingdom.";

NlpTesting nlpTesting = new NlpTesting();
NameFinderME nameFinderA = nlpTesting.createNameFinder("./opennlp-models/en-ner-location.bin");
nlpTesting.findLocation(london, nameFinderA);
System.out.println("--------------------------");
nlpTesting.findLocation(bankura, nameFinderA);

nlpTesting.train();

NameFinderME nameFinderB = nlpTesting.createNameFinder("./opennlp-models/en-ner-custom-location.bin");

nlpTesting.findLocation(bankura, nameFinderB);
}

public String findLocation(String str,NameFinderME nameFinder) throws InvalidFormatException,
IOException {
String commaSeparatedLocationNames = "";
tokenizer = SimpleTokenizer.INSTANCE;

String tokens[] = tokenizer.tokenize(str);
Span nameSpans[] = nameFinder.find(tokens);
HashSet<String> locationSet = new HashSet<String>();
for (int i = 0; i < nameSpans.length; i++) {
locationSet.add(tokens[nameSpans[i].getStart()]);
}
for (Iterator<String> iterator = locationSet.iterator(); iterator
.hasNext();) {
String location = iterator.next();
commaSeparatedLocationNames += location + ",";
}
System.out.println(commaSeparatedLocationNames);
return commaSeparatedLocationNames;
}

public void train() throws IOException {
File trainerFile = new File("./train/train.txt");
File output = new File("./opennlp-models/en-ner-custom-location.bin");
ObjectStream<String> lineStream = new PlainTextByLineStream(
new FileInputStream(trainerFile), "UTF-8");
ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
lineStream);
System.out.println("lineStream = " + lineStream);
TokenNameFinderModel model = NameFinderME.train("en", "location",
sampleStream, Collections.<String, Object> emptyMap());
BufferedOutputStream modelOut = null;
try {
modelOut = new BufferedOutputStream(new FileOutputStream(output));
model.serialize(modelOut);
} finally {
if (modelOut != null)
modelOut.close();
}
}

public NameFinderME createNameFinder(String str) throws InvalidFormatException,
FileNotFoundException, IOException {
NameFinderME nameFinder = new NameFinderME(new TokenNameFinderModel(
new FileInputStream(new File(str))));
return nameFinder;
}

}

到目前为止,一切正常。

问题是我无法向我创建的此自定义模型添加另一个位置。因此,我浏览了 OpenNLP - README 文档。

在那里,它说,“注意:为了训练模型,您需要所有训练数据。目前没有一种机制可以使用附加数据更新随项目一起分发的模型。”

这是否意味着我也无法更新我的自定义模型?有什么办法可以做到这一点吗?很可能我在创建模型时可能没有所有数据,并且应该有更新模型的选项。请帮助我。

最佳答案

这正是它所说的:每次要添加新的训练实例时,您都需要从头开始重新训练整个模型。

如果您需要更新模型而不重新训练,那么 OpenNLP 不是适合您任务的工具。

关于java - 无法更新 Open NLP 模型,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/26583483/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com