gpt4 book ai didi

opennlp.tools.tokenize.WhitespaceTokenizer.tokenize()方法的使用及代码示例

转载 作者:知者 更新时间:2024-03-25 03:49:05 33 4
gpt4 key购买 nike

本文整理了Java中opennlp.tools.tokenize.WhitespaceTokenizer.tokenize()方法的一些代码示例,展示了WhitespaceTokenizer.tokenize()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WhitespaceTokenizer.tokenize()方法的具体详情如下:
包路径:opennlp.tools.tokenize.WhitespaceTokenizer
类名称:WhitespaceTokenizer
方法名:tokenize

WhitespaceTokenizer.tokenize介绍

暂无

代码示例

代码示例来源:origin: apache/opennlp

public static POSSample parse(String sentenceString) throws InvalidFormatException {
 String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);
 String[] sentence = new String[tokenTags.length];
 String[] tags = new String[tokenTags.length];
 for (int i = 0; i < tokenTags.length; i++) {
  int split = tokenTags[i].lastIndexOf("_");
  if (split == -1) {
   throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!");
  }
  sentence[i] = tokenTags[i].substring(0, split);
  tags[i] = tokenTags[i].substring(split + 1);
 }
 return new POSSample(sentence, tags);
}

代码示例来源:origin: apache/opennlp

String typeName = WhitespaceTokenizer.INSTANCE.tokenize(line)[0];

代码示例来源:origin: apache/opennlp

public DocumentSample read() throws IOException {
  String sampleString = samples.read();

  if (sampleString != null) {

   // Whitespace tokenize entire string
   String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sampleString);

   DocumentSample sample;

   if (tokens.length > 1) {
    String category = tokens[0];
    String[] docTokens = new String[tokens.length - 1];
    System.arraycopy(tokens, 1, docTokens, 0, tokens.length - 1);

    sample = new DocumentSample(category, docTokens);
   }
   else {
    throw new IOException("Empty lines, or lines with only a category string are not allowed!");
   }

   return sample;
  }

  return null;
 }
}

代码示例来源:origin: apache/opennlp

@Override
 public SentenceSample read() throws IOException {

  SentenceSample sample = samples.read();

  if (sample != null) {
   List<String> sentenceTexts = new ArrayList<>();

   for (Span sentenceSpan : sample.getSentences()) {
    sentenceTexts.add(sample.getDocument().substring(sentenceSpan.getStart(), sentenceSpan.getEnd()));
   }

   StringBuilder documentText = new StringBuilder();
   List<Span> newSentenceSpans = new ArrayList<>();
   for (String sentenceText : sentenceTexts) {
    String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sentenceText);

    int begin = documentText.length();

    documentText.append(detokenizer.detokenize(tokens, null));
    newSentenceSpans.add(new Span(begin, documentText.length()));
    documentText.append(' ');
   }

   return new SentenceSample(documentText, newSentenceSpans.toArray(new Span[newSentenceSpans.size()]));
  }

  return null;
 }
}

代码示例来源:origin: apache/opennlp

String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(line);

代码示例来源:origin: apache/opennlp

String[] parts = WhitespaceTokenizer.INSTANCE.tokenize(taggedTokens);

代码示例来源:origin: apache/opennlp

@Test
public void testOneToken() {
 Assert.assertEquals("one", WhitespaceTokenizer.INSTANCE.tokenize("one")[0]);
 Assert.assertEquals("one", WhitespaceTokenizer.INSTANCE.tokenize(" one")[0]);
 Assert.assertEquals("one", WhitespaceTokenizer.INSTANCE.tokenize("one ")[0]);
}

代码示例来源:origin: apache/opennlp

@Test
 public void testTokenizationOfStringWithoutTokens() {
  Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize("").length); // empty
  Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize(" ").length); // space
  Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize(" ").length); // tab
  Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize("     ").length);
 }
}

代码示例来源:origin: apache/opennlp

/**
 * Tests if it can tokenize whitespace separated tokens.
 */
@Test
public void testWhitespaceTokenization() {
 String text = "a b c  d     e                f    ";
 String[] tokenizedText = WhitespaceTokenizer.INSTANCE.tokenize(text);
 Assert.assertTrue("a".equals(tokenizedText[0]));
 Assert.assertTrue("b".equals(tokenizedText[1]));
 Assert.assertTrue("c".equals(tokenizedText[2]));
 Assert.assertTrue("d".equals(tokenizedText[3]));
 Assert.assertTrue("e".equals(tokenizedText[4]));
 Assert.assertTrue("f".equals(tokenizedText[5]));
 Assert.assertTrue(tokenizedText.length == 6);
}

代码示例来源:origin: apache/opennlp

@Test
public void testURL() throws Exception {
 String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text);
 Span[] find = regexNameFinder.find(tokens);
 List<Span> spanList = Arrays.asList(find);
 Span urlSpan = new Span(13, 14, "URL");
 Assert.assertTrue(spanList.contains(urlSpan));
 Assert.assertEquals("https://www.google.com", tokens[urlSpan.getStart()]);
}

代码示例来源:origin: apache/opennlp

@Test
public void testPhoneNumber() throws Exception {
 String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text);
 Span[] find = regexNameFinder.find(tokens);
 List<Span> spanList = Arrays.asList(find);
 Span phoneSpan = new Span(9, 10, "PHONE_NUM");
 Assert.assertTrue(spanList.contains(phoneSpan));
 Assert.assertEquals("123-234-5678", tokens[phoneSpan.getStart()]);
}

代码示例来源:origin: apache/opennlp

String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(tokenizedLine);

代码示例来源:origin: apache/opennlp

String line;
while ((line = untokenizedLineStream.read()) != null) {
 String[] whitespaceTokenizerLine = WhitespaceTokenizer.INSTANCE.tokenize(line);

代码示例来源:origin: apache/opennlp

@Test
public void testEmail() throws Exception {
 String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text);
 Span[] find = regexNameFinder.find(tokens);
 List<Span> spanList = Arrays.asList(find);
 Assert.assertTrue(spanList.contains(new Span(3, 4, "EMAIL")));
 Span emailSpan = new Span(3, 4, "EMAIL");
 Assert.assertEquals("opennlp@gmail.com", tokens[emailSpan.getStart()]);
}

代码示例来源:origin: apache/opennlp

@Test
 public void testMgrs() throws Exception {
  String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text);
  Span[] find = regexNameFinder.find(tokens);
  List<Span> spanList = Arrays.asList(find);
  Span mgrsSpan1 = new Span(18, 19, "MGRS");
  Span mgrsSpan2 = new Span(20, 24, "MGRS");
  Assert.assertTrue(spanList.contains(mgrsSpan1));
  Assert.assertTrue(spanList.contains(mgrsSpan2));
  Assert.assertEquals("11SKU528111".toLowerCase(), tokens[mgrsSpan1.getStart()]);
  Assert.assertEquals("11S", tokens[mgrsSpan2.getStart()]);
 }
}

代码示例来源:origin: apache/opennlp

@Test
public void testLatLong() throws Exception {
 String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text);
 Span[] find = regexNameFinder.find(tokens);
 List<Span> spanList = Arrays.asList(find);
 Span latLongSpan1 = new Span(22, 24, "DEGREES_MIN_SEC_LAT_LON");
 Span latLongSpan2 = new Span(35, 41, "DEGREES_MIN_SEC_LAT_LON");
 Assert.assertTrue(spanList.contains(latLongSpan1));
 Assert.assertTrue(spanList.contains(latLongSpan2));
 Assert.assertEquals("528", tokens[latLongSpan1.getStart()]);
 Assert.assertEquals("45", tokens[latLongSpan2.getStart()]);
}

代码示例来源:origin: apache/opennlp

while ((line = lineStream.read()) != null) {
 String[] whitespaceTokenizerLine = WhitespaceTokenizer.INSTANCE.tokenize(line);
 String[] tags = tagger.tag(whitespaceTokenizerLine);

代码示例来源:origin: apache/opennlp

String document;
while ((document = documentStream.read()) != null) {
 String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(document);

代码示例来源:origin: org.apache.opennlp/opennlp-tools

public static POSSample parse(String sentenceString) throws InvalidFormatException {
 String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);
 String[] sentence = new String[tokenTags.length];
 String[] tags = new String[tokenTags.length];
 for (int i = 0; i < tokenTags.length; i++) {
  int split = tokenTags[i].lastIndexOf("_");
  if (split == -1) {
   throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!");
  }
  sentence[i] = tokenTags[i].substring(0, split);
  tags[i] = tokenTags[i].substring(split + 1);
 }
 return new POSSample(sentence, tags);
}

代码示例来源:origin: ai.idylnlp/idylnlp-opennlp-tools-1.8.3

public static POSSample parse(String sentenceString) throws InvalidFormatException {
 String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);
 String[] sentence = new String[tokenTags.length];
 String[] tags = new String[tokenTags.length];
 for (int i = 0; i < tokenTags.length; i++) {
  int split = tokenTags[i].lastIndexOf("_");
  if (split == -1) {
   throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!");
  }
  sentence[i] = tokenTags[i].substring(0, split);
  tags[i] = tokenTags[i].substring(split + 1);
 }
 return new POSSample(sentence, tags);
}

33 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com