gpt4 book ai didi

opennlp.tools.tokenize.WhitespaceTokenizer类的使用及代码示例

转载 作者:知者 更新时间:2024-03-25 04:07:05 30 4
gpt4 key购买 nike

本文整理了Java中opennlp.tools.tokenize.WhitespaceTokenizer类的一些代码示例,展示了WhitespaceTokenizer类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WhitespaceTokenizer类的具体详情如下:
包路径:opennlp.tools.tokenize.WhitespaceTokenizer
类名称:WhitespaceTokenizer

WhitespaceTokenizer介绍

[英]This tokenizer uses white spaces to tokenize the input text. To obtain an instance of this tokenizer use the static final INSTANCE field.
[中]此标记器使用空格标记输入文本。要获取此标记器的实例,请使用静态finalINSTANCE字段。

代码示例

代码示例来源:origin: apache/opennlp

public static POSSample parse(String sentenceString) throws InvalidFormatException {
 String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);
 String[] sentence = new String[tokenTags.length];
 String[] tags = new String[tokenTags.length];
 for (int i = 0; i < tokenTags.length; i++) {
  int split = tokenTags[i].lastIndexOf("_");
  if (split == -1) {
   throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!");
  }
  sentence[i] = tokenTags[i].substring(0, split);
  tags[i] = tokenTags[i].substring(split + 1);
 }
 return new POSSample(sentence, tags);
}

代码示例来源:origin: apache/opennlp

@Override
 protected Span[] tokenize(CAS cas, AnnotationFS sentence) {
  return opennlp.tools.tokenize.WhitespaceTokenizer.INSTANCE.
    tokenizePos(sentence.getCoveredText());
 }
}

代码示例来源:origin: apache/opennlp

public DocumentSample read() throws IOException {
  String sampleString = samples.read();

  if (sampleString != null) {

   // Whitespace tokenize entire string
   String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sampleString);

   DocumentSample sample;

   if (tokens.length > 1) {
    String category = tokens[0];
    String[] docTokens = new String[tokens.length - 1];
    System.arraycopy(tokens, 1, docTokens, 0, tokens.length - 1);

    sample = new DocumentSample(category, docTokens);
   }
   else {
    throw new IOException("Empty lines, or lines with only a category string are not allowed!");
   }

   return sample;
  }

  return null;
 }
}

代码示例来源:origin: apache/opennlp

Objects.requireNonNull(separatorChars, "separatorChars must not be null");
Span[] whitespaceTokenSpans = WhitespaceTokenizer.INSTANCE.tokenizePos(sampleString);

代码示例来源:origin: apache/opennlp

String typeName = WhitespaceTokenizer.INSTANCE.tokenize(line)[0];

代码示例来源:origin: apache/opennlp

Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(d);
newTokens.clear();
tokProbs.clear();

代码示例来源:origin: apache/opennlp

@Override
 public SentenceSample read() throws IOException {

  SentenceSample sample = samples.read();

  if (sample != null) {
   List<String> sentenceTexts = new ArrayList<>();

   for (Span sentenceSpan : sample.getSentences()) {
    sentenceTexts.add(sample.getDocument().substring(sentenceSpan.getStart(), sentenceSpan.getEnd()));
   }

   StringBuilder documentText = new StringBuilder();
   List<Span> newSentenceSpans = new ArrayList<>();
   for (String sentenceText : sentenceTexts) {
    String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sentenceText);

    int begin = documentText.length();

    documentText.append(detokenizer.detokenize(tokens, null));
    newSentenceSpans.add(new Span(begin, documentText.length()));
    documentText.append(' ');
   }

   return new SentenceSample(documentText, newSentenceSpans.toArray(new Span[newSentenceSpans.size()]));
  }

  return null;
 }
}

代码示例来源:origin: apache/opennlp

Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(line);

代码示例来源:origin: apache/opennlp

String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(line);

代码示例来源:origin: apache/opennlp

Span[] candTokens = WhitespaceTokenizer.INSTANCE.tokenizePos(sent);

代码示例来源:origin: apache/opennlp

String[] parts = WhitespaceTokenizer.INSTANCE.tokenize(taggedTokens);

代码示例来源:origin: org.apache.opennlp/opennlp-uima

@Override
 protected Span[] tokenize(CAS cas, AnnotationFS sentence) {
  return opennlp.tools.tokenize.WhitespaceTokenizer.INSTANCE.
    tokenizePos(sentence.getCoveredText());
 }
}

代码示例来源:origin: apache/opennlp

@Test
 public void testTokenizationOfStringWithoutTokens() {
  Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize("").length); // empty
  Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize(" ").length); // space
  Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize(" ").length); // tab
  Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize("     ").length);
 }
}

代码示例来源:origin: org.apache.opennlp/opennlp-tools

Objects.requireNonNull(separatorChars, "separatorChars must not be null");
Span[] whitespaceTokenSpans = WhitespaceTokenizer.INSTANCE.tokenizePos(sampleString);

代码示例来源:origin: apache/opennlp

@Test
public void testOneToken() {
 Assert.assertEquals("one", WhitespaceTokenizer.INSTANCE.tokenize("one")[0]);
 Assert.assertEquals("one", WhitespaceTokenizer.INSTANCE.tokenize(" one")[0]);
 Assert.assertEquals("one", WhitespaceTokenizer.INSTANCE.tokenize("one ")[0]);
}

代码示例来源:origin: ai.idylnlp/idylnlp-opennlp-tools-1.8.3

Objects.requireNonNull(separatorChars, "separatorChars must not be null");
Span[] whitespaceTokenSpans = WhitespaceTokenizer.INSTANCE.tokenizePos(sampleString);

代码示例来源:origin: apache/opennlp

/**
 * Tests if it can tokenize whitespace separated tokens.
 */
@Test
public void testWhitespaceTokenization() {
 String text = "a b c  d     e                f    ";
 String[] tokenizedText = WhitespaceTokenizer.INSTANCE.tokenize(text);
 Assert.assertTrue("a".equals(tokenizedText[0]));
 Assert.assertTrue("b".equals(tokenizedText[1]));
 Assert.assertTrue("c".equals(tokenizedText[2]));
 Assert.assertTrue("d".equals(tokenizedText[3]));
 Assert.assertTrue("e".equals(tokenizedText[4]));
 Assert.assertTrue("f".equals(tokenizedText[5]));
 Assert.assertTrue(tokenizedText.length == 6);
}

代码示例来源:origin: org.apache.opennlp/opennlp-tools

Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(d);
newTokens.clear();
tokProbs.clear();

代码示例来源:origin: apache/opennlp

@Test
public void testURL() throws Exception {
 String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(text);
 Span[] find = regexNameFinder.find(tokens);
 List<Span> spanList = Arrays.asList(find);
 Span urlSpan = new Span(13, 14, "URL");
 Assert.assertTrue(spanList.contains(urlSpan));
 Assert.assertEquals("https://www.google.com", tokens[urlSpan.getStart()]);
}

代码示例来源:origin: ai.idylnlp/idylnlp-opennlp-tools-1.8.3

Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(d);
newTokens.clear();
tokProbs.clear();

30 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com