gpt4 book ai didi

info.bliki.wiki.dump.WikiXMLParser类的使用及代码示例

转载 作者:知者 更新时间:2024-03-25 07:31:05 27 4
gpt4 key购买 nike

本文整理了Java中info.bliki.wiki.dump.WikiXMLParser类的一些代码示例,展示了WikiXMLParser类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WikiXMLParser类的具体详情如下:
包路径:info.bliki.wiki.dump.WikiXMLParser
类名称:WikiXMLParser

WikiXMLParser介绍

[英]A Wikipedia XML dump file parser Original version with permission from Marco Schmidt. See: http://schmidt.devlib.org/software/lucene-wikipedia.html
[中]维基百科XML转储文件解析器的原始版本,并获得了马可·施密特的许可。请参阅:http://schmidt.devlib.org/software/lucene-wikipedia.html

代码示例

代码示例来源:origin: edu.illinois.cs.cogcomp/wikipediaAPI

/**
 * Parses the given Wikipedia XML dump stream. User needs to instantiate the
 * parser for call backs
 */
public static void parseDump(InputStream is, WikiDumpFilter parser)
    throws UnsupportedEncodingException, FileNotFoundException,
    IOException, SAXException {
  new WikiXMLParser(is, parser).parse();
  parser.finishUp();
}

代码示例来源:origin: info.bliki.wiki/bliki-core

public WikiXMLParser(File filename, IArticleFilter filter) throws IOException, SAXException {
  this(getReader(filename), filter);
}

代码示例来源:origin: stackoverflow.com

import edu.jhu.nlp.wikipedia.*;
  public class InfoboxParser {

  public static void main(String[] args) throws Exception{
    WikiXMLParser parser = WikiXMLParserFactory.getSAXParser("/path_to_wiki_dump/enwiki-20131202-pages-articles-multistream.xml.bz2.xml");
      parser.setPageCallback(new PageCallbackHandler() {
        public void process(WikiPage page) {
         InfoBox infobox=page.getInfoBox();
          //do something with info box
        }
      });
      parser.parse();
  }

}

代码示例来源:origin: stackoverflow.com

File f = new File("c:/path/to/svwiki-20151102-pages-meta-current.xml");
 WikiXMLParser wxp;
 try {
   wxp = new WikiXMLParser(f, handler);
   wxp.parse();
 } catch (IOException e) {   
   e.printStackTrace();
 } catch (SAXException e) {
   e.printStackTrace();
 }

代码示例来源:origin: diegoceccarelli/json-wikipedia

/**
 * Generates a converter from the xml to json dump.
 * 
 * @param inputFile
 *            - the xml file (compressed)
 * @param outputFile
 *            - the json output file, containing one article per line (if
 *            the filename ends with <tt>.gz </tt> the output will be
 *            compressed).
 * 
 * @param lang
 *            - the language of the dump
 * 
 * 
 */
public WikipediaArticleReader(File inputFile, File outputFile, String lang) {
  JsonConverter handler = new JsonConverter();
  // encoder = new JsonRecordParser<Article>(Article.class);
  parser = new ArticleParser(lang);
  try {
    wxp = new WikiXMLParser(new File(inputFile.getAbsolutePath()), handler);
  } catch (Exception e) {
    logger.error("creating the parser {}", e.toString());
    System.exit(-1);
  }
  out = IOUtils.getPlainOrCompressedUTF8Writer(outputFile
      .getAbsolutePath());
}

代码示例来源:origin: info.bliki.wiki/bliki-core

if (fSiteinfo != null) {
  if (WIKIPEDIA_NAMESPACE.equals(qName) && fNamespaceKey != null) {
    fSiteinfo.addNamespace(fNamespaceKey, getString());
  } else if ("sitename".equals(qName)) {
    fSiteinfo.setSitename(getString());
  } else if ("base".equals(qName)) {
    fSiteinfo.setBase(getString());
  } else if ("generator".equals(qName)) {
    fSiteinfo.setGenerator(getString());
  } else if ("case".equals(qName)) {
    fSiteinfo.setCharacterCase(getString());
  fArticle.setText(getString());
  try {
    fArticleFilter.process(fArticle, fSiteinfo);
  fArticle.setTitle(getString(), fSiteinfo);
} else if (WIKIPEDIA_TIMESTAMP.equals(qName)) {
  fArticle.setTimeStamp(getString());
} else if (!fRevision && WIKIPEDIA_ID.equals(qName)) {
  fArticle.setId(getString());
} else if (fRevision && WIKIPEDIA_ID.equals(qName)) {
  fArticle.setRevisionId(getString());

代码示例来源:origin: diegoceccarelli/json-wikipedia

/**
 * Starts the parsing
 */
public void start() throws IOException, SAXException {
  wxp.parse();
  out.close();
  logger.info(sw.stat("articles"));
}

代码示例来源:origin: yahoo/FEL

public static void main( String[] arg ) {
  if( arg.length < 2 ) {
    System.err.println( " USAGE java ExtractFirstParagraphs  <inputFile> <outputFile>" );
  }
  try {
    ExtractFirstParagraphs handler = new ExtractFirstParagraphs( arg[ 1 ] );
    WikiXMLParser wxp = new WikiXMLParser( arg[ 0 ], ( IArticleFilter ) handler );
    wxp.parse();
  } catch( Exception e ) {
    e.printStackTrace();
  }
}

代码示例来源:origin: stackoverflow.com

public void wikiDumpReader(String dumpfile) {
     WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(dumpfile);
     System.out.println("Going to process dump file");
     try {
       wxsp.setPageCallback(new PageCallbackHandler() {
         @Override
         public void process(WikiPage page) {
           System.out.println(page.getTitle());
         }
       });
       wxsp.parse();
     } catch (Exception e) {
       System.err.println("Error :" + e);
     }
   }

代码示例来源:origin: yahoo/FEL

public static void main(String[] arg) {
  if (arg.length < 2) {
    System.err.println(" USAGE java ExtractLinks  <inputFile> <output file> ");
  }
  try {
    ExtractLinks handler = new ExtractLinks(arg[1]);
    WikiXMLParser wxp = new WikiXMLParser(arg[0], (IArticleFilter)handler);
    wxp.parse();
  }
  catch (Exception e) {
    e.printStackTrace();
  }
}

代码示例来源:origin: edu.illinois.cs.cogcomp/wikipediaAPI-multilingual

/**
 * Parses the given Wikipedia XML dump file. User needs to instantiate the
 * parser for call backs
 * 
 * @param file
 * @param parser
 * @throws UnsupportedEncodingException
 * @throws FileNotFoundException
 * @throws IOException
 * @throws SAXException
 */
public static void parseDump(String file, MLWikiDumpFilter parser)
    throws UnsupportedEncodingException, FileNotFoundException,
    IOException, SAXException {
  new WikiXMLParser(file, parser).parse();
  parser.finishUp();
}

代码示例来源:origin: edu.illinois.cs.cogcomp/wikipediaAPI

/**
 * Parses the given Wikipedia XML dump file. User needs to instantiate the
 * parser for call backs
 * 
 * @param file
 * @param parser
 * @throws UnsupportedEncodingException
 * @throws FileNotFoundException
 * @throws IOException
 * @throws SAXException
 */
public static void parseDump(String file, DumpFilter parser)
    throws UnsupportedEncodingException, FileNotFoundException,
    IOException, SAXException {
  new WikiXMLParser(file, parser).parse();
  parser.finishUp();
}

代码示例来源:origin: info.bliki.wiki/bliki-core

public static void main(String args[]) throws Exception {
    if (args.length > 0) {
      File file = new File(args[0]);
      WikiXMLParser parser = new WikiXMLParser(file, new PrintArticle(999999));
      parser.parse();
    } else {
      System.err.println("PrintArticle <dump.xml>");
      System.exit(1);
    }
  }
}

代码示例来源:origin: edu.illinois.cs.cogcomp/DatalessClassification

public static void main(String[] args) {
    if (args.length != 1) {
      System.err.println("Usage: Parser <XML-FILE>");
      System.exit(-1);
    }
    String bz2Filename = args[0];
    try {
      DemoArticleFilter handler = new DemoArticleFilter(10) {

        @Override
        void processAnnotation(PageParser pageParser) {
        }

      };
      WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler);
      wxp.parse();
      handler.finishUp();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
}

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com