gpt4 book ai didi

org.archive.io.warc.WARCReader类的使用及代码示例

转载 作者:知者 更新时间:2024-03-27 14:47:05 32 4
gpt4 key购买 nike

本文整理了Java中org.archive.io.warc.WARCReader类的一些代码示例,展示了WARCReader类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WARCReader类的具体详情如下:
包路径:org.archive.io.warc.WARCReader
类名称:WARCReader

WARCReader介绍

[英]WARCReader. Go via WARCReaderFactory to get instance.
[中]战士。通过WARCReaderFactory获取实例。

代码示例

代码示例来源:origin: internetarchive/heritrix3

reader.setDigest(false);
try {
  l.setLevel(Level.WARNING);
  for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
    WARCRecord r = (WARCRecord)i.next();
    if (!isARCType(r.getHeader().getMimetype())) {
    reader.close();

代码示例来源:origin: internetarchive/heritrix3

public void transform(final File warc, final File dir, final String prefix,
    final String suffix, final boolean force)
throws IOException, java.text.ParseException {
  FileUtils.assertReadable(warc);
  FileUtils.assertReadable(dir);
  WARCReader reader = WARCReaderFactory.get(warc);
  List<String> metadata =  new ArrayList<String>();
  metadata.add("Made from " + reader.getReaderIdentifier() + " by " +
    this.getClass().getName() + "/" + getRevision());
  ARCWriter writer = 
    new ARCWriter(
        new AtomicInteger(),
        new WriterPoolSettingsData(
            prefix, 
            suffix, 
            -12, 
            reader.isCompressed(), 
            Arrays.asList(new File [] {dir}), 
            metadata));
  transform(reader, writer);
}

代码示例来源:origin: org.netpreserve.commons/webarchive-commons

Options options = getOptions();
PosixParser parser = new PosixParser();
CommandLine cmdline = parser.parse(options, args, false);
  usage(formatter, options, 0);
  switch(cmdlineOptions[i].getId()) {
    case 'h':
      usage(formatter, options, 0);
      break;
      digest = getTrueOrFalse(cmdlineOptions[i].getValue());
      break;
        usage(formatter, options, 1);
  if (cmdlineArgs.size() != 1) {
    System.out.println("Error: Pass one arcfile only.");
    usage(formatter, options, 1);
  r.setStrict(strict);
  outputRecord(r, format);
} else {
  for (Iterator<String> i = cmdlineArgs.iterator(); i.hasNext();) {
    try {
      WARCReader r = WARCReaderFactory.get(urlOrPath);
      r.setStrict(strict);
      r.setDigest(digest);

代码示例来源:origin: ViDA-NYU/ache

public WarcRecordsIterator(Path directory) {
  try {
    filesStream = Files.newDirectoryStream(directory);
    filesIt = filesStream.iterator();
    if (filesIt.hasNext()) {
      Path file = filesIt.next();
      warcReader = openFile(file);
      warcRecordIterator = warcReader.iterator();
    }
  } catch (IOException e) {
    throw new IllegalArgumentException("Failed to open target repository folder: " + directory, e);
  }
}

代码示例来源:origin: lintool/warcbase

for (Iterator<ArchiveRecord> ii = reader.iterator(); ii.hasNext();) {
  WARCRecord r = (WARCRecord) ii.next();
ArchiveRecordHeader h = r.getHeader();
 if (reader != null)
  try {
   reader.close();
  } catch (IOException e) {
   e.printStackTrace();

代码示例来源:origin: ViDA-NYU/ache

warcReader.close();
if (!filesIt.hasNext()) {
  IOUtils.closeQuietly(filesStream);
  filePath = filesIt.next();
  warcReader = openFile(filePath);
  warcRecordIterator = warcReader.iterator();
  nextRecord = (WARCRecord) warcReader.get();
} catch (IOException e) {
  String f = filePath == null ? null : filePath.toString();

代码示例来源:origin: org.netpreserve.commons/webarchive-commons

/**
 * Create new WARC record.
 * Encapsulate housekeeping that has to do w/ creating new Record.
 * @param is InputStream to use.
 * @param offset Absolute offset into WARC file.
 * @return A WARCRecord.
 * @throws IOException
 */
protected WARCRecord createArchiveRecord(InputStream is, long offset)
throws IOException {
  return (WARCRecord)currentRecord(new WARCRecord(is,
    getReaderIdentifier(), offset, isDigest(), isStrict()));
}

代码示例来源:origin: org.netpreserve.commons/commons-web

/**
 * Generate a CDX index file for an ARC file.
 *
 * @param urlOrPath The ARC file to generate a CDX index for
 * @throws IOException
 * @throws java.text.ParseException
 */
public static void createCDXIndexFile(String urlOrPath)
throws IOException, java.text.ParseException {
  WARCReader r = WARCReaderFactory.get(urlOrPath);
  r.setStrict(false);
  r.setDigest(true);
  output(r, CDX_FILE);
}

代码示例来源:origin: lintool/warcbase

@Override
 public synchronized void close() throws IOException {
  reader.close();
 }
}

代码示例来源:origin: org.netpreserve.commons/commons-web

/**
 * Skip over any trailing new lines at end of the record so we're lined up
 * ready to read the next.
 * @param record
 * @throws IOException
 */
protected void gotoEOR(ArchiveRecord record) throws IOException {
  if (record.available() != 0) {
    throw new IOException("Record should be exhausted before coming " +
      "in here");
  }
  // Records end in 2*CRLF.  Suck it up.
  readExpectedChar(getIn(), CRLF.charAt(0));
  readExpectedChar(getIn(), CRLF.charAt(1));
  readExpectedChar(getIn(), CRLF.charAt(0));
  readExpectedChar(getIn(), CRLF.charAt(1));
}

代码示例来源:origin: lintool/warcbase

/**
 * Converts raw bytes into an {@code WARCRecord}.
 *
 * @param bytes raw bytes
 * @return parsed {@code WARCRecord}
 * @throws IOException
 */
public static WARCRecord fromBytes(byte[] bytes) throws IOException {
 WARCReader reader = (WARCReader) WARCReaderFactory.get("",
   new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
 return (WARCRecord) reader.get();
}

代码示例来源:origin: org.netpreserve.commons/commons-web

Options options = getOptions();
PosixParser parser = new PosixParser();
CommandLine cmdline = parser.parse(options, args, false);
  usage(formatter, options, 0);
  switch(cmdlineOptions[i].getId()) {
    case 'h':
      usage(formatter, options, 0);
      break;
      digest = getTrueOrFalse(cmdlineOptions[i].getValue());
      break;
        usage(formatter, options, 1);
  if (cmdlineArgs.size() != 1) {
    System.out.println("Error: Pass one arcfile only.");
    usage(formatter, options, 1);
  r.setStrict(strict);
  outputRecord(r, format);
} else {
  for (Iterator<String> i = cmdlineArgs.iterator(); i.hasNext();) {
    try {
      WARCReader r = WARCReaderFactory.get(urlOrPath);
      r.setStrict(strict);
      r.setDigest(digest);

代码示例来源:origin: org.netpreserve.commons/webarchive-commons

@Override
public void dump(boolean compress)
throws IOException, java.text.ParseException {
  for (final Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
    ArchiveRecord r = i.next();
    System.out.println(r.getHeader().toString());
    r.dump();
    System.out.println();
  }
}

代码示例来源:origin: ViDA-NYU/ache

boolean readSecond = false;
for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
  WARCRecord ar = (WARCRecord) i.next();
  if (!readWarcInfoRecord) {
reader.close();

代码示例来源:origin: iipc/webarchive-commons

/**
 * Create new WARC record.
 * Encapsulate housekeeping that has to do w/ creating new Record.
 * @param is InputStream to use.
 * @param offset Absolute offset into WARC file.
 * @return A WARCRecord.
 * @throws IOException
 */
protected WARCRecord createArchiveRecord(InputStream is, long offset)
throws IOException {
  return (WARCRecord)currentRecord(new WARCRecord(is,
    getReaderIdentifier(), offset, isDigest(), isStrict()));
}

代码示例来源:origin: org.netpreserve.commons/webarchive-commons

/**
 * Generate a CDX index file for an ARC file.
 *
 * @param urlOrPath The ARC file to generate a CDX index for
 * @throws IOException
 * @throws java.text.ParseException
 */
public static void createCDXIndexFile(String urlOrPath)
throws IOException, java.text.ParseException {
  WARCReader r = WARCReaderFactory.get(urlOrPath);
  r.setStrict(false);
  r.setDigest(true);
  output(r, CDX_FILE);
}

代码示例来源:origin: org.netpreserve.commons/webarchive-commons

/**
 * Skip over any trailing new lines at end of the record so we're lined up
 * ready to read the next.
 * @param record
 * @throws IOException
 */
protected void gotoEOR(ArchiveRecord record) throws IOException {
  if (record.available() != 0) {
    throw new IOException("Record should be exhausted before coming " +
      "in here");
  }
  // Records end in 2*CRLF.  Suck it up.
  readExpectedChar(getIn(), CRLF.charAt(0));
  readExpectedChar(getIn(), CRLF.charAt(1));
  readExpectedChar(getIn(), CRLF.charAt(0));
  readExpectedChar(getIn(), CRLF.charAt(1));
}

代码示例来源:origin: iipc/openwayback

public static Resource getResource(URL url, long offset)
throws IOException, ResourceNotAvailableException {
  
  Resource r = null;
  long start = System.currentTimeMillis();
  TimeoutArchiveReaderFactory tarf = defaultTimeoutReader;
  ArchiveReader reader = tarf.getArchiveReader(url,offset);
  if(reader instanceof ARCReader) {
    ARCReader areader = (ARCReader) reader;
    r = ARCArchiveRecordToResource(areader.get(),areader);
  
  } else if(reader instanceof WARCReader) {
    WARCReader wreader = (WARCReader) reader;
    r = WARCArchiveRecordToResource(wreader.get(),wreader);
    
  } else {
    throw new ResourceNotAvailableException("Unknown ArchiveReader");
  }
  long elapsed = System.currentTimeMillis() - start;
  PerformanceLogger.noteElapsed("Http11Resource", elapsed, url.toExternalForm());
  return r;
}

代码示例来源:origin: iipc/webarchive-commons

Options options = getOptions();
PosixParser parser = new PosixParser();
CommandLine cmdline = parser.parse(options, args, false);
  usage(formatter, options, 0);
  switch(cmdlineOptions[i].getId()) {
    case 'h':
      usage(formatter, options, 0);
      break;
      digest = getTrueOrFalse(cmdlineOptions[i].getValue());
      break;
        usage(formatter, options, 1);
  if (cmdlineArgs.size() != 1) {
    System.out.println("Error: Pass one arcfile only.");
    usage(formatter, options, 1);
  r.setStrict(strict);
  outputRecord(r, format);
} else {
  for (Iterator<String> i = cmdlineArgs.iterator(); i.hasNext();) {
    try {
      WARCReader r = WARCReaderFactory.get(urlOrPath);
      r.setStrict(strict);
      r.setDigest(digest);

代码示例来源:origin: org.archive.heritrix/heritrix-commons

reader.setDigest(false);
try {
  l.setLevel(Level.WARNING);
  for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
    WARCRecord r = (WARCRecord)i.next();
    if (!isARCType(r.getHeader().getMimetype())) {
    reader.close();

32 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com