gpt4 book ai didi

java - 扩展 SequenceFileInputFormat 以包含文件名+偏移量

转载 作者:可可西里 更新时间:2023-11-01 14:15:06 25 4
gpt4 key购买 nike

我希望能够创建一个自定义的 InputFormat 来读取序列文件,但另外公开记录所在文件中的文件路径和偏移量。

退后一步,这里是用例:我有一个包含可变大小数据的序列文件。键大多是无关紧要的,值高达几兆字节,包含各种不同的字段。我想在 elasticsearch 中索引其中一些字段以及文件名和偏移量。这样,我就可以从elasticsearch中查询到那些字段,然后使用文件名和偏移量返回到序列文件并获取原始记录,而不是将整个东西存储在ES中。

我将整个过程作为一个 Java 程序运行。 SequenceFile.Reader 类方便地提供了 getPositionseek 方法来实现这一点。

但是,最终会涉及到数 TB 的数据,因此我需要将其转换为 MapReduce 作业(可能仅限 Map)。由于序列文件中的实际键是无关紧要的,我希望采用的方法是创建一个自定义 InputFormat,它扩展或以某种方式利用 SquenceFileInputFormat,但不是返回实际键,而是返回由文件组成的复合键和抵消。

然而,事实证明这在实践中更加困难。这似乎应该是可能的,但考虑到实际的 API 和公开的内容,这很棘手。有任何想法吗?也许我应该采取另一种方法?

最佳答案

万一有人遇到类似的问题,这里是我想出的解决方案。我最终只是简单地复制了 SequenceFileInputFormat/RecordReader 中的一些代码并对其进行了修改。我曾希望写一个子类或装饰器或其他东西......这种方式并不漂亮,但它有效:

SequenceFileOffsetInputFormat.java:

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;

public class SequenceFileOffsetInputFormat<V extends Writable> extends FileInputFormat<PathOffsetWritable, V> {

private static class SequenceFileOffsetRecordReader<V extends Writable> extends RecordReader<PathOffsetWritable, V> {

private SequenceFile.Reader in;
private long start;
private long end;
private boolean more = true;
private PathOffsetWritable key = null;
private Writable k = null;
private V value = null;
private Configuration conf;

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) split;
conf = context.getConfiguration();
Path path = fileSplit.getPath();
FileSystem fs = path.getFileSystem(conf);
this.in = new SequenceFile.Reader(fs, path, conf);
try {
this.k = (Writable) in.getKeyClass().newInstance();
this.value = (V) in.getValueClass().newInstance();
} catch (InstantiationException e) {
throw new IOException(e);
} catch (IllegalAccessException e) {
throw new IOException(e);
}
this.end = fileSplit.getStart() + fileSplit.getLength();

if (fileSplit.getStart() > in.getPosition()) {
in.sync(fileSplit.getStart());
}

this.start = in.getPosition();
more = start < end;

key = new PathOffsetWritable(path, start);
}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!more) {
return false;
}
long pos = in.getPosition();

more = in.next(k, value);
if (!more || (pos >= end && in.syncSeen())) {
key = null;
value = null;
more = false;
} else {
key.setOffset(pos);
}
return more;
}

@Override
public PathOffsetWritable getCurrentKey() {
return key;
}

@Override
public V getCurrentValue() {
return value;
}

@Override
public float getProgress() throws IOException, InterruptedException {
if (end == start) {
return 0.0f;
} else {
return Math.min(1.0f, (in.getPosition() - start) / (float)(end - start));
}
}

@Override
public void close() throws IOException {
in.close();
}

}

@Override
public RecordReader<PathOffsetWritable, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
return new SequenceFileOffsetRecordReader<V>();
}

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
return new SequenceFileInputFormat<PathOffsetWritable, V>().getSplits(context);
}

@Override
public long getFormatMinSplitSize() {
return SequenceFile.SYNC_INTERVAL;
}


}

PathOffsetWritable.java:

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

public class PathOffsetWritable implements WritableComparable<PathOffsetWritable> {

private Text t = new Text();
private Path path;
private long offset;

public PathOffsetWritable(Path path, long offset) {
this.path = path;
this.offset = offset;
}

public Path getPath() {
return path;
}

public long getOffset() {
return offset;
}

public void setPath(Path path) {
this.path = path;
}

public void setOffset(long offset) {
this.offset = offset;
}

@Override
public void readFields(DataInput in) throws IOException {
t.readFields(in);
path = new Path(t.toString());
offset = in.readLong();
}

@Override
public void write(DataOutput out) throws IOException {
t.set(path.toString());
t.write(out);
out.writeLong(offset);
}

@Override
public int compareTo(PathOffsetWritable o) {
int x = path.compareTo(o.path);
if (x != 0) {
return x;
} else {
return Long.valueOf(offset).compareTo(Long.valueOf(o.offset));
}
}


}

关于java - 扩展 SequenceFileInputFormat 以包含文件名+偏移量,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/18642875/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com