gpt4 book ai didi

java - Apache Beam 框架 - 按降序排序

转载 作者:塔克拉玛干 更新时间:2023-11-02 19:09:32 25 4
gpt4 key购买 nike

如何使用 Apache Beam 框架按降序排序?

我设法创建了一个字数统计管道,它按字词的字母顺序对输出进行排序,但没有弄清楚如何反转排序顺序。

代码如下:

public class SortedWordCount {

public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.create();
Pipeline p = Pipeline.create(options);

BufferedExternalSorter.Options options1 = BufferedExternalSorter.options();

p.apply(TextIO.read().from("d:/dev/playground/apache/beam/word-count-beam/src/test/resources/bible/whole_bible.txt"))
.apply("ExtractWords", ParDo.of(new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) {
for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) {
if (!word.isEmpty()) {
c.output(word);
}
}
}
}))
.apply(Count.perElement())
.apply(ParDo.of(new DoFn<KV<String, Long>, KV<String, Long>>() {
@ProcessElement
public void processElement(ProcessContext c){
KV<String, Long> element = c.element();
if(element.getKey().length() > 2) {
c.output(element);
}
}
}))
.apply("CreateKey", MapElements.via(new SimpleFunction<KV<String, Long>, KV<String, KV<String, Long>>>() {
public KV<String, KV<String, Long>> apply(KV<String, Long> input) {
return KV.of("sort", KV.of(input.getKey().toLowerCase(), input.getValue()));
}
}))
.apply(GroupByKey.create())
.apply(SortValues.create(options1))
.apply("FormatResults", MapElements.via(new SimpleFunction<KV<String, Iterable<KV<String, Long>>>, String>() {
@Override
public String apply(KV<String, Iterable<KV<String, Long>>> input) {
return StreamSupport.stream(input.getValue().spliterator(), false)
.map(value -> String.format("%20s: %s", value.getKey(), value.getValue()))
.collect(Collectors.joining(String.format("%n")));
}
}))
.apply(TextIO.write().to("bible"));
// Run the pipeline.
p.run().waitUntilFinish();
}
}

此代码生成一个按字母顺序排序的单词列表及其各自的计数:

           aaron: 350
aaronites: 2
abaddon: 1
abagtha: 1
abana: 1
abarim: 4
abase: 4
abased: 4
abasing: 1
abated: 6
abba: 3
abda: 2
abdeel: 1
abdi: 3
abdiel: 1
abdon: 8
abednego: 15
abel: 16
abelbethmaachah: 2
abelmaim: 1

编辑 1:

经过一些调试我知道代码使用类:

org.apache.beam.sdk.extensions.sorter.InMemorySorter

此类在执行排序方法期间使用静态最终比较器:

private static final Comparator<byte[]> COMPARATOR = UnsignedBytes.lexicographicalComparator();

public Iterable<KV<byte[], byte[]>> sort() {
checkState(!sortCalled, "sort() can only be called once.");

sortCalled = true;

Comparator<KV<byte[], byte[]>> kvComparator =
new Comparator<KV<byte[], byte[]>>() {

@Override
public int compare(KV<byte[], byte[]> o1, KV<byte[], byte[]> o2) {
return COMPARATOR.compare(o1.getKey(), o2.getKey());
}
};
Collections.sort(records, kvComparator);
return Collections.unmodifiableList(records);
}

无法在此类中注入(inject)比较器。

最佳答案

我最终听从了 jkff 的建议。并使用 Apache Beam 重写了小型 WordCount。我还摆脱了 SortValues,只是将记录分组到一个键中,然后自己进行排序。

这是我想出的:

import org.apache.beam.examples.common.ExampleUtils;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.*;
import org.apache.beam.sdk.values.KV;

import java.util.ArrayList;
import java.util.function.Supplier;
import java.util.stream.StreamSupport;

public class DescendingWordCount {

public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.create();
Pipeline p = Pipeline.create(options);
p.apply(TextIO.read().from("d:/whole_bible.txt"))
.apply("ExtractWords", ParDo.of(new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) {
for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) {
if (word.length() > 1) {
c.output(word.toLowerCase());
}
}
}
}))
.apply(Count.perElement())
.apply("CreateKey", ParDo.of(new DoFn<KV<String, Long>, KV<String, KV<String, Long>>>() {
@ProcessElement
public void processElement(ProcessContext c) {
KV<String, Long> element = c.element();
String key = element.getKey();
c.output(KV.of("single", KV.of(key, element.getValue())));
}
}))
.apply(GroupByKey.create())
.apply("FormatResults",
MapElements.via(
new SimpleFunction<KV<String, Iterable<KV<String, Long>>>, String>() {
@Override
public String apply(KV<String, Iterable<KV<String, Long>>> input) {
return StreamSupport.stream(input.getValue().spliterator(), false)
.collect((Supplier<ArrayList<KV<String, Long>>>) ArrayList::new,
(al, kv) -> al.add(KV.of(kv.getKey(), kv.getValue())),
(sb, kv) -> {
})
.stream()
.sorted((kv1, kv2) -> kv2.getKey().compareTo(kv1.getKey()))
.collect(StringBuilder::new,
(sb, kv) -> sb.append(String.format("%20s : %d%n", kv.getKey(), kv.getValue())),
(sb, kv) -> {
}).toString();
}
}
))
.apply(TextIO.write().withNumShards(1).to("minimal-wordcount-bible"));
p.run().waitUntilFinish();
}
}

这会打印出如下输出:

          zuzims : 1
zurishaddai : 5
zuriel : 1
zur : 5
zuph : 3
zuar : 5
zorobabel : 3
zorites : 1
zoreah : 1
zorathites : 1
zorah : 8
zophim : 1
zophar : 4
zophai : 1
zophah : 2
zoheth : 1
zoheleth : 1
zohar : 4
zobebah : 1
zobah : 11
zoba : 2
zoar : 10
zoan : 7
zizah : 1
ziza : 2
ziz : 1
zithri : 1
zipporah : 3

关于java - Apache Beam 框架 - 按降序排序,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/47760301/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com