org.apache.tez.examples.OrderedWordCount Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tez.examples;
import java.io.IOException;
import org.apache.tez.client.CallerContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.tez.client.TezClient;
import org.apache.tez.dag.api.DAG;
import org.apache.tez.dag.api.DataSinkDescriptor;
import org.apache.tez.dag.api.DataSourceDescriptor;
import org.apache.tez.dag.api.Edge;
import org.apache.tez.dag.api.ProcessorDescriptor;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.Vertex;
import org.apache.tez.examples.WordCount.TokenProcessor;
import org.apache.tez.mapreduce.input.MRInput;
import org.apache.tez.mapreduce.output.MROutput;
import org.apache.tez.mapreduce.processor.SimpleMRProcessor;
import org.apache.tez.runtime.api.ProcessorContext;
import org.apache.tez.runtime.library.api.KeyValueWriter;
import org.apache.tez.runtime.library.api.KeyValuesReader;
import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig;
import org.apache.tez.runtime.library.partitioner.HashPartitioner;
import org.apache.tez.runtime.library.processor.SimpleProcessor;
import com.google.common.base.Preconditions;
/**
* Simple example that extends the WordCount example to show a chain of processing.
* The example extends WordCount by sorting the words by their count.
*/
public class OrderedWordCount extends TezExampleBase {
private static String INPUT = WordCount.INPUT;
private static String OUTPUT = WordCount.OUTPUT;
private static String TOKENIZER = WordCount.TOKENIZER;
private static String SUMMATION = WordCount.SUMMATION;
private static String SORTER = "Sorter";
private static final Logger LOG = LoggerFactory.getLogger(OrderedWordCount.class);
/*
* SumProcessor similar to WordCount except that it writes the count as key and the
* word as value. This is because we can and ordered partitioned key value edge to group the
* words with the same count (as key) and order the counts.
*/
public static class SumProcessor extends SimpleProcessor {
public SumProcessor(ProcessorContext context) {
super(context);
}
@Override
public void run() throws Exception {
Preconditions.checkArgument(getInputs().size() == 1);
Preconditions.checkArgument(getOutputs().size() == 1);
// the recommended approach is to cast the reader/writer to a specific type instead
// of casting the input/output. This allows the actual input/output type to be replaced
// without affecting the semantic guarantees of the data type that are represented by
// the reader and writer.
// The inputs/outputs are referenced via the names assigned in the DAG.
KeyValueWriter kvWriter = (KeyValueWriter) getOutputs().get(SORTER).getWriter();
KeyValuesReader kvReader = (KeyValuesReader) getInputs().get(TOKENIZER).getReader();
while (kvReader.next()) {
Text word = (Text) kvReader.getCurrentKey();
int sum = 0;
for (Object value : kvReader.getCurrentValues()) {
sum += ((IntWritable) value).get();
}
// write the sum as the key and the word as the value
kvWriter.write(new IntWritable(sum), word);
}
}
}
/**
* No-op sorter processor. It does not need to apply any logic since the ordered partitioned edge
* ensures that we get the data sorted and grouped by the the sum key.
*/
public static class NoOpSorter extends SimpleMRProcessor {
public NoOpSorter(ProcessorContext context) {
super(context);
}
@Override
public void run() throws Exception {
Preconditions.checkArgument(getInputs().size() == 1);
Preconditions.checkArgument(getOutputs().size() == 1);
KeyValueWriter kvWriter = (KeyValueWriter) getOutputs().get(OUTPUT).getWriter();
KeyValuesReader kvReader = (KeyValuesReader) getInputs().get(SUMMATION).getReader();
while (kvReader.next()) {
Object sum = kvReader.getCurrentKey();
for (Object word : kvReader.getCurrentValues()) {
kvWriter.write(word, sum);
}
}
// deriving from SimpleMRProcessor takes care of committing the output
}
}
public static DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath,
int numPartitions, boolean disableSplitGrouping, boolean isGenerateSplitInClient, String dagName) throws IOException {
DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf),
TextInputFormat.class, inputPath).groupSplits(!disableSplitGrouping)
.generateSplitsInAM(!isGenerateSplitInClient).build();
DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf),
TextOutputFormat.class, outputPath).build();
Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
TokenProcessor.class.getName()));
tokenizerVertex.addDataSource(INPUT, dataSource);
// Use Text key and IntWritable value to bring counts for each word in the same partition
// The setFromConfiguration call is optional and allows overriding the config options with
// command line parameters.
OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig
.newBuilder(Text.class.getName(), IntWritable.class.getName(),
HashPartitioner.class.getName())
.setFromConfiguration(tezConf)
.build();
// This vertex will be reading intermediate data via an input edge and writing intermediate data
// via an output edge.
Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(
SumProcessor.class.getName()), numPartitions);
// Use IntWritable key and Text value to bring all words with the same count in the same
// partition. The data will be ordered by count and words grouped by count. The
// setFromConfiguration call is optional and allows overriding the config options with
// command line parameters.
OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig
.newBuilder(IntWritable.class.getName(), Text.class.getName(),
HashPartitioner.class.getName())
.setFromConfiguration(tezConf)
.build();
// Use 1 task to bring all the data in one place for global sorted order. Essentially the number
// of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output
Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create(
NoOpSorter.class.getName()), 1);
sorterVertex.addDataSink(OUTPUT, dataSink);
// No need to add jar containing this class as assumed to be part of the tez jars.
DAG dag = DAG.create(dagName);
dag.addVertex(tokenizerVertex)
.addVertex(summationVertex)
.addVertex(sorterVertex)
.addEdge(
Edge.create(tokenizerVertex, summationVertex,
summationEdgeConf.createDefaultEdgeProperty()))
.addEdge(
Edge.create(summationVertex, sorterVertex, sorterEdgeConf.createDefaultEdgeProperty()));
return dag;
}
@Override
protected void printUsage() {
System.err.println("Usage: " + " orderedwordcount in out [numPartitions]");
}
@Override
protected int validateArgs(String[] otherArgs) {
if (otherArgs.length < 2 || otherArgs.length > 3) {
return 2;
}
return 0;
}
@Override
protected int runJob(String[] args, TezConfiguration tezConf,
TezClient tezClient) throws Exception {
DAG dag = createDAG(tezConf, args[0], args[1],
args.length == 3 ? Integer.parseInt(args[2]) : 1, isDisableSplitGrouping(),
isGenerateSplitInClient(), "OrderedWordCount");
LOG.info("Running OrderedWordCount");
return runDag(dag, isCountersLog(), LOG);
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new OrderedWordCount(), args);
System.exit(res);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy