org.apache.hadoop.mapreduce.Reducer Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in org.apache.hadoop.shaded.com.liance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org.apache.hadoop.shaded.org.licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.shaded.org.apache.hadoop.mapreduce;
import java.org.apache.hadoop.shaded.io.IOException;
import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.shaded.org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.task.annotation.Checkpointable;
import java.util.Iterator;
/**
* Reduces a set of intermediate values which share a key to a smaller set of
* values.
*
* Reducer
implementations
* can access the {@link Configuration} for the job via the
* {@link JobContext#getConfiguration()} method.
* Reducer
has 3 primary phases:
*
* -
*
* Shuffle
*
*
The Reducer
copies the sorted output from each
* {@link Mapper} using HTTP across the org.apache.hadoop.shaded.net.ork.
*
*
* -
* Sort
*
*
The framework merge sorts Reducer
inputs by
* key
s
* (since different Mapper
s may have output the same key).
*
* The shuffle and sort phases occur simultaneously i.e. while outputs are
* being fetched they are merged.
*
* SecondarySort
*
* To achieve a secondary sort on the values returned by the value
* iterator, the application should extend the key with the secondary
* key and define a grouping org.apache.hadoop.shaded.com.arator. The keys will be sorted using the
* entire key, but will be grouped using the grouping org.apache.hadoop.shaded.com.arator to decide
* which keys and values are sent in the same call to reduce.The grouping
* org.apache.hadoop.shaded.com.arator is specified via
* {@link Job#setGroupingComparatorClass(Class)}. The sort order is
* controlled by
* {@link Job#setSortComparatorClass(Class)}.
*
*
* For example, say that you want to find duplicate web pages and tag them
* all with the url of the "best" known example. You would set up the job
* like:
*
* - Map Input Key: url
* - Map Input Value: document
* - Map Output Key: document checksum, url pagerank
* - Map Output Value: url
* - Partitioner: by checksum
* - OutputKeyComparator: by checksum and then decreasing pagerank
* - OutputValueGroupingComparator: by checksum
*
*
*
* -
* Reduce
*
*
In this phase the
* {@link #reduce(Object, Iterable, org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.Reducer.Context)}
* method is called for each <key, (collection of values)>
in
* the sorted inputs.
* The output of the reduce task is typically written to a
* {@link RecordWriter} via
* {@link Context#write(Object, Object)}.
*
*
*
* The output of the Reducer
is not re-sorted.
*
* Example:
*
* public class IntSumReducer<Key> extends Reducer<Key,IntWritable,
* Key,IntWritable> {
* private IntWritable result = new IntWritable();
*
* public void reduce(Key key, Iterable<IntWritable> values,
* Context context) throws IOException, InterruptedException {
* int sum = 0;
* for (IntWritable val : values) {
* sum += val.get();
* }
* result.set(sum);
* context.write(key, result);
* }
* }
*
*
* @see Mapper
* @see Partitioner
*/
@Checkpointable
@InterfaceAudience.Public
@InterfaceStability.Stable
public class Reducer {
/**
* The Context
passed on to the {@link Reducer} implementations.
*/
public abstract class Context
implements ReduceContext {
}
/**
* Called once at the start of the task.
*/
protected void setup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
/**
* This method is called once for each key. Most applications will define
* their reduce class by overriding this method. The default implementation
* is an identity function.
*/
@SuppressWarnings("unchecked")
protected void reduce(KEYIN key, Iterable values, Context context
) throws IOException, InterruptedException {
for(VALUEIN value: values) {
context.write((KEYOUT) key, (VALUEOUT) value);
}
}
/**
* Called once at the end of the task.
*/
protected void cleanup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
/**
* Advanced application writers can use the
* {@link #run(org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.Reducer.Context)} method to
* control how the reduce task works.
*/
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKey()) {
reduce(context.getCurrentKey(), context.getValues(), context);
// If a back up store is used, reset it
Iterator iter = context.getValues().iterator();
if(iter instanceof ReduceContext.ValueIterator) {
((ReduceContext.ValueIterator)iter).resetBackupStore();
}
}
} finally {
cleanup(context);
}
}
}