org.apache.hadoop.mapreduce.Reducer Maven / Gradle / Ivy

Go to download
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.mapred.RawKeyValueIterator;

/** 
 * Reduces a set of intermediate values which share a key to a smaller set of
 * values.  
 * 
 * Reducer implementations 
 * can access the {@link Configuration} for the job via the 
 * {@link JobContext#getConfiguration()} method.

 * Reducer has 3 primary phases:
 * 
 *   
 *   
 *   Shuffle
 *   
 *   The Reducer copies the sorted output from each 
 *   {@link Mapper} using HTTP across the network.
 *   
 *   
 *   
 *   Sort
 *   
 *   The framework merge sorts Reducer inputs by 
 *   keys 
 *   (since different Mappers may have output the same key).
 *   
 *   The shuffle and sort phases occur simultaneously i.e. while outputs are
 *   being fetched they are merged.
 *      
 *   SecondarySort
 *   
 *   To achieve a secondary sort on the values returned by the value 
 *   iterator, the application should extend the key with the secondary
 *   key and define a grouping comparator. The keys will be sorted using the
 *   entire key, but will be grouped using the grouping comparator to decide
 *   which keys and values are sent in the same call to reduce.The grouping 
 *   comparator is specified via 
 *   {@link Job#setGroupingComparatorClass(Class)}. The sort order is
 *   controlled by 
 *   {@link Job#setSortComparatorClass(Class)}.
 *   
 *   
 *   For example, say that you want to find duplicate web pages and tag them 
 *   all with the url of the "best" known example. You would set up the job 
 *   like:
 *   
 *     Map Input Key: url
 *     Map Input Value: document
 *     Map Output Key: document checksum, url pagerank
 *     Map Output Value: url
 *     Partitioner: by checksum
 *     OutputKeyComparator: by checksum and then decreasing pagerank
 *     OutputValueGroupingComparator: by checksum
 *   
 *   
 *   
 *      
 *   Reduce
 *   
 *   In this phase the 
 *   {@link #reduce(Object, Iterable, Context)}
 *   method is called for each <key, (collection of values)> in
 *   the sorted inputs.
 *   The output of the reduce task is typically written to a 
 *   {@link RecordWriter} via 
 *   {@link Context#write(Object, Object)}.
 *   
 * 
 * 
 * The output of the Reducer is not re-sorted.
 * 
 * Example:
 * 
 * public class IntSumReducer extends Reducer {
 *   private IntWritable result = new IntWritable();
 * 
 *   public void reduce(Key key, Iterable values, 
 *                      Context context) throws IOException {
 *     int sum = 0;
 *     for (IntWritable val : values) {
 *       sum += val.get();
 *     }
 *     result.set(sum);
 *     context.collect(key, result);
 *   }
 * }
 * 

 * 
 * @see Mapper
 * @see Partitioner
 */
public class Reducer {

  public class Context 
    extends ReduceContext {
    public Context(Configuration conf, TaskAttemptID taskid,
                   RawKeyValueIterator input, 
                   Counter inputKeyCounter,
                   Counter inputValueCounter,
                   RecordWriter output,
                   OutputCommitter committer,
                   StatusReporter reporter,
                   RawComparator comparator,
                   Class keyClass,
                   Class valueClass
                   ) throws IOException, InterruptedException {
      super(conf, taskid, input, inputKeyCounter, inputValueCounter,
            output, committer, reporter, 
            comparator, keyClass, valueClass);
    }
  }

  /**
   * Called once at the start of the task.
   */
  protected void setup(Context context
                       ) throws IOException, InterruptedException {
    // NOTHING
  }

  /**
   * This method is called once for each key. Most applications will define
   * their reduce class by overriding this method. The default implementation
   * is an identity function.
   */
  @SuppressWarnings("unchecked")
  protected void reduce(KEYIN key, Iterable values, Context context
                        ) throws IOException, InterruptedException {
    for(VALUEIN value: values) {
      context.write((KEYOUT) key, (VALUEOUT) value);
    }
  }

  /**
   * Called once at the end of the task.
   */
  protected void cleanup(Context context
                         ) throws IOException, InterruptedException {
    // NOTHING
  }

  /**
   * Advanced application writers can use the 
   * {@link #run(org.apache.hadoop.mapreduce.Reducer.Context)} method to
   * control how the reduce task works.
   */
  public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    while (context.nextKey()) {
      context.progress();
      reduce(context.getCurrentKey(), context.getValues(), context);
    }
    cleanup(context);
  }
}