All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.mapreduce.Reducer Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapreduce;

import java.io.IOException;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.task.annotation.Checkpointable;

import java.util.Iterator;

/** 
 * Reduces a set of intermediate values which share a key to a smaller set of
 * values.  
 * 
 * 

Reducer implementations * can access the {@link Configuration} for the job via the * {@link JobContext#getConfiguration()} method.

*

Reducer has 3 primary phases:

*
    *
  1. * * Shuffle * *

    The Reducer copies the sorted output from each * {@link Mapper} using HTTP across the network.

    *
  2. * *
  3. * Sort * *

    The framework merge sorts Reducer inputs by * keys * (since different Mappers may have output the same key).

    * *

    The shuffle and sort phases occur simultaneously i.e. while outputs are * being fetched they are merged.

    * * SecondarySort * *

    To achieve a secondary sort on the values returned by the value * iterator, the application should extend the key with the secondary * key and define a grouping comparator. The keys will be sorted using the * entire key, but will be grouped using the grouping comparator to decide * which keys and values are sent in the same call to reduce.The grouping * comparator is specified via * {@link Job#setGroupingComparatorClass(Class)}. The sort order is * controlled by * {@link Job#setSortComparatorClass(Class)}.

    * * * For example, say that you want to find duplicate web pages and tag them * all with the url of the "best" known example. You would set up the job * like: *
      *
    • Map Input Key: url
    • *
    • Map Input Value: document
    • *
    • Map Output Key: document checksum, url pagerank
    • *
    • Map Output Value: url
    • *
    • Partitioner: by checksum
    • *
    • OutputKeyComparator: by checksum and then decreasing pagerank
    • *
    • OutputValueGroupingComparator: by checksum
    • *
    *
  4. * *
  5. * Reduce * *

    In this phase the * {@link #reduce(Object, Iterable, org.apache.hadoop.mapreduce.Reducer.Context)} * method is called for each <key, (collection of values)> in * the sorted inputs.

    *

    The output of the reduce task is typically written to a * {@link RecordWriter} via * {@link Context#write(Object, Object)}.

    *
  6. *
* *

The output of the Reducer is not re-sorted.

* *

Example:

*

 * public class IntSumReducer<Key> extends Reducer<Key,IntWritable,
 *                                                 Key,IntWritable> {
 *   private IntWritable result = new IntWritable();
 * 
 *   public void reduce(Key key, Iterable<IntWritable> values,
 *                      Context context) throws IOException, InterruptedException {
 *     int sum = 0;
 *     for (IntWritable val : values) {
 *       sum += val.get();
 *     }
 *     result.set(sum);
 *     context.write(key, result);
 *   }
 * }
 * 
* * @see Mapper * @see Partitioner */ @Checkpointable @InterfaceAudience.Public @InterfaceStability.Stable public class Reducer { /** * The Context passed on to the {@link Reducer} implementations. */ public abstract class Context implements ReduceContext { } /** * Called once at the start of the task. */ protected void setup(Context context ) throws IOException, InterruptedException { // NOTHING } /** * This method is called once for each key. Most applications will define * their reduce class by overriding this method. The default implementation * is an identity function. */ @SuppressWarnings("unchecked") protected void reduce(KEYIN key, Iterable values, Context context ) throws IOException, InterruptedException { for(VALUEIN value: values) { context.write((KEYOUT) key, (VALUEOUT) value); } } /** * Called once at the end of the task. */ protected void cleanup(Context context ) throws IOException, InterruptedException { // NOTHING } /** * Advanced application writers can use the * {@link #run(org.apache.hadoop.mapreduce.Reducer.Context)} method to * control how the reduce task works. */ public void run(Context context) throws IOException, InterruptedException { setup(context); try { while (context.nextKey()) { reduce(context.getCurrentKey(), context.getValues(), context); // If a back up store is used, reset it Iterator iter = context.getValues().iterator(); if(iter instanceof ReduceContext.ValueIterator) { ((ReduceContext.ValueIterator)iter).resetBackupStore(); } } } finally { cleanup(context); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy