All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.mapred.lib.Chain Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.mapred.lib;

import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.GenericsUtil;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


/**
 * The Chain class provides all the common functionality for the
 * {@link ChainMapper} and the {@link ChainReducer} classes.
 */
class Chain extends org.apache.hadoop.mapreduce.lib.chain.Chain {

  private static final String MAPPER_BY_VALUE = "chain.mapper.byValue";
  private static final String REDUCER_BY_VALUE = "chain.reducer.byValue";

  private JobConf chainJobConf;

  private List mappers = new ArrayList();
  private Reducer reducer;

  // to cache the key/value output class serializations for each chain element
  // to avoid everytime lookup.
  private List mappersKeySerialization =
    new ArrayList();
  private List mappersValueSerialization =
    new ArrayList();
  private Serialization reducerKeySerialization;
  private Serialization reducerValueSerialization;

  /**
   * Creates a Chain instance configured for a Mapper or a Reducer.
   *
   * @param isMap TRUE indicates the chain is for a Mapper, FALSE that is for a
   *              Reducer.
   */
  Chain(boolean isMap) {
    super(isMap);
  }

  /**
   * Adds a Mapper class to the chain job's JobConf.
   * 

* The configuration properties of the chain job have precedence over the * configuration properties of the Mapper. * * @param isMap indicates if the Chain is for a Mapper or for a * Reducer. * @param jobConf chain job's JobConf to add the Mapper class. * @param klass the Mapper class to add. * @param inputKeyClass mapper input key class. * @param inputValueClass mapper input value class. * @param outputKeyClass mapper output key class. * @param outputValueClass mapper output value class. * @param byValue indicates if key/values should be passed by value * to the next Mapper in the chain, if any. * @param mapperConf a JobConf with the configuration for the Mapper * class. It is recommended to use a JobConf without default values using the * JobConf(boolean loadDefaults) constructor with FALSE. */ public static void addMapper(boolean isMap, JobConf jobConf, Class> klass, Class inputKeyClass, Class inputValueClass, Class outputKeyClass, Class outputValueClass, boolean byValue, JobConf mapperConf) { String prefix = getPrefix(isMap); // if a reducer chain check the Reducer has been already set checkReducerAlreadySet(isMap, jobConf, prefix, true); // set the mapper class int index = getIndex(jobConf, prefix); jobConf.setClass(prefix + CHAIN_MAPPER_CLASS + index, klass, Mapper.class); validateKeyValueTypes(isMap, jobConf, inputKeyClass, inputValueClass, outputKeyClass, outputValueClass, index, prefix); // if the Mapper does not have a private JobConf create an empty one if (mapperConf == null) { // using a JobConf without defaults to make it lightweight. // still the chain JobConf may have all defaults and this conf is // overlapped to the chain JobConf one. mapperConf = new JobConf(true); } // store in the private mapper conf if it works by value or by reference mapperConf.setBoolean(MAPPER_BY_VALUE, byValue); setMapperConf(isMap, jobConf, inputKeyClass, inputValueClass, outputKeyClass, outputValueClass, mapperConf, index, prefix); } /** * Sets the Reducer class to the chain job's JobConf. *

* The configuration properties of the chain job have precedence over the * configuration properties of the Reducer. * * @param jobConf chain job's JobConf to add the Reducer class. * @param klass the Reducer class to add. * @param inputKeyClass reducer input key class. * @param inputValueClass reducer input value class. * @param outputKeyClass reducer output key class. * @param outputValueClass reducer output value class. * @param byValue indicates if key/values should be passed by value * to the next Mapper in the chain, if any. * @param reducerConf a JobConf with the configuration for the Reducer * class. It is recommended to use a JobConf without default values using the * JobConf(boolean loadDefaults) constructor with FALSE. */ public static void setReducer(JobConf jobConf, Class> klass, Class inputKeyClass, Class inputValueClass, Class outputKeyClass, Class outputValueClass, boolean byValue, JobConf reducerConf) { String prefix = getPrefix(false); checkReducerAlreadySet(false, jobConf, prefix, false); jobConf.setClass(prefix + CHAIN_REDUCER_CLASS, klass, Reducer.class); // if the Reducer does not have a private JobConf create an empty one if (reducerConf == null) { // using a JobConf without defaults to make it lightweight. // still the chain JobConf may have all defaults and this conf is // overlapped to the chain JobConf one. reducerConf = new JobConf(false); } // store in the private reducer conf the input/output classes of the reducer // and if it works by value or by reference reducerConf.setBoolean(REDUCER_BY_VALUE, byValue); setReducerConf(jobConf, inputKeyClass, inputValueClass, outputKeyClass, outputValueClass, reducerConf, prefix); } /** * Configures all the chain elements for the task. * * @param jobConf chain job's JobConf. */ public void configure(JobConf jobConf) { String prefix = getPrefix(isMap); chainJobConf = jobConf; SerializationFactory serializationFactory = new SerializationFactory(chainJobConf); int index = jobConf.getInt(prefix + CHAIN_MAPPER_SIZE, 0); for (int i = 0; i < index; i++) { Class klass = jobConf.getClass(prefix + CHAIN_MAPPER_CLASS + i, null, Mapper.class); JobConf mConf = new JobConf( getChainElementConf(jobConf, prefix + CHAIN_MAPPER_CONFIG + i)); Mapper mapper = ReflectionUtils.newInstance(klass, mConf); mappers.add(mapper); if (mConf.getBoolean(MAPPER_BY_VALUE, true)) { mappersKeySerialization.add(serializationFactory.getSerialization( mConf.getClass(MAPPER_OUTPUT_KEY_CLASS, null))); mappersValueSerialization.add(serializationFactory.getSerialization( mConf.getClass(MAPPER_OUTPUT_VALUE_CLASS, null))); } else { mappersKeySerialization.add(null); mappersValueSerialization.add(null); } } Class klass = jobConf.getClass(prefix + CHAIN_REDUCER_CLASS, null, Reducer.class); if (klass != null) { JobConf rConf = new JobConf( getChainElementConf(jobConf, prefix + CHAIN_REDUCER_CONFIG)); reducer = ReflectionUtils.newInstance(klass, rConf); if (rConf.getBoolean(REDUCER_BY_VALUE, true)) { reducerKeySerialization = serializationFactory .getSerialization(rConf.getClass(REDUCER_OUTPUT_KEY_CLASS, null)); reducerValueSerialization = serializationFactory .getSerialization(rConf.getClass(REDUCER_OUTPUT_VALUE_CLASS, null)); } else { reducerKeySerialization = null; reducerValueSerialization = null; } } } /** * Returns the chain job conf. * * @return the chain job conf. */ protected JobConf getChainJobConf() { return chainJobConf; } /** * Returns the first Mapper instance in the chain. * * @return the first Mapper instance in the chain or NULL if none. */ public Mapper getFirstMap() { return (mappers.size() > 0) ? mappers.get(0) : null; } /** * Returns the Reducer instance in the chain. * * @return the Reducer instance in the chain or NULL if none. */ public Reducer getReducer() { return reducer; } /** * Returns the OutputCollector to be used by a Mapper instance in the chain. * * @param mapperIndex index of the Mapper instance to get the OutputCollector. * @param output the original OutputCollector of the task. * @param reporter the reporter of the task. * @return the OutputCollector to be used in the chain. */ @SuppressWarnings({"unchecked"}) public OutputCollector getMapperCollector(int mapperIndex, OutputCollector output, Reporter reporter) { Serialization keySerialization = mappersKeySerialization.get(mapperIndex); Serialization valueSerialization = mappersValueSerialization.get(mapperIndex); return new ChainOutputCollector(mapperIndex, keySerialization, valueSerialization, output, reporter); } /** * Returns the OutputCollector to be used by a Mapper instance in the chain. * * @param output the original OutputCollector of the task. * @param reporter the reporter of the task. * @return the OutputCollector to be used in the chain. */ @SuppressWarnings({"unchecked"}) public OutputCollector getReducerCollector(OutputCollector output, Reporter reporter) { return new ChainOutputCollector(reducerKeySerialization, reducerValueSerialization, output, reporter); } /** * Closes all the chain elements. * * @throws IOException thrown if any of the chain elements threw an * IOException exception. */ public void close() throws IOException { for (Mapper map : mappers) { map.close(); } if (reducer != null) { reducer.close(); } } // using a ThreadLocal to reuse the ByteArrayOutputStream used for ser/deser // it has to be a thread local because if not it would break if used from a // MultiThreadedMapRunner. private final ThreadLocal threadLocalDataOutputBuffer = new ThreadLocal() { protected DataOutputBuffer initialValue() { return new DataOutputBuffer(1024); } }; /** * OutputCollector implementation used by the chain tasks. *

* If it is not the end of the chain, a {@link #collect} invocation invokes * the next Mapper in the chain. If it is the end of the chain the task * OutputCollector is called. */ private class ChainOutputCollector implements OutputCollector { private int nextMapperIndex; private Serialization keySerialization; private Serialization valueSerialization; private OutputCollector output; private Reporter reporter; /* * Constructor for Mappers */ public ChainOutputCollector(int index, Serialization keySerialization, Serialization valueSerialization, OutputCollector output, Reporter reporter) { this.nextMapperIndex = index + 1; this.keySerialization = keySerialization; this.valueSerialization = valueSerialization; this.output = output; this.reporter = reporter; } /* * Constructor for Reducer */ public ChainOutputCollector(Serialization keySerialization, Serialization valueSerialization, OutputCollector output, Reporter reporter) { this.nextMapperIndex = 0; this.keySerialization = keySerialization; this.valueSerialization = valueSerialization; this.output = output; this.reporter = reporter; } @SuppressWarnings({"unchecked"}) public void collect(K key, V value) throws IOException { if (nextMapperIndex < mappers.size()) { // there is a next mapper in chain // only need to ser/deser if there is next mapper in the chain if (keySerialization != null) { key = makeCopyForPassByValue(keySerialization, key); value = makeCopyForPassByValue(valueSerialization, value); } // gets ser/deser and mapper of next in chain Serialization nextKeySerialization = mappersKeySerialization.get(nextMapperIndex); Serialization nextValueSerialization = mappersValueSerialization.get(nextMapperIndex); Mapper nextMapper = mappers.get(nextMapperIndex); // invokes next mapper in chain nextMapper.map(key, value, new ChainOutputCollector(nextMapperIndex, nextKeySerialization, nextValueSerialization, output, reporter), reporter); } else { // end of chain, user real output collector output.collect(key, value); } } private E makeCopyForPassByValue(Serialization serialization, E obj) throws IOException { Serializer ser = serialization.getSerializer(GenericsUtil.getClass(obj)); Deserializer deser = serialization.getDeserializer(GenericsUtil.getClass(obj)); DataOutputBuffer dof = threadLocalDataOutputBuffer.get(); dof.reset(); ser.open(dof); ser.serialize(obj); ser.close(); obj = ReflectionUtils.newInstance(GenericsUtil.getClass(obj), getChainJobConf()); ByteArrayInputStream bais = new ByteArrayInputStream(dof.getData(), 0, dof.getLength()); deser.open(bais); deser.deserialize(obj); deser.close(); return obj; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy