datafu.hourglass.jobs.PartitionCollapsingIncrementalJob Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of datafu-hourglass Show documentation
A framework for incrementally processing data in Hadoop
The newest version!
/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.jobs;

import java.io.IOException;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;

import datafu.hourglass.model.Accumulator;
import datafu.hourglass.model.Mapper;
import datafu.hourglass.model.Merger;

/**
 * A concrete version of {@link AbstractPartitionCollapsingIncrementalJob}.
 * 
 * This provides an alternative to extending {@link AbstractPartitionCollapsingIncrementalJob}.
 * Instead of extending this class and implementing the abstract methods, this concrete version
 * can be used instead.  Getters and setters have been provided for the abstract methods. 
 * 
 * @author "Matthew Hayes"
 *
 */
public class PartitionCollapsingIncrementalJob extends AbstractPartitionCollapsingIncrementalJob
{
  private Mapper _mapper;
  private Accumulator _combiner;
  private Accumulator _reducer;
  private Schema _keySchema;
  private Schema _intermediateValueSchema;
  private Schema _outputValueSchema;
  private Merger _merger;
  private Merger _oldMerger;
  private Setup _setup;

  /**
   * Initializes the job.  The job name is derived from the name of a provided class.
   * 
   * @param cls class to base job name on
   * @throws IOException
   */
  public PartitionCollapsingIncrementalJob(@SuppressWarnings("rawtypes") Class cls) throws IOException
  {
    setName(cls.getName());
  }

  @Override
  public Mapper getMapper()
  {
    return _mapper;
  }

  @Override
  public Accumulator getCombinerAccumulator()
  {
    return _combiner;
  }
  
  @Override
  public Accumulator getReducerAccumulator()
  {
    return _reducer;
  }

  @Override
  protected Schema getKeySchema()
  {
    return _keySchema;
  }

  @Override
  protected Schema getIntermediateValueSchema()
  {
    return _intermediateValueSchema;
  }

  @Override
  protected Schema getOutputValueSchema()
  {
    return _outputValueSchema;
  }

  @Override
  public Merger getRecordMerger()
  {
    return _merger;
  }

  @Override
  public Merger getOldRecordMerger()
  {
    return _oldMerger;
  }

  /**
   * Set the mapper.
   * 
   * @param mapper
   */
  public void setMapper(Mapper mapper)
  {
    this._mapper = mapper;
  }

  /**
   * Set the accumulator for the combiner
   * 
   * @param combiner accumulator for the combiner
   */
  public void setCombinerAccumulator(Accumulator combiner)
  {
    this._combiner = combiner;
  }

  /**
   * Set the accumulator for the reducer.
   * 
   * @param reducer accumulator for the reducer
   */
  public void setReducerAccumulator(Accumulator reducer)
  {
    this._reducer = reducer;
  }

  /**
   * Sets the Avro schema for the key.
   * 
   * This is also used as the key for the map output.
   * 
   * @param keySchema key schema
   */
  public void setKeySchema(Schema keySchema)
  {
    this._keySchema = keySchema;
  }
  
  /**
   * Sets the Avro schema for the intermediate value.
   * 
   * This is also used for the value for the map output.
   * 
   * @param intermediateValueSchema intermediate value schema
   */
  public void setIntermediateValueSchema(Schema intermediateValueSchema)
  {
    this._intermediateValueSchema = intermediateValueSchema;
  }
  
  /**
   * Sets the Avro schema for the output data.
   *  
   * @param outputValueSchema output value schema
   */
  public void setOutputValueSchema(Schema outputValueSchema)
  {
    this._outputValueSchema = outputValueSchema;
  }

  /**
   * Sets the record merger that is capable of merging previous output with a new partial output.
   * This is only needed when reusing previous output where the intermediate and output schemas are different.
   * New partial output is produced by the reducer from new input that is after the previous output.
   * 
   * @param merger
   */
  public void setMerger(Merger merger)
  {
    this._merger = merger;
  }

  /**
   * Sets the record merger that is capable of unmerging old partial output from the new output.
   * This is only needed when reusing previous output for a fixed-length sliding window.
   * The new output is the result of merging the previous output with the new partial output.
   * The old partial output is produced by the reducer from old input data before the time range of
   * the previous output. 
   * 
   * @param oldMerger merger
   */
  public void setOldMerger(Merger oldMerger)
  {
    this._oldMerger = oldMerger;
  } 
  
  /**
   * Set callback to provide custom configuration before job begins execution.
   * 
   * @param setup object with callback method
   */
  public void setOnSetup(Setup setup)
  {
    _setup = setup;
  }
  
  @Override
  public void config(Configuration conf)
  {    
    super.config(conf);
    if (_setup != null)
    {
      _setup.setup(conf);
    }
  } 
}