All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.elephantbird.cascading3.scheme.CombinedSequenceFile Maven / Gradle / Ivy

There is a newer version: 4.17
Show newest version
package com.twitter.elephantbird.cascading3.scheme;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.SequenceFileInputFormat;

import com.twitter.elephantbird.mapreduce.input.MapReduceInputFormatWrapper;
import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat;

import cascading.flow.FlowProcess;
import cascading.scheme.hadoop.SequenceFile;
import cascading.tap.Tap;
import cascading.tuple.Fields;

/**
 * This scheme allows SequenceFile splits to be combined via the DelegateCombineFileInputFormat
 * before it is read. It can be used to combine inputs for intermediate MR jobs in Cascading.
 *
 * To enable, set cascading.flowconnector.intermediateschemeclass to this class in the Hadoop
 * configuration.
 *
 * @author Akihiro Matsukawa
 */
public class CombinedSequenceFile extends SequenceFile {

  private static final String MR_COMPRESS_ENABLE = "mapreduce.output.fileoutputformat.compress";
  public static final String COMPRESS_ENABLE = "elephantbird.cascading.combinedsequencefile.compress.enable";

  private static final String MR_COMPRESS_TYPE = "mapreduce.output.fileoutputformat.compress.type";
  public static final String COMPRESS_TYPE = "elephantbird.cascading.combinedsequencefile.compress.type";

  private static final String MR_COMPRESS_CODEC = "mapreduce.output.fileoutputformat.compress.codec";
  public static final String COMPRESS_CODEC = "elephantbird.cascading.combinedsequencefile.compress.codec";


  protected CombinedSequenceFile() { super(); }

  public CombinedSequenceFile(Fields fields) { super(fields); }

  // We can allow overriding the compression settings for just this scheme here
  private void updateJobConfForLocalSettings(Configuration conf) {
    String localSetCompressionEnabled = conf.get(COMPRESS_ENABLE);
    if(localSetCompressionEnabled != null) {
      conf.set(MR_COMPRESS_ENABLE, localSetCompressionEnabled);
    }

    String localSetCompressionType = conf.get(COMPRESS_TYPE);
    if(localSetCompressionType != null) {
      conf.set(MR_COMPRESS_TYPE, localSetCompressionType);
    }

    String localSetCompressionCodec = conf.get(COMPRESS_CODEC);
    if(localSetCompressionCodec != null) {
      conf.set(MR_COMPRESS_CODEC, localSetCompressionCodec);
    }
  }

  @Override
  public void sourceConfInit(
      FlowProcess flowProcess,
      Tap tap,
      Configuration conf ) {
    super.sourceConfInit(flowProcess, tap, conf);

    updateJobConfForLocalSettings(conf);

    // both EB combiner and Cascading3 work over the mapreduce API
    // however, SequenceFileInputFormat is in the mapred API.
    // in order to use the EB combiner we must wrap the mapred SequenceFileInputFormat
    // with the MapReduceInputFormatWrapper and then wrap it in the DelegateCombineFileInputFormat
    MapReduceInputFormatWrapper.setWrappedInputFormat(SequenceFileInputFormat.class, conf);
    DelegateCombineFileInputFormat.setDelegateInputFormat(conf, MapReduceInputFormatWrapper.class);
  }

  @Override
  public void sinkConfInit( FlowProcess flowProcess, Tap tap, Configuration conf )
  {
    super.sinkConfInit(flowProcess, tap, conf);

    updateJobConfForLocalSettings(conf);
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy