org.apache.crunch.DoFn Maven / Gradle / Ivy

Go to download
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch;

import java.io.Serializable;

import javax.annotation.CheckForNull;
import javax.annotation.Nonnull;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;

import com.google.common.base.Preconditions;

/**
 * Base class for all data processing functions in Crunch.
 * 
 * 
 * Note that all {@code DoFn} instances implement {@link Serializable}, and thus
 * all of their non-transient member variables must implement
 * {@code Serializable} as well. If your DoFn depends on non-serializable
 * classes for data processing, they may be declared as {@code transient} and
 * initialized in the DoFn's {@code initialize} method.
 * 
 */
public abstract class DoFn implements Serializable {
  /** This will be null prior to being set in {@link #setContext(TaskInputOutputContext)}. */
  @CheckForNull
  private transient TaskInputOutputContext context;
  /** This will be null prior to being set in {@link #setConfiguration(Configuration)}. */
  @CheckForNull
  private transient Configuration conf;

  /**
   * Configure this DoFn. Subclasses may override this method to modify the
   * configuration of the Job that this DoFn instance belongs to.
   * 
   * 

   * Called during the job planning phase by the crunch-client.
   * 
   * 
   * @param conf
   *          The Configuration instance for the Job.
   */
  public void configure(Configuration conf) {
  }

  /**
   * Initialize this DoFn. This initialization will happen before the actual
   * {@link #process(Object, Emitter)} is triggered. Subclasses may override
   * this method to do appropriate initialization.
   * 
   * 
   * Called during the setup of the job instance this {@code DoFn} is associated
   * with.
   * 
   * 
   */
  public void initialize() {
  }

  /**
   * Processes the records from a {@link PCollection}.
   * 
   * 

   * 

   * Note: Crunch can reuse a single input record object whose content
   * changes on each {@link #process(Object, Emitter)} method call. This
   * functionality is imposed by Hadoop's Reducer implementation: The framework will reuse the key and value
   * objects that are passed into the reduce, therefore the application should
   * clone the objects they want to keep a copy of.
   * 
   * @param input
   *          The input record.
   * @param emitter
   *          The emitter to send the output to
   */
  public abstract void process(S input, Emitter emitter);

  /**
   * Called during the cleanup of the MapReduce job this {@code DoFn} is
   * associated with. Subclasses may override this method to do appropriate
   * cleanup.
   * 
   * @param emitter
   *          The emitter that was used for output
   */
  public void cleanup(Emitter emitter) {
  }

  /**
   * Called during setup to pass the {@link TaskInputOutputContext} to this
   * {@code DoFn} instance. The specified {@code TaskInputOutputContext} must not be null.
   */
  public void setContext(@Nonnull TaskInputOutputContext context) {
    Preconditions.checkNotNull(context);
    this.context = context;
  }

  /**
   * Called during the setup of an initialized {@link org.apache.crunch.types.PType} that
   * relies on this instance.
   *
   * @param conf
   *          The non-null configuration for the {@code PType} being initialized
   */
  public void setConfiguration(@Nonnull Configuration conf) {
    Preconditions.checkNotNull(conf);
    this.conf = conf;
  }

  /**
   * Returns an estimate of how applying this function to a {@link PCollection}
   * will cause it to change in side. The optimizer uses these estimates to
   * decide where to break up dependent MR jobs into separate Map and Reduce
   * phases in order to minimize I/O.
   * 
   * 
   * Subclasses of {@code DoFn} that will substantially alter the size of the
   * resulting {@code PCollection} should override this method.
   */
  public float scaleFactor() {
    return 0.99f;
  }

  /**
   * By default, Crunch will do a defensive deep copy of the outputs of a
   * DoFn when there are multiple downstream consumers of that item, in order to
   * prevent the downstream functions from making concurrent modifications to
   * data objects. This introduces some extra overhead in cases where you know
   * that the downstream code is only reading the objects and not modifying it,
   * so you can disable this feature by overriding this function to
   * return {@code true}.
   */
  public boolean disableDeepCopy() {
    return false;
  }

  protected TaskInputOutputContext getContext() {
    return context;
  }

  protected Configuration getConfiguration() {
    if (conf != null) {
      return conf;
    } else if (context != null) {
      return context.getConfiguration();
    } else {
      return null;
    }
  }

  /**
   * @deprecated The {@link Counter} class changed incompatibly between Hadoop 1 and 2
   * (from a class to an interface) so user programs should avoid this method and use
   * one of the increment methods instead, such as {@link #increment(Enum)}.
   */
  @Deprecated
  protected Counter getCounter(Enum counterName) {
    if (context == null) {
      return null;
    }
    return context.getCounter(counterName);
  }

  /**
   * @deprecated The {@link Counter} class changed incompatibly between Hadoop 1 and 2
   * (from a class to an interface) so user programs should avoid this method and use
   * one of the increment methods instead, such as {@link #increment(Enum)}.
   */
  @Deprecated
  protected Counter getCounter(String groupName, String counterName) {
    if (context == null) {
        return null;
    }
    return context.getCounter(groupName, counterName);
  }

  protected void increment(String groupName, String counterName) {
    increment(groupName, counterName, 1);
  }

  protected void increment(String groupName, String counterName, long value) {
    if (context != null) {
      context.getCounter(groupName, counterName).increment(value);
    }
  }

  protected void increment(Enum counterName) {
    increment(counterName, 1);
  }

  protected void increment(Enum counterName, long value) {
    if (context != null) {
      context.getCounter(counterName).increment(value);
    }
  }

  protected void progress() {
    if (context != null) {
      context.progress();
    }
  }

  protected TaskAttemptID getTaskAttemptID() {
    if (context == null) {
      return null;
    }
    return context.getTaskAttemptID();
  }

  protected void setStatus(String status) {
    if (context != null) {
      context.setStatus(status);
    }
  }

  protected String getStatus() {
    if (context == null) {
      return null;
    }
    return context.getStatus();
  }

}