All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.DoFn Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch;

import java.io.Serializable;

import javax.annotation.CheckForNull;
import javax.annotation.Nonnull;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;

import com.google.common.base.Preconditions;

/**
 * Base class for all data processing functions in Crunch.
 * 
 * 

* Note that all {@code DoFn} instances implement {@link Serializable}, and thus * all of their non-transient member variables must implement * {@code Serializable} as well. If your DoFn depends on non-serializable * classes for data processing, they may be declared as {@code transient} and * initialized in the DoFn's {@code initialize} method. * */ public abstract class DoFn implements Serializable { /** This will be null prior to being set in {@link #setContext(TaskInputOutputContext)}. */ @CheckForNull private transient TaskInputOutputContext context; /** This will be null prior to being set in {@link #setConfiguration(Configuration)}. */ @CheckForNull private transient Configuration conf; /** * Configure this DoFn. Subclasses may override this method to modify the * configuration of the Job that this DoFn instance belongs to. * *

* Called during the job planning phase by the crunch-client. *

* * @param conf * The Configuration instance for the Job. */ public void configure(Configuration conf) { } /** * Initialize this DoFn. This initialization will happen before the actual * {@link #process(Object, Emitter)} is triggered. Subclasses may override * this method to do appropriate initialization. * *

* Called during the setup of the job instance this {@code DoFn} is associated * with. *

* */ public void initialize() { } /** * Processes the records from a {@link PCollection}. * *
*
* Note: Crunch can reuse a single input record object whose content * changes on each {@link #process(Object, Emitter)} method call. This * functionality is imposed by Hadoop's Reducer implementation: The framework will reuse the key and value * objects that are passed into the reduce, therefore the application should * clone the objects they want to keep a copy of. * * @param input * The input record. * @param emitter * The emitter to send the output to */ public abstract void process(S input, Emitter emitter); /** * Called during the cleanup of the MapReduce job this {@code DoFn} is * associated with. Subclasses may override this method to do appropriate * cleanup. * * @param emitter * The emitter that was used for output */ public void cleanup(Emitter emitter) { } /** * Called during setup to pass the {@link TaskInputOutputContext} to this * {@code DoFn} instance. The specified {@code TaskInputOutputContext} must not be null. */ public void setContext(@Nonnull TaskInputOutputContext context) { Preconditions.checkNotNull(context); this.context = context; } /** * Called during the setup of an initialized {@link org.apache.crunch.types.PType} that * relies on this instance. * * @param conf * The non-null configuration for the {@code PType} being initialized */ public void setConfiguration(@Nonnull Configuration conf) { Preconditions.checkNotNull(conf); this.conf = conf; } /** * Returns an estimate of how applying this function to a {@link PCollection} * will cause it to change in side. The optimizer uses these estimates to * decide where to break up dependent MR jobs into separate Map and Reduce * phases in order to minimize I/O. * *

* Subclasses of {@code DoFn} that will substantially alter the size of the * resulting {@code PCollection} should override this method. */ public float scaleFactor() { return 0.99f; } /** * By default, Crunch will do a defensive deep copy of the outputs of a * DoFn when there are multiple downstream consumers of that item, in order to * prevent the downstream functions from making concurrent modifications to * data objects. This introduces some extra overhead in cases where you know * that the downstream code is only reading the objects and not modifying it, * so you can disable this feature by overriding this function to * return {@code true}. */ public boolean disableDeepCopy() { return false; } protected TaskInputOutputContext getContext() { return context; } protected Configuration getConfiguration() { if (conf != null) { return conf; } else if (context != null) { return context.getConfiguration(); } else { return null; } } /** * @deprecated The {@link Counter} class changed incompatibly between Hadoop 1 and 2 * (from a class to an interface) so user programs should avoid this method and use * one of the increment methods instead, such as {@link #increment(Enum)}. */ @Deprecated protected Counter getCounter(Enum counterName) { if (context == null) { return null; } return context.getCounter(counterName); } /** * @deprecated The {@link Counter} class changed incompatibly between Hadoop 1 and 2 * (from a class to an interface) so user programs should avoid this method and use * one of the increment methods instead, such as {@link #increment(Enum)}. */ @Deprecated protected Counter getCounter(String groupName, String counterName) { if (context == null) { return null; } return context.getCounter(groupName, counterName); } protected void increment(String groupName, String counterName) { increment(groupName, counterName, 1); } protected void increment(String groupName, String counterName, long value) { if (context != null) { context.getCounter(groupName, counterName).increment(value); } } protected void increment(Enum counterName) { increment(counterName, 1); } protected void increment(Enum counterName, long value) { if (context != null) { context.getCounter(counterName).increment(value); } } protected void progress() { if (context != null) { context.progress(); } } protected TaskAttemptID getTaskAttemptID() { if (context == null) { return null; } return context.getTaskAttemptID(); } protected void setStatus(String status) { if (context != null) { context.setStatus(status); } } protected String getStatus() { if (context == null) { return null; } return context.getStatus(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy