com.google.cloud.dataflow.sdk.transforms.ParDo Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.
There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.transforms;

import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData.Builder;
import com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn;
import com.google.cloud.dataflow.sdk.util.DirectModeExecutionContext;
import com.google.cloud.dataflow.sdk.util.DirectSideInputReader;
import com.google.cloud.dataflow.sdk.util.DoFnRunner;
import com.google.cloud.dataflow.sdk.util.DoFnRunnerBase;
import com.google.cloud.dataflow.sdk.util.DoFnRunners;
import com.google.cloud.dataflow.sdk.util.IllegalMutationException;
import com.google.cloud.dataflow.sdk.util.MutationDetector;
import com.google.cloud.dataflow.sdk.util.MutationDetectors;
import com.google.cloud.dataflow.sdk.util.PTuple;
import com.google.cloud.dataflow.sdk.util.SerializableUtils;
import com.google.cloud.dataflow.sdk.util.SideInputReader;
import com.google.cloud.dataflow.sdk.util.StringUtils;
import com.google.cloud.dataflow.sdk.util.UserCodeException;
import com.google.cloud.dataflow.sdk.util.WindowedValue;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TupleTagList;
import com.google.cloud.dataflow.sdk.values.TypedPValue;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Maps;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentMap;

import javax.annotation.Nullable;

/**
 * {@link ParDo} is the core element-wise transform in Google Cloud
 * Dataflow, invoking a user-specified function on each of the elements of the input
 * {@link PCollection} to produce zero or more output elements, all
 * of which are collected into the output {@link PCollection}.
 *
 * Elements are processed independently, and possibly in parallel across
 * distributed cloud resources.
 *
 * 
The {@link ParDo} processing style is similar to what happens inside
 * the "Mapper" or "Reducer" class of a MapReduce-style algorithm.
 *
 * 
{@link DoFn DoFns}
 *
 * The function to use to process each element is specified by a
 * {@link DoFn DoFn<InputT, OutputT>}, primarily via its
 * {@link DoFn#processElement processElement} method. The {@link DoFn} may also
 * override the default implementations of {@link DoFn#startBundle startBundle}
 * and {@link DoFn#finishBundle finishBundle}.
 *
 * 
Conceptually, when a {@link ParDo} transform is executed, the
 * elements of the input {@link PCollection} are first divided up
 * into some number of "bundles". These are farmed off to distributed
 * worker machines (or run locally, if using the {@link DirectPipelineRunner}).
 * For each bundle of input elements processing proceeds as follows:
 *
 * 

 *   A fresh instance of the argument {@link DoFn} is created on a worker. This may
 *     be through deserialization or other means. If the {@link DoFn} subclass
 *     does not override {@link DoFn#startBundle startBundle} or
 *     {@link DoFn#finishBundle finishBundle} then this may be optimized since
 *     it cannot observe the start and end of a bundle.
 *   The {@link DoFn DoFn's} {@link DoFn#startBundle} method is called to
 *     initialize it. If this method is not overridden, the call may be optimized
 *     away.
 *   The {@link DoFn DoFn's} {@link DoFn#processElement} method
 *     is called on each of the input elements in the bundle.
 *   The {@link DoFn DoFn's} {@link DoFn#finishBundle} method is called
 *     to complete its work. After {@link DoFn#finishBundle} is called, the
 *     framework will never again invoke any of these three processing methods.
 *     If this method is not overridden, this call may be optimized away.
 * 
 *
 * Each of the calls to any of the {@link DoFn DoFn's} processing
 * methods can produce zero or more output elements. All of the
 * of output elements from all of the {@link DoFn} instances
 * are included in the output {@link PCollection}.
 *
 * 
For example:
 *
 * 
 {@code
 * PCollection lines = ...;
 * PCollection words =
 *     lines.apply(ParDo.of(new DoFn() {
 *         public void processElement(ProcessContext c) {
 *           String line = c.element();
 *           for (String word : line.split("[^a-zA-Z']+")) {
 *             c.output(word);
 *           }
 *         }}));
 * PCollection wordLengths =
 *     words.apply(ParDo.of(new DoFn() {
 *         public void processElement(ProcessContext c) {
 *           String word = c.element();
 *           Integer length = word.length();
 *           c.output(length);
 *         }}));
 * } 
 *
 * Each output element has the same timestamp and is in the same windows
 * as its corresponding input element, and the output {@code PCollection}
 * has the same {@link WindowFn} associated with it as the input.
 *
 * 
Naming {@link ParDo ParDo} transforms
 *
 * The name of a transform is used to provide a name for any node in the
 * {@link Pipeline} graph resulting from application of the transform.
 * It is best practice to provide a name at the time of application,
 * via {@link PCollection#apply(String, PTransform)}. Otherwise,
 * a unique name - which may not be stable across pipeline revision -
 * will be generated, based on the transform name.
 *
 * 
If a {@link ParDo} is applied exactly once inlined, then
 * it can be given a name via {@link #named}. For example:
 *
 * 
 {@code
 * PCollection words =
 *     lines.apply(ParDo.named("ExtractWords")
 *                      .of(new DoFn() { ... }));
 * PCollection wordLengths =
 *     words.apply(ParDo.named("ComputeWordLengths")
 *                      .of(new DoFn() { ... }));
 * } 
 *
 * Side Inputs
 *
 * While a {@link ParDo} processes elements from a single "main input"
 * {@link PCollection}, it can take additional "side input"
 * {@link PCollectionView PCollectionViews}. These side input
 * {@link PCollectionView PCollectionViews} express styles of accessing
 * {@link PCollection PCollections} computed by earlier pipeline operations,
 * passed in to the {@link ParDo} transform using
 * {@link #withSideInputs}, and their contents accessible to each of
 * the {@link DoFn} operations via {@link DoFn.ProcessContext#sideInput sideInput}.
 * For example:
 *
 * 
 {@code
 * PCollection words = ...;
 * PCollection maxWordLengthCutOff = ...; // Singleton PCollection
 * final PCollectionView maxWordLengthCutOffView =
 *     maxWordLengthCutOff.apply(View.asSingleton());
 * PCollection wordsBelowCutOff =
 *     words.apply(ParDo.withSideInputs(maxWordLengthCutOffView)
 *                      .of(new DoFn() {
 *         public void processElement(ProcessContext c) {
 *           String word = c.element();
 *           int lengthCutOff = c.sideInput(maxWordLengthCutOffView);
 *           if (word.length() <= lengthCutOff) {
 *             c.output(word);
 *           }
 *         }}));
 * } 
 *
 * Side Outputs
 *
 * Optionally, a {@link ParDo} transform can produce multiple
 * output {@link PCollection PCollections}, both a "main output"
 * {@code PCollection} plus any number of "side output"
 * {@link PCollection PCollections}, each keyed by a distinct {@link TupleTag},
 * and bundled in a {@link PCollectionTuple}. The {@link TupleTag TupleTags}
 * to be used for the output {@link PCollectionTuple} are specified by
 * invoking {@link #withOutputTags}. Unconsumed side outputs do not
 * necessarily need to be explicitly specified, even if the {@link DoFn}
 * generates them. Within the {@link DoFn}, an element is added to the
 * main output {@link PCollection} as normal, using
 * {@link DoFn.Context#output}, while an element is added to a side output
 * {@link PCollection} using {@link DoFn.Context#sideOutput}. For example:
 *
 * 
 {@code
 * PCollection words = ...;
 * // Select words whose length is below a cut off,
 * // plus the lengths of words that are above the cut off.
 * // Also select words starting with "MARKER".
 * final int wordLengthCutOff = 10;
 * // Create tags to use for the main and side outputs.
 * final TupleTag wordsBelowCutOffTag =
 *     new TupleTag(){};
 * final TupleTag wordLengthsAboveCutOffTag =
 *     new TupleTag(){};
 * final TupleTag markedWordsTag =
 *     new TupleTag(){};
 * PCollectionTuple results =
 *     words.apply(
 *         ParDo
 *         // Specify the main and consumed side output tags of the
 *         // PCollectionTuple result:
 *         .withOutputTags(wordsBelowCutOffTag,
 *                         TupleTagList.of(wordLengthsAboveCutOffTag)
 *                                     .and(markedWordsTag))
 *         .of(new DoFn() {
 *             // Create a tag for the unconsumed side output.
 *             final TupleTag specialWordsTag =
 *                 new TupleTag(){};
 *             public void processElement(ProcessContext c) {
 *               String word = c.element();
 *               if (word.length() <= wordLengthCutOff) {
 *                 // Emit this short word to the main output.
 *                 c.output(word);
 *               } else {
 *                 // Emit this long word's length to a side output.
 *                 c.sideOutput(wordLengthsAboveCutOffTag, word.length());
 *               }
 *               if (word.startsWith("MARKER")) {
 *                 // Emit this word to a different side output.
 *                 c.sideOutput(markedWordsTag, word);
 *               }
 *               if (word.startsWith("SPECIAL")) {
 *                 // Emit this word to the unconsumed side output.
 *                 c.sideOutput(specialWordsTag, word);
 *               }
 *             }}));
 * // Extract the PCollection results, by tag.
 * PCollection wordsBelowCutOff =
 *     results.get(wordsBelowCutOffTag);
 * PCollection wordLengthsAboveCutOff =
 *     results.get(wordLengthsAboveCutOffTag);
 * PCollection markedWords =
 *     results.get(markedWordsTag);
 * } 
 *
 * Properties May Be Specified In Any Order
 *
 * Several properties can be specified for a {@link ParDo}
 * {@link PTransform}, including name, side inputs, side output tags,
 * and {@link DoFn} to invoke. Only the {@link DoFn} is required; the
 * name is encouraged but not required, and side inputs and side
 * output tags are only specified when they're needed. These
 * properties can be specified in any order, as long as they're
 * specified before the {@link ParDo} {@link PTransform} is applied.
 *
 * 
The approach used to allow these properties to be specified in
 * any order, with some properties omitted, is to have each of the
 * property "setter" methods defined as static factory methods on
 * {@link ParDo} itself, which return an instance of either
 * {@link ParDo.Unbound} or
 * {@link ParDo.Bound} nested classes, each of which offer
 * property setter instance methods to enable setting additional
 * properties. {@link ParDo.Bound} is used for {@link ParDo}
 * transforms whose {@link DoFn} is specified and whose input and
 * output static types have been bound. {@link ParDo.Unbound ParDo.Unbound} is used
 * for {@link ParDo} transforms that have not yet had their
 * {@link DoFn} specified. Only {@link ParDo.Bound} instances can be
 * applied.
 *
 * 
Another benefit of this approach is that it reduces the number
 * of type parameters that need to be specified manually. In
 * particular, the input and output types of the {@link ParDo}
 * {@link PTransform} are inferred automatically from the type
 * parameters of the {@link DoFn} argument passed to {@link ParDo#of}.
 *
 * 
Output Coders
 *
 * By default, the {@link Coder Coder<OutputT>} for the
 * elements of the main output {@link PCollection PCollection<OutputT>} is
 * inferred from the concrete type of the {@link DoFn DoFn<InputT, OutputT>}.
 *
 * 
By default, the {@link Coder Coder<SideOutputT>} for the elements of
 * a side output {@link PCollection PCollection<SideOutputT>} is inferred
 * from the concrete type of the corresponding {@link TupleTag TupleTag<SideOutputT>}.
 * To be successful, the {@link TupleTag} should be created as an instance
 * of a trivial anonymous subclass, with {@code {}} suffixed to the
 * constructor call. Such uses block Java's generic type parameter
 * inference, so the {@code } argument must be provided explicitly.
 * For example:
 * 
 {@code
 * // A TupleTag to use for a side input can be written concisely:
 * final TupleTag sideInputag = new TupleTag<>();
 * // A TupleTag to use for a side output should be written with "{}",
 * // and explicit generic parameter type:
 * final TupleTag sideOutputTag = new TupleTag(){};
 * } 
 * This style of {@code TupleTag} instantiation is used in the example of
 * multiple side outputs, above.
 *
 * Serializability of {@link DoFn DoFns}
 *
 * A {@link DoFn} passed to a {@link ParDo} transform must be
 * {@link Serializable}. This allows the {@link DoFn} instance
 * created in this "main program" to be sent (in serialized form) to
 * remote worker machines and reconstituted for each bundles of elements
 * of the input {@link PCollection} being processed. A {@link DoFn}
 * can have instance variable state, and non-transient instance
 * variable state will be serialized in the main program and then
 * deserialized on remote worker machines for each bundle of elements
 * to process.
 *
 * 
To aid in ensuring that {@link DoFn DoFns} are properly
 * {@link Serializable}, even local execution using the
 * {@link DirectPipelineRunner} will serialize and then deserialize
 * {@link DoFn DoFns} before executing them on a bundle.
 *
 * 
{@link DoFn DoFns} expressed as anonymous inner classes can be
 * convenient, but due to a quirk in Java's rules for serializability,
 * non-static inner or nested classes (including anonymous inner
 * classes) automatically capture their enclosing class's instance in
 * their serialized state. This can lead to including much more than
 * intended in the serialized state of a {@link DoFn}, or even things
 * that aren't {@link Serializable}.
 *
 * 
There are two ways to avoid unintended serialized state in a
 * {@link DoFn}:
 *
 * 

 *
 * Define the {@link DoFn} as a named, static class.
 *
 * 
Define the {@link DoFn} as an anonymous inner class inside of
 * a static method.
 *
 * 
 *
 * Both of these approaches ensure that there is no implicit enclosing
 * instance serialized along with the {@link DoFn} instance.
 *
 * 
Prior to Java 8, any local variables of the enclosing
 * method referenced from within an anonymous inner class need to be
 * marked as {@code final}. If defining the {@link DoFn} as a named
 * static class, such variables would be passed as explicit
 * constructor arguments and stored in explicit instance variables.
 *
 * 
There are three main ways to initialize the state of a
 * {@link DoFn} instance processing a bundle:
 *
 * 

 *
 * Define instance variable state (including implicit instance
 * variables holding final variables captured by an anonymous inner
 * class), initialized by the {@link DoFn}'s constructor (which is
 * implicit for an anonymous inner class). This state will be
 * automatically serialized and then deserialized in the {@code DoFn}
 * instance created for each bundle. This method is good for state
 * known when the original {@code DoFn} is created in the main
 * program, if it's not overly large.
 *
 * 
Compute the state as a singleton {@link PCollection} and pass it
 * in as a side input to the {@link DoFn}. This is good if the state
 * needs to be computed by the pipeline, or if the state is very large
 * and so is best read from file(s) rather than sent as part of the
 * {@code DoFn}'s serialized state.
 *
 * 
Initialize the state in each {@link DoFn} instance, in
 * {@link DoFn#startBundle}. This is good if the initialization
 * doesn't depend on any information known only by the main program or
 * computed by earlier pipeline operations, but is the same for all
 * instances of this {@link DoFn} for all program executions, say
 * setting up empty caches or initializing constant data.
 *
 * 
 *
 * No Global Shared State
 *
 * {@link ParDo} operations are intended to be able to run in
 * parallel across multiple worker machines. This precludes easy
 * sharing and updating mutable state across those machines. There is
 * no support in the Google Cloud Dataflow system for communicating
 * and synchronizing updates to shared state across worker machines,
 * so programs should not access any mutable static variable state in
 * their {@link DoFn}, without understanding that the Java processes
 * for the main program and workers will each have its own independent
 * copy of such state, and there won't be any automatic copying of
 * that state across Java processes. All information should be
 * communicated to {@link DoFn} instances via main and side inputs and
 * serialized state, and all output should be communicated from a
 * {@link DoFn} instance via main and side outputs, in the absence of
 * external communication mechanisms written by user code.
 *
 * 
Fault Tolerance
 *
 * In a distributed system, things can fail: machines can crash,
 * machines can be unable to communicate across the network, etc.
 * While individual failures are rare, the larger the job, the greater
 * the chance that something, somewhere, will fail. The Google Cloud
 * Dataflow service strives to mask such failures automatically,
 * principally by retrying failed {@link DoFn} bundle. This means
 * that a {@code DoFn} instance might process a bundle partially, then
 * crash for some reason, then be rerun (often on a different worker
 * machine) on that same bundle and on the same elements as before.
 * Sometimes two or more {@link DoFn} instances will be running on the
 * same bundle simultaneously, with the system taking the results of
 * the first instance to complete successfully. Consequently, the
 * code in a {@link DoFn} needs to be written such that these
 * duplicate (sequential or concurrent) executions do not cause
 * problems. If the outputs of a {@link DoFn} are a pure function of
 * its inputs, then this requirement is satisfied. However, if a
 * {@link DoFn DoFn's} execution has external side-effects, such as performing
 * updates to external HTTP services, then the {@link DoFn DoFn's} code
 * needs to take care to ensure that those updates are idempotent and
 * that concurrent updates are acceptable. This property can be
 * difficult to achieve, so it is advisable to strive to keep
 * {@link DoFn DoFns} as pure functions as much as possible.
 *
 * 
Optimization
 *
 * The Google Cloud Dataflow service automatically optimizes a
 * pipeline before it is executed. A key optimization, fusion,
 * relates to {@link ParDo} operations. If one {@link ParDo} operation produces a
 * {@link PCollection} that is then consumed as the main input of another
 * {@link ParDo} operation, the two {@link ParDo} operations will be fused
 * together into a single ParDo operation and run in a single pass;
 * this is "producer-consumer fusion". Similarly, if
 * two or more ParDo operations have the same {@link PCollection} main input,
 * they will be fused into a single {@link ParDo} that makes just one pass
 * over the input {@link PCollection}; this is "sibling fusion".
 *
 * 
If after fusion there are no more unfused references to a
 * {@link PCollection} (e.g., one between a producer ParDo and a consumer
 * {@link ParDo}), the {@link PCollection} itself is "fused away" and won't ever be
 * written to disk, saving all the I/O and space expense of
 * constructing it.
 *
 * 
The Google Cloud Dataflow service applies fusion as much as
 * possible, greatly reducing the cost of executing pipelines. As a
 * result, it is essentially "free" to write {@link ParDo} operations in a
 * very modular, composable style, each {@link ParDo} operation doing one
 * clear task, and stringing together sequences of {@link ParDo} operations to
 * get the desired overall effect. Such programs can be easier to
 * understand, easier to unit-test, easier to extend and evolve, and
 * easier to reuse in new programs. The predefined library of
 * PTransforms that come with Google Cloud Dataflow makes heavy use of
 * this modular, composable style, trusting to the Google Cloud
 * Dataflow service's optimizer to "flatten out" all the compositions
 * into highly optimized stages.
 *
 * @see the web
 * documentation for ParDo
 */
public class ParDo {

  /**
   * Creates a {@link ParDo} {@link PTransform} with the given name.
   *
   * 
See the discussion of naming above for more explanation.
   *
   * 
The resulting {@link PTransform} is incomplete, and its
   * input/output types are not yet bound. Use
   * {@link ParDo.Unbound#of} to specify the {@link DoFn} to
   * invoke, which will also bind the input/output types of this
   * {@link PTransform}.
   */
  public static Unbound named(String name) {
    return new Unbound().named(name);
  }

  /**
   * Creates a {@link ParDo} {@link PTransform} with the given
   * side inputs.
   *
   * 
Side inputs are {@link PCollectionView PCollectionViews}, whose contents are
   * computed during pipeline execution and then made accessible to
   * {@link DoFn} code via {@link DoFn.ProcessContext#sideInput sideInput}. Each
   * invocation of the {@link DoFn} receives the same values for these
   * side inputs.
   *
   * 
See the discussion of Side Inputs above for more explanation.
   *
   * 
The resulting {@link PTransform} is incomplete, and its
   * input/output types are not yet bound. Use
   * {@link ParDo.Unbound#of} to specify the {@link DoFn} to
   * invoke, which will also bind the input/output types of this
   * {@link PTransform}.
   */
  public static Unbound withSideInputs(PCollectionView... sideInputs) {
    return new Unbound().withSideInputs(sideInputs);
  }

  /**
    * Creates a {@link ParDo} with the given side inputs.
    *
   * 
Side inputs are {@link PCollectionView}s, whose contents are
   * computed during pipeline execution and then made accessible to
   * {@code DoFn} code via {@link DoFn.ProcessContext#sideInput sideInput}.
   *
   * 
See the discussion of Side Inputs above for more explanation.
   *
   * 
The resulting {@link PTransform} is incomplete, and its
   * input/output types are not yet bound. Use
   * {@link ParDo.Unbound#of} to specify the {@link DoFn} to
   * invoke, which will also bind the input/output types of this
   * {@link PTransform}.
   */
  public static Unbound withSideInputs(
      Iterable> sideInputs) {
    return new Unbound().withSideInputs(sideInputs);
  }

  /**
   * Creates a multi-output {@link ParDo} {@link PTransform} whose
   * output {@link PCollection}s will be referenced using the given main
   * output and side output tags.
   *
   * 
{@link TupleTag TupleTags} are used to name (with its static element
   * type {@code T}) each main and side output {@code PCollection}.
   * This {@link PTransform PTransform's} {@link DoFn} emits elements to the main
   * output {@link PCollection} as normal, using
   * {@link DoFn.Context#output}. The {@link DoFn} emits elements to
   * a side output {@code PCollection} using
   * {@link DoFn.Context#sideOutput}, passing that side output's tag
   * as an argument. The result of invoking this {@link PTransform}
   * will be a {@link PCollectionTuple}, and any of the the main and
   * side output {@code PCollection}s can be retrieved from it via
   * {@link PCollectionTuple#get}, passing the output's tag as an
   * argument.
   *
   * 
See the discussion of Side Outputs above for more explanation.
   *
   * 
The resulting {@link PTransform} is incomplete, and its input
   * type is not yet bound. Use {@link ParDo.UnboundMulti#of}
   * to specify the {@link DoFn} to invoke, which will also bind the
   * input type of this {@link PTransform}.
   */
  public static  UnboundMulti withOutputTags(
      TupleTag mainOutputTag,
      TupleTagList sideOutputTags) {
    return new Unbound().withOutputTags(mainOutputTag, sideOutputTags);
  }

  /**
   * Creates a {@link ParDo} {@link PTransform} that will invoke the
   * given {@link DoFn} function.
   *
   * 
The resulting {@link PTransform PTransform's} types have been bound, with the
   * input being a {@code PCollection} and the output a
   * {@code PCollection}, inferred from the types of the argument
   * {@code DoFn}. It is ready to be applied, or further
   * properties can be set on it first.
   */
  public static  Bound of(DoFn fn) {
    return of(fn, fn.getClass());
  }

  private static  Bound of(
          DoFn fn, Class fnClass) {
    return new Unbound().of(fn, fnClass);
  }

  private static  DoFn
      adapt(DoFnWithContext fn) {
    return DoFnReflector.of(fn.getClass()).toDoFn(fn);
  }

  /**
   * Creates a {@link ParDo} {@link PTransform} that will invoke the
   * given {@link DoFnWithContext} function.
   *
   * 
The resulting {@link PTransform PTransform's} types have been bound, with the
   * input being a {@code PCollection} and the output a
   * {@code PCollection}, inferred from the types of the argument
   * {@code DoFn}. It is ready to be applied, or further
   * properties can be set on it first.
   *
   * 
{@link DoFnWithContext} is an experimental alternative to
   * {@link DoFn} which simplifies accessing the window of the element.
   */
  @Experimental
  public static  Bound of(DoFnWithContext fn) {
    return of(adapt(fn), fn.getClass());
  }

  /**
   * An incomplete {@link ParDo} transform, with unbound input/output types.
   *
   * 
Before being applied, {@link ParDo.Unbound#of} must be
   * invoked to specify the {@link DoFn} to invoke, which will also
   * bind the input/output types of this {@link PTransform}.
   */
  public static class Unbound {
    private final String name;
    private final List> sideInputs;

    Unbound() {
      this(null, ImmutableList.>of());
    }

    Unbound(String name, List> sideInputs) {
      this.name = name;
      this.sideInputs = sideInputs;
    }

    /**
     * Returns a new {@link ParDo} transform that's like this
     * transform but with the specified name. Does not modify this
     * transform. The resulting transform is still incomplete.
     *
     * 
See the discussion of naming above for more explanation.
     */
    public Unbound named(String name) {
      return new Unbound(name, sideInputs);
    }

    /**
     * Returns a new {@link ParDo} transform that's like this
     * transform but with the specified additional side inputs.
     * Does not modify this transform. The resulting transform is
     * still incomplete.
     *
     * 
See the discussion of Side Inputs above and on
     * {@link ParDo#withSideInputs} for more explanation.
     */
    public Unbound withSideInputs(PCollectionView... sideInputs) {
      return withSideInputs(Arrays.asList(sideInputs));
    }

    /**
     * Returns a new {@link ParDo} transform that is like this
     * transform but with the specified additional side inputs. Does not modify
     * this transform. The resulting transform is still incomplete.
     *
     * 
See the discussion of Side Inputs above and on
     * {@link ParDo#withSideInputs} for more explanation.
     */
    public Unbound withSideInputs(
        Iterable> sideInputs) {
      ImmutableList.Builder> builder = ImmutableList.builder();
      builder.addAll(this.sideInputs);
      builder.addAll(sideInputs);
      return new Unbound(name, builder.build());
    }

    /**
     * Returns a new multi-output {@link ParDo} transform that's like
     * this transform but with the specified main and side output
     * tags. Does not modify this transform. The resulting transform
     * is still incomplete.
     *
     * 
See the discussion of Side Outputs above and on
     * {@link ParDo#withOutputTags} for more explanation.
     */
    public  UnboundMulti withOutputTags(TupleTag mainOutputTag,
                                              TupleTagList sideOutputTags) {
      return new UnboundMulti<>(
          name, sideInputs, mainOutputTag, sideOutputTags);
    }

    /**
     * Returns a new {@link ParDo} {@link PTransform} that's like this
     * transform but that will invoke the given {@link DoFn}
     * function, and that has its input and output types bound. Does
     * not modify this transform. The resulting {@link PTransform} is
     * sufficiently specified to be applied, but more properties can
     * still be specified.
     */
    public  Bound of(DoFn fn) {
      return of(fn, fn.getClass());
    }

    private  Bound of(
        DoFn fn, Class fnClass) {
      return new Bound<>(name, sideInputs, fn, fnClass);
    }


    /**
     * Returns a new {@link ParDo} {@link PTransform} that's like this
     * transform but which will invoke the given {@link DoFnWithContext}
     * function, and which has its input and output types bound. Does
     * not modify this transform. The resulting {@link PTransform} is
     * sufficiently specified to be applied, but more properties can
     * still be specified.
     */
    public  Bound of(DoFnWithContext fn) {
      return of(adapt(fn), fn.getClass());
    }
  }

  /**
   * A {@link PTransform} that, when applied to a {@code PCollection},
   * invokes a user-specified {@code DoFn} on all its elements,
   * with all its outputs collected into an output
   * {@code PCollection}.
   *
   * 
A multi-output form of this transform can be created with
   * {@link ParDo.Bound#withOutputTags}.
   *
   * @param  the type of the (main) input {@link PCollection} elements
   * @param  the type of the (main) output {@link PCollection} elements
   */
  public static class Bound
      extends PTransform, PCollection> {
    // Inherits name.
    private final List> sideInputs;
    private final DoFn fn;
    private final Class fnClass;

    Bound(String name,
          List> sideInputs,
          DoFn fn,
          Class fnClass) {
      super(name);
      this.sideInputs = sideInputs;
      this.fn = SerializableUtils.clone(fn);
      this.fnClass = fnClass;
    }

    /**
     * Returns a new {@link ParDo} {@link PTransform} that's like this
     * {@link PTransform} but with the specified name. Does not
     * modify this {@link PTransform}.
     *
     * 
See the discussion of Naming above for more explanation.
     */
    public Bound named(String name) {
      return new Bound<>(name, sideInputs, fn, fnClass);
    }

    /**
     * Returns a new {@link ParDo} {@link PTransform} that's like this
     * {@link PTransform} but with the specified additional side inputs. Does not
     * modify this {@link PTransform}.
     *
     * 
See the discussion of Side Inputs above and on
     * {@link ParDo#withSideInputs} for more explanation.
     */
    public Bound withSideInputs(PCollectionView... sideInputs) {
      return withSideInputs(Arrays.asList(sideInputs));
    }

    /**
     * Returns a new {@link ParDo} {@link PTransform} that's like this
     * {@link PTransform} but with the specified additional side inputs. Does not
     * modify this {@link PTransform}.
     *
     * 
See the discussion of Side Inputs above and on
     * {@link ParDo#withSideInputs} for more explanation.
     */
    public Bound withSideInputs(
        Iterable> sideInputs) {
      ImmutableList.Builder> builder = ImmutableList.builder();
      builder.addAll(this.sideInputs);
      builder.addAll(sideInputs);
      return new Bound<>(name, builder.build(), fn, fnClass);
    }

    /**
     * Returns a new multi-output {@link ParDo} {@link PTransform}
     * that's like this {@link PTransform} but with the specified main
     * and side output tags. Does not modify this {@link PTransform}.
     *
     * 
See the discussion of Side Outputs above and on
     * {@link ParDo#withOutputTags} for more explanation.
     */
    public BoundMulti withOutputTags(TupleTag mainOutputTag,
                                           TupleTagList sideOutputTags) {
      return new BoundMulti<>(
          name, sideInputs, mainOutputTag, sideOutputTags, fn, fnClass);
    }

    @Override
    public PCollection apply(PCollection input) {
      return PCollection.createPrimitiveOutputInternal(
              input.getPipeline(),
              input.getWindowingStrategy(),
              input.isBounded())
          .setTypeDescriptorInternal(fn.getOutputTypeDescriptor());
    }

    @Override
    @SuppressWarnings("unchecked")
    protected Coder getDefaultOutputCoder(PCollection input)
        throws CannotProvideCoderException {
      return input.getPipeline().getCoderRegistry().getDefaultCoder(
          fn.getOutputTypeDescriptor(),
          fn.getInputTypeDescriptor(),
          ((PCollection) input).getCoder());
    }

    @Override
    protected String getKindString() {
      Class clazz = DoFnReflector.getDoFnClass(fn);
      if (clazz.isAnonymousClass()) {
        return "AnonymousParDo";
      } else {
        return String.format("ParDo(%s)", StringUtils.approximateSimpleName(clazz));
      }
    }

    /**
     * {@inheritDoc}
     *
     * 
{@link ParDo} registers its internal {@link DoFn} as a subcomponent for display data.
     * {@link DoFn} implementations can register display data by overriding
     * {@link DoFn#populateDisplayData}.
     */
    @Override
    public void populateDisplayData(Builder builder) {
      super.populateDisplayData(builder);
      ParDo.populateDisplayData(builder, fn, fnClass);
    }

    public DoFn getFn() {
      return fn;
    }

    public List> getSideInputs() {
      return sideInputs;
    }
  }

  /**
   * An incomplete multi-output {@link ParDo} transform, with unbound
   * input type.
   *
   * 
Before being applied, {@link ParDo.UnboundMulti#of} must be
   * invoked to specify the {@link DoFn} to invoke, which will also
   * bind the input type of this {@link PTransform}.
   *
   * @param  the type of the main output {@code PCollection} elements
   */
  public static class UnboundMulti {
    private final String name;
    private final List> sideInputs;
    private final TupleTag mainOutputTag;
    private final TupleTagList sideOutputTags;

    UnboundMulti(String name,
                 List> sideInputs,
                 TupleTag mainOutputTag,
                 TupleTagList sideOutputTags) {
      this.name = name;
      this.sideInputs = sideInputs;
      this.mainOutputTag = mainOutputTag;
      this.sideOutputTags = sideOutputTags;
    }

    /**
     * Returns a new multi-output {@link ParDo} transform that's like
     * this transform but with the specified name. Does not modify
     * this transform. The resulting transform is still incomplete.
     *
     * 
See the discussion of Naming above for more explanation.
     */
    public UnboundMulti named(String name) {
      return new UnboundMulti<>(
          name, sideInputs, mainOutputTag, sideOutputTags);
    }

    /**
     * Returns a new multi-output {@link ParDo} transform that's like
     * this transform but with the specified side inputs. Does not
     * modify this transform. The resulting transform is still
     * incomplete.
     *
     * 
See the discussion of Side Inputs above and on
     * {@link ParDo#withSideInputs} for more explanation.
     */
    public UnboundMulti withSideInputs(
        PCollectionView... sideInputs) {
      return withSideInputs(Arrays.asList(sideInputs));
    }

    /**
     * Returns a new multi-output {@link ParDo} transform that's like
     * this transform but with the specified additional side inputs. Does not
     * modify this transform. The resulting transform is still
     * incomplete.
     *
     * 
See the discussion of Side Inputs above and on
     * {@link ParDo#withSideInputs} for more explanation.
     */
    public UnboundMulti withSideInputs(
        Iterable> sideInputs) {
      ImmutableList.Builder> builder = ImmutableList.builder();
      builder.addAll(this.sideInputs);
      builder.addAll(sideInputs);
      return new UnboundMulti<>(
          name, builder.build(),
          mainOutputTag, sideOutputTags);
    }

    /**
     * Returns a new multi-output {@link ParDo} {@link PTransform}
     * that's like this transform but that will invoke the given
     * {@link DoFn} function, and that has its input type bound.
     * Does not modify this transform. The resulting
     * {@link PTransform} is sufficiently specified to be applied, but
     * more properties can still be specified.
     */
    public  BoundMulti of(DoFn fn) {
      return of(fn, fn.getClass());
    }

    public  BoundMulti of(DoFn fn, Class fnClass) {
      return new BoundMulti<>(
              name, sideInputs, mainOutputTag, sideOutputTags, fn, fnClass);
    }

    /**
     * Returns a new multi-output {@link ParDo} {@link PTransform}
     * that's like this transform but which will invoke the given
     * {@link DoFnWithContext} function, and which has its input type bound.
     * Does not modify this transform. The resulting
     * {@link PTransform} is sufficiently specified to be applied, but
     * more properties can still be specified.
     */
    public  BoundMulti of(DoFnWithContext fn) {
      return of(adapt(fn), fn.getClass());
    }
  }

  /**
   * A {@link PTransform} that, when applied to a
   * {@code PCollection}, invokes a user-specified
   * {@code DoFn} on all its elements, which can emit elements
   * to any of the {@link PTransform}'s main and side output
   * {@code PCollection}s, which are bundled into a result
   * {@code PCollectionTuple}.
   *
   * @param  the type of the (main) input {@code PCollection} elements
   * @param  the type of the main output {@code PCollection} elements
   */
  public static class BoundMulti
      extends PTransform, PCollectionTuple> {
    // Inherits name.
    private final List> sideInputs;
    private final TupleTag mainOutputTag;
    private final TupleTagList sideOutputTags;
    private final DoFn fn;
    private final Class fnClass;

    BoundMulti(String name,
               List> sideInputs,
               TupleTag mainOutputTag,
               TupleTagList sideOutputTags,
               DoFn fn,
               Class fnClass) {
      super(name);
      this.sideInputs = sideInputs;
      this.mainOutputTag = mainOutputTag;
      this.sideOutputTags = sideOutputTags;
      this.fn = SerializableUtils.clone(fn);
      this.fnClass = fnClass;
    }

    /**
     * Returns a new multi-output {@link ParDo} {@link PTransform}
     * that's like this {@link PTransform} but with the specified
     * name. Does not modify this {@link PTransform}.
     *
     * 
See the discussion of Naming above for more explanation.
     */
    public BoundMulti named(String name) {
      return new BoundMulti<>(
          name, sideInputs, mainOutputTag, sideOutputTags, fn, fnClass);
    }

    /**
     * Returns a new multi-output {@link ParDo} {@link PTransform}
     * that's like this {@link PTransform} but with the specified additional side
     * inputs. Does not modify this {@link PTransform}.
     *
     * 
See the discussion of Side Inputs above and on
     * {@link ParDo#withSideInputs} for more explanation.
     */
    public BoundMulti withSideInputs(
        PCollectionView... sideInputs) {
      return withSideInputs(Arrays.asList(sideInputs));
    }

    /**
     * Returns a new multi-output {@link ParDo} {@link PTransform}
     * that's like this {@link PTransform} but with the specified additional side
     * inputs. Does not modify this {@link PTransform}.
     *
     * 
See the discussion of Side Inputs above and on
     * {@link ParDo#withSideInputs} for more explanation.
     */
    public BoundMulti withSideInputs(
        Iterable> sideInputs) {
      ImmutableList.Builder> builder = ImmutableList.builder();
      builder.addAll(this.sideInputs);
      builder.addAll(sideInputs);
      return new BoundMulti<>(
          name, builder.build(),
          mainOutputTag, sideOutputTags, fn, fnClass);
    }


    @Override
    public PCollectionTuple apply(PCollection input) {
      PCollectionTuple outputs = PCollectionTuple.ofPrimitiveOutputsInternal(
          input.getPipeline(),
          TupleTagList.of(mainOutputTag).and(sideOutputTags.getAll()),
          input.getWindowingStrategy(),
          input.isBounded());

      // The fn will likely be an instance of an anonymous subclass
      // such as DoFn { }, thus will have a high-fidelity
      // TypeDescriptor for the output type.
      outputs.get(mainOutputTag).setTypeDescriptorInternal(fn.getOutputTypeDescriptor());

      return outputs;
    }

    @Override
    protected Coder getDefaultOutputCoder() {
      throw new RuntimeException(
          "internal error: shouldn't be calling this on a multi-output ParDo");
    }

    @Override
    public  Coder getDefaultOutputCoder(
        PCollection input, TypedPValue output)
        throws CannotProvideCoderException {
      @SuppressWarnings("unchecked")
      Coder inputCoder = ((PCollection) input).getCoder();
      return input.getPipeline().getCoderRegistry().getDefaultCoder(
          output.getTypeDescriptor(),
          fn.getInputTypeDescriptor(),
          inputCoder);
      }

    @Override
    protected String getKindString() {
      Class clazz = DoFnReflector.getDoFnClass(fn);
      if (fn.getClass().isAnonymousClass()) {
        return "AnonymousParMultiDo";
      } else {
        return String.format("ParMultiDo(%s)", StringUtils.approximateSimpleName(clazz));
      }
    }

    @Override
    public void populateDisplayData(Builder builder) {
      super.populateDisplayData(builder);
      ParDo.populateDisplayData(builder, fn, fnClass);
    }

    public DoFn getFn() {
      return fn;
    }

    public TupleTag getMainOutputTag() {
      return mainOutputTag;
    }

    public TupleTagList getSideOutputTags() {
      return sideOutputTags;
    }

    public List> getSideInputs() {
      return sideInputs;
    }
  }

  /////////////////////////////////////////////////////////////////////////////

  static {
    DirectPipelineRunner.registerDefaultTransformEvaluator(
        Bound.class,
        new DirectPipelineRunner.TransformEvaluator() {
          @Override
          public void evaluate(
              Bound transform,
              DirectPipelineRunner.EvaluationContext context) {
            evaluateSingleHelper(transform, context);
          }
        });
  }

  private static  void evaluateSingleHelper(
      Bound transform,
      DirectPipelineRunner.EvaluationContext context) {
    TupleTag mainOutputTag = new TupleTag<>("out");

    DirectModeExecutionContext executionContext = DirectModeExecutionContext.create();

    PCollectionTuple outputs = PCollectionTuple.of(mainOutputTag, context.getOutput(transform));

    evaluateHelper(
        transform.fn,
        context.getStepName(transform),
        context.getInput(transform),
        transform.sideInputs,
        mainOutputTag,
        Collections.>emptyList(),
        outputs,
        context,
        executionContext);

    context.setPCollectionValuesWithMetadata(
        context.getOutput(transform),
        executionContext.getOutput(mainOutputTag));
  }

  /////////////////////////////////////////////////////////////////////////////

  static {
    DirectPipelineRunner.registerDefaultTransformEvaluator(
        BoundMulti.class,
        new DirectPipelineRunner.TransformEvaluator() {
          @Override
          public void evaluate(
              BoundMulti transform,
              DirectPipelineRunner.EvaluationContext context) {
            evaluateMultiHelper(transform, context);
          }
        });
  }

  private static  void evaluateMultiHelper(
      BoundMulti transform,
      DirectPipelineRunner.EvaluationContext context) {

    DirectModeExecutionContext executionContext = DirectModeExecutionContext.create();

    evaluateHelper(
        transform.fn,
        context.getStepName(transform),
        context.getInput(transform),
        transform.sideInputs,
        transform.mainOutputTag,
        transform.sideOutputTags.getAll(),
        context.getOutput(transform),
        context,
        executionContext);

    for (Map.Entry, PCollection> entry
        : context.getOutput(transform).getAll().entrySet()) {
      @SuppressWarnings("unchecked")
      TupleTag