com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.
There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.runners;

import static com.google.cloud.dataflow.sdk.util.StringUtils.approximatePTransformName;
import static com.google.cloud.dataflow.sdk.util.StringUtils.approximateSimpleName;
import static com.google.cloud.dataflow.sdk.util.WindowedValue.valueInEmptyWindows;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;

import com.google.api.client.googleapis.json.GoogleJsonResponseException;
import com.google.api.client.json.JsonFactory;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.clouddebugger.v2.Clouddebugger;
import com.google.api.services.clouddebugger.v2.model.Debuggee;
import com.google.api.services.clouddebugger.v2.model.RegisterDebuggeeRequest;
import com.google.api.services.clouddebugger.v2.model.RegisterDebuggeeResponse;
import com.google.api.services.dataflow.Dataflow;
import com.google.api.services.dataflow.model.DataflowPackage;
import com.google.api.services.dataflow.model.Job;
import com.google.api.services.dataflow.model.ListJobsResponse;
import com.google.api.services.dataflow.model.WorkerPool;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
import com.google.cloud.dataflow.sdk.PipelineResult.State;
import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.coders.BigEndianLongCoder;
import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.Coder.NonDeterministicException;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
import com.google.cloud.dataflow.sdk.coders.IterableCoder;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.coders.ListCoder;
import com.google.cloud.dataflow.sdk.coders.MapCoder;
import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
import com.google.cloud.dataflow.sdk.coders.StandardCoder;
import com.google.cloud.dataflow.sdk.coders.TableRowJsonCoder;
import com.google.cloud.dataflow.sdk.coders.VarIntCoder;
import com.google.cloud.dataflow.sdk.coders.VarLongCoder;
import com.google.cloud.dataflow.sdk.io.AvroIO;
import com.google.cloud.dataflow.sdk.io.BigQueryIO;
import com.google.cloud.dataflow.sdk.io.BoundedSource;
import com.google.cloud.dataflow.sdk.io.FileBasedSink;
import com.google.cloud.dataflow.sdk.io.PubsubIO;
import com.google.cloud.dataflow.sdk.io.PubsubIO.Read.Bound.PubsubReader;
import com.google.cloud.dataflow.sdk.io.PubsubIO.Write.Bound.PubsubWriter;
import com.google.cloud.dataflow.sdk.io.PubsubUnboundedSink;
import com.google.cloud.dataflow.sdk.io.PubsubUnboundedSource;
import com.google.cloud.dataflow.sdk.io.Read;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.io.UnboundedSource;
import com.google.cloud.dataflow.sdk.io.Write;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineDebugOptions;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineWorkerPoolOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsValidator;
import com.google.cloud.dataflow.sdk.options.StreamingOptions;
import com.google.cloud.dataflow.sdk.options.ValueProvider.NestedValueProvider;
import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.JobSpecification;
import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TransformTranslator;
import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TranslationContext;
import com.google.cloud.dataflow.sdk.runners.dataflow.AssignWindows;
import com.google.cloud.dataflow.sdk.runners.dataflow.DataflowAggregatorTransforms;
import com.google.cloud.dataflow.sdk.runners.dataflow.DataflowUnboundedReadFromBoundedSource;
import com.google.cloud.dataflow.sdk.runners.dataflow.ReadTranslator;
import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat;
import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat.IsmRecord;
import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat.IsmRecordCoder;
import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat.MetadataKeyCoder;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.Combine;
import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.Flatten;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.View.CreatePCollectionView;
import com.google.cloud.dataflow.sdk.transforms.WithKeys;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.transforms.windowing.AfterPane;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.util.CoderUtils;
import com.google.cloud.dataflow.sdk.util.DataflowReleaseInfo;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
import com.google.cloud.dataflow.sdk.util.MimeTypes;
import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
import com.google.cloud.dataflow.sdk.util.PCollectionViews;
import com.google.cloud.dataflow.sdk.util.PathValidator;
import com.google.cloud.dataflow.sdk.util.PropertyNames;
import com.google.cloud.dataflow.sdk.util.Reshuffle;
import com.google.cloud.dataflow.sdk.util.SystemDoFnInternal;
import com.google.cloud.dataflow.sdk.util.Transport;
import com.google.cloud.dataflow.sdk.util.ValueWithRecordId;
import com.google.cloud.dataflow.sdk.util.WindowedValue;
import com.google.cloud.dataflow.sdk.util.WindowedValue.FullWindowedValueCoder;
import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
import com.google.cloud.dataflow.sdk.values.PCollectionList;
import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.POutput;
import com.google.cloud.dataflow.sdk.values.PValue;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TupleTagList;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Strings;
import com.google.common.base.Utf8;
import com.google.common.collect.ForwardingMap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;

import org.joda.time.DateTimeUtils;
import org.joda.time.DateTimeZone;
import org.joda.time.Duration;
import org.joda.time.format.DateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;

import javax.annotation.Nullable;

/**
 * A {@link PipelineRunner} that executes the operations in the
 * pipeline by first translating them to the Dataflow representation
 * using the {@link DataflowPipelineTranslator} and then submitting
 * them to a Dataflow service for execution.
 *
 * 
Permissions
 * When reading from a Dataflow source or writing to a Dataflow sink using
 * {@code DataflowPipelineRunner}, the Google cloudservices account and the Google compute engine
 * service account of the GCP project running the Dataflow Job will need access to the corresponding
 * source/sink.
 *
 * Please see Google Cloud
 * Dataflow Security and Permissions for more details.
 */
public class DataflowPipelineRunner extends PipelineRunner {
  private static final Logger LOG = LoggerFactory.getLogger(DataflowPipelineRunner.class);

  /** Provided configuration options. */
  private final DataflowPipelineOptions options;

  /** Client for the Dataflow service. This is used to actually submit jobs. */
  private final Dataflow dataflowClient;

  /** Translator for this DataflowPipelineRunner, based on options. */
  private final DataflowPipelineTranslator translator;

  /** Custom transforms implementations. */
  private final Map, Class> overrides;

  /** A set of user defined functions to invoke at different points in execution. */
  private DataflowPipelineRunnerHooks hooks;

  // Environment version information.
  private static final String ENVIRONMENT_MAJOR_VERSION = "6";

  // Default Docker container images that execute Dataflow worker harness, residing in Google
  // Container Registry, separately for Batch and Streaming.
  public static final String BATCH_WORKER_HARNESS_CONTAINER_IMAGE
      = "dataflow.gcr.io/v1beta3/java-batch:1.9.0";
  public static final String STREAMING_WORKER_HARNESS_CONTAINER_IMAGE
      = "dataflow.gcr.io/v1beta3/java-streaming:1.9.0";

  // The limit of CreateJob request size.
  private static final int CREATE_JOB_REQUEST_LIMIT_BYTES = 10 * 1024 * 1024;

  @VisibleForTesting
  static final int GCS_UPLOAD_BUFFER_SIZE_BYTES_DEFAULT = 1 * 1024 * 1024;

  private final Set> pcollectionsRequiringIndexedFormat;

  /**
   * Project IDs must contain lowercase letters, digits, or dashes.
   * IDs must start with a letter and may not end with a dash.
   * This regex isn't exact - this allows for patterns that would be rejected by
   * the service, but this is sufficient for basic validation of project IDs.
   */
  public static final String PROJECT_ID_REGEXP = "[a-z][-a-z0-9:.]+[a-z0-9]";

  private static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();
  /**
   * Construct a runner from the provided options.
   *
   * @param options Properties that configure the runner.
   * @return The newly created runner.
   */
  public static DataflowPipelineRunner fromOptions(PipelineOptions options) {
    // (Re-)register standard IO factories. Clobbers any prior credentials.
    IOChannelUtils.registerStandardIOFactories(options);

    DataflowPipelineOptions dataflowOptions =
        PipelineOptionsValidator.validate(DataflowPipelineOptions.class, options);
    ArrayList missing = new ArrayList<>();

    if (dataflowOptions.getAppName() == null) {
      missing.add("appName");
    }
    if (missing.size() > 0) {
      throw new IllegalArgumentException(
          "Missing required values: " + Joiner.on(',').join(missing));
    }

    PathValidator validator = dataflowOptions.getPathValidator();
    checkArgument(!(Strings.isNullOrEmpty(dataflowOptions.getTempLocation())
        && Strings.isNullOrEmpty(dataflowOptions.getStagingLocation())),
        "Missing required value: at least one of tempLocation or stagingLocation must be set.");

    if (dataflowOptions.getStagingLocation() != null) {
      validator.validateOutputFilePrefixSupported(dataflowOptions.getStagingLocation());
    }
    if (dataflowOptions.getTempLocation() != null) {
      validator.validateOutputFilePrefixSupported(dataflowOptions.getTempLocation());
    }
    if (!Strings.isNullOrEmpty(dataflowOptions.getSaveProfilesToGcs())) {
      validator.validateOutputFilePrefixSupported(dataflowOptions.getSaveProfilesToGcs());
    }
    if (dataflowOptions.getEnableProfilingAgent()) {
      LOG.error("--enableProfilingAgent is no longer supported, and will be ignored. "
          + "Use --saveProfilesToGcs instead.");
    }

    if (Strings.isNullOrEmpty(dataflowOptions.getTempLocation())) {
      dataflowOptions.setTempLocation(dataflowOptions.getStagingLocation());
    } else if (Strings.isNullOrEmpty(dataflowOptions.getStagingLocation())) {
      try {
        dataflowOptions.setStagingLocation(
            IOChannelUtils.resolve(dataflowOptions.getTempLocation(), "staging"));
      } catch (IOException e) {
        throw new IllegalArgumentException("Unable to resolve PipelineOptions.stagingLocation "
            + "from PipelineOptions.tempLocation. Please set the staging location explicitly.", e);
      }
    }

    if (dataflowOptions.getFilesToStage() == null) {
      dataflowOptions.setFilesToStage(detectClassPathResourcesToStage(
          DataflowPipelineRunner.class.getClassLoader()));
      LOG.info("PipelineOptions.filesToStage was not specified. "
          + "Defaulting to files from the classpath: will stage {} files. "
          + "Enable logging at DEBUG level to see which files will be staged.",
          dataflowOptions.getFilesToStage().size());
      LOG.debug("Classpath elements: {}", dataflowOptions.getFilesToStage());
    }

    // Verify jobName according to service requirements, truncating converting to lowercase if
    // necessary.
    String jobName =
        dataflowOptions
            .getJobName()
            .toLowerCase();
    checkArgument(
        jobName.matches("[a-z]([-a-z0-9]*[a-z0-9])?"),
        "JobName invalid; the name must consist of only the characters "
            + "[-a-z0-9], starting with a letter and ending with a letter "
            + "or number");
    if (!jobName.equals(dataflowOptions.getJobName())) {
      LOG.info(
          "PipelineOptions.jobName did not match the service requirements. "
              + "Using {} instead of {}.",
          jobName,
          dataflowOptions.getJobName());
    }
    dataflowOptions.setJobName(jobName);

    // Verify project
    String project = dataflowOptions.getProject();
    if (project.matches("[0-9]*")) {
      throw new IllegalArgumentException("Project ID '" + project
          + "' invalid. Please make sure you specified the Project ID, not project number.");
    } else if (!project.matches(PROJECT_ID_REGEXP)) {
      throw new IllegalArgumentException("Project ID '" + project
          + "' invalid. Please make sure you specified the Project ID, not project description.");
    }

    DataflowPipelineDebugOptions debugOptions =
        dataflowOptions.as(DataflowPipelineDebugOptions.class);
    // Verify the number of worker threads is a valid value
    if (debugOptions.getNumberOfWorkerHarnessThreads() < 0) {
      throw new IllegalArgumentException("Number of worker harness threads '"
          + debugOptions.getNumberOfWorkerHarnessThreads()
          + "' invalid. Please make sure the value is non-negative.");
    }

    if (dataflowOptions.isStreaming() && dataflowOptions.getGcsUploadBufferSizeBytes() == null) {
      dataflowOptions.setGcsUploadBufferSizeBytes(GCS_UPLOAD_BUFFER_SIZE_BYTES_DEFAULT);
    }
    return new DataflowPipelineRunner(dataflowOptions);
  }

  @VisibleForTesting protected DataflowPipelineRunner(DataflowPipelineOptions options) {
    this.options = options;
    this.dataflowClient = options.getDataflowClient();
    this.translator = DataflowPipelineTranslator.fromOptions(options);
    this.pcollectionsRequiringIndexedFormat = new HashSet<>();
    this.ptransformViewsWithNonDeterministicKeyCoders = new HashSet<>();

    ImmutableMap.Builder, Class> builder = ImmutableMap., Class>builder();
    if (options.isStreaming()) {
      builder.put(Combine.GloballyAsSingletonView.class,
                  StreamingCombineGloballyAsSingletonView.class);
      builder.put(Create.Values.class, StreamingCreate.class);
      builder.put(View.AsMap.class, StreamingViewAsMap.class);
      builder.put(View.AsMultimap.class, StreamingViewAsMultimap.class);
      builder.put(View.AsSingleton.class, StreamingViewAsSingleton.class);
      builder.put(View.AsList.class, StreamingViewAsList.class);
      builder.put(View.AsIterable.class, StreamingViewAsIterable.class);
      builder.put(Read.Unbounded.class, StreamingUnboundedRead.class);
      builder.put(Read.Bounded.class, StreamingBoundedRead.class);
      builder.put(AvroIO.Write.Bound.class, UnsupportedIO.class);
      builder.put(Window.Bound.class, AssignWindows.class);
      // In streaming mode must use either the custom Pubsub unbounded source/sink or
      // defer to Windmill's built-in implementation.
      builder.put(PubsubReader.class, UnsupportedIO.class);
      builder.put(PubsubWriter.class, UnsupportedIO.class);
      if (options.getExperiments() == null
          || !options.getExperiments().contains("enable_custom_pubsub_sink")) {
        builder.put(PubsubIO.Write.Bound.class, StreamingPubsubIOWrite.class);
      }
    } else {
      builder.put(Read.Unbounded.class, UnsupportedIO.class);
      builder.put(Window.Bound.class, AssignWindows.class);
      builder.put(Write.Bound.class, BatchWrite.class);
      // In batch mode must use the custom Pubsub bounded source/sink.
      builder.put(PubsubUnboundedSource.class, UnsupportedIO.class);
      builder.put(PubsubUnboundedSink.class, UnsupportedIO.class);
      if (options.getExperiments() == null
          || !options.getExperiments().contains("disable_ism_side_input")) {
        builder.put(View.AsMap.class, BatchViewAsMap.class);
        builder.put(View.AsMultimap.class, BatchViewAsMultimap.class);
        builder.put(View.AsSingleton.class, BatchViewAsSingleton.class);
        builder.put(View.AsList.class, BatchViewAsList.class);
        builder.put(View.AsIterable.class, BatchViewAsIterable.class);
      }
      if (options.getExperiments() == null
          || !options.getExperiments().contains("enable_custom_bigquery_source")) {
        builder.put(BigQueryIO.Read.Bound.class, BatchBigQueryIONativeRead.class);
      }
      if (options.getExperiments() == null
          || !options.getExperiments().contains("enable_custom_bigquery_sink")) {
        builder.put(BigQueryIO.Write.Bound.class, BatchBigQueryIOWrite.class);
      }
    }
    overrides = builder.build();
  }

  /**
   * Applies the given transform to the input. For transforms with customized definitions
   * for the Dataflow pipeline runner, the application is intercepted and modified here.
   */
  @Override
  public  OutputT apply(
      PTransform transform, InputT input) {

    if (Combine.GroupedValues.class.equals(transform.getClass())
        || GroupByKey.class.equals(transform.getClass())) {

      // For both Dataflow runners (streaming and batch), GroupByKey and GroupedValues are
      // primitives. Returning a primitive output instead of the expanded definition
      // signals to the translator that translation is necessary.
      @SuppressWarnings("unchecked")
      PCollection pc = (PCollection) input;
      @SuppressWarnings("unchecked")
      OutputT outputT = (OutputT) PCollection.createPrimitiveOutputInternal(
          pc.getPipeline(),
          transform instanceof GroupByKey
          ? ((GroupByKey) transform).updateWindowingStrategy(pc.getWindowingStrategy())
          : pc.getWindowingStrategy(),
          pc.isBounded());
      return outputT;
    } else if (PubsubIO.Read.Bound.class.equals(transform.getClass())
               && options.isStreaming()
               && (options.getExperiments() == null
                   || !options.getExperiments().contains("enable_custom_pubsub_source"))) {
      // casting to wildcard
      @SuppressWarnings("unchecked")
      OutputT pubsub = (OutputT) applyPubsubStreamingRead((PubsubIO.Read.Bound) transform,
                                                          input);
      return pubsub;
    } else if (Window.Bound.class.equals(transform.getClass())) {
      /*
       * TODO: make this the generic way overrides are applied (using super.apply() rather than
       * Pipeline.applyTransform(); this allows the apply method to be replaced without inserting
       * additional nodes into the graph.
       */
      // casting to wildcard
      @SuppressWarnings("unchecked")
      OutputT windowed = (OutputT) applyWindow((Window.Bound) transform, (PCollection) input);
      return windowed;
    } else if (Flatten.FlattenPCollectionList.class.equals(transform.getClass())
        && ((PCollectionList) input).size() == 0) {
      return (OutputT) Pipeline.applyTransform(input, Create.of());
    } else if (overrides.containsKey(transform.getClass())) {
      // It is the responsibility of whoever constructs overrides to ensure this is type safe.
      @SuppressWarnings("unchecked")
      Class> transformClass =
          (Class>) transform.getClass();

      @SuppressWarnings("unchecked")
      Class> customTransformClass =
          (Class>) overrides.get(transform.getClass());

      PTransform customTransform =
          InstanceBuilder.ofType(customTransformClass)
          .withArg(DataflowPipelineRunner.class, this)
          .withArg(transformClass, transform)
          .build();

      return Pipeline.applyTransform(input, customTransform);
    } else {
      return super.apply(transform, input);
    }
  }

  private  PCollection
  applyPubsubStreamingRead(PubsubIO.Read.Bound initialTransform, PInput input) {
    // types are matched at compile time
    @SuppressWarnings("unchecked")
    PubsubIO.Read.Bound transform = (PubsubIO.Read.Bound) initialTransform;
    return PCollection.createPrimitiveOutputInternal(
        input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED)
        .setCoder(transform.getCoder());
  }

  private  PCollection applyWindow(
      Window.Bound intitialTransform, PCollection initialInput) {
    // types are matched at compile time
    @SuppressWarnings("unchecked")
    Window.Bound transform = (Window.Bound) intitialTransform;
    @SuppressWarnings("unchecked")
    PCollection input = (PCollection) initialInput;
    return super.apply(new AssignWindows<>(transform), input);
  }

  private String debuggerMessage(String projectId, String uniquifier) {
    return String.format("To debug your job, visit Google Cloud Debugger at: "
        + "https://console.developers.google.com/debug?project=%s&dbgee=%s",
        projectId, uniquifier);
  }

  private void maybeRegisterDebuggee(DataflowPipelineOptions options, String uniquifier) {
    if (!options.getEnableCloudDebugger()) {
      return;
    }

    if (options.getDebuggee() != null) {
      throw new RuntimeException("Should not specify the debuggee");
    }

    Clouddebugger debuggerClient = Transport.newClouddebuggerClient(options).build();
    Debuggee debuggee = registerDebuggee(debuggerClient, uniquifier);
    options.setDebuggee(debuggee);

    System.out.println(debuggerMessage(options.getProject(), debuggee.getUniquifier()));
  }

  private Debuggee registerDebuggee(Clouddebugger debuggerClient, String uniquifier) {
    RegisterDebuggeeRequest registerReq = new RegisterDebuggeeRequest();
    registerReq.setDebuggee(new Debuggee()
        .setProject(options.getProject())
        .setUniquifier(uniquifier)
        .setDescription(uniquifier)
        .setAgentVersion("google.com/cloud-dataflow-java/v1"));

    try {
      RegisterDebuggeeResponse registerResponse =
          debuggerClient.controller().debuggees().register(registerReq).execute();
      Debuggee debuggee = registerResponse.getDebuggee();
      if (debuggee.getStatus() != null && debuggee.getStatus().getIsError()) {
        throw new RuntimeException("Unable to register with the debugger: "
            + debuggee.getStatus().getDescription().getFormat());
      }

      return debuggee;
    } catch (IOException e) {
      throw new RuntimeException("Unable to register with the debugger: ", e);
    }
  }

  @Override
  public DataflowPipelineJob run(Pipeline pipeline) {
    logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);

    LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications "
        + "related to Google Compute Engine usage and other Google Cloud Services.");

    List packages = options.getStager().stageFiles();


    // Set a unique client_request_id in the CreateJob request.
    // This is used to ensure idempotence of job creation across retried
    // attempts to create a job. Specifically, if the service returns a job with
    // a different client_request_id, it means the returned one is a different
    // job previously created with the same job name, and that the job creation
    // has been effectively rejected. The SDK should return
    // Error::Already_Exists to user in that case.
    int randomNum = new Random().nextInt(9000) + 1000;
    String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC)
        .print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;

    // Try to create a debuggee ID. This must happen before the job is translated since it may
    // update the options.
    DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
    maybeRegisterDebuggee(dataflowOptions, requestId);

    JobSpecification jobSpecification =
        translator.translate(pipeline, this, packages);
    Job newJob = jobSpecification.getJob();
    newJob.setClientRequestId(requestId);

    String version = DataflowReleaseInfo.getReleaseInfo().getVersion();
    System.out.println("Dataflow SDK version: " + version);

    newJob.getEnvironment().setUserAgent(DataflowReleaseInfo.getReleaseInfo());
    // The Dataflow Service may write to the temporary directory directly, so
    // must be verified.
    if (!Strings.isNullOrEmpty(options.getTempLocation())) {
      newJob.getEnvironment().setTempStoragePrefix(
          dataflowOptions.getPathValidator().verifyPath(options.getTempLocation()));
    }
    newJob.getEnvironment().setDataset(options.getTempDatasetId());
    newJob.getEnvironment().setExperiments(options.getExperiments());

    // Set the Docker container image that executes Dataflow worker harness, residing in Google
    // Container Registry. Translator is guaranteed to create a worker pool prior to this point.
    String workerHarnessContainerImage =
        options.as(DataflowPipelineWorkerPoolOptions.class)
        .getWorkerHarnessContainerImage();
    for (WorkerPool workerPool : newJob.getEnvironment().getWorkerPools()) {
      workerPool.setWorkerHarnessContainerImage(workerHarnessContainerImage);
    }

    // Requirements about the service.
    Map environmentVersion = new HashMap<>();
    environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_MAJOR_KEY, ENVIRONMENT_MAJOR_VERSION);
    newJob.getEnvironment().setVersion(environmentVersion);
    // Default jobType is JAVA_BATCH_AUTOSCALING: A Java job with workers that the job can
    // autoscale if specified.
    String jobType = "JAVA_BATCH_AUTOSCALING";

    if (options.isStreaming()) {
      jobType = "STREAMING";
    }
    environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_JOB_TYPE_KEY, jobType);

    if (hooks != null) {
      hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
    }

    if (!Strings.isNullOrEmpty(options.getDataflowJobFile())) {
      runJobFileHooks(newJob);
    }
    if (hooks != null && !hooks.shouldActuallyRunJob()) {
      return null;
    }

    String jobIdToUpdate = null;
    if (options.getUpdate()) {
      jobIdToUpdate = getJobIdFromName(options.getJobName());
      newJob.setTransformNameMapping(options.getTransformNameMapping());
      newJob.setReplaceJobId(jobIdToUpdate);
    }
    Job jobResult;
    try {
      jobResult = dataflowClient
              .projects()
              .jobs()
              .create(options.getProject(), newJob)
              .execute();
    } catch (GoogleJsonResponseException e) {
      String errorMessages = "Unexpected errors";
      if (e.getDetails() != null) {
        if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
          errorMessages = "The size of the serialized JSON representation of the pipeline "
              + "exceeds the allowable limit. "
              + "For more information, please check the FAQ link below:\n"
              + "https://cloud.google.com/dataflow/faq";
        } else {
          errorMessages = e.getDetails().getMessage();
        }
      }
      throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
    } catch (IOException e) {
      throw new RuntimeException("Failed to create a workflow job", e);
    }

    // Obtain all of the extractors from the PTransforms used in the pipeline so the
    // DataflowPipelineJob has access to them.
    AggregatorPipelineExtractor aggregatorExtractor = new AggregatorPipelineExtractor(pipeline);
    Map, Collection>> aggregatorSteps =
        aggregatorExtractor.getAggregatorSteps();

    DataflowAggregatorTransforms aggregatorTransforms =
        new DataflowAggregatorTransforms(aggregatorSteps, jobSpecification.getStepNames());

    // Use a raw client for post-launch monitoring, as status calls may fail
    // regularly and need not be retried automatically.
    DataflowPipelineJob dataflowPipelineJob =
        new DataflowPipelineJob(options.getProject(), jobResult.getId(),
            Transport.newDataflowClient(options).build(), aggregatorTransforms);

    // If the service returned client request id, the SDK needs to compare it
    // with the original id generated in the request, if they are not the same
    // (i.e., the returned job is not created by this request), throw
    // DataflowJobAlreadyExistsException or DataflowJobAlreadyUpdatedExcetpion
    // depending on whether this is a reload or not.
    if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty()
        && !jobResult.getClientRequestId().equals(requestId)) {
      // If updating a job.
      if (options.getUpdate()) {
        throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob,
            String.format("The job named %s with id: %s has already been updated into job id: %s "
                + "and cannot be updated again.",
                newJob.getName(), jobIdToUpdate, jobResult.getId()));
      } else {
        throw new DataflowJobAlreadyExistsException(dataflowPipelineJob,
            String.format("There is already an active job named %s with id: %s. If you want "
                + "to submit a second job, try again by setting a different name using --jobName.",
                newJob.getName(), jobResult.getId()));
      }
    }

    LOG.info("To access the Dataflow monitoring console, please navigate to {}",
        MonitoringUtil.getJobMonitoringPageURL(options.getProject(), jobResult.getId()));
    System.out.println("Submitted job: " + jobResult.getId());

    LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}",
        MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));

    return dataflowPipelineJob;
  }

  /**
   * Returns the DataflowPipelineTranslator associated with this object.
   */
  public DataflowPipelineTranslator getTranslator() {
    return translator;
  }

  /**
   * Sets callbacks to invoke during execution see {@code DataflowPipelineRunnerHooks}.
   */
  @Experimental
  public void setHooks(DataflowPipelineRunnerHooks hooks) {
    this.hooks = hooks;
  }

  /////////////////////////////////////////////////////////////////////////////

  /** Outputs a warning about PCollection views without deterministic key coders. */
  private void logWarningIfPCollectionViewHasNonDeterministicKeyCoder(Pipeline pipeline) {
    // We need to wait till this point to determine the names of the transforms since only
    // at this time do we know the hierarchy of the transforms otherwise we could
    // have just recorded the full names during apply time.
    if (!ptransformViewsWithNonDeterministicKeyCoders.isEmpty()) {
      final SortedSet ptransformViewNamesWithNonDeterministicKeyCoders = new TreeSet<>();
      pipeline.traverseTopologically(new PipelineVisitor() {
        @Override
        public void visitValue(PValue value, TransformTreeNode producer) {
        }

        @Override
        public void visitTransform(TransformTreeNode node) {
          if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
            ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
          }
        }

        @Override
        public void enterCompositeTransform(TransformTreeNode node) {
          if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
            ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
          }
        }

        @Override
        public void leaveCompositeTransform(TransformTreeNode node) {
        }
      });

      LOG.warn("Unable to use indexed implementation for View.AsMap and View.AsMultimap for {} "
          + "because the key coder is not deterministic. Falling back to singleton implementation "
          + "which may cause memory and/or performance problems. Future major versions of "
          + "Dataflow will require deterministic key coders.",
          ptransformViewNamesWithNonDeterministicKeyCoders);
    }
  }

  private void runJobFileHooks(Job newJob) {
    try {
      WritableByteChannel writer =
          IOChannelUtils.create(options.getDataflowJobFile(), MimeTypes.TEXT);
      PrintWriter printWriter = new PrintWriter(Channels.newOutputStream(writer));
      String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
      printWriter.print(workSpecJson);
      printWriter.flush();
      printWriter.close();
      LOG.info("Printed job specification to {}", options.getDataflowJobFile());
    } catch (IllegalStateException ex) {
      String error = "Cannot translate workflow spec to JSON.";
      if (hooks != null && hooks.failOnJobFileWriteFailure()) {
        throw new RuntimeException(error, ex);
      } else {
        LOG.warn(error, ex);
      }
    } catch (IOException ex) {
      String error =
          String.format("Cannot create output file at {}", options.getDataflowJobFile());
      if (hooks != null && hooks.failOnJobFileWriteFailure()) {
        throw new RuntimeException(error, ex);
      } else {
        LOG.warn(error, ex);
      }
    }
  }

  /**
   * Returns true if the passed in {@link PCollection} needs to be materialiazed using
   * an indexed format.
   */
  boolean doesPCollectionRequireIndexedFormat(PCollection pcol) {
    return pcollectionsRequiringIndexedFormat.contains(pcol);
  }

  /**
   * Marks the passed in {@link PCollection} as requiring to be materialized using
   * an indexed format.
   */
  private void addPCollectionRequiringIndexedFormat(PCollection pcol) {
    pcollectionsRequiringIndexedFormat.add(pcol);
  }

  /** A set of {@link View}s with non-deterministic key coders. */
  Set> ptransformViewsWithNonDeterministicKeyCoders;

  /**
   * Records that the {@link PTransform} requires a deterministic key coder.
   */
  private void recordViewUsesNonDeterministicKeyCoder(PTransform ptransform) {
    ptransformViewsWithNonDeterministicKeyCoders.add(ptransform);
  }

  /**
   * A {@link GroupByKey} transform for the {@link DataflowPipelineRunner} which sorts
   * values using the secondary key {@code K2}.
   *
   * 
The {@link PCollection} created created by this {@link PTransform} will have values in
   * the empty window. Care must be taken *afterwards* to either re-window
   * (using {@link Window#into}) or only use {@link PTransform}s that do not depend on the
   * values being within a window.
   */
  static class GroupByKeyAndSortValuesOnly
      extends PTransform>>, PCollection>>>> {
    private GroupByKeyAndSortValuesOnly() {
    }

    @Override
    public PCollection>>> apply(PCollection>> input) {
      PCollection>>> rval =
          PCollection.>>>createPrimitiveOutputInternal(
          input.getPipeline(),
          WindowingStrategy.globalDefault(),
          IsBounded.BOUNDED);

      @SuppressWarnings({"unchecked", "rawtypes"})
      KvCoder> inputCoder = (KvCoder) input.getCoder();
      rval.setCoder(
          KvCoder.of(inputCoder.getKeyCoder(),
          IterableCoder.of(inputCoder.getValueCoder())));
      return rval;
    }
  }

  /**
   * A {@link PTransform} that groups the values by a hash of the window's byte representation
   * and sorts the values using the windows byte representation.
   */
  private static class GroupByWindowHashAsKeyAndWindowAsSortKey extends
      PTransform, PCollection>>>>> {

    /**
     * A {@link DoFn} that for each element outputs a {@code KV} structure suitable for
     * grouping by the hash of the window's byte representation and sorting the grouped values
     * using the window's byte representation.
     */
    @SystemDoFnInternal
    private static class UseWindowHashAsKeyAndWindowAsSortKeyDoFn
        extends DoFn>>> implements DoFn.RequiresWindowAccess {

      private final IsmRecordCoder ismCoderForHash;
      private UseWindowHashAsKeyAndWindowAsSortKeyDoFn(IsmRecordCoder ismCoderForHash) {
        this.ismCoderForHash = ismCoderForHash;
      }

      @Override
      public void processElement(ProcessContext c) throws Exception {
        @SuppressWarnings("unchecked")
        W window = (W) c.window();
        c.output(
            KV.of(ismCoderForHash.hash(ImmutableList.of(window)),
                KV.of(window,
                    WindowedValue.of(
                        c.element(),
                        c.timestamp(),
                        c.window(),
                        c.pane()))));
      }
    }

    private final IsmRecordCoder ismCoderForHash;
    private GroupByWindowHashAsKeyAndWindowAsSortKey(IsmRecordCoder ismCoderForHash) {
      this.ismCoderForHash = ismCoderForHash;
    }

    @Override
    public PCollection>>>> apply(PCollection input) {
      @SuppressWarnings("unchecked")
      Coder windowCoder = (Coder)
          input.getWindowingStrategy().getWindowFn().windowCoder();
      PCollection>>> rval =
          input.apply(ParDo.of(
              new UseWindowHashAsKeyAndWindowAsSortKeyDoFn(ismCoderForHash)));
      rval.setCoder(
          KvCoder.of(
              VarIntCoder.of(),
              KvCoder.of(windowCoder,
                  FullWindowedValueCoder.of(input.getCoder(), windowCoder))));
      return rval.apply(new GroupByKeyAndSortValuesOnly>());
    }
  }

  /**
   * Specialized implementation for
   * {@link com.google.cloud.dataflow.sdk.transforms.View.AsSingleton View.AsSingleton} for the
   * Dataflow runner in batch mode.
   *
   * 
Creates a set of files in the {@link IsmFormat} sharded by the hash of the windows
   * byte representation and with records having:
   * 

   *   Key 1: Window
   *   Value: Windowed value
   * 
   */
  static class BatchViewAsSingleton
      extends PTransform, PCollectionView> {

    /**
     * A {@link DoFn} that outputs {@link IsmRecord}s. These records are structured as follows:
     * 
     *   Key 1: Window
     *   
Value: Windowed value
     * 
     */
    static class IsmRecordForSingularValuePerWindowDoFn
        extends DoFn>>>,
                     IsmRecord>> {

      private final Coder windowCoder;
      IsmRecordForSingularValuePerWindowDoFn(Coder windowCoder) {
        this.windowCoder = windowCoder;
      }

      @Override
      public void processElement(ProcessContext c) throws Exception {
        Optional