com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.runners;
import static com.google.cloud.dataflow.sdk.util.StringUtils.approximatePTransformName;
import static com.google.cloud.dataflow.sdk.util.StringUtils.approximateSimpleName;
import static com.google.cloud.dataflow.sdk.util.WindowedValue.valueInEmptyWindows;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import com.google.api.client.googleapis.json.GoogleJsonResponseException;
import com.google.api.client.json.JsonFactory;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.clouddebugger.v2.Clouddebugger;
import com.google.api.services.clouddebugger.v2.model.Debuggee;
import com.google.api.services.clouddebugger.v2.model.RegisterDebuggeeRequest;
import com.google.api.services.clouddebugger.v2.model.RegisterDebuggeeResponse;
import com.google.api.services.dataflow.Dataflow;
import com.google.api.services.dataflow.model.DataflowPackage;
import com.google.api.services.dataflow.model.Job;
import com.google.api.services.dataflow.model.ListJobsResponse;
import com.google.api.services.dataflow.model.WorkerPool;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
import com.google.cloud.dataflow.sdk.PipelineResult.State;
import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.coders.BigEndianLongCoder;
import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.Coder.NonDeterministicException;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
import com.google.cloud.dataflow.sdk.coders.IterableCoder;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.coders.ListCoder;
import com.google.cloud.dataflow.sdk.coders.MapCoder;
import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
import com.google.cloud.dataflow.sdk.coders.StandardCoder;
import com.google.cloud.dataflow.sdk.coders.TableRowJsonCoder;
import com.google.cloud.dataflow.sdk.coders.VarIntCoder;
import com.google.cloud.dataflow.sdk.coders.VarLongCoder;
import com.google.cloud.dataflow.sdk.io.AvroIO;
import com.google.cloud.dataflow.sdk.io.BigQueryIO;
import com.google.cloud.dataflow.sdk.io.BoundedSource;
import com.google.cloud.dataflow.sdk.io.FileBasedSink;
import com.google.cloud.dataflow.sdk.io.PubsubIO;
import com.google.cloud.dataflow.sdk.io.PubsubIO.Read.Bound.PubsubReader;
import com.google.cloud.dataflow.sdk.io.PubsubIO.Write.Bound.PubsubWriter;
import com.google.cloud.dataflow.sdk.io.PubsubUnboundedSink;
import com.google.cloud.dataflow.sdk.io.PubsubUnboundedSource;
import com.google.cloud.dataflow.sdk.io.Read;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.io.UnboundedSource;
import com.google.cloud.dataflow.sdk.io.Write;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineDebugOptions;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineWorkerPoolOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsValidator;
import com.google.cloud.dataflow.sdk.options.StreamingOptions;
import com.google.cloud.dataflow.sdk.options.ValueProvider.NestedValueProvider;
import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.JobSpecification;
import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TransformTranslator;
import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.TranslationContext;
import com.google.cloud.dataflow.sdk.runners.dataflow.AssignWindows;
import com.google.cloud.dataflow.sdk.runners.dataflow.DataflowAggregatorTransforms;
import com.google.cloud.dataflow.sdk.runners.dataflow.DataflowUnboundedReadFromBoundedSource;
import com.google.cloud.dataflow.sdk.runners.dataflow.ReadTranslator;
import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat;
import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat.IsmRecord;
import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat.IsmRecordCoder;
import com.google.cloud.dataflow.sdk.runners.worker.IsmFormat.MetadataKeyCoder;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.Combine;
import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.Flatten;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.View.CreatePCollectionView;
import com.google.cloud.dataflow.sdk.transforms.WithKeys;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.transforms.windowing.AfterPane;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.util.CoderUtils;
import com.google.cloud.dataflow.sdk.util.DataflowReleaseInfo;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
import com.google.cloud.dataflow.sdk.util.MimeTypes;
import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
import com.google.cloud.dataflow.sdk.util.PCollectionViews;
import com.google.cloud.dataflow.sdk.util.PathValidator;
import com.google.cloud.dataflow.sdk.util.PropertyNames;
import com.google.cloud.dataflow.sdk.util.Reshuffle;
import com.google.cloud.dataflow.sdk.util.SystemDoFnInternal;
import com.google.cloud.dataflow.sdk.util.Transport;
import com.google.cloud.dataflow.sdk.util.ValueWithRecordId;
import com.google.cloud.dataflow.sdk.util.WindowedValue;
import com.google.cloud.dataflow.sdk.util.WindowedValue.FullWindowedValueCoder;
import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
import com.google.cloud.dataflow.sdk.values.PCollectionList;
import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.POutput;
import com.google.cloud.dataflow.sdk.values.PValue;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TupleTagList;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Strings;
import com.google.common.base.Utf8;
import com.google.common.collect.ForwardingMap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.joda.time.DateTimeUtils;
import org.joda.time.DateTimeZone;
import org.joda.time.Duration;
import org.joda.time.format.DateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import javax.annotation.Nullable;
/**
* A {@link PipelineRunner} that executes the operations in the
* pipeline by first translating them to the Dataflow representation
* using the {@link DataflowPipelineTranslator} and then submitting
* them to a Dataflow service for execution.
*
* Permissions
* When reading from a Dataflow source or writing to a Dataflow sink using
* {@code DataflowPipelineRunner}, the Google cloudservices account and the Google compute engine
* service account of the GCP project running the Dataflow Job will need access to the corresponding
* source/sink.
*
* Please see Google Cloud
* Dataflow Security and Permissions for more details.
*/
public class DataflowPipelineRunner extends PipelineRunner {
private static final Logger LOG = LoggerFactory.getLogger(DataflowPipelineRunner.class);
/** Provided configuration options. */
private final DataflowPipelineOptions options;
/** Client for the Dataflow service. This is used to actually submit jobs. */
private final Dataflow dataflowClient;
/** Translator for this DataflowPipelineRunner, based on options. */
private final DataflowPipelineTranslator translator;
/** Custom transforms implementations. */
private final Map, Class> overrides;
/** A set of user defined functions to invoke at different points in execution. */
private DataflowPipelineRunnerHooks hooks;
// Environment version information.
private static final String ENVIRONMENT_MAJOR_VERSION = "6";
// Default Docker container images that execute Dataflow worker harness, residing in Google
// Container Registry, separately for Batch and Streaming.
public static final String BATCH_WORKER_HARNESS_CONTAINER_IMAGE
= "dataflow.gcr.io/v1beta3/java-batch:1.9.0";
public static final String STREAMING_WORKER_HARNESS_CONTAINER_IMAGE
= "dataflow.gcr.io/v1beta3/java-streaming:1.9.0";
// The limit of CreateJob request size.
private static final int CREATE_JOB_REQUEST_LIMIT_BYTES = 10 * 1024 * 1024;
@VisibleForTesting
static final int GCS_UPLOAD_BUFFER_SIZE_BYTES_DEFAULT = 1 * 1024 * 1024;
private final Set> pcollectionsRequiringIndexedFormat;
/**
* Project IDs must contain lowercase letters, digits, or dashes.
* IDs must start with a letter and may not end with a dash.
* This regex isn't exact - this allows for patterns that would be rejected by
* the service, but this is sufficient for basic validation of project IDs.
*/
public static final String PROJECT_ID_REGEXP = "[a-z][-a-z0-9:.]+[a-z0-9]";
private static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();
/**
* Construct a runner from the provided options.
*
* @param options Properties that configure the runner.
* @return The newly created runner.
*/
public static DataflowPipelineRunner fromOptions(PipelineOptions options) {
// (Re-)register standard IO factories. Clobbers any prior credentials.
IOChannelUtils.registerStandardIOFactories(options);
DataflowPipelineOptions dataflowOptions =
PipelineOptionsValidator.validate(DataflowPipelineOptions.class, options);
ArrayList missing = new ArrayList<>();
if (dataflowOptions.getAppName() == null) {
missing.add("appName");
}
if (missing.size() > 0) {
throw new IllegalArgumentException(
"Missing required values: " + Joiner.on(',').join(missing));
}
PathValidator validator = dataflowOptions.getPathValidator();
checkArgument(!(Strings.isNullOrEmpty(dataflowOptions.getTempLocation())
&& Strings.isNullOrEmpty(dataflowOptions.getStagingLocation())),
"Missing required value: at least one of tempLocation or stagingLocation must be set.");
if (dataflowOptions.getStagingLocation() != null) {
validator.validateOutputFilePrefixSupported(dataflowOptions.getStagingLocation());
}
if (dataflowOptions.getTempLocation() != null) {
validator.validateOutputFilePrefixSupported(dataflowOptions.getTempLocation());
}
if (!Strings.isNullOrEmpty(dataflowOptions.getSaveProfilesToGcs())) {
validator.validateOutputFilePrefixSupported(dataflowOptions.getSaveProfilesToGcs());
}
if (dataflowOptions.getEnableProfilingAgent()) {
LOG.error("--enableProfilingAgent is no longer supported, and will be ignored. "
+ "Use --saveProfilesToGcs instead.");
}
if (Strings.isNullOrEmpty(dataflowOptions.getTempLocation())) {
dataflowOptions.setTempLocation(dataflowOptions.getStagingLocation());
} else if (Strings.isNullOrEmpty(dataflowOptions.getStagingLocation())) {
try {
dataflowOptions.setStagingLocation(
IOChannelUtils.resolve(dataflowOptions.getTempLocation(), "staging"));
} catch (IOException e) {
throw new IllegalArgumentException("Unable to resolve PipelineOptions.stagingLocation "
+ "from PipelineOptions.tempLocation. Please set the staging location explicitly.", e);
}
}
if (dataflowOptions.getFilesToStage() == null) {
dataflowOptions.setFilesToStage(detectClassPathResourcesToStage(
DataflowPipelineRunner.class.getClassLoader()));
LOG.info("PipelineOptions.filesToStage was not specified. "
+ "Defaulting to files from the classpath: will stage {} files. "
+ "Enable logging at DEBUG level to see which files will be staged.",
dataflowOptions.getFilesToStage().size());
LOG.debug("Classpath elements: {}", dataflowOptions.getFilesToStage());
}
// Verify jobName according to service requirements, truncating converting to lowercase if
// necessary.
String jobName =
dataflowOptions
.getJobName()
.toLowerCase();
checkArgument(
jobName.matches("[a-z]([-a-z0-9]*[a-z0-9])?"),
"JobName invalid; the name must consist of only the characters "
+ "[-a-z0-9], starting with a letter and ending with a letter "
+ "or number");
if (!jobName.equals(dataflowOptions.getJobName())) {
LOG.info(
"PipelineOptions.jobName did not match the service requirements. "
+ "Using {} instead of {}.",
jobName,
dataflowOptions.getJobName());
}
dataflowOptions.setJobName(jobName);
// Verify project
String project = dataflowOptions.getProject();
if (project.matches("[0-9]*")) {
throw new IllegalArgumentException("Project ID '" + project
+ "' invalid. Please make sure you specified the Project ID, not project number.");
} else if (!project.matches(PROJECT_ID_REGEXP)) {
throw new IllegalArgumentException("Project ID '" + project
+ "' invalid. Please make sure you specified the Project ID, not project description.");
}
DataflowPipelineDebugOptions debugOptions =
dataflowOptions.as(DataflowPipelineDebugOptions.class);
// Verify the number of worker threads is a valid value
if (debugOptions.getNumberOfWorkerHarnessThreads() < 0) {
throw new IllegalArgumentException("Number of worker harness threads '"
+ debugOptions.getNumberOfWorkerHarnessThreads()
+ "' invalid. Please make sure the value is non-negative.");
}
if (dataflowOptions.isStreaming() && dataflowOptions.getGcsUploadBufferSizeBytes() == null) {
dataflowOptions.setGcsUploadBufferSizeBytes(GCS_UPLOAD_BUFFER_SIZE_BYTES_DEFAULT);
}
return new DataflowPipelineRunner(dataflowOptions);
}
@VisibleForTesting protected DataflowPipelineRunner(DataflowPipelineOptions options) {
this.options = options;
this.dataflowClient = options.getDataflowClient();
this.translator = DataflowPipelineTranslator.fromOptions(options);
this.pcollectionsRequiringIndexedFormat = new HashSet<>();
this.ptransformViewsWithNonDeterministicKeyCoders = new HashSet<>();
ImmutableMap.Builder, Class> builder = ImmutableMap., Class>builder();
if (options.isStreaming()) {
builder.put(Combine.GloballyAsSingletonView.class,
StreamingCombineGloballyAsSingletonView.class);
builder.put(Create.Values.class, StreamingCreate.class);
builder.put(View.AsMap.class, StreamingViewAsMap.class);
builder.put(View.AsMultimap.class, StreamingViewAsMultimap.class);
builder.put(View.AsSingleton.class, StreamingViewAsSingleton.class);
builder.put(View.AsList.class, StreamingViewAsList.class);
builder.put(View.AsIterable.class, StreamingViewAsIterable.class);
builder.put(Read.Unbounded.class, StreamingUnboundedRead.class);
builder.put(Read.Bounded.class, StreamingBoundedRead.class);
builder.put(AvroIO.Write.Bound.class, UnsupportedIO.class);
builder.put(Window.Bound.class, AssignWindows.class);
// In streaming mode must use either the custom Pubsub unbounded source/sink or
// defer to Windmill's built-in implementation.
builder.put(PubsubReader.class, UnsupportedIO.class);
builder.put(PubsubWriter.class, UnsupportedIO.class);
if (options.getExperiments() == null
|| !options.getExperiments().contains("enable_custom_pubsub_sink")) {
builder.put(PubsubIO.Write.Bound.class, StreamingPubsubIOWrite.class);
}
} else {
builder.put(Read.Unbounded.class, UnsupportedIO.class);
builder.put(Window.Bound.class, AssignWindows.class);
builder.put(Write.Bound.class, BatchWrite.class);
// In batch mode must use the custom Pubsub bounded source/sink.
builder.put(PubsubUnboundedSource.class, UnsupportedIO.class);
builder.put(PubsubUnboundedSink.class, UnsupportedIO.class);
if (options.getExperiments() == null
|| !options.getExperiments().contains("disable_ism_side_input")) {
builder.put(View.AsMap.class, BatchViewAsMap.class);
builder.put(View.AsMultimap.class, BatchViewAsMultimap.class);
builder.put(View.AsSingleton.class, BatchViewAsSingleton.class);
builder.put(View.AsList.class, BatchViewAsList.class);
builder.put(View.AsIterable.class, BatchViewAsIterable.class);
}
if (options.getExperiments() == null
|| !options.getExperiments().contains("enable_custom_bigquery_source")) {
builder.put(BigQueryIO.Read.Bound.class, BatchBigQueryIONativeRead.class);
}
if (options.getExperiments() == null
|| !options.getExperiments().contains("enable_custom_bigquery_sink")) {
builder.put(BigQueryIO.Write.Bound.class, BatchBigQueryIOWrite.class);
}
}
overrides = builder.build();
}
/**
* Applies the given transform to the input. For transforms with customized definitions
* for the Dataflow pipeline runner, the application is intercepted and modified here.
*/
@Override
public OutputT apply(
PTransform transform, InputT input) {
if (Combine.GroupedValues.class.equals(transform.getClass())
|| GroupByKey.class.equals(transform.getClass())) {
// For both Dataflow runners (streaming and batch), GroupByKey and GroupedValues are
// primitives. Returning a primitive output instead of the expanded definition
// signals to the translator that translation is necessary.
@SuppressWarnings("unchecked")
PCollection pc = (PCollection) input;
@SuppressWarnings("unchecked")
OutputT outputT = (OutputT) PCollection.createPrimitiveOutputInternal(
pc.getPipeline(),
transform instanceof GroupByKey
? ((GroupByKey) transform).updateWindowingStrategy(pc.getWindowingStrategy())
: pc.getWindowingStrategy(),
pc.isBounded());
return outputT;
} else if (PubsubIO.Read.Bound.class.equals(transform.getClass())
&& options.isStreaming()
&& (options.getExperiments() == null
|| !options.getExperiments().contains("enable_custom_pubsub_source"))) {
// casting to wildcard
@SuppressWarnings("unchecked")
OutputT pubsub = (OutputT) applyPubsubStreamingRead((PubsubIO.Read.Bound) transform,
input);
return pubsub;
} else if (Window.Bound.class.equals(transform.getClass())) {
/*
* TODO: make this the generic way overrides are applied (using super.apply() rather than
* Pipeline.applyTransform(); this allows the apply method to be replaced without inserting
* additional nodes into the graph.
*/
// casting to wildcard
@SuppressWarnings("unchecked")
OutputT windowed = (OutputT) applyWindow((Window.Bound) transform, (PCollection) input);
return windowed;
} else if (Flatten.FlattenPCollectionList.class.equals(transform.getClass())
&& ((PCollectionList) input).size() == 0) {
return (OutputT) Pipeline.applyTransform(input, Create.of());
} else if (overrides.containsKey(transform.getClass())) {
// It is the responsibility of whoever constructs overrides to ensure this is type safe.
@SuppressWarnings("unchecked")
Class> transformClass =
(Class>) transform.getClass();
@SuppressWarnings("unchecked")
Class> customTransformClass =
(Class>) overrides.get(transform.getClass());
PTransform customTransform =
InstanceBuilder.ofType(customTransformClass)
.withArg(DataflowPipelineRunner.class, this)
.withArg(transformClass, transform)
.build();
return Pipeline.applyTransform(input, customTransform);
} else {
return super.apply(transform, input);
}
}
private PCollection
applyPubsubStreamingRead(PubsubIO.Read.Bound initialTransform, PInput input) {
// types are matched at compile time
@SuppressWarnings("unchecked")
PubsubIO.Read.Bound transform = (PubsubIO.Read.Bound) initialTransform;
return PCollection.createPrimitiveOutputInternal(
input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED)
.setCoder(transform.getCoder());
}
private PCollection applyWindow(
Window.Bound intitialTransform, PCollection initialInput) {
// types are matched at compile time
@SuppressWarnings("unchecked")
Window.Bound transform = (Window.Bound) intitialTransform;
@SuppressWarnings("unchecked")
PCollection input = (PCollection) initialInput;
return super.apply(new AssignWindows<>(transform), input);
}
private String debuggerMessage(String projectId, String uniquifier) {
return String.format("To debug your job, visit Google Cloud Debugger at: "
+ "https://console.developers.google.com/debug?project=%s&dbgee=%s",
projectId, uniquifier);
}
private void maybeRegisterDebuggee(DataflowPipelineOptions options, String uniquifier) {
if (!options.getEnableCloudDebugger()) {
return;
}
if (options.getDebuggee() != null) {
throw new RuntimeException("Should not specify the debuggee");
}
Clouddebugger debuggerClient = Transport.newClouddebuggerClient(options).build();
Debuggee debuggee = registerDebuggee(debuggerClient, uniquifier);
options.setDebuggee(debuggee);
System.out.println(debuggerMessage(options.getProject(), debuggee.getUniquifier()));
}
private Debuggee registerDebuggee(Clouddebugger debuggerClient, String uniquifier) {
RegisterDebuggeeRequest registerReq = new RegisterDebuggeeRequest();
registerReq.setDebuggee(new Debuggee()
.setProject(options.getProject())
.setUniquifier(uniquifier)
.setDescription(uniquifier)
.setAgentVersion("google.com/cloud-dataflow-java/v1"));
try {
RegisterDebuggeeResponse registerResponse =
debuggerClient.controller().debuggees().register(registerReq).execute();
Debuggee debuggee = registerResponse.getDebuggee();
if (debuggee.getStatus() != null && debuggee.getStatus().getIsError()) {
throw new RuntimeException("Unable to register with the debugger: "
+ debuggee.getStatus().getDescription().getFormat());
}
return debuggee;
} catch (IOException e) {
throw new RuntimeException("Unable to register with the debugger: ", e);
}
}
@Override
public DataflowPipelineJob run(Pipeline pipeline) {
logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);
LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications "
+ "related to Google Compute Engine usage and other Google Cloud Services.");
List packages = options.getStager().stageFiles();
// Set a unique client_request_id in the CreateJob request.
// This is used to ensure idempotence of job creation across retried
// attempts to create a job. Specifically, if the service returns a job with
// a different client_request_id, it means the returned one is a different
// job previously created with the same job name, and that the job creation
// has been effectively rejected. The SDK should return
// Error::Already_Exists to user in that case.
int randomNum = new Random().nextInt(9000) + 1000;
String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC)
.print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;
// Try to create a debuggee ID. This must happen before the job is translated since it may
// update the options.
DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
maybeRegisterDebuggee(dataflowOptions, requestId);
JobSpecification jobSpecification =
translator.translate(pipeline, this, packages);
Job newJob = jobSpecification.getJob();
newJob.setClientRequestId(requestId);
String version = DataflowReleaseInfo.getReleaseInfo().getVersion();
System.out.println("Dataflow SDK version: " + version);
newJob.getEnvironment().setUserAgent(DataflowReleaseInfo.getReleaseInfo());
// The Dataflow Service may write to the temporary directory directly, so
// must be verified.
if (!Strings.isNullOrEmpty(options.getTempLocation())) {
newJob.getEnvironment().setTempStoragePrefix(
dataflowOptions.getPathValidator().verifyPath(options.getTempLocation()));
}
newJob.getEnvironment().setDataset(options.getTempDatasetId());
newJob.getEnvironment().setExperiments(options.getExperiments());
// Set the Docker container image that executes Dataflow worker harness, residing in Google
// Container Registry. Translator is guaranteed to create a worker pool prior to this point.
String workerHarnessContainerImage =
options.as(DataflowPipelineWorkerPoolOptions.class)
.getWorkerHarnessContainerImage();
for (WorkerPool workerPool : newJob.getEnvironment().getWorkerPools()) {
workerPool.setWorkerHarnessContainerImage(workerHarnessContainerImage);
}
// Requirements about the service.
Map environmentVersion = new HashMap<>();
environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_MAJOR_KEY, ENVIRONMENT_MAJOR_VERSION);
newJob.getEnvironment().setVersion(environmentVersion);
// Default jobType is JAVA_BATCH_AUTOSCALING: A Java job with workers that the job can
// autoscale if specified.
String jobType = "JAVA_BATCH_AUTOSCALING";
if (options.isStreaming()) {
jobType = "STREAMING";
}
environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_JOB_TYPE_KEY, jobType);
if (hooks != null) {
hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
}
if (!Strings.isNullOrEmpty(options.getDataflowJobFile())) {
runJobFileHooks(newJob);
}
if (hooks != null && !hooks.shouldActuallyRunJob()) {
return null;
}
String jobIdToUpdate = null;
if (options.getUpdate()) {
jobIdToUpdate = getJobIdFromName(options.getJobName());
newJob.setTransformNameMapping(options.getTransformNameMapping());
newJob.setReplaceJobId(jobIdToUpdate);
}
Job jobResult;
try {
jobResult = dataflowClient
.projects()
.jobs()
.create(options.getProject(), newJob)
.execute();
} catch (GoogleJsonResponseException e) {
String errorMessages = "Unexpected errors";
if (e.getDetails() != null) {
if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
errorMessages = "The size of the serialized JSON representation of the pipeline "
+ "exceeds the allowable limit. "
+ "For more information, please check the FAQ link below:\n"
+ "https://cloud.google.com/dataflow/faq";
} else {
errorMessages = e.getDetails().getMessage();
}
}
throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
} catch (IOException e) {
throw new RuntimeException("Failed to create a workflow job", e);
}
// Obtain all of the extractors from the PTransforms used in the pipeline so the
// DataflowPipelineJob has access to them.
AggregatorPipelineExtractor aggregatorExtractor = new AggregatorPipelineExtractor(pipeline);
Map, Collection>> aggregatorSteps =
aggregatorExtractor.getAggregatorSteps();
DataflowAggregatorTransforms aggregatorTransforms =
new DataflowAggregatorTransforms(aggregatorSteps, jobSpecification.getStepNames());
// Use a raw client for post-launch monitoring, as status calls may fail
// regularly and need not be retried automatically.
DataflowPipelineJob dataflowPipelineJob =
new DataflowPipelineJob(options.getProject(), jobResult.getId(),
Transport.newDataflowClient(options).build(), aggregatorTransforms);
// If the service returned client request id, the SDK needs to compare it
// with the original id generated in the request, if they are not the same
// (i.e., the returned job is not created by this request), throw
// DataflowJobAlreadyExistsException or DataflowJobAlreadyUpdatedExcetpion
// depending on whether this is a reload or not.
if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty()
&& !jobResult.getClientRequestId().equals(requestId)) {
// If updating a job.
if (options.getUpdate()) {
throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob,
String.format("The job named %s with id: %s has already been updated into job id: %s "
+ "and cannot be updated again.",
newJob.getName(), jobIdToUpdate, jobResult.getId()));
} else {
throw new DataflowJobAlreadyExistsException(dataflowPipelineJob,
String.format("There is already an active job named %s with id: %s. If you want "
+ "to submit a second job, try again by setting a different name using --jobName.",
newJob.getName(), jobResult.getId()));
}
}
LOG.info("To access the Dataflow monitoring console, please navigate to {}",
MonitoringUtil.getJobMonitoringPageURL(options.getProject(), jobResult.getId()));
System.out.println("Submitted job: " + jobResult.getId());
LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}",
MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));
return dataflowPipelineJob;
}
/**
* Returns the DataflowPipelineTranslator associated with this object.
*/
public DataflowPipelineTranslator getTranslator() {
return translator;
}
/**
* Sets callbacks to invoke during execution see {@code DataflowPipelineRunnerHooks}.
*/
@Experimental
public void setHooks(DataflowPipelineRunnerHooks hooks) {
this.hooks = hooks;
}
/////////////////////////////////////////////////////////////////////////////
/** Outputs a warning about PCollection views without deterministic key coders. */
private void logWarningIfPCollectionViewHasNonDeterministicKeyCoder(Pipeline pipeline) {
// We need to wait till this point to determine the names of the transforms since only
// at this time do we know the hierarchy of the transforms otherwise we could
// have just recorded the full names during apply time.
if (!ptransformViewsWithNonDeterministicKeyCoders.isEmpty()) {
final SortedSet ptransformViewNamesWithNonDeterministicKeyCoders = new TreeSet<>();
pipeline.traverseTopologically(new PipelineVisitor() {
@Override
public void visitValue(PValue value, TransformTreeNode producer) {
}
@Override
public void visitTransform(TransformTreeNode node) {
if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
}
}
@Override
public void enterCompositeTransform(TransformTreeNode node) {
if (ptransformViewsWithNonDeterministicKeyCoders.contains(node.getTransform())) {
ptransformViewNamesWithNonDeterministicKeyCoders.add(node.getFullName());
}
}
@Override
public void leaveCompositeTransform(TransformTreeNode node) {
}
});
LOG.warn("Unable to use indexed implementation for View.AsMap and View.AsMultimap for {} "
+ "because the key coder is not deterministic. Falling back to singleton implementation "
+ "which may cause memory and/or performance problems. Future major versions of "
+ "Dataflow will require deterministic key coders.",
ptransformViewNamesWithNonDeterministicKeyCoders);
}
}
private void runJobFileHooks(Job newJob) {
try {
WritableByteChannel writer =
IOChannelUtils.create(options.getDataflowJobFile(), MimeTypes.TEXT);
PrintWriter printWriter = new PrintWriter(Channels.newOutputStream(writer));
String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
printWriter.print(workSpecJson);
printWriter.flush();
printWriter.close();
LOG.info("Printed job specification to {}", options.getDataflowJobFile());
} catch (IllegalStateException ex) {
String error = "Cannot translate workflow spec to JSON.";
if (hooks != null && hooks.failOnJobFileWriteFailure()) {
throw new RuntimeException(error, ex);
} else {
LOG.warn(error, ex);
}
} catch (IOException ex) {
String error =
String.format("Cannot create output file at {}", options.getDataflowJobFile());
if (hooks != null && hooks.failOnJobFileWriteFailure()) {
throw new RuntimeException(error, ex);
} else {
LOG.warn(error, ex);
}
}
}
/**
* Returns true if the passed in {@link PCollection} needs to be materialiazed using
* an indexed format.
*/
boolean doesPCollectionRequireIndexedFormat(PCollection pcol) {
return pcollectionsRequiringIndexedFormat.contains(pcol);
}
/**
* Marks the passed in {@link PCollection} as requiring to be materialized using
* an indexed format.
*/
private void addPCollectionRequiringIndexedFormat(PCollection pcol) {
pcollectionsRequiringIndexedFormat.add(pcol);
}
/** A set of {@link View}s with non-deterministic key coders. */
Set> ptransformViewsWithNonDeterministicKeyCoders;
/**
* Records that the {@link PTransform} requires a deterministic key coder.
*/
private void recordViewUsesNonDeterministicKeyCoder(PTransform ptransform) {
ptransformViewsWithNonDeterministicKeyCoders.add(ptransform);
}
/**
* A {@link GroupByKey} transform for the {@link DataflowPipelineRunner} which sorts
* values using the secondary key {@code K2}.
*
* The {@link PCollection} created created by this {@link PTransform} will have values in
* the empty window. Care must be taken *afterwards* to either re-window
* (using {@link Window#into}) or only use {@link PTransform}s that do not depend on the
* values being within a window.
*/
static class GroupByKeyAndSortValuesOnly
extends PTransform>>, PCollection>>>> {
private GroupByKeyAndSortValuesOnly() {
}
@Override
public PCollection>>> apply(PCollection>> input) {
PCollection>>> rval =
PCollection.>>>createPrimitiveOutputInternal(
input.getPipeline(),
WindowingStrategy.globalDefault(),
IsBounded.BOUNDED);
@SuppressWarnings({"unchecked", "rawtypes"})
KvCoder> inputCoder = (KvCoder) input.getCoder();
rval.setCoder(
KvCoder.of(inputCoder.getKeyCoder(),
IterableCoder.of(inputCoder.getValueCoder())));
return rval;
}
}
/**
* A {@link PTransform} that groups the values by a hash of the window's byte representation
* and sorts the values using the windows byte representation.
*/
private static class GroupByWindowHashAsKeyAndWindowAsSortKey extends
PTransform, PCollection>>>>> {
/**
* A {@link DoFn} that for each element outputs a {@code KV} structure suitable for
* grouping by the hash of the window's byte representation and sorting the grouped values
* using the window's byte representation.
*/
@SystemDoFnInternal
private static class UseWindowHashAsKeyAndWindowAsSortKeyDoFn
extends DoFn>>> implements DoFn.RequiresWindowAccess {
private final IsmRecordCoder ismCoderForHash;
private UseWindowHashAsKeyAndWindowAsSortKeyDoFn(IsmRecordCoder ismCoderForHash) {
this.ismCoderForHash = ismCoderForHash;
}
@Override
public void processElement(ProcessContext c) throws Exception {
@SuppressWarnings("unchecked")
W window = (W) c.window();
c.output(
KV.of(ismCoderForHash.hash(ImmutableList.of(window)),
KV.of(window,
WindowedValue.of(
c.element(),
c.timestamp(),
c.window(),
c.pane()))));
}
}
private final IsmRecordCoder ismCoderForHash;
private GroupByWindowHashAsKeyAndWindowAsSortKey(IsmRecordCoder ismCoderForHash) {
this.ismCoderForHash = ismCoderForHash;
}
@Override
public PCollection>>>> apply(PCollection input) {
@SuppressWarnings("unchecked")
Coder windowCoder = (Coder)
input.getWindowingStrategy().getWindowFn().windowCoder();
PCollection>>> rval =
input.apply(ParDo.of(
new UseWindowHashAsKeyAndWindowAsSortKeyDoFn(ismCoderForHash)));
rval.setCoder(
KvCoder.of(
VarIntCoder.of(),
KvCoder.of(windowCoder,
FullWindowedValueCoder.of(input.getCoder(), windowCoder))));
return rval.apply(new GroupByKeyAndSortValuesOnly>());
}
}
/**
* Specialized implementation for
* {@link com.google.cloud.dataflow.sdk.transforms.View.AsSingleton View.AsSingleton} for the
* Dataflow runner in batch mode.
*
* Creates a set of files in the {@link IsmFormat} sharded by the hash of the windows
* byte representation and with records having:
*
* - Key 1: Window
* - Value: Windowed value
*
*/
static class BatchViewAsSingleton
extends PTransform, PCollectionView> {
/**
* A {@link DoFn} that outputs {@link IsmRecord}s. These records are structured as follows:
*
* - Key 1: Window
*
- Value: Windowed value
*
*/
static class IsmRecordForSingularValuePerWindowDoFn
extends DoFn>>>,
IsmRecord>> {
private final Coder windowCoder;
IsmRecordForSingularValuePerWindowDoFn(Coder windowCoder) {
this.windowCoder = windowCoder;
}
@Override
public void processElement(ProcessContext c) throws Exception {
Optional