
com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator Maven / Gradle / Ivy
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.runners;
import static com.google.cloud.dataflow.sdk.util.CoderUtils.encodeToByteArray;
import static com.google.cloud.dataflow.sdk.util.SerializableUtils.serializeToByteArray;
import static com.google.cloud.dataflow.sdk.util.StringUtils.byteArrayToJsonString;
import static com.google.cloud.dataflow.sdk.util.StringUtils.jsonStringToByteArray;
import static com.google.cloud.dataflow.sdk.util.Structs.addBoolean;
import static com.google.cloud.dataflow.sdk.util.Structs.addDictionary;
import static com.google.cloud.dataflow.sdk.util.Structs.addList;
import static com.google.cloud.dataflow.sdk.util.Structs.addLong;
import static com.google.cloud.dataflow.sdk.util.Structs.addObject;
import static com.google.cloud.dataflow.sdk.util.Structs.addString;
import static com.google.cloud.dataflow.sdk.util.Structs.getString;
import static com.google.common.base.Preconditions.checkArgument;
import com.google.api.client.util.Preconditions;
import com.google.api.services.dataflow.model.AutoscalingSettings;
import com.google.api.services.dataflow.model.DataflowPackage;
import com.google.api.services.dataflow.model.Disk;
import com.google.api.services.dataflow.model.Environment;
import com.google.api.services.dataflow.model.Job;
import com.google.api.services.dataflow.model.Step;
import com.google.api.services.dataflow.model.WorkerPool;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.coders.IterableCoder;
import com.google.cloud.dataflow.sdk.io.AvroIO;
import com.google.cloud.dataflow.sdk.io.BigQueryIO;
import com.google.cloud.dataflow.sdk.io.PubsubIO;
import com.google.cloud.dataflow.sdk.io.Read;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
import com.google.cloud.dataflow.sdk.options.StreamingOptions;
import com.google.cloud.dataflow.sdk.runners.dataflow.AvroIOTranslator;
import com.google.cloud.dataflow.sdk.runners.dataflow.BigQueryIOTranslator;
import com.google.cloud.dataflow.sdk.runners.dataflow.PubsubIOTranslator;
import com.google.cloud.dataflow.sdk.runners.dataflow.ReadTranslator;
import com.google.cloud.dataflow.sdk.runners.dataflow.TextIOTranslator;
import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
import com.google.cloud.dataflow.sdk.transforms.Combine;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.Flatten;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.windowing.DefaultTrigger;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.util.AppliedCombineFn;
import com.google.cloud.dataflow.sdk.util.CloudObject;
import com.google.cloud.dataflow.sdk.util.DoFnInfo;
import com.google.cloud.dataflow.sdk.util.OutputReference;
import com.google.cloud.dataflow.sdk.util.PropertyNames;
import com.google.cloud.dataflow.sdk.util.SerializableUtils;
import com.google.cloud.dataflow.sdk.util.WindowedValue;
import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionTuple;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.POutput;
import com.google.cloud.dataflow.sdk.values.PValue;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TypedPValue;
import com.google.common.base.Strings;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
/**
* {@link DataflowPipelineTranslator} knows how to translate {@link Pipeline} objects
* into Cloud Dataflow Service API {@link Job}s.
*/
@SuppressWarnings({"rawtypes", "unchecked"})
public class DataflowPipelineTranslator {
// Must be kept in sync with their internal counterparts.
private static final Logger LOG = LoggerFactory.getLogger(DataflowPipelineTranslator.class);
private static final ObjectMapper MAPPER = new ObjectMapper();
/**
* A map from {@link PTransform} subclass to the corresponding
* {@link TransformTranslator} to use to translate that transform.
*
* A static map that contains system-wide defaults.
*/
private static Map transformTranslators =
new HashMap<>();
/** Provided configuration options. */
private final DataflowPipelineOptions options;
/**
* Constructs a translator from the provided options.
*
* @param options Properties that configure the translator.
*
* @return The newly created translator.
*/
public static DataflowPipelineTranslator fromOptions(
DataflowPipelineOptions options) {
return new DataflowPipelineTranslator(options);
}
private DataflowPipelineTranslator(DataflowPipelineOptions options) {
this.options = options;
}
/**
* Translates a {@link Pipeline} into a {@code JobSpecification}.
*/
public JobSpecification translate(Pipeline pipeline, List packages) {
Translator translator = new Translator(pipeline);
Job result = translator.translate(packages);
return new JobSpecification(result, Collections.unmodifiableMap(translator.stepNames));
}
/**
* The result of a job translation.
*
* Used to pass the result {@link Job} and any state that was used to construct the job that
* may be of use to other classes (eg the {@link PTransform} to StepName mapping).
*/
public static class JobSpecification {
private final Job job;
private final Map, String> stepNames;
public JobSpecification(Job job, Map, String> stepNames) {
this.job = job;
this.stepNames = stepNames;
}
public Job getJob() {
return job;
}
/**
* Returns the mapping of {@link AppliedPTransform AppliedPTransforms} to the internal step
* name for that {@code AppliedPTransform}.
*/
public Map, String> getStepNames() {
return stepNames;
}
}
public static String jobToString(Job job) {
try {
return MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(job);
} catch (JsonProcessingException exc) {
throw new IllegalStateException("Failed to render Job as String.", exc);
}
}
/////////////////////////////////////////////////////////////////////////////
/**
* Records that instances of the specified PTransform class
* should be translated by default by the corresponding
* {@link TransformTranslator}.
*/
public static void registerTransformTranslator(
Class transformClass,
TransformTranslator extends TransformT> transformTranslator) {
if (transformTranslators.put(transformClass, transformTranslator) != null) {
throw new IllegalArgumentException(
"defining multiple translators for " + transformClass);
}
}
/**
* Returns the {@link TransformTranslator} to use for instances of the
* specified PTransform class, or null if none registered.
*/
public
TransformTranslator getTransformTranslator(Class transformClass) {
return transformTranslators.get(transformClass);
}
/**
* A {@link TransformTranslator} knows how to translate
* a particular subclass of {@link PTransform} for the
* Cloud Dataflow service. It does so by
* mutating the {@link TranslationContext}.
*/
public interface TransformTranslator {
public void translate(TransformT transform,
TranslationContext context);
}
/**
* The interface provided to registered callbacks for interacting
* with the {@link DataflowPipelineRunner}, including reading and writing the
* values of {@link PCollection}s and side inputs ({@link PCollectionView}s).
*/
public interface TranslationContext {
/**
* Returns the configured pipeline options.
*/
DataflowPipelineOptions getPipelineOptions();
/**
* Returns the input of the currently being translated transform.
*/
InputT getInput(PTransform transform);
/**
* Returns the output of the currently being translated transform.
*/
OutputT getOutput(PTransform, OutputT> transform);
/**
* Returns the full name of the currently being translated transform.
*/
String getFullName(PTransform, ?> transform);
/**
* Adds a step to the Dataflow workflow for the given transform, with
* the given Dataflow step type.
* This step becomes "current" for the purpose of {@link #addInput} and
* {@link #addOutput}.
*/
public void addStep(PTransform, ?> transform, String type);
/**
* Adds a pre-defined step to the Dataflow workflow. The given PTransform should be
* consistent with the Step, in terms of input, output and coder types.
*
* This is a low-level operation, when using this method it is up to
* the caller to ensure that names do not collide.
*/
public void addStep(PTransform, ? extends PValue> transform, Step step);
/**
* Sets the encoding for the current Dataflow step.
*/
public void addEncodingInput(Coder> value);
/**
* Adds an input with the given name and value to the current
* Dataflow step.
*/
public void addInput(String name, Boolean value);
/**
* Adds an input with the given name and value to the current
* Dataflow step.
*/
public void addInput(String name, String value);
/**
* Adds an input with the given name and value to the current
* Dataflow step.
*/
public void addInput(String name, Long value);
/**
* Adds an input with the given name to the previously added Dataflow
* step, coming from the specified input PValue.
*/
public void addInput(String name, PInput value);
/**
* Adds an input that is a dictionary of strings to objects.
*/
public void addInput(String name, Map elements);
/**
* Adds an input that is a list of objects.
*/
public void addInput(String name, List extends Map> elements);
/**
* Adds an output with the given name to the previously added
* Dataflow step, producing the specified output {@code PValue},
* including its {@code Coder} if a {@code TypedPValue}. If the
* {@code PValue} is a {@code PCollection}, wraps its coder inside
* a {@code WindowedValueCoder}.
*/
public void addOutput(String name, PValue value);
/**
* Adds an output with the given name to the previously added
* Dataflow step, producing the specified output {@code PValue},
* including its {@code Coder} if a {@code TypedPValue}. If the
* {@code PValue} is a {@code PCollection}, wraps its coder inside
* a {@code ValueOnlyCoder}.
*/
public void addValueOnlyOutput(String name, PValue value);
/**
* Adds an output with the given name to the previously added
* CollectionToSingleton Dataflow step, consuming the specified
* input {@code PValue} and producing the specified output
* {@code PValue}. This step requires special treatment for its
* output encoding.
*/
public void addCollectionToSingletonOutput(String name,
PValue inputValue,
PValue outputValue);
/**
* Encode a PValue reference as an output reference.
*/
public OutputReference asOutputReference(PValue value);
}
/////////////////////////////////////////////////////////////////////////////
/**
* Translates a Pipeline into the Dataflow representation.
*/
class Translator implements PipelineVisitor, TranslationContext {
/** The Pipeline to translate. */
private final Pipeline pipeline;
/** The Cloud Dataflow Job representation. */
private final Job job = new Job();
/**
* Translator is stateful, as addProperty calls refer to the current step.
*/
private Step currentStep;
/**
* A Map from AppliedPTransform to their unique Dataflow step names.
*/
private final Map, String> stepNames = new HashMap<>();
/**
* A Map from PValues to their output names used by their producer
* Dataflow steps.
*/
private final Map outputNames = new HashMap<>();
/**
* A Map from PValues to the Coders used for them.
*/
private final Map> outputCoders = new HashMap<>();
/**
* The transform currently being applied.
*/
private AppliedPTransform, ?, ?> currentTransform;
/**
* Constructs a Translator that will translate the specified
* Pipeline into Dataflow objects.
*/
public Translator(Pipeline pipeline) {
this.pipeline = pipeline;
}
/**
* Translates this Translator's pipeline onto its writer.
* @return a Job definition filled in with the type of job, the environment,
* and the job steps.
*/
public Job translate(List packages) {
job.setName(options.getJobName().toLowerCase());
Environment environment = new Environment();
job.setEnvironment(environment);
try {
environment.setSdkPipelineOptions(
MAPPER.readValue(MAPPER.writeValueAsBytes(options), Map.class));
} catch (IOException e) {
throw new IllegalArgumentException(
"PipelineOptions specified failed to serialize to JSON.", e);
}
WorkerPool workerPool = new WorkerPool();
if (options.getTeardownPolicy() != null) {
workerPool.setTeardownPolicy(options.getTeardownPolicy().getTeardownPolicyName());
}
if (options.isStreaming()) {
job.setType("JOB_TYPE_STREAMING");
} else {
job.setType("JOB_TYPE_BATCH");
workerPool.setDiskType(options.getWorkerDiskType());
}
if (options.getWorkerMachineType() != null) {
workerPool.setMachineType(options.getWorkerMachineType());
}
workerPool.setPackages(packages);
workerPool.setNumWorkers(options.getNumWorkers());
if (options.getDiskSourceImage() != null) {
workerPool.setDiskSourceImage(options.getDiskSourceImage());
}
if (options.isStreaming()) {
// Use separate data disk for streaming.
Disk disk = new Disk();
disk.setDiskType(options.getWorkerDiskType());
workerPool.setDataDisks(Collections.singletonList(disk));
}
if (!Strings.isNullOrEmpty(options.getZone())) {
workerPool.setZone(options.getZone());
}
if (!Strings.isNullOrEmpty(options.getNetwork())) {
workerPool.setNetwork(options.getNetwork());
}
if (options.getDiskSizeGb() > 0) {
workerPool.setDiskSizeGb(options.getDiskSizeGb());
}
if (options.getAutoscalingAlgorithm() != null) {
AutoscalingSettings settings = new AutoscalingSettings();
settings.setAlgorithm(options.getAutoscalingAlgorithm().getAlgorithm());
settings.setMaxNumWorkers(options.getMaxNumWorkers());
workerPool.setAutoscalingSettings(settings);
}
List workerPools = new LinkedList<>();
workerPools.add(workerPool);
environment.setWorkerPools(workerPools);
pipeline.traverseTopologically(this);
return job;
}
@Override
public DataflowPipelineOptions getPipelineOptions() {
return options;
}
@Override
public InputT getInput(PTransform transform) {
return (InputT) getCurrentTransform(transform).getInput();
}
@Override
public OutputT getOutput(PTransform, OutputT> transform) {
return (OutputT) getCurrentTransform(transform).getOutput();
}
@Override
public String getFullName(PTransform, ?> transform) {
return getCurrentTransform(transform).getFullName();
}
private AppliedPTransform, ?, ?> getCurrentTransform(PTransform, ?> transform) {
checkArgument(
currentTransform != null && currentTransform.getTransform() == transform,
"can only be called with current transform");
return currentTransform;
}
@Override
public void enterCompositeTransform(TransformTreeNode node) {
}
@Override
public void leaveCompositeTransform(TransformTreeNode node) {
}
@Override
public void visitTransform(TransformTreeNode node) {
PTransform, ?> transform = node.getTransform();
TransformTranslator translator =
getTransformTranslator(transform.getClass());
if (translator == null) {
throw new IllegalStateException(
"no translator registered for " + transform);
}
LOG.debug("Translating {}", transform);
currentTransform = AppliedPTransform.of(
node.getFullName(), node.getInput(), node.getOutput(), (PTransform) transform);
translator.translate(transform, this);
currentTransform = null;
}
@Override
public void visitValue(PValue value, TransformTreeNode producer) {
LOG.debug("Checking translation of {}", value);
if (value.getProducingTransformInternal() == null) {
throw new RuntimeException(
"internal error: expecting a PValue "
+ "to have a producingTransform");
}
if (!producer.isCompositeNode()) {
// Primitive transforms are the only ones assigned step names.
asOutputReference(value);
}
}
@Override
public void addStep(PTransform, ?> transform, String type) {
String stepName = genStepName();
if (stepNames.put(getCurrentTransform(transform), stepName) != null) {
throw new IllegalArgumentException(
transform + " already has a name specified");
}
// Start the next "steps" list item.
List steps = job.getSteps();
if (steps == null) {
steps = new LinkedList<>();
job.setSteps(steps);
}
currentStep = new Step();
currentStep.setName(stepName);
currentStep.setKind(type);
steps.add(currentStep);
addInput(PropertyNames.USER_NAME, getFullName(transform));
}
@Override
public void addStep(PTransform, ? extends PValue> transform, Step original) {
Step step = original.clone();
String stepName = step.getName();
if (stepNames.put(getCurrentTransform(transform), stepName) != null) {
throw new IllegalArgumentException(transform + " already has a name specified");
}
Map properties = step.getProperties();
if (properties != null) {
@Nullable List