
com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner Maven / Gradle / Ivy
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.runners;
import static com.google.cloud.dataflow.sdk.util.StringUtils.approximatePTransformName;
import static com.google.cloud.dataflow.sdk.util.StringUtils.approximateSimpleName;
import static java.nio.charset.StandardCharsets.UTF_8;
import com.google.api.client.googleapis.json.GoogleJsonResponseException;
import com.google.api.client.util.Joiner;
import com.google.api.services.dataflow.Dataflow;
import com.google.api.services.dataflow.model.DataflowPackage;
import com.google.api.services.dataflow.model.Job;
import com.google.api.services.dataflow.model.ListJobsResponse;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.PipelineResult.State;
import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.io.AvroIO;
import com.google.cloud.dataflow.sdk.io.BigQueryIO;
import com.google.cloud.dataflow.sdk.io.PubsubIO;
import com.google.cloud.dataflow.sdk.io.Read;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.io.UnboundedSource;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsValidator;
import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator.JobSpecification;
import com.google.cloud.dataflow.sdk.runners.dataflow.BasicSerializableSourceFormat;
import com.google.cloud.dataflow.sdk.runners.dataflow.DataflowAggregatorTransforms;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.Combine;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.WithKeys;
import com.google.cloud.dataflow.sdk.transforms.Write;
import com.google.cloud.dataflow.sdk.transforms.windowing.AfterPane;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.util.CoderUtils;
import com.google.cloud.dataflow.sdk.util.DataflowReleaseInfo;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.cloud.dataflow.sdk.util.InstanceBuilder;
import com.google.cloud.dataflow.sdk.util.MonitoringUtil;
import com.google.cloud.dataflow.sdk.util.PCollectionViews;
import com.google.cloud.dataflow.sdk.util.PathValidator;
import com.google.cloud.dataflow.sdk.util.PropertyNames;
import com.google.cloud.dataflow.sdk.util.Reshuffle;
import com.google.cloud.dataflow.sdk.util.StreamingPCollectionViewWriterFn;
import com.google.cloud.dataflow.sdk.util.Transport;
import com.google.cloud.dataflow.sdk.util.ValueWithRecordId;
import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.POutput;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import org.joda.time.DateTimeUtils;
import org.joda.time.DateTimeZone;
import org.joda.time.Duration;
import org.joda.time.format.DateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
/**
* A {@link PipelineRunner} that executes the operations in the
* pipeline by first translating them to the Dataflow representation
* using the {@link DataflowPipelineTranslator} and then submitting
* them to a Dataflow service for execution.
*
* Permissions
* When reading from a Dataflow source or writing to a Dataflow sink using
* {@code DataflowPipelineRunner}, the Google cloudservices account and the Google compute engine
* service account of the GCP project running the Dataflow Job will need access to the corresponding
* source/sink.
*
* Please see Google Cloud
* Dataflow Security and Permissions for more details.
*/
public class DataflowPipelineRunner extends PipelineRunner {
private static final Logger LOG = LoggerFactory.getLogger(DataflowPipelineRunner.class);
/** Provided configuration options. */
private final DataflowPipelineOptions options;
/** Client for the Dataflow service. This is used to actually submit jobs. */
private final Dataflow dataflowClient;
/** Translator for this DataflowPipelineRunner, based on options. */
private final DataflowPipelineTranslator translator;
/** Custom transforms implementations for running in streaming mode. */
private final Map, Class>> streamingOverrides;
/** A set of user defined functions to invoke at different points in execution. */
private DataflowPipelineRunnerHooks hooks;
// Environment version information
private static final String ENVIRONMENT_MAJOR_VERSION = "3";
// The limit of CreateJob request size.
private static final int CREATE_JOB_REQUEST_LIMIT_BYTES = 10 * 1024 * 1024;
/**
* Project IDs must contain lowercase letters, digits, or dashes.
* IDs must start with a letter and may not end with a dash.
* This regex isn't exact - this allows for patterns that would be rejected by
* the service, but this is sufficient for basic validation of project IDs.
*/
public static final String PROJECT_ID_REGEXP = "[a-z][-a-z0-9:.]+[a-z0-9]";
/**
* Construct a runner from the provided options.
*
* @param options Properties that configure the runner.
* @return The newly created runner.
*/
public static DataflowPipelineRunner fromOptions(PipelineOptions options) {
// (Re-)register standard IO factories. Clobbers any prior credentials.
IOChannelUtils.registerStandardIOFactories(options);
DataflowPipelineOptions dataflowOptions =
PipelineOptionsValidator.validate(DataflowPipelineOptions.class, options);
ArrayList missing = new ArrayList<>();
if (dataflowOptions.getAppName() == null) {
missing.add("appName");
}
if (missing.size() > 0) {
throw new IllegalArgumentException(
"Missing required values: " + Joiner.on(',').join(missing));
}
PathValidator validator = dataflowOptions.getPathValidator();
if (dataflowOptions.getStagingLocation() != null) {
validator.validateOutputFilePrefixSupported(dataflowOptions.getStagingLocation());
}
if (dataflowOptions.getTempLocation() != null) {
validator.validateOutputFilePrefixSupported(dataflowOptions.getTempLocation());
}
if (Strings.isNullOrEmpty(dataflowOptions.getTempLocation())) {
dataflowOptions.setTempLocation(dataflowOptions.getStagingLocation());
} else if (Strings.isNullOrEmpty(dataflowOptions.getStagingLocation())) {
try {
dataflowOptions.setStagingLocation(
IOChannelUtils.resolve(dataflowOptions.getTempLocation(), "staging"));
} catch (IOException e) {
throw new IllegalArgumentException("Unable to resolve PipelineOptions.stagingLocation "
+ "from PipelineOptions.tempLocation. Please set the staging location explicitly.", e);
}
}
if (dataflowOptions.getFilesToStage() == null) {
dataflowOptions.setFilesToStage(detectClassPathResourcesToStage(
DataflowPipelineRunner.class.getClassLoader()));
LOG.info("PipelineOptions.filesToStage was not specified. "
+ "Defaulting to files from the classpath: will stage {} files. "
+ "Enable logging at DEBUG level to see which files will be staged.",
dataflowOptions.getFilesToStage().size());
LOG.debug("Classpath elements: {}", dataflowOptions.getFilesToStage());
}
// Verify jobName according to service requirements.
String jobName = dataflowOptions.getJobName().toLowerCase();
Preconditions.checkArgument(
jobName.matches("[a-z]([-a-z0-9]*[a-z0-9])?"),
"JobName invalid; the name must consist of only the characters "
+ "[-a-z0-9], starting with a letter and ending with a letter "
+ "or number");
// Verify project
String project = dataflowOptions.getProject();
if (project.matches("[0-9]*")) {
throw new IllegalArgumentException("Project ID '" + project
+ "' invalid. Please make sure you specified the Project ID, not project number.");
} else if (!project.matches(PROJECT_ID_REGEXP)) {
throw new IllegalArgumentException("Project ID '" + project
+ "' invalid. Please make sure you specified the Project ID, not project description.");
}
return new DataflowPipelineRunner(dataflowOptions);
}
@VisibleForTesting protected DataflowPipelineRunner(DataflowPipelineOptions options) {
this.options = options;
this.dataflowClient = options.getDataflowClient();
this.translator = DataflowPipelineTranslator.fromOptions(options);
this.streamingOverrides = ImmutableMap., Class>>builder()
.put(Create.Values.class, StreamingCreate.class)
.put(View.AsMap.class, StreamingViewAsMap.class)
.put(View.AsMultimap.class, StreamingViewAsMultimap.class)
.put(View.AsSingleton.class, StreamingViewAsSingleton.class)
.put(View.AsIterable.class, StreamingViewAsIterable.class)
.put(Write.Bound.class, StreamingWrite.class)
.put(PubsubIO.Write.Bound.class, StreamingPubsubIOWrite.class)
.put(Read.Unbounded.class, StreamingUnboundedRead.class)
.put(Read.Bounded.class, StreamingUnsupportedIO.class)
.put(AvroIO.Read.Bound.class, StreamingUnsupportedIO.class)
.put(AvroIO.Write.Bound.class, StreamingUnsupportedIO.class)
.put(BigQueryIO.Read.Bound.class, StreamingUnsupportedIO.class)
.put(TextIO.Read.Bound.class, StreamingUnsupportedIO.class)
.put(TextIO.Write.Bound.class, StreamingUnsupportedIO.class)
.build();
}
/**
* Applies the given transform to the input. For transforms with customized definitions
* for the Dataflow pipeline runner, the application is intercepted and modified here.
*/
@Override
public OutputT apply(
PTransform transform, InputT input) {
if (Combine.GroupedValues.class.equals(transform.getClass())
|| GroupByKey.class.equals(transform.getClass())) {
// For both Dataflow runners (streaming and batch), GroupByKey and GroupedValues are
// primitives. Returning a primitive output instead of the expanded definition
// signals to the translator that translation is necessary.
@SuppressWarnings("unchecked")
PCollection> pc = (PCollection>) input;
@SuppressWarnings("unchecked")
OutputT outputT = (OutputT) PCollection.createPrimitiveOutputInternal(
pc.getPipeline(),
transform instanceof GroupByKey
? ((GroupByKey, ?>) transform).updateWindowingStrategy(pc.getWindowingStrategy())
: pc.getWindowingStrategy(),
pc.isBounded());
return outputT;
} else if (options.isStreaming() && streamingOverrides.containsKey(transform.getClass())) {
// It is the responsibility of whoever constructs streamingOverrides
// to ensure this is type safe.
@SuppressWarnings("unchecked")
Class> transformClass =
(Class>) transform.getClass();
@SuppressWarnings("unchecked")
Class> customTransformClass =
(Class>) streamingOverrides.get(transform.getClass());
PTransform customTransform =
InstanceBuilder.ofType(customTransformClass)
.withArg(transformClass, transform)
.build();
return Pipeline.applyTransform(input, customTransform);
} else {
return super.apply(transform, input);
}
}
@Override
public DataflowPipelineJob run(Pipeline pipeline) {
LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications "
+ "related to Google Compute Engine usage and other Google Cloud Services.");
List packages = options.getStager().stageFiles();
JobSpecification jobSpecification = translator.translate(pipeline, packages);
Job newJob = jobSpecification.getJob();
// Set a unique client_request_id in the CreateJob request.
// This is used to ensure idempotence of job creation across retried
// attempts to create a job. Specifically, if the service returns a job with
// a different client_request_id, it means the returned one is a different
// job previously created with the same job name, and that the job creation
// has been effectively rejected. The SDK should return
// Error::Already_Exists to user in that case.
int randomNum = new Random().nextInt(9000) + 1000;
String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC)
.print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;
newJob.setClientRequestId(requestId);
String version = DataflowReleaseInfo.getReleaseInfo().getVersion();
System.out.println("Dataflow SDK version: " + version);
newJob.getEnvironment().setUserAgent(DataflowReleaseInfo.getReleaseInfo());
// The Dataflow Service may write to the temporary directory directly, so
// must be verified.
DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
if (!Strings.isNullOrEmpty(options.getTempLocation())) {
newJob.getEnvironment().setTempStoragePrefix(
dataflowOptions.getPathValidator().verifyPath(options.getTempLocation()));
}
newJob.getEnvironment().setDataset(options.getTempDatasetId());
newJob.getEnvironment().setExperiments(options.getExperiments());
// Requirements about the service.
Map environmentVersion = new HashMap<>();
environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_MAJOR_KEY, ENVIRONMENT_MAJOR_VERSION);
newJob.getEnvironment().setVersion(environmentVersion);
// Default jobType is DATA_PARALLEL, which is for java batch.
String jobType = "DATA_PARALLEL";
if (options.isStreaming()) {
jobType = "STREAMING";
}
environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_JOB_TYPE_KEY, jobType);
if (hooks != null) {
hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
}
if (!Strings.isNullOrEmpty(options.getDataflowJobFile())) {
try (PrintWriter printWriter = new PrintWriter(
new File(options.getDataflowJobFile()))) {
String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
printWriter.print(workSpecJson);
LOG.info("Printed workflow specification to {}", options.getDataflowJobFile());
} catch (IllegalStateException ex) {
LOG.warn("Cannot translate workflow spec to json for debug.");
} catch (FileNotFoundException ex) {
LOG.warn("Cannot create workflow spec output file.");
}
}
String jobIdToUpdate = null;
if (options.getUpdate()) {
jobIdToUpdate = getJobIdFromName(options.getJobName());
newJob.setTransformNameMapping(options.getTransformNameMapping());
newJob.setReplaceJobId(jobIdToUpdate);
}
Job jobResult;
try {
jobResult = dataflowClient
.projects()
.jobs()
.create(options.getProject(), newJob)
.execute();
} catch (GoogleJsonResponseException e) {
String errorMessages = "Unexpected errors";
if (e.getDetails() != null) {
if (newJob.toString().getBytes(UTF_8).length >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
errorMessages = "The size of the serialized JSON representation of the pipeline "
+ "exceeds the allowable limit. "
+ "For more information, please check the FAQ link below:\n"
+ "https://cloud.google.com/dataflow/faq";
} else {
errorMessages = e.getDetails().getMessage();
}
}
throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
} catch (IOException e) {
throw new RuntimeException("Failed to create a workflow job", e);
}
// Obtain all of the extractors from the PTransforms used in the pipeline so the
// DataflowPipelineJob has access to them.
AggregatorPipelineExtractor aggregatorExtractor = new AggregatorPipelineExtractor(pipeline);
Map, Collection>> aggregatorSteps =
aggregatorExtractor.getAggregatorSteps();
DataflowAggregatorTransforms aggregatorTransforms =
new DataflowAggregatorTransforms(aggregatorSteps, jobSpecification.getStepNames());
// Use a raw client for post-launch monitoring, as status calls may fail
// regularly and need not be retried automatically.
DataflowPipelineJob dataflowPipelineJob =
new DataflowPipelineJob(options.getProject(), jobResult.getId(),
Transport.newRawDataflowClient(options).build(), aggregatorTransforms);
// If the service returned client request id, the SDK needs to compare it
// with the original id generated in the request, if they are not the same
// (i.e., the returned job is not created by this request), throw
// DataflowJobAlreadyExistsException or DataflowJobAlreadyUpdatedExcetpion
// depending on whether this is a reload or not.
if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty()
&& !jobResult.getClientRequestId().equals(requestId)) {
// If updating a job.
if (options.getUpdate()) {
throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob,
String.format("The job named %s with id: %s has already been updated into job id: %s "
+ "and cannot be updated again.",
newJob.getName(), jobIdToUpdate, jobResult.getId()));
} else {
throw new DataflowJobAlreadyExistsException(dataflowPipelineJob,
String.format("There is already an active job named %s with id: %s. If you want "
+ "to submit a second job, try again by setting a different name using --jobName.",
newJob.getName(), jobResult.getId()));
}
}
LOG.info("To access the Dataflow monitoring console, please navigate to {}",
MonitoringUtil.getJobMonitoringPageURL(options.getProject(), jobResult.getId()));
System.out.println("Submitted job: " + jobResult.getId());
LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}",
MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));
return dataflowPipelineJob;
}
/**
* Returns the DataflowPipelineTranslator associated with this object.
*/
public DataflowPipelineTranslator getTranslator() {
return translator;
}
/**
* Sets callbacks to invoke during execution see {@code DataflowPipelineRunnerHooks}.
*/
@Experimental
public void setHooks(DataflowPipelineRunnerHooks hooks) {
this.hooks = hooks;
}
/////////////////////////////////////////////////////////////////////////////
/**
* Specialized (non-)implementation for {@link Write.Bound} for the Dataflow runner in streaming
* mode.
*/
private static class StreamingWrite extends PTransform, PDone> {
private static final long serialVersionUID = 0L;
/**
* Builds an instance of this class from the overridden transform.
*/
public StreamingWrite(Write.Bound transform) { }
@Override
public PDone apply(PCollection input) {
throw new UnsupportedOperationException(
"The Write transform is not supported by the Dataflow streaming runner.");
}
@Override
protected String getKindString() {
return "StreamingWrite";
}
}
/**
* Specialized implementation for {@link PubsubIO.Write} for the Dataflow runner in streaming
* mode.
*
* For internal use only. Subject to change at any time.
*
*
Public so the {@link com.google.cloud.dataflow.sdk.runners.dataflow.PubsubIOTranslator}
* can access.
*/
public static class StreamingPubsubIOWrite extends PTransform, PDone> {
private static final long serialVersionUID = 0L;
private final PubsubIO.Write.Bound transform;
/**
* Builds an instance of this class from the overridden transform.
*/
public StreamingPubsubIOWrite(PubsubIO.Write.Bound transform) {
this.transform = transform;
}
public PubsubIO.Write.Bound getOverriddenTransform() {
return transform;
}
@Override
public PDone apply(PCollection input) {
return PDone.in(input.getPipeline());
}
@Override
protected String getKindString() {
return "StreamingPubsubIOWrite";
}
}
/**
* Specialized implementation for {@link Read.Unbounded} for the Dataflow runner in streaming
* mode.
*
* In particular, if an UnboundedSource requires deduplication, then features of WindmillSink
* are leveraged to do the deduplication.
*/
private static class StreamingUnboundedRead extends PTransform> {
private static final long serialVersionUID = 0L;
private final UnboundedSource source;
/**
* Builds an instance of this class from the overridden transform.
*/
@SuppressWarnings("unused") // used via reflection in DataflowPipelineRunner#apply()
public StreamingUnboundedRead(Read.Unbounded transform) {
this.source = transform.getSource();;
}
@Override
protected Coder getDefaultOutputCoder() {
return source.getDefaultOutputCoder();
}
@Override
public final PCollection apply(PInput input) {
source.validate();
if (source.requiresDeduping()) {
return Pipeline.applyTransform(input, new ReadWithIds(source))
.apply(new Deduplicate());
} else {
return Pipeline.applyTransform(input, new ReadWithIds(source))
.apply(ValueWithRecordId.stripIds());
}
}
/**
* {@link PTransform} that reads {@code (record,recordId)} pairs from an
* {@link UnboundedSource}.
*/
private static class ReadWithIds
extends PTransform>> {
private static final long serialVersionUID = 0L;
private final UnboundedSource source;
private ReadWithIds(UnboundedSource source) {
this.source = source;
}
@Override
public final PCollection> apply(PInput input) {
return PCollection.>createPrimitiveOutputInternal(
input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED);
}
@Override
protected Coder> getDefaultOutputCoder() {
return ValueWithRecordId.ValueWithRecordIdCoder.of(source.getDefaultOutputCoder());
}
public UnboundedSource getSource() {
return source;
}
}
@Override
public String getKindString() {
return "Read(" + approximateSimpleName(source.getClass()) + ")";
}
static {
DataflowPipelineTranslator.registerTransformTranslator(
ReadWithIds.class, new ReadWithIdsTranslator());
}
private static class ReadWithIdsTranslator
implements DataflowPipelineTranslator.TransformTranslator> {
@Override
public void translate(ReadWithIds> transform,
DataflowPipelineTranslator.TranslationContext context) {
BasicSerializableSourceFormat.translateReadHelper(
transform.getSource(), transform, context);
}
}
}
/**
* Remove values with duplicate ids.
*/
private static class Deduplicate
extends PTransform>, PCollection> {
private static final long serialVersionUID = 0L;
// Use a finite set of keys to improve bundling. Without this, the key space
// will be the space of ids which is potentially very large, which results in much
// more per-key overhead.
private static final int NUM_RESHARD_KEYS = 10000;
@Override
public PCollection apply(PCollection> input) {
return input
.apply(WithKeys.of(new SerializableFunction, Integer>() {
private static final long serialVersionUID = 0L;
@Override
public Integer apply(ValueWithRecordId value) {
return Arrays.hashCode(value.getId()) % NUM_RESHARD_KEYS;
}
}))
// Reshuffle will dedup based on ids in ValueWithRecordId by passing the data through
// WindmillSink.
.apply(Reshuffle.>of())
.apply(ParDo.named("StripIds").of(
new DoFn>, T>() {
private static final long serialVersionUID = 0L;
@Override
public void processElement(ProcessContext c) {
c.output(c.element().getValue().getValue());
}
}));
}
}
/**
* Specialized implementation for {@link Create.Values} for the Dataflow runner in streaming mode.
*/
private static class StreamingCreate extends PTransform> {
private static final long serialVersionUID = 0L;
private final Create.Values transform;
/**
* Builds an instance of this class from the overridden transform.
*/
@SuppressWarnings("unused") // used via reflection in DataflowPipelineRunner#apply()
public StreamingCreate(Create.Values transform) {
this.transform = transform;
}
/**
* {@link DoFn} that outputs a single KV.of(null, null) kick off the {@link GroupByKey}
* in the streaming create implementation.
*/
private static class OutputNullKv extends DoFn> {
private static final long serialVersionUID = 0;
@Override
public void processElement(DoFn>.ProcessContext c) throws Exception {
c.output(KV.of((Void) null, (Void) null));
}
}
/**
* A {@link DoFn} which outputs the specified elements by first encoding them to bytes using
* the specified {@link Coder} so that they are serialized as part of the {@link DoFn} but
* need not implement {@code Serializable}.
*/
private static class OutputElements extends DoFn