All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.io.Write Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
import com.google.cloud.dataflow.sdk.io.Sink.WriteOperation;
import com.google.cloud.dataflow.sdk.io.Sink.Writer;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PDone;

import org.joda.time.Instant;

import java.util.UUID;

/**
 * A {@link PTransform} that writes to a {@link Sink}. A write begins with a sequential global
 * initialization of a sink, followed by a parallel write, and ends with a sequential finalization
 * of the write. The output of a write is {@link PDone}.  In the case of an empty PCollection, only
 * the global initialization and finalization will be performed.
 *
 * 

Currently, only batch workflows can contain Write transforms. * *

Example usage: * *

{@code p.apply(Write.to(new MySink(...)));} */ @Experimental(Experimental.Kind.SOURCE_SINK) public class Write { /** * Creates a Write transform that writes to the given Sink. */ public static Bound to(Sink sink) { return new Bound<>(sink); } /** * A {@link PTransform} that writes to a {@link Sink}. See {@link Write} and {@link Sink} for * documentation about writing to Sinks. */ public static class Bound extends PTransform, PDone> { private final Sink sink; private Bound(Sink sink) { this.sink = sink; } @Override public PDone apply(PCollection input) { PipelineOptions options = input.getPipeline().getOptions(); sink.validate(options); return createWrite(input, sink.createWriteOperation(options)); } /** * Returns the {@link Sink} associated with this PTransform. */ public Sink getSink() { return sink; } /** * A write is performed as sequence of three {@link ParDo}'s. * *

In the first, a do-once ParDo is applied to a singleton PCollection containing the Sink's * {@link WriteOperation}. In this initialization ParDo, {@link WriteOperation#initialize} is * called. The output of this ParDo is a singleton PCollection * containing the WriteOperation. * *

This singleton collection containing the WriteOperation is then used as a side input to a * ParDo over the PCollection of elements to write. In this bundle-writing phase, * {@link WriteOperation#createWriter} is called to obtain a {@link Writer}. * {@link Writer#open} and {@link Writer#close} are called in {@link DoFn#startBundle} and * {@link DoFn#finishBundle}, respectively, and {@link Writer#write} method is called for every * element in the bundle. The output of this ParDo is a PCollection of writer result * objects (see {@link Sink} for a description of writer results)-one for each bundle. * *

The final do-once ParDo uses the singleton collection of the WriteOperation as input and * the collection of writer results as a side-input. In this ParDo, * {@link WriteOperation#finalize} is called to finalize the write. * *

If the write of any element in the PCollection fails, {@link Writer#close} will be called * before the exception that caused the write to fail is propagated and the write result will be * discarded. * *

Since the {@link WriteOperation} is serialized after the initialization ParDo and * deserialized in the bundle-writing and finalization phases, any state change to the * WriteOperation object that occurs during initialization is visible in the latter phases. * However, the WriteOperation is not serialized after the bundle-writing phase. This is why * implementations should guarantee that {@link WriteOperation#createWriter} does not mutate * WriteOperation). */ private PDone createWrite( PCollection input, WriteOperation writeOperation) { Pipeline p = input.getPipeline(); // A coder to use for the WriteOperation. @SuppressWarnings("unchecked") Coder> operationCoder = (Coder>) SerializableCoder.of(writeOperation.getClass()); // A singleton collection of the WriteOperation, to be used as input to a ParDo to initialize // the sink. PCollection> operationCollection = p.apply(Create.>of(writeOperation).withCoder(operationCoder)); // Initialize the resource in a do-once ParDo on the WriteOperation. operationCollection = operationCollection .apply("Initialize", ParDo.of( new DoFn, WriteOperation>() { @Override public void processElement(ProcessContext c) throws Exception { WriteOperation writeOperation = c.element(); writeOperation.initialize(c.getPipelineOptions()); // The WriteOperation is also the output of this ParDo, so it can have mutable // state. c.output(writeOperation); } })) .setCoder(operationCoder); // Create a view of the WriteOperation to be used as a sideInput to the parallel write phase. final PCollectionView> writeOperationView = operationCollection.apply(View.>asSingleton()); // Perform the per-bundle writes as a ParDo on the input PCollection (with the WriteOperation // as a side input) and collect the results of the writes in a PCollection. // There is a dependency between this ParDo and the first (the WriteOperation PCollection // as a side input), so this will happen after the initial ParDo. PCollection results = input .apply("WriteBundles", ParDo.of(new DoFn() { // Writer that will write the records in this bundle. Lazily // initialized in processElement. private Writer writer = null; @Override public void processElement(ProcessContext c) throws Exception { // Lazily initialize the Writer if (writer == null) { WriteOperation writeOperation = c.sideInput(writeOperationView); writer = writeOperation.createWriter(c.getPipelineOptions()); writer.open(UUID.randomUUID().toString()); } try { writer.write(c.element()); } catch (Exception e) { // Discard write result and close the write. try { writer.close(); } catch (Exception closeException) { // Do not mask the exception that caused the write to fail. } throw e; } } @Override public void finishBundle(Context c) throws Exception { if (writer != null) { WriteT result = writer.close(); // Output the result of the write. c.outputWithTimestamp(result, Instant.now()); } } }).withSideInputs(writeOperationView)) .setCoder(writeOperation.getWriterResultCoder()) .apply(Window.into(new GlobalWindows())); final PCollectionView> resultsView = results.apply(View.asIterable()); // Finalize the write in another do-once ParDo on the singleton collection containing the // Writer. The results from the per-bundle writes are given as an Iterable side input. // The WriteOperation's state is the same as after its initialization in the first do-once // ParDo. There is a dependency between this ParDo and the parallel write (the writer results // collection as a side input), so it will happen after the parallel write. @SuppressWarnings("unused") final PCollection done = operationCollection .apply("Finalize", ParDo.of(new DoFn, Integer>() { @Override public void processElement(ProcessContext c) throws Exception { Iterable results = c.sideInput(resultsView); WriteOperation writeOperation = c.element(); writeOperation.finalize(results, c.getPipelineOptions()); } }).withSideInputs(resultsView)); return PDone.in(input.getPipeline()); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy