com.google.cloud.dataflow.sdk.io.Write Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.
There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.SerializableCoder;
import com.google.cloud.dataflow.sdk.io.Sink.WriteOperation;
import com.google.cloud.dataflow.sdk.io.Sink.Writer;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.View;
import com.google.cloud.dataflow.sdk.transforms.windowing.GlobalWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.PDone;

import org.joda.time.Instant;

import java.util.UUID;

/**
 * A {@link PTransform} that writes to a {@link Sink}. A write begins with a sequential global
 * initialization of a sink, followed by a parallel write, and ends with a sequential finalization
 * of the write. The output of a write is {@link PDone}.  In the case of an empty PCollection, only
 * the global initialization and finalization will be performed.
 *
 * Currently, only batch workflows can contain Write transforms.
 *
 * 
Example usage:
 *
 * 
{@code p.apply(Write.to(new MySink(...)));}
 */
@Experimental(Experimental.Kind.SOURCE_SINK)
public class Write {
  /**
   * Creates a Write transform that writes to the given Sink.
   */
  public static  Bound to(Sink sink) {
    return new Bound<>(sink);
  }

  /**
   * A {@link PTransform} that writes to a {@link Sink}. See {@link Write} and {@link Sink} for
   * documentation about writing to Sinks.
   */
  public static class Bound extends PTransform, PDone> {
    private final Sink sink;

    private Bound(Sink sink) {
      this.sink = sink;
    }

    @Override
    public PDone apply(PCollection input) {
      PipelineOptions options = input.getPipeline().getOptions();
      sink.validate(options);
      return createWrite(input, sink.createWriteOperation(options));
    }

    /**
     * Returns the {@link Sink} associated with this PTransform.
     */
    public Sink getSink() {
      return sink;
    }

    /**
     * A write is performed as sequence of three {@link ParDo}'s.
     *
     * 
In the first, a do-once ParDo is applied to a singleton PCollection containing the Sink's
     * {@link WriteOperation}. In this initialization ParDo, {@link WriteOperation#initialize} is
     * called. The output of this ParDo is a singleton PCollection
     * containing the WriteOperation.
     *
     * 
This singleton collection containing the WriteOperation is then used as a side input to a
     * ParDo over the PCollection of elements to write. In this bundle-writing phase,
     * {@link WriteOperation#createWriter} is called to obtain a {@link Writer}.
     * {@link Writer#open} and {@link Writer#close} are called in {@link DoFn#startBundle} and
     * {@link DoFn#finishBundle}, respectively, and {@link Writer#write} method is called for every
     * element in the bundle. The output of this ParDo is a PCollection of writer result
     * objects (see {@link Sink} for a description of writer results)-one for each bundle.
     *
     * 
The final do-once ParDo uses the singleton collection of the WriteOperation as input and
     * the collection of writer results as a side-input. In this ParDo,
     * {@link WriteOperation#finalize} is called to finalize the write.
     *
     * 
If the write of any element in the PCollection fails, {@link Writer#close} will be called
     * before the exception that caused the write to fail is propagated and the write result will be
     * discarded.
     *
     * Since the {@link WriteOperation} is serialized after the initialization ParDo and
     * deserialized in the bundle-writing and finalization phases, any state change to the
     * WriteOperation object that occurs during initialization is visible in the latter phases.
     * However, the WriteOperation is not serialized after the bundle-writing phase.  This is why
     * implementations should guarantee that {@link WriteOperation#createWriter} does not mutate
     * WriteOperation).
     */
    private  PDone createWrite(
        PCollection input, WriteOperation writeOperation) {
      Pipeline p = input.getPipeline();

      // A coder to use for the WriteOperation.
      @SuppressWarnings("unchecked")
      Coder> operationCoder =
          (Coder>) SerializableCoder.of(writeOperation.getClass());

      // A singleton collection of the WriteOperation, to be used as input to a ParDo to initialize
      // the sink.
      PCollection> operationCollection =
          p.apply(Create.>of(writeOperation).withCoder(operationCoder));

      // Initialize the resource in a do-once ParDo on the WriteOperation.
      operationCollection = operationCollection
          .apply("Initialize", ParDo.of(
              new DoFn, WriteOperation>() {
            @Override
            public void processElement(ProcessContext c) throws Exception {
              WriteOperation writeOperation = c.element();
              writeOperation.initialize(c.getPipelineOptions());
              // The WriteOperation is also the output of this ParDo, so it can have mutable
              // state.
              c.output(writeOperation);
            }
          }))
          .setCoder(operationCoder);

      // Create a view of the WriteOperation to be used as a sideInput to the parallel write phase.
      final PCollectionView> writeOperationView =
          operationCollection.apply(View.>asSingleton());

      // Perform the per-bundle writes as a ParDo on the input PCollection (with the WriteOperation
      // as a side input) and collect the results of the writes in a PCollection.
      // There is a dependency between this ParDo and the first (the WriteOperation PCollection
      // as a side input), so this will happen after the initial ParDo.
      PCollection results = input
          .apply("WriteBundles", ParDo.of(new DoFn() {
            // Writer that will write the records in this bundle. Lazily
            // initialized in processElement.
            private Writer writer = null;

            @Override
            public void processElement(ProcessContext c) throws Exception {
              // Lazily initialize the Writer
              if (writer == null) {
                WriteOperation writeOperation = c.sideInput(writeOperationView);
                writer = writeOperation.createWriter(c.getPipelineOptions());
                writer.open(UUID.randomUUID().toString());
              }
              try {
                writer.write(c.element());
              } catch (Exception e) {
                // Discard write result and close the write.
                try {
                  writer.close();
                } catch (Exception closeException) {
                  // Do not mask the exception that caused the write to fail.
                }
                throw e;
              }
            }

            @Override
            public void finishBundle(Context c) throws Exception {
              if (writer != null) {
                WriteT result = writer.close();
                // Output the result of the write.
                c.outputWithTimestamp(result, Instant.now());
              }
            }
          }).withSideInputs(writeOperationView))
          .setCoder(writeOperation.getWriterResultCoder())
          .apply(Window.into(new GlobalWindows()));

      final PCollectionView> resultsView =
          results.apply(View.asIterable());

      // Finalize the write in another do-once ParDo on the singleton collection containing the
      // Writer. The results from the per-bundle writes are given as an Iterable side input.
      // The WriteOperation's state is the same as after its initialization in the first do-once
      // ParDo. There is a dependency between this ParDo and the parallel write (the writer results
      // collection as a side input), so it will happen after the parallel write.
      @SuppressWarnings("unused")
      final PCollection done = operationCollection
          .apply("Finalize", ParDo.of(new DoFn, Integer>() {
            @Override
            public void processElement(ProcessContext c) throws Exception {
              Iterable results = c.sideInput(resultsView);
              WriteOperation writeOperation = c.element();
              writeOperation.finalize(results, c.getPipelineOptions());
            }
          }).withSideInputs(resultsView));
      return PDone.in(input.getPipeline());
    }
  }
}