All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.Pipeline Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk;

import com.google.cloud.dataflow.sdk.coders.CoderRegistry;
import com.google.cloud.dataflow.sdk.io.Read;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
import com.google.cloud.dataflow.sdk.runners.TransformHierarchy;
import com.google.cloud.dataflow.sdk.runners.TransformTreeNode;
import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.util.UserCodeException;
import com.google.cloud.dataflow.sdk.values.PBegin;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.POutput;
import com.google.cloud.dataflow.sdk.values.PValue;
import com.google.common.base.Preconditions;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Multimap;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * A {@link Pipeline} manages a directed acyclic graph of {@link PTransform PTransforms}, and the
 * {@link PCollection PCollections} that the {@link PTransform}s consume and produce.
 *
 * 

A {@link Pipeline} is initialized with a {@link PipelineRunner} that will later * execute the {@link Pipeline}. * *

{@link Pipeline Pipelines} are independent, so they can be constructed and executed * concurrently. * *

Each {@link Pipeline} is self-contained and isolated from any other * {@link Pipeline}. The {@link PValue PValues} that are inputs and outputs of each of a * {@link Pipeline Pipeline's} {@link PTransform PTransforms} are also owned by that * {@link Pipeline}. A {@link PValue} owned by one {@link Pipeline} can be read only by * {@link PTransform PTransforms} also owned by that {@link Pipeline}. * *

Here is a typical example of use: *

 {@code
 * // Start by defining the options for the pipeline.
 * PipelineOptions options = PipelineOptionsFactory.create();
 * // Then create the pipeline. The runner is determined by the options.
 * Pipeline p = Pipeline.create(options);
 *
 * // A root PTransform, like TextIO.Read or Create, gets added
 * // to the Pipeline by being applied:
 * PCollection lines =
 *     p.apply(TextIO.Read.from("gs://bucket/dir/file*.txt"));
 *
 * // A Pipeline can have multiple root transforms:
 * PCollection moreLines =
 *     p.apply(TextIO.Read.from("gs://bucket/other/dir/file*.txt"));
 * PCollection yetMoreLines =
 *     p.apply(Create.of("yet", "more", "lines").withCoder(StringUtf8Coder.of()));
 *
 * // Further PTransforms can be applied, in an arbitrary (acyclic) graph.
 * // Subsequent PTransforms (and intermediate PCollections etc.) are
 * // implicitly part of the same Pipeline.
 * PCollection allLines =
 *     PCollectionList.of(lines).and(moreLines).and(yetMoreLines)
 *     .apply(new Flatten());
 * PCollection> wordCounts =
 *     allLines
 *     .apply(ParDo.of(new ExtractWords()))
 *     .apply(new Count());
 * PCollection formattedWordCounts =
 *     wordCounts.apply(ParDo.of(new FormatCounts()));
 * formattedWordCounts.apply(TextIO.Write.to("gs://bucket/dir/counts.txt"));
 *
 * // PTransforms aren't executed when they're applied, rather they're
 * // just added to the Pipeline.  Once the whole Pipeline of PTransforms
 * // is constructed, the Pipeline's PTransforms can be run using a
 * // PipelineRunner.  The default PipelineRunner executes the Pipeline
 * // directly, sequentially, in this one process, which is useful for
 * // unit tests and simple experiments:
 * p.run();
 *
 * } 
*/ public class Pipeline { private static final Logger LOG = LoggerFactory.getLogger(Pipeline.class); /** * Thrown during execution of a {@link Pipeline}, whenever user code within that * {@link Pipeline} throws an exception. * *

The original exception thrown by user code may be retrieved via {@link #getCause}. */ public static class PipelineExecutionException extends RuntimeException { /** * Wraps {@code cause} into a {@link PipelineExecutionException}. */ public PipelineExecutionException(Throwable cause) { super(cause); } } ///////////////////////////////////////////////////////////////////////////// // Public operations. /** * Constructs a pipeline from the provided options. * * @return The newly created pipeline. */ public static Pipeline create(PipelineOptions options) { Pipeline pipeline = new Pipeline(PipelineRunner.fromOptions(options), options); LOG.debug("Creating {}", pipeline); return pipeline; } /** * Returns a {@link PBegin} owned by this Pipeline. This is useful * as the input of a root PTransform such as {@link Read} or * {@link Create}. */ public PBegin begin() { return PBegin.in(this); } /** * Like {@link #apply(String, PTransform)} but the transform node in the {@link Pipeline} * graph will be named according to {@link PTransform#getName}. * * @see #apply(String, PTransform) */ public OutputT apply( PTransform root) { return begin().apply(root); } /** * Adds a root {@link PTransform}, such as {@link Read} or {@link Create}, * to this {@link Pipeline}. * *

The node in the {@link Pipeline} graph will use the provided {@code name}. * This name is used in various places, including the monitoring UI, logging, * and to stably identify this node in the {@link Pipeline} graph upon update. * *

Alias for {@code begin().apply(name, root)}. */ public OutputT apply( String name, PTransform root) { return begin().apply(name, root); } /** * Runs the {@link Pipeline} using its {@link PipelineRunner}. */ public PipelineResult run() { LOG.debug("Running {} via {}", this, runner); try { return runner.run(this); } catch (UserCodeException e) { // This serves to replace the stack with one that ends here and // is caused by the caught UserCodeException, thereby splicing // out all the stack frames in between the PipelineRunner itself // and where the worker calls into the user's code. throw new PipelineExecutionException(e.getCause()); } } ///////////////////////////////////////////////////////////////////////////// // Below here are operations that aren't normally called by users. /** * Returns the {@link CoderRegistry} that this {@link Pipeline} uses. */ public CoderRegistry getCoderRegistry() { if (coderRegistry == null) { coderRegistry = new CoderRegistry(); coderRegistry.registerStandardCoders(); } return coderRegistry; } /** * Sets the {@link CoderRegistry} that this {@link Pipeline} uses. */ public void setCoderRegistry(CoderRegistry coderRegistry) { this.coderRegistry = coderRegistry; } /** * A {@link PipelineVisitor} can be passed into * {@link Pipeline#traverseTopologically} to be called for each of the * transforms and values in the {@link Pipeline}. */ public interface PipelineVisitor { /** * Called for each composite transform after all topological predecessors have been visited * but before any of its component transforms. */ public void enterCompositeTransform(TransformTreeNode node); /** * Called for each composite transform after all of its component transforms and their outputs * have been visited. */ public void leaveCompositeTransform(TransformTreeNode node); /** * Called for each primitive transform after all of its topological predecessors * and inputs have been visited. */ public void visitTransform(TransformTreeNode node); /** * Called for each value after the transform that produced the value has been * visited. */ public void visitValue(PValue value, TransformTreeNode producer); } /** * Invokes the {@link PipelineVisitor PipelineVisitor's} * {@link PipelineVisitor#visitTransform} and * {@link PipelineVisitor#visitValue} operations on each of this * {@link Pipeline Pipeline's} transform and value nodes, in forward * topological order. * *

Traversal of the {@link Pipeline} causes {@link PTransform PTransforms} and * {@link PValue PValues} owned by the {@link Pipeline} to be marked as finished, * at which point they may no longer be modified. * *

Typically invoked by {@link PipelineRunner} subclasses. */ public void traverseTopologically(PipelineVisitor visitor) { Set visitedValues = new HashSet<>(); // Visit all the transforms, which should implicitly visit all the values. transforms.visit(visitor, visitedValues); if (!visitedValues.containsAll(values)) { throw new RuntimeException( "internal error: should have visited all the values " + "after visiting all the transforms"); } } /** * Like {@link #applyTransform(String, PInput, PTransform)} but defaulting to the name * provided by the {@link PTransform}. */ public static OutputT applyTransform(InputT input, PTransform transform) { return input.getPipeline().applyInternal(transform.getName(), input, transform); } /** * Applies the given {@code PTransform} to this input {@code InputT} and returns * its {@code OutputT}. This uses {@code name} to identify this specific application * of the transform. This name is used in various places, including the monitoring UI, * logging, and to stably identify this application node in the {@link Pipeline} graph during * update. * *

Each {@link PInput} subclass that provides an {@code apply} method should delegate to * this method to ensure proper registration with the {@link PipelineRunner}. */ public static OutputT applyTransform(String name, InputT input, PTransform transform) { return input.getPipeline().applyInternal(name, input, transform); } ///////////////////////////////////////////////////////////////////////////// // Below here are internal operations, never called by users. private final PipelineRunner runner; private final PipelineOptions options; private final TransformHierarchy transforms = new TransformHierarchy(); private Collection values = new ArrayList<>(); private Set usedFullNames = new HashSet<>(); private CoderRegistry coderRegistry; private Multimap, AppliedPTransform> transformApplicationsForTesting = HashMultimap.create(); /** * @deprecated replaced by {@link #Pipeline(PipelineRunner, PipelineOptions)} */ @Deprecated protected Pipeline(PipelineRunner runner) { this(runner, PipelineOptionsFactory.create()); } protected Pipeline(PipelineRunner runner, PipelineOptions options) { this.runner = runner; this.options = options; } @Override public String toString() { return "Pipeline#" + hashCode(); } /** * Applies a {@link PTransform} to the given {@link PInput}. * * @see Pipeline#apply */ private OutputT applyInternal(String name, InputT input, PTransform transform) { input.finishSpecifying(); TransformTreeNode parent = transforms.getCurrent(); String namePrefix = parent.getFullName(); String fullName = uniquifyInternal(namePrefix, name); boolean nameIsUnique = fullName.equals(buildName(namePrefix, name)); if (!nameIsUnique) { switch (getOptions().getStableUniqueNames()) { case OFF: break; case WARNING: LOG.warn("Transform {} does not have a stable unique name. " + "This will prevent updating of pipelines.", fullName); break; case ERROR: throw new IllegalStateException( "Transform " + fullName + " does not have a stable unique name. " + "This will prevent updating of pipelines."); default: throw new IllegalArgumentException( "Unrecognized value for stable unique names: " + getOptions().getStableUniqueNames()); } } TransformTreeNode child = new TransformTreeNode(parent, transform, fullName, input); parent.addComposite(child); transforms.addInput(child, input); LOG.debug("Adding {} to {}", transform, this); try { transforms.pushNode(child); transform.validate(input); OutputT output = runner.apply(transform, input); transforms.setOutput(child, output); AppliedPTransform applied = AppliedPTransform.of( child.getFullName(), input, output, transform); transformApplicationsForTesting.put(transform, applied); // recordAsOutput is a NOOP if already called; output.recordAsOutput(applied); verifyOutputState(output, child); return output; } finally { transforms.popNode(); } } /** * Returns all producing transforms for the {@link PValue PValues} contained * in {@code output}. */ private List> getProducingTransforms(POutput output) { List> producingTransforms = new ArrayList<>(); for (PValue value : output.expand()) { AppliedPTransform transform = value.getProducingTransformInternal(); if (transform != null) { producingTransforms.add(transform); } } return producingTransforms; } /** * Verifies that the output of a {@link PTransform} is correctly configured in its * {@link TransformTreeNode} in the {@link Pipeline} graph. * *

A non-composite {@link PTransform} must have all * of its outputs registered as produced by that {@link PTransform}. * *

A composite {@link PTransform} must have all of its outputs * registered as produced by the contained primitive {@link PTransform PTransforms}. * They have each had the above check performed already, when * they were applied, so the only possible failure state is * that the composite {@link PTransform} has returned a primitive output. */ private void verifyOutputState(POutput output, TransformTreeNode node) { if (!node.isCompositeNode()) { PTransform thisTransform = node.getTransform(); List> producingTransforms = getProducingTransforms(output); for (AppliedPTransform producingTransform : producingTransforms) { // Using != because object identity indicates that the transforms // are the same node in the pipeline if (thisTransform != producingTransform.getTransform()) { throw new IllegalArgumentException("Output of non-composite transform " + thisTransform + " is registered as being produced by" + " a different transform: " + producingTransform); } } } else { PTransform thisTransform = node.getTransform(); List> producingTransforms = getProducingTransforms(output); for (AppliedPTransform producingTransform : producingTransforms) { // Using == because object identity indicates that the transforms // are the same node in the pipeline if (thisTransform == producingTransform.getTransform()) { throw new IllegalStateException("Output of composite transform " + thisTransform + " is registered as being produced by it," + " but the output of every composite transform should be" + " produced by a primitive transform contained therein."); } } } } /** * Returns the configured {@link PipelineRunner}. */ public PipelineRunner getRunner() { return runner; } /** * Returns the configured {@link PipelineOptions}. */ public PipelineOptions getOptions() { return options; } /** * @deprecated this method is no longer compatible with the design of {@link Pipeline}, * as {@link PTransform PTransforms} can be applied multiple times, with different names * each time. */ @Deprecated public String getFullNameForTesting(PTransform transform) { Collection> uses = transformApplicationsForTesting.get(transform); Preconditions.checkState(uses.size() > 0, "Unknown transform: " + transform); Preconditions.checkState(uses.size() <= 1, "Transform used multiple times: " + transform); return Iterables.getOnlyElement(uses).getFullName(); } /** * Returns a unique name for a transform with the given prefix (from * enclosing transforms) and initial name. * *

For internal use only. */ private String uniquifyInternal(String namePrefix, String origName) { String name = origName; int suffixNum = 2; while (true) { String candidate = buildName(namePrefix, name); if (usedFullNames.add(candidate)) { return candidate; } // A duplicate! Retry. name = origName + suffixNum++; } } /** * Builds a name from a "/"-delimited prefix and a name. */ private String buildName(String namePrefix, String name) { return namePrefix.isEmpty() ? name : namePrefix + "/" + name; } /** * Adds the given {@link PValue} to this {@link Pipeline}. * *

For internal use only. */ public void addValueInternal(PValue value) { this.values.add(value); LOG.debug("Adding {} to {}", value, this); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy