All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.beam.sdk.Pipeline Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk;

import static com.google.common.base.Preconditions.checkState;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.beam.sdk.coders.CoderRegistry;
import org.apache.beam.sdk.io.Read;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.runners.PipelineRunner;
import org.apache.beam.sdk.runners.TransformHierarchy;
import org.apache.beam.sdk.transforms.Aggregator;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.util.UserCodeException;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PInput;
import org.apache.beam.sdk.values.POutput;
import org.apache.beam.sdk.values.PValue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A {@link Pipeline} manages a directed acyclic graph of {@link PTransform PTransforms}, and the
 * {@link PCollection PCollections} that the {@link PTransform}s consume and produce.
 *
 * 

A {@link Pipeline} is initialized with a {@link PipelineRunner} that will later * execute the {@link Pipeline}. * *

{@link Pipeline Pipelines} are independent, so they can be constructed and executed * concurrently. * *

Each {@link Pipeline} is self-contained and isolated from any other * {@link Pipeline}. The {@link PValue PValues} that are inputs and outputs of each of a * {@link Pipeline Pipeline's} {@link PTransform PTransforms} are also owned by that * {@link Pipeline}. A {@link PValue} owned by one {@link Pipeline} can be read only by * {@link PTransform PTransforms} also owned by that {@link Pipeline}. * *

Here is a typical example of use: *

 {@code
 * // Start by defining the options for the pipeline.
 * PipelineOptions options = PipelineOptionsFactory.create();
 * // Then create the pipeline. The runner is determined by the options.
 * Pipeline p = Pipeline.create(options);
 *
 * // A root PTransform, like TextIO.Read or Create, gets added
 * // to the Pipeline by being applied:
 * PCollection lines =
 *     p.apply(TextIO.Read.from("gs://bucket/dir/file*.txt"));
 *
 * // A Pipeline can have multiple root transforms:
 * PCollection moreLines =
 *     p.apply(TextIO.Read.from("gs://bucket/other/dir/file*.txt"));
 * PCollection yetMoreLines =
 *     p.apply(Create.of("yet", "more", "lines").withCoder(StringUtf8Coder.of()));
 *
 * // Further PTransforms can be applied, in an arbitrary (acyclic) graph.
 * // Subsequent PTransforms (and intermediate PCollections etc.) are
 * // implicitly part of the same Pipeline.
 * PCollection allLines =
 *     PCollectionList.of(lines).and(moreLines).and(yetMoreLines)
 *     .apply(new Flatten());
 * PCollection> wordCounts =
 *     allLines
 *     .apply(ParDo.of(new ExtractWords()))
 *     .apply(new Count());
 * PCollection formattedWordCounts =
 *     wordCounts.apply(ParDo.of(new FormatCounts()));
 * formattedWordCounts.apply(TextIO.Write.to("gs://bucket/dir/counts.txt"));
 *
 * // PTransforms aren't executed when they're applied, rather they're
 * // just added to the Pipeline.  Once the whole Pipeline of PTransforms
 * // is constructed, the Pipeline's PTransforms can be run using a
 * // PipelineRunner.  The default PipelineRunner executes the Pipeline
 * // directly, sequentially, in this one process, which is useful for
 * // unit tests and simple experiments:
 * p.run();
 *
 * } 
*/ public class Pipeline { private static final Logger LOG = LoggerFactory.getLogger(Pipeline.class); /** * Thrown during execution of a {@link Pipeline}, whenever user code within that * {@link Pipeline} throws an exception. * *

The original exception thrown by user code may be retrieved via {@link #getCause}. */ public static class PipelineExecutionException extends RuntimeException { /** * Wraps {@code cause} into a {@link PipelineExecutionException}. */ public PipelineExecutionException(Throwable cause) { super(cause); } } ///////////////////////////////////////////////////////////////////////////// // Public operations. /** * Constructs a pipeline from the provided options. * * @return The newly created pipeline. */ public static Pipeline create(PipelineOptions options) { Pipeline pipeline = new Pipeline(PipelineRunner.fromOptions(options), options); LOG.debug("Creating {}", pipeline); return pipeline; } /** * Returns a {@link PBegin} owned by this Pipeline. This is useful * as the input of a root PTransform such as {@link Read} or * {@link Create}. */ public PBegin begin() { return PBegin.in(this); } /** * Like {@link #apply(String, PTransform)} but the transform node in the {@link Pipeline} * graph will be named according to {@link PTransform#getName}. * * @see #apply(String, PTransform) */ public OutputT apply( PTransform root) { return begin().apply(root); } /** * Adds a root {@link PTransform}, such as {@link Read} or {@link Create}, * to this {@link Pipeline}. * *

The node in the {@link Pipeline} graph will use the provided {@code name}. * This name is used in various places, including the monitoring UI, logging, * and to stably identify this node in the {@link Pipeline} graph upon update. * *

Alias for {@code begin().apply(name, root)}. */ public OutputT apply( String name, PTransform root) { return begin().apply(name, root); } /** * Runs the {@link Pipeline} using its {@link PipelineRunner}. */ public PipelineResult run() { // Ensure all of the nodes are fully specified before a PipelineRunner gets access to the // pipeline. LOG.debug("Running {} via {}", this, runner); try { return runner.run(this); } catch (UserCodeException e) { // This serves to replace the stack with one that ends here and // is caused by the caught UserCodeException, thereby splicing // out all the stack frames in between the PipelineRunner itself // and where the worker calls into the user's code. throw new PipelineExecutionException(e.getCause()); } } ///////////////////////////////////////////////////////////////////////////// // Below here are operations that aren't normally called by users. /** * Returns the {@link CoderRegistry} that this {@link Pipeline} uses. */ public CoderRegistry getCoderRegistry() { if (coderRegistry == null) { coderRegistry = new CoderRegistry(); coderRegistry.registerStandardCoders(); } return coderRegistry; } /** * Sets the {@link CoderRegistry} that this {@link Pipeline} uses. */ public void setCoderRegistry(CoderRegistry coderRegistry) { this.coderRegistry = coderRegistry; } /** * A {@link PipelineVisitor} can be passed into * {@link Pipeline#traverseTopologically} to be called for each of the * transforms and values in the {@link Pipeline}. */ public interface PipelineVisitor { /** * Called for each composite transform after all topological predecessors have been visited * but before any of its component transforms. * *

The return value controls whether or not child transforms are visited. */ CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node); /** * Called for each composite transform after all of its component transforms and their outputs * have been visited. */ void leaveCompositeTransform(TransformHierarchy.Node node); /** * Called for each primitive transform after all of its topological predecessors * and inputs have been visited. */ void visitPrimitiveTransform(TransformHierarchy.Node node); /** * Called for each value after the transform that produced the value has been * visited. */ void visitValue(PValue value, TransformHierarchy.Node producer); /** * Control enum for indicating whether or not a traversal should process the contents of * a composite transform or not. */ enum CompositeBehavior { ENTER_TRANSFORM, DO_NOT_ENTER_TRANSFORM } /** * Default no-op {@link PipelineVisitor} that enters all composite transforms. * User implementations can override just those methods they are interested in. */ class Defaults implements PipelineVisitor { @Override public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) { return CompositeBehavior.ENTER_TRANSFORM; } @Override public void leaveCompositeTransform(TransformHierarchy.Node node) { } @Override public void visitPrimitiveTransform(TransformHierarchy.Node node) { } @Override public void visitValue(PValue value, TransformHierarchy.Node producer) { } } } /** * Invokes the {@link PipelineVisitor PipelineVisitor's} * {@link PipelineVisitor#visitPrimitiveTransform} and * {@link PipelineVisitor#visitValue} operations on each of this * {@link Pipeline Pipeline's} transform and value nodes, in forward * topological order. * *

Traversal of the {@link Pipeline} causes {@link PTransform PTransforms} and * {@link PValue PValues} owned by the {@link Pipeline} to be marked as finished, * at which point they may no longer be modified. * *

Typically invoked by {@link PipelineRunner} subclasses. */ public void traverseTopologically(PipelineVisitor visitor) { // Ensure all nodes are fully specified before visiting the pipeline Set visitedValues = // Visit all the transforms, which should implicitly visit all the values. transforms.visit(visitor); checkState( visitedValues.containsAll(values), "internal error: should have visited all the values after visiting all the transforms"); } /** * Like {@link #applyTransform(String, PInput, PTransform)} but defaulting to the name * provided by the {@link PTransform}. */ public static OutputT applyTransform(InputT input, PTransform transform) { return input.getPipeline().applyInternal(transform.getName(), input, transform); } /** * Applies the given {@code PTransform} to this input {@code InputT} and returns * its {@code OutputT}. This uses {@code name} to identify this specific application * of the transform. This name is used in various places, including the monitoring UI, * logging, and to stably identify this application node in the {@link Pipeline} graph during * update. * *

Each {@link PInput} subclass that provides an {@code apply} method should delegate to * this method to ensure proper registration with the {@link PipelineRunner}. */ public static OutputT applyTransform(String name, InputT input, PTransform transform) { return input.getPipeline().applyInternal(name, input, transform); } ///////////////////////////////////////////////////////////////////////////// // Below here are internal operations, never called by users. private final PipelineRunner runner; private final PipelineOptions options; private final TransformHierarchy transforms = new TransformHierarchy(); private Collection values = new ArrayList<>(); private Set usedFullNames = new HashSet<>(); private CoderRegistry coderRegistry; /** * @deprecated replaced by {@link #Pipeline(PipelineRunner, PipelineOptions)} */ @Deprecated protected Pipeline(PipelineRunner runner) { this(runner, PipelineOptionsFactory.create()); } protected Pipeline(PipelineRunner runner, PipelineOptions options) { this.runner = runner; this.options = options; } @Override public String toString() { return "Pipeline#" + hashCode(); } /** * Applies a {@link PTransform} to the given {@link PInput}. * * @see Pipeline#apply */ private OutputT applyInternal( String name, InputT input, PTransform transform) { String namePrefix = transforms.getCurrent().getFullName(); String uniqueName = uniquifyInternal(namePrefix, name); boolean nameIsUnique = uniqueName.equals(buildName(namePrefix, name)); if (!nameIsUnique) { switch (getOptions().getStableUniqueNames()) { case OFF: break; case WARNING: LOG.warn( "Transform {} does not have a stable unique name. " + "This will prevent updating of pipelines.", uniqueName); break; case ERROR: throw new IllegalStateException( "Transform " + uniqueName + " does not have a stable unique name. " + "This will prevent updating of pipelines."); default: throw new IllegalArgumentException( "Unrecognized value for stable unique names: " + getOptions().getStableUniqueNames()); } } LOG.debug("Adding {} to {}", transform, this); transforms.pushNode(uniqueName, input, transform); try { transforms.finishSpecifyingInput(); transform.validate(input); OutputT output = runner.apply(transform, input); transforms.setOutput(output); return output; } finally { transforms.popNode(); } } /** * Returns the configured {@link PipelineRunner}. */ public PipelineRunner getRunner() { return runner; } /** * Returns the configured {@link PipelineOptions}. */ public PipelineOptions getOptions() { return options; } /** * Returns a unique name for a transform with the given prefix (from * enclosing transforms) and initial name. * *

For internal use only. */ private String uniquifyInternal(String namePrefix, String origName) { String name = origName; int suffixNum = 2; while (true) { String candidate = buildName(namePrefix, name); if (usedFullNames.add(candidate)) { return candidate; } // A duplicate! Retry. name = origName + suffixNum++; } } /** * Returns a {@link Map} from each {@link Aggregator} in the {@link Pipeline} to the {@link * PTransform PTransforms} in which it is used. */ public Map, Collection>> getAggregatorSteps() { return new AggregatorPipelineExtractor(this).getAggregatorSteps(); } /** * Builds a name from a "/"-delimited prefix and a name. */ private String buildName(String namePrefix, String name) { return namePrefix.isEmpty() ? name : namePrefix + "/" + name; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy