All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.transforms.DoFn Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.transforms;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.transforms.display.HasDisplayData;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
import com.google.cloud.dataflow.sdk.util.WindowingInternals;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
import com.google.common.base.MoreObjects;

import org.joda.time.Duration;
import org.joda.time.Instant;

import java.io.Serializable;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;

/**
 * The argument to {@link ParDo} providing the code to use to process
 * elements of the input
 * {@link com.google.cloud.dataflow.sdk.values.PCollection}.
 *
 * 

See {@link ParDo} for more explanation, examples of use, and * discussion of constraints on {@code DoFn}s, including their * serializability, lack of access to global shared mutable state, * requirements for failure tolerance, and benefits of optimization. * *

{@code DoFn}s can be tested in the context of a particular * {@code Pipeline} by running that {@code Pipeline} on sample input * and then checking its output. Unit testing of a {@code DoFn}, * separately from any {@code ParDo} transform or {@code Pipeline}, * can be done via the {@link DoFnTester} harness. * *

{@link DoFnWithContext} (currently experimental) offers an alternative * mechanism for accessing {@link ProcessContext#window()} without the need * to implement {@link RequiresWindowAccess}. * *

See also {@link #processElement} for details on implementing the transformation * from {@code InputT} to {@code OutputT}. * * @param the type of the (main) input elements * @param the type of the (main) output elements */ public abstract class DoFn implements Serializable, HasDisplayData { /** * Information accessible to all methods in this {@code DoFn}. * Used primarily to output elements. */ public abstract class Context { /** * Returns the {@code PipelineOptions} specified with the * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner} * invoking this {@code DoFn}. The {@code PipelineOptions} will * be the default running via {@link DoFnTester}. */ public abstract PipelineOptions getPipelineOptions(); /** * Adds the given element to the main output {@code PCollection}. * *

Once passed to {@code output} the element should be considered * immutable and not be modified in any way. It may be cached or retained * by the Dataflow runtime or later steps in the pipeline, or used in * other unspecified ways. * *

If invoked from {@link DoFn#processElement processElement}, the output * element will have the same timestamp and be in the same windows * as the input element passed to {@link DoFn#processElement processElement}. * *

If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle}, * this will attempt to use the * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn} * of the input {@code PCollection} to determine what windows the element * should be in, throwing an exception if the {@code WindowFn} attempts * to access any information about the input element. The output element * will have a timestamp of negative infinity. */ public abstract void output(OutputT output); /** * Adds the given element to the main output {@code PCollection}, * with the given timestamp. * *

Once passed to {@code outputWithTimestamp} the element should not be * modified in any way. * *

If invoked from {@link DoFn#processElement processElement}, the timestamp * must not be older than the input element's timestamp minus * {@link DoFn#getAllowedTimestampSkew getAllowedTimestampSkew}. The output element will * be in the same windows as the input element. * *

If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle}, * this will attempt to use the * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn} * of the input {@code PCollection} to determine what windows the element * should be in, throwing an exception if the {@code WindowFn} attempts * to access any information about the input element except for the * timestamp. */ public abstract void outputWithTimestamp(OutputT output, Instant timestamp); /** * Adds the given element to the side output {@code PCollection} with the * given tag. * *

Once passed to {@code sideOutput} the element should not be modified * in any way. * *

The caller of {@code ParDo} uses {@link ParDo#withOutputTags withOutputTags} to * specify the tags of side outputs that it consumes. Non-consumed side * outputs, e.g., outputs for monitoring purposes only, don't necessarily * need to be specified. * *

The output element will have the same timestamp and be in the same * windows as the input element passed to {@link DoFn#processElement processElement}. * *

If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle}, * this will attempt to use the * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn} * of the input {@code PCollection} to determine what windows the element * should be in, throwing an exception if the {@code WindowFn} attempts * to access any information about the input element. The output element * will have a timestamp of negative infinity. * * @see ParDo#withOutputTags */ public abstract void sideOutput(TupleTag tag, T output); /** * Adds the given element to the specified side output {@code PCollection}, * with the given timestamp. * *

Once passed to {@code sideOutputWithTimestamp} the element should not be * modified in any way. * *

If invoked from {@link DoFn#processElement processElement}, the timestamp * must not be older than the input element's timestamp minus * {@link DoFn#getAllowedTimestampSkew getAllowedTimestampSkew}. The output element will * be in the same windows as the input element. * *

If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle}, * this will attempt to use the * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn} * of the input {@code PCollection} to determine what windows the element * should be in, throwing an exception if the {@code WindowFn} attempts * to access any information about the input element except for the * timestamp. * * @see ParDo#withOutputTags */ public abstract void sideOutputWithTimestamp( TupleTag tag, T output, Instant timestamp); /** * Creates an {@link Aggregator} in the {@link DoFn} context with the * specified name and aggregation logic specified by {@link CombineFn}. * *

For internal use only. * * @param name the name of the aggregator * @param combiner the {@link CombineFn} to use in the aggregator * @return an aggregator for the provided name and {@link CombineFn} in this * context */ @Experimental(Kind.AGGREGATOR) protected abstract Aggregator createAggregatorInternal(String name, CombineFn combiner); /** * Sets up {@link Aggregator}s created by the {@link DoFn} so they are * usable within this context. * *

This method should be called by runners before {@link DoFn#startBundle} * is executed. */ @Experimental(Kind.AGGREGATOR) protected final void setupDelegateAggregators() { for (DelegatingAggregator aggregator : aggregators.values()) { setupDelegateAggregator(aggregator); } aggregatorsAreFinal = true; } private void setupDelegateAggregator( DelegatingAggregator aggregator) { Aggregator delegate = createAggregatorInternal( aggregator.getName(), aggregator.getCombineFn()); aggregator.setDelegate(delegate); } } /** * Information accessible when running {@link DoFn#processElement}. */ public abstract class ProcessContext extends Context { /** * Returns the input element to be processed. * *

The element should be considered immutable. The Dataflow runtime will not mutate the * element, so it is safe to cache, etc. The element should not be mutated by any of the * {@link DoFn} methods, because it may be cached elsewhere, retained by the Dataflow runtime, * or used in other unspecified ways. */ public abstract InputT element(); /** * Returns the value of the side input for the window corresponding to the * window of the main input element. * *

See * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn#getSideInputWindow} * for how this corresponding window is determined. * * @throws IllegalArgumentException if this is not a side input * @see ParDo#withSideInputs */ public abstract T sideInput(PCollectionView view); /** * Returns the timestamp of the input element. * *

See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window} * for more information. */ public abstract Instant timestamp(); /** * Returns the window into which the input element has been assigned. * *

See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window} * for more information. * * @throws UnsupportedOperationException if this {@link DoFn} does * not implement {@link RequiresWindowAccess}. */ public abstract BoundedWindow window(); /** * Returns information about the pane within this window into which the * input element has been assigned. * *

Generally all data is in a single, uninteresting pane unless custom * triggering and/or late data has been explicitly requested. * See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window} * for more information. */ public abstract PaneInfo pane(); /** * Returns the process context to use for implementing windowing. */ @Experimental public abstract WindowingInternals windowingInternals(); } /** * Returns the allowed timestamp skew duration, which is the maximum * duration that timestamps can be shifted backward in * {@link DoFn.Context#outputWithTimestamp}. * *

The default value is {@code Duration.ZERO}, in which case * timestamps can only be shifted forward to future. For infinite * skew, return {@code Duration.millis(Long.MAX_VALUE)}. * *

Note that producing an element whose timestamp is less than the * current timestamp may result in late data, i.e. returning a non-zero * value here does not impact watermark calculations used for firing * windows. * * @deprecated does not interact well with the watermark. */ @Deprecated public Duration getAllowedTimestampSkew() { return Duration.ZERO; } /** * Interface for signaling that a {@link DoFn} needs to access the window the * element is being processed in, via {@link DoFn.ProcessContext#window}. */ @Experimental public interface RequiresWindowAccess {} public DoFn() { this(new HashMap>()); } DoFn(Map> aggregators) { this.aggregators = aggregators; } ///////////////////////////////////////////////////////////////////////////// private final Map> aggregators; /** * Protects aggregators from being created after initialization. */ private boolean aggregatorsAreFinal; /** * Prepares this {@code DoFn} instance for processing a batch of elements. * *

By default, does nothing. */ public void startBundle(Context c) throws Exception { } /** * Processes one input element. * *

The current element of the input {@code PCollection} is returned by * {@link ProcessContext#element() c.element()}. It should be considered immutable. The Dataflow * runtime will not mutate the element, so it is safe to cache, etc. The element should not be * mutated by any of the {@link DoFn} methods, because it may be cached elsewhere, retained by the * Dataflow runtime, or used in other unspecified ways. * *

A value is added to the main output {@code PCollection} by {@link ProcessContext#output}. * Once passed to {@code output} the element should be considered immutable and not be modified in * any way. It may be cached elsewhere, retained by the Dataflow runtime, or used in other * unspecified ways. * * @see ProcessContext */ public abstract void processElement(ProcessContext c) throws Exception; /** * Finishes processing this batch of elements. * *

By default, does nothing. */ public void finishBundle(Context c) throws Exception { } /** * {@inheritDoc} * *

By default, does not register any display data. Implementors may override this method * to provide their own display data. */ @Override public void populateDisplayData(DisplayData.Builder builder) { } ///////////////////////////////////////////////////////////////////////////// /** * Returns a {@link TypeDescriptor} capturing what is known statically * about the input type of this {@code DoFn} instance's most-derived * class. * *

See {@link #getOutputTypeDescriptor} for more discussion. */ protected TypeDescriptor getInputTypeDescriptor() { return new TypeDescriptor(getClass()) {}; } /** * Returns a {@link TypeDescriptor} capturing what is known statically * about the output type of this {@code DoFn} instance's * most-derived class. * *

In the normal case of a concrete {@code DoFn} subclass with * no generic type parameters of its own (including anonymous inner * classes), this will be a complete non-generic type, which is good * for choosing a default output {@code Coder} for the output * {@code PCollection}. */ protected TypeDescriptor getOutputTypeDescriptor() { return new TypeDescriptor(getClass()) {}; } /** * Returns an {@link Aggregator} with aggregation logic specified by the * {@link CombineFn} argument. The name provided must be unique across * {@link Aggregator}s created within the DoFn. Aggregators can only be created * during pipeline construction. * * @param name the name of the aggregator * @param combiner the {@link CombineFn} to use in the aggregator * @return an aggregator for the provided name and combiner in the scope of * this DoFn * @throws NullPointerException if the name or combiner is null * @throws IllegalArgumentException if the given name collides with another * aggregator in this scope * @throws IllegalStateException if called during pipeline processing. */ protected final Aggregator createAggregator(String name, CombineFn combiner) { checkNotNull(name, "name cannot be null"); checkNotNull(combiner, "combiner cannot be null"); checkArgument(!aggregators.containsKey(name), "Cannot create aggregator with name %s." + " An Aggregator with that name already exists within this scope.", name); checkState(!aggregatorsAreFinal, "Cannot create an aggregator during DoFn processing." + " Aggregators should be registered during pipeline construction."); DelegatingAggregator aggregator = new DelegatingAggregator<>(name, combiner); aggregators.put(name, aggregator); return aggregator; } /** * Returns an {@link Aggregator} with the aggregation logic specified by the * {@link SerializableFunction} argument. The name provided must be unique * across {@link Aggregator}s created within the DoFn. Aggregators can only be * created during pipeline construction. * * @param name the name of the aggregator * @param combiner the {@link SerializableFunction} to use in the aggregator * @return an aggregator for the provided name and combiner in the scope of * this DoFn * @throws NullPointerException if the name or combiner is null * @throws IllegalArgumentException if the given name collides with another * aggregator in this scope * @throws IllegalStateException if called during pipeline processing. */ protected final Aggregator createAggregator(String name, SerializableFunction, AggInputT> combiner) { checkNotNull(combiner, "combiner cannot be null."); return createAggregator(name, Combine.IterableCombineFn.of(combiner)); } /** * Returns the {@link Aggregator Aggregators} created by this {@code DoFn}. */ Collection> getAggregators() { return Collections.>unmodifiableCollection(aggregators.values()); } /** * An {@link Aggregator} that delegates calls to addValue to another * aggregator. * * @param the type of input element * @param the type of output element */ static class DelegatingAggregator implements Aggregator, Serializable { private final UUID id; private final String name; private final CombineFn combineFn; private Aggregator delegate; public DelegatingAggregator(String name, CombineFn combiner) { this.id = UUID.randomUUID(); this.name = checkNotNull(name, "name cannot be null"); // Safe contravariant cast @SuppressWarnings("unchecked") CombineFn specificCombiner = (CombineFn) checkNotNull(combiner, "combineFn cannot be null"); this.combineFn = specificCombiner; } @Override public void addValue(AggInputT value) { if (delegate == null) { throw new IllegalStateException( "addValue cannot be called on Aggregator outside of the execution of a DoFn."); } else { delegate.addValue(value); } } @Override public String getName() { return name; } @Override public CombineFn getCombineFn() { return combineFn; } /** * Sets the current delegate of the Aggregator. * * @param delegate the delegate to set in this aggregator */ public void setDelegate(Aggregator delegate) { this.delegate = delegate; } @Override public String toString() { return MoreObjects.toStringHelper(getClass()) .add("name", name) .add("combineFn", combineFn) .toString(); } @Override public int hashCode() { return Objects.hash(id, name, combineFn.getClass()); } /** * Indicates whether some other object is "equal to" this one. * *

{@code DelegatingAggregator} instances are equal if they have the same name, their * CombineFns are the same class, and they have identical IDs. */ @Override public boolean equals(Object o) { if (o == this) { return true; } if (o == null) { return false; } if (o instanceof DelegatingAggregator) { DelegatingAggregator that = (DelegatingAggregator) o; return Objects.equals(this.id, that.id) && Objects.equals(this.name, that.name) && Objects.equals(this.combineFn.getClass(), that.combineFn.getClass()); } return false; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy