com.google.cloud.dataflow.sdk.transforms.DoFn Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.transforms;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.Combine.CombineFn;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.transforms.display.HasDisplayData;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.PaneInfo;
import com.google.cloud.dataflow.sdk.util.WindowingInternals;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TypeDescriptor;
import com.google.common.base.MoreObjects;
import org.joda.time.Duration;
import org.joda.time.Instant;
import java.io.Serializable;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
/**
* The argument to {@link ParDo} providing the code to use to process
* elements of the input
* {@link com.google.cloud.dataflow.sdk.values.PCollection}.
*
* See {@link ParDo} for more explanation, examples of use, and
* discussion of constraints on {@code DoFn}s, including their
* serializability, lack of access to global shared mutable state,
* requirements for failure tolerance, and benefits of optimization.
*
*
{@code DoFn}s can be tested in the context of a particular
* {@code Pipeline} by running that {@code Pipeline} on sample input
* and then checking its output. Unit testing of a {@code DoFn},
* separately from any {@code ParDo} transform or {@code Pipeline},
* can be done via the {@link DoFnTester} harness.
*
*
{@link DoFnWithContext} (currently experimental) offers an alternative
* mechanism for accessing {@link ProcessContext#window()} without the need
* to implement {@link RequiresWindowAccess}.
*
*
See also {@link #processElement} for details on implementing the transformation
* from {@code InputT} to {@code OutputT}.
*
* @param the type of the (main) input elements
* @param the type of the (main) output elements
*/
public abstract class DoFn implements Serializable, HasDisplayData {
/**
* Information accessible to all methods in this {@code DoFn}.
* Used primarily to output elements.
*/
public abstract class Context {
/**
* Returns the {@code PipelineOptions} specified with the
* {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner}
* invoking this {@code DoFn}. The {@code PipelineOptions} will
* be the default running via {@link DoFnTester}.
*/
public abstract PipelineOptions getPipelineOptions();
/**
* Adds the given element to the main output {@code PCollection}.
*
* Once passed to {@code output} the element should be considered
* immutable and not be modified in any way. It may be cached or retained
* by the Dataflow runtime or later steps in the pipeline, or used in
* other unspecified ways.
*
*
If invoked from {@link DoFn#processElement processElement}, the output
* element will have the same timestamp and be in the same windows
* as the input element passed to {@link DoFn#processElement processElement}.
*
*
If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle},
* this will attempt to use the
* {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
* of the input {@code PCollection} to determine what windows the element
* should be in, throwing an exception if the {@code WindowFn} attempts
* to access any information about the input element. The output element
* will have a timestamp of negative infinity.
*/
public abstract void output(OutputT output);
/**
* Adds the given element to the main output {@code PCollection},
* with the given timestamp.
*
*
Once passed to {@code outputWithTimestamp} the element should not be
* modified in any way.
*
*
If invoked from {@link DoFn#processElement processElement}, the timestamp
* must not be older than the input element's timestamp minus
* {@link DoFn#getAllowedTimestampSkew getAllowedTimestampSkew}. The output element will
* be in the same windows as the input element.
*
*
If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle},
* this will attempt to use the
* {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
* of the input {@code PCollection} to determine what windows the element
* should be in, throwing an exception if the {@code WindowFn} attempts
* to access any information about the input element except for the
* timestamp.
*/
public abstract void outputWithTimestamp(OutputT output, Instant timestamp);
/**
* Adds the given element to the side output {@code PCollection} with the
* given tag.
*
*
Once passed to {@code sideOutput} the element should not be modified
* in any way.
*
*
The caller of {@code ParDo} uses {@link ParDo#withOutputTags withOutputTags} to
* specify the tags of side outputs that it consumes. Non-consumed side
* outputs, e.g., outputs for monitoring purposes only, don't necessarily
* need to be specified.
*
*
The output element will have the same timestamp and be in the same
* windows as the input element passed to {@link DoFn#processElement processElement}.
*
*
If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle},
* this will attempt to use the
* {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
* of the input {@code PCollection} to determine what windows the element
* should be in, throwing an exception if the {@code WindowFn} attempts
* to access any information about the input element. The output element
* will have a timestamp of negative infinity.
*
* @see ParDo#withOutputTags
*/
public abstract void sideOutput(TupleTag tag, T output);
/**
* Adds the given element to the specified side output {@code PCollection},
* with the given timestamp.
*
* Once passed to {@code sideOutputWithTimestamp} the element should not be
* modified in any way.
*
*
If invoked from {@link DoFn#processElement processElement}, the timestamp
* must not be older than the input element's timestamp minus
* {@link DoFn#getAllowedTimestampSkew getAllowedTimestampSkew}. The output element will
* be in the same windows as the input element.
*
*
If invoked from {@link #startBundle startBundle} or {@link #finishBundle finishBundle},
* this will attempt to use the
* {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
* of the input {@code PCollection} to determine what windows the element
* should be in, throwing an exception if the {@code WindowFn} attempts
* to access any information about the input element except for the
* timestamp.
*
* @see ParDo#withOutputTags
*/
public abstract void sideOutputWithTimestamp(
TupleTag tag, T output, Instant timestamp);
/**
* Creates an {@link Aggregator} in the {@link DoFn} context with the
* specified name and aggregation logic specified by {@link CombineFn}.
*
* For internal use only.
*
* @param name the name of the aggregator
* @param combiner the {@link CombineFn} to use in the aggregator
* @return an aggregator for the provided name and {@link CombineFn} in this
* context
*/
@Experimental(Kind.AGGREGATOR)
protected abstract Aggregator
createAggregatorInternal(String name, CombineFn combiner);
/**
* Sets up {@link Aggregator}s created by the {@link DoFn} so they are
* usable within this context.
*
* This method should be called by runners before {@link DoFn#startBundle}
* is executed.
*/
@Experimental(Kind.AGGREGATOR)
protected final void setupDelegateAggregators() {
for (DelegatingAggregator aggregator : aggregators.values()) {
setupDelegateAggregator(aggregator);
}
aggregatorsAreFinal = true;
}
private void setupDelegateAggregator(
DelegatingAggregator aggregator) {
Aggregator delegate = createAggregatorInternal(
aggregator.getName(), aggregator.getCombineFn());
aggregator.setDelegate(delegate);
}
}
/**
* Information accessible when running {@link DoFn#processElement}.
*/
public abstract class ProcessContext extends Context {
/**
* Returns the input element to be processed.
*
* The element should be considered immutable. The Dataflow runtime will not mutate the
* element, so it is safe to cache, etc. The element should not be mutated by any of the
* {@link DoFn} methods, because it may be cached elsewhere, retained by the Dataflow runtime,
* or used in other unspecified ways.
*/
public abstract InputT element();
/**
* Returns the value of the side input for the window corresponding to the
* window of the main input element.
*
*
See
* {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn#getSideInputWindow}
* for how this corresponding window is determined.
*
* @throws IllegalArgumentException if this is not a side input
* @see ParDo#withSideInputs
*/
public abstract T sideInput(PCollectionView view);
/**
* Returns the timestamp of the input element.
*
* See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window}
* for more information.
*/
public abstract Instant timestamp();
/**
* Returns the window into which the input element has been assigned.
*
*
See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window}
* for more information.
*
* @throws UnsupportedOperationException if this {@link DoFn} does
* not implement {@link RequiresWindowAccess}.
*/
public abstract BoundedWindow window();
/**
* Returns information about the pane within this window into which the
* input element has been assigned.
*
*
Generally all data is in a single, uninteresting pane unless custom
* triggering and/or late data has been explicitly requested.
* See {@link com.google.cloud.dataflow.sdk.transforms.windowing.Window}
* for more information.
*/
public abstract PaneInfo pane();
/**
* Returns the process context to use for implementing windowing.
*/
@Experimental
public abstract WindowingInternals windowingInternals();
}
/**
* Returns the allowed timestamp skew duration, which is the maximum
* duration that timestamps can be shifted backward in
* {@link DoFn.Context#outputWithTimestamp}.
*
* The default value is {@code Duration.ZERO}, in which case
* timestamps can only be shifted forward to future. For infinite
* skew, return {@code Duration.millis(Long.MAX_VALUE)}.
*
*
Note that producing an element whose timestamp is less than the
* current timestamp may result in late data, i.e. returning a non-zero
* value here does not impact watermark calculations used for firing
* windows.
*
* @deprecated does not interact well with the watermark.
*/
@Deprecated
public Duration getAllowedTimestampSkew() {
return Duration.ZERO;
}
/**
* Interface for signaling that a {@link DoFn} needs to access the window the
* element is being processed in, via {@link DoFn.ProcessContext#window}.
*/
@Experimental
public interface RequiresWindowAccess {}
public DoFn() {
this(new HashMap>());
}
DoFn(Map> aggregators) {
this.aggregators = aggregators;
}
/////////////////////////////////////////////////////////////////////////////
private final Map> aggregators;
/**
* Protects aggregators from being created after initialization.
*/
private boolean aggregatorsAreFinal;
/**
* Prepares this {@code DoFn} instance for processing a batch of elements.
*
* By default, does nothing.
*/
public void startBundle(Context c) throws Exception {
}
/**
* Processes one input element.
*
*
The current element of the input {@code PCollection} is returned by
* {@link ProcessContext#element() c.element()}. It should be considered immutable. The Dataflow
* runtime will not mutate the element, so it is safe to cache, etc. The element should not be
* mutated by any of the {@link DoFn} methods, because it may be cached elsewhere, retained by the
* Dataflow runtime, or used in other unspecified ways.
*
*
A value is added to the main output {@code PCollection} by {@link ProcessContext#output}.
* Once passed to {@code output} the element should be considered immutable and not be modified in
* any way. It may be cached elsewhere, retained by the Dataflow runtime, or used in other
* unspecified ways.
*
* @see ProcessContext
*/
public abstract void processElement(ProcessContext c) throws Exception;
/**
* Finishes processing this batch of elements.
*
*
By default, does nothing.
*/
public void finishBundle(Context c) throws Exception {
}
/**
* {@inheritDoc}
*
*
By default, does not register any display data. Implementors may override this method
* to provide their own display data.
*/
@Override
public void populateDisplayData(DisplayData.Builder builder) {
}
/////////////////////////////////////////////////////////////////////////////
/**
* Returns a {@link TypeDescriptor} capturing what is known statically
* about the input type of this {@code DoFn} instance's most-derived
* class.
*
*
See {@link #getOutputTypeDescriptor} for more discussion.
*/
protected TypeDescriptor getInputTypeDescriptor() {
return new TypeDescriptor(getClass()) {};
}
/**
* Returns a {@link TypeDescriptor} capturing what is known statically
* about the output type of this {@code DoFn} instance's
* most-derived class.
*
* In the normal case of a concrete {@code DoFn} subclass with
* no generic type parameters of its own (including anonymous inner
* classes), this will be a complete non-generic type, which is good
* for choosing a default output {@code Coder} for the output
* {@code PCollection}.
*/
protected TypeDescriptor getOutputTypeDescriptor() {
return new TypeDescriptor(getClass()) {};
}
/**
* Returns an {@link Aggregator} with aggregation logic specified by the
* {@link CombineFn} argument. The name provided must be unique across
* {@link Aggregator}s created within the DoFn. Aggregators can only be created
* during pipeline construction.
*
* @param name the name of the aggregator
* @param combiner the {@link CombineFn} to use in the aggregator
* @return an aggregator for the provided name and combiner in the scope of
* this DoFn
* @throws NullPointerException if the name or combiner is null
* @throws IllegalArgumentException if the given name collides with another
* aggregator in this scope
* @throws IllegalStateException if called during pipeline processing.
*/
protected final Aggregator
createAggregator(String name, CombineFn combiner) {
checkNotNull(name, "name cannot be null");
checkNotNull(combiner, "combiner cannot be null");
checkArgument(!aggregators.containsKey(name),
"Cannot create aggregator with name %s."
+ " An Aggregator with that name already exists within this scope.",
name);
checkState(!aggregatorsAreFinal, "Cannot create an aggregator during DoFn processing."
+ " Aggregators should be registered during pipeline construction.");
DelegatingAggregator aggregator =
new DelegatingAggregator<>(name, combiner);
aggregators.put(name, aggregator);
return aggregator;
}
/**
* Returns an {@link Aggregator} with the aggregation logic specified by the
* {@link SerializableFunction} argument. The name provided must be unique
* across {@link Aggregator}s created within the DoFn. Aggregators can only be
* created during pipeline construction.
*
* @param name the name of the aggregator
* @param combiner the {@link SerializableFunction} to use in the aggregator
* @return an aggregator for the provided name and combiner in the scope of
* this DoFn
* @throws NullPointerException if the name or combiner is null
* @throws IllegalArgumentException if the given name collides with another
* aggregator in this scope
* @throws IllegalStateException if called during pipeline processing.
*/
protected final Aggregator createAggregator(String name,
SerializableFunction, AggInputT> combiner) {
checkNotNull(combiner, "combiner cannot be null.");
return createAggregator(name, Combine.IterableCombineFn.of(combiner));
}
/**
* Returns the {@link Aggregator Aggregators} created by this {@code DoFn}.
*/
Collection> getAggregators() {
return Collections.>unmodifiableCollection(aggregators.values());
}
/**
* An {@link Aggregator} that delegates calls to addValue to another
* aggregator.
*
* @param the type of input element
* @param the type of output element
*/
static class DelegatingAggregator implements
Aggregator, Serializable {
private final UUID id;
private final String name;
private final CombineFn combineFn;
private Aggregator delegate;
public DelegatingAggregator(String name,
CombineFn combiner) {
this.id = UUID.randomUUID();
this.name = checkNotNull(name, "name cannot be null");
// Safe contravariant cast
@SuppressWarnings("unchecked")
CombineFn specificCombiner =
(CombineFn) checkNotNull(combiner, "combineFn cannot be null");
this.combineFn = specificCombiner;
}
@Override
public void addValue(AggInputT value) {
if (delegate == null) {
throw new IllegalStateException(
"addValue cannot be called on Aggregator outside of the execution of a DoFn.");
} else {
delegate.addValue(value);
}
}
@Override
public String getName() {
return name;
}
@Override
public CombineFn getCombineFn() {
return combineFn;
}
/**
* Sets the current delegate of the Aggregator.
*
* @param delegate the delegate to set in this aggregator
*/
public void setDelegate(Aggregator delegate) {
this.delegate = delegate;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(getClass())
.add("name", name)
.add("combineFn", combineFn)
.toString();
}
@Override
public int hashCode() {
return Objects.hash(id, name, combineFn.getClass());
}
/**
* Indicates whether some other object is "equal to" this one.
*
* {@code DelegatingAggregator} instances are equal if they have the same name, their
* CombineFns are the same class, and they have identical IDs.
*/
@Override
public boolean equals(Object o) {
if (o == this) {
return true;
}
if (o == null) {
return false;
}
if (o instanceof DelegatingAggregator) {
DelegatingAggregator that = (DelegatingAggregator) o;
return Objects.equals(this.id, that.id)
&& Objects.equals(this.name, that.name)
&& Objects.equals(this.combineFn.getClass(), that.combineFn.getClass());
}
return false;
}
}
}