com.google.cloud.dataflow.sdk.transforms.View Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.transforms;
import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
import com.google.cloud.dataflow.sdk.util.PCollectionViews;
import com.google.cloud.dataflow.sdk.util.WindowedValue;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import java.util.List;
import java.util.Map;
/**
* Transforms for creating {@link PCollectionView PCollectionViews} from
* {@link PCollection PCollections} (to read them as side inputs).
*
* While a {@link PCollection PCollection<ElemT>} has many values of type {@code ElemT} per
* window, a {@link PCollectionView PCollectionView<ViewT>} has a single value of type
* {@code ViewT} for each window. It can be thought of as a mapping from windows to values of
* type {@code ViewT}. The transforms here represent ways of converting the {@code ElemT} values
* in a window into a {@code ViewT} for that window.
*
*
When a {@link ParDo} tranform is processing a main input
* element in a window {@code w} and a {@link PCollectionView} is read via
* {@link DoFn.ProcessContext#sideInput}, the value of the view for {@code w} is
* returned.
*
*
The SDK supports viewing a {@link PCollection}, per window, as a single value,
* a {@link List}, an {@link Iterable}, a {@link Map}, or a multimap (iterable-valued {@link Map}).
*
*
For a {@link PCollection} that contains a single value of type {@code T}
* per window, such as the output of {@link Combine#globally},
* use {@link View#asSingleton()} to prepare it for use as a side input:
*
*
* {@code
* PCollectionView output = someOtherPCollection
* .apply(Combine.globally(...))
* .apply(View.asSingleton());
* }
*
*
* For a small {@link PCollection} with windows that can fit entirely in memory,
* use {@link View#asList()} to prepare it for use as a {@code List}.
* When read as a side input, the entire list for a window will be cached in memory.
*
*
* {@code
* PCollectionView> output =
* smallPCollection.apply(View.asList());
* }
*
*
* If a {@link PCollection} of {@code KV} is known to
* have a single value per window for each key, then use {@link View#asMap()}
* to view it as a {@code Map}:
*
*
* {@code
* PCollectionView
*
* Otherwise, to access a {@link PCollection} of {@code KV} as a
* {@code Map>} side input, use {@link View#asMultimap()}:
*
*
* {@code
* PCollectionView
*
* To iterate over an entire window of a {@link PCollection} via
* side input, use {@link View#asIterable()}:
*
*
* {@code
* PCollectionView> output =
* somePCollection.apply(View.asIterable());
* }
*
*
*
* Both {@link View#asMultimap()} and {@link View#asMap()} are useful
* for implementing lookup based "joins" with the main input, when the
* side input is small enough to fit into memory.
*
*
For example, if you represent a page on a website via some {@code Page} object and
* have some type {@code UrlVisits} logging that a URL was visited, you could convert these
* to more fully structured {@code PageVisit} objects using a side input, something like the
* following:
*
*
* {@code
* PCollection pages = ... // pages fit into memory
* PCollection urlVisits = ... // very large collection
* final PCollectionView
*
* See {@link ParDo#withSideInputs} for details on how to access
* this variable inside a {@link ParDo} over another {@link PCollection}.
*/
public class View {
// Do not instantiate
private View() { }
/**
* Returns a {@link AsSingleton} transform that takes a
* {@link PCollection} with a single value per window
* as input and produces a {@link PCollectionView} that returns
* the value in the main input window when read as a side input.
*
*
* {@code
* PCollection input = ...
* CombineFn yourCombineFn = ...
* PCollectionView output = input
* .apply(Combine.globally(yourCombineFn))
* .apply(View.asSingleton());
* }
*
* If the input {@link PCollection} is empty,
* throws {@link java.util.NoSuchElementException} in the consuming
* {@link DoFn}.
*
*
If the input {@link PCollection} contains more than one
* element, throws {@link IllegalArgumentException} in the
* consuming {@link DoFn}.
*/
public static AsSingleton asSingleton() {
return new AsSingleton<>();
}
/**
* Returns a {@link View.AsList} transform that takes a {@link PCollection} and returns a
* {@link PCollectionView} mapping each window to a {@link List} containing
* all of the elements in the window.
*
* The resulting list is required to fit in memory.
*/
public static AsList asList() {
return new AsList<>();
}
/**
* Returns a {@link View.AsIterable} transform that takes a {@link PCollection} as input
* and produces a {@link PCollectionView} mapping each window to an
* {@link Iterable} of the values in that window.
*
* The values of the {@link Iterable} for a window are not required to fit in memory,
* but they may also not be effectively cached. If it is known that every window fits in memory,
* and stronger caching is desired, use {@link #asList}.
*/
public static AsIterable asIterable() {
return new AsIterable<>();
}
/**
* Returns a {@link View.AsMap} transform that takes a
* {@link PCollection PCollection<KV<K V>>} as
* input and produces a {@link PCollectionView} mapping each window to
* a {@link Map Map>K, V>}. It is required that each key of the input be
* associated with a single value, per window. If this is not the case, precede this
* view with {@code Combine.perKey}, as in the example below, or alternatively
* use {@link View#asMultimap()}.
*
*
* {@code
* PCollection> input = ...
* CombineFn yourCombineFn = ...
* PCollectionView
*
* Currently, the resulting map is required to fit into memory.
*/
public static AsMap asMap() {
return new AsMap();
}
/**
* Returns a {@link View.AsMultimap} transform that takes a
* {@link PCollection PCollection<KV<K, V>>}
* as input and produces a {@link PCollectionView} mapping
* each window to its contents as a {@link Map Map<K, Iterable<V>>}
* for use as a side input.
* In contrast to {@link View#asMap()}, it is not required that the keys in the
* input collection be unique.
*
*
* {@code
* PCollection> input = ... // maybe more than one occurrence of a some keys
* PCollectionView
*
* Currently, the resulting map is required to fit into memory.
*/
public static AsMultimap asMultimap() {
return new AsMultimap();
}
/**
* Not intended for direct use by pipeline authors; public only so a {@link PipelineRunner} may
* override its behavior.
*
* See {@link View#asList()}.
*/
public static class AsList extends PTransform, PCollectionView>> {
private AsList() { }
@Override
public void validate(PCollection input) {
try {
GroupByKey.applicableTo(input);
} catch (IllegalStateException e) {
throw new IllegalStateException("Unable to create a side-input view from input", e);
}
}
@Override
public PCollectionView> apply(PCollection input) {
return input.apply(CreatePCollectionView.>of(PCollectionViews.listView(
input.getPipeline(), input.getWindowingStrategy(), input.getCoder())));
}
}
/**
* Not intended for direct use by pipeline authors; public only so a {@link PipelineRunner} may
* override its behavior.
*
* See {@link View#asIterable()}.
*/
public static class AsIterable
extends PTransform, PCollectionView>> {
private AsIterable() { }
@Override
public void validate(PCollection input) {
try {
GroupByKey.applicableTo(input);
} catch (IllegalStateException e) {
throw new IllegalStateException("Unable to create a side-input view from input", e);
}
}
@Override
public PCollectionView> apply(PCollection input) {
return input.apply(CreatePCollectionView.>of(PCollectionViews.iterableView(
input.getPipeline(), input.getWindowingStrategy(), input.getCoder())));
}
}
/**
* Not intended for direct use by pipeline authors; public only so a {@link PipelineRunner} may
* override its behavior.
*
* See {@link View#asSingleton()}.
*/
public static class AsSingleton extends PTransform, PCollectionView> {
private final T defaultValue;
private final boolean hasDefault;
private AsSingleton() {
this.defaultValue = null;
this.hasDefault = false;
}
private AsSingleton(T defaultValue) {
this.defaultValue = defaultValue;
this.hasDefault = true;
}
/**
* Returns whether this transform has a default value.
*/
public boolean hasDefaultValue() {
return hasDefault;
}
/**
* Returns the default value of this transform, or null if there isn't one.
*/
public T defaultValue() {
return defaultValue;
}
/**
* Default value to return for windows with no value in them.
*/
public AsSingleton withDefaultValue(T defaultValue) {
return new AsSingleton<>(defaultValue);
}
@Override
public void validate(PCollection input) {
try {
GroupByKey.applicableTo(input);
} catch (IllegalStateException e) {
throw new IllegalStateException("Unable to create a side-input view from input", e);
}
}
@Override
public PCollectionView apply(PCollection input) {
return input.apply(CreatePCollectionView.of(PCollectionViews.singletonView(
input.getPipeline(),
input.getWindowingStrategy(),
hasDefault,
defaultValue,
input.getCoder())));
}
}
/**
* Not intended for direct use by pipeline authors; public only so a {@link PipelineRunner} may
* override its behavior.
*
* See {@link View#asMultimap()}.
*/
public static class AsMultimap
extends PTransform>, PCollectionView