com.google.cloud.dataflow.sdk.util.Reshuffle Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.util;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import org.joda.time.Duration;
/**
* A {@link PTransform} that returns a {@link PCollection} equivalent to its input but operationally
* provides some of the side effects of a {@link GroupByKey}, in particular preventing fusion of
* the surrounding transforms, checkpointing and deduplication by id (see
* {@link ValueWithRecordId}).
*
* Performs a {@link GroupByKey} so that the data is key-partitioned. Configures the
* {@link WindowingStrategy} so that no data is dropped, but doesn't affect the need for
* the user to specify allowed lateness and accumulation mode before a user-inserted GroupByKey.
*
* @param The type of key being reshuffled on.
* @param The type of value being reshuffled.
*/
public class Reshuffle extends PTransform>, PCollection>> {
private Reshuffle() {
}
public static Reshuffle of() {
return new Reshuffle();
}
@Override
public PCollection> apply(PCollection> input) {
WindowingStrategy originalStrategy = input.getWindowingStrategy();
// If the input has already had its windows merged, then the GBK that performed the merge
// will have set originalStrategy.getWindowFn() to InvalidWindows, causing the GBK contained
// here to fail. Instead, we install a valid WindowFn that leaves all windows unchanged.
Window.Bound> rewindow =
Window.>into(
new IdentityWindowFn<>(
originalStrategy.getWindowFn().windowCoder(),
originalStrategy.getWindowFn().assignsToSingleWindow()))
.triggering(new ReshuffleTrigger<>())
.discardingFiredPanes()
.withAllowedLateness(Duration.millis(BoundedWindow.TIMESTAMP_MAX_VALUE.getMillis()));
return input.apply(rewindow)
.apply(GroupByKey.create())
// Set the windowing strategy directly, so that it doesn't get counted as the user having
// set allowed lateness.
.setWindowingStrategyInternal(originalStrategy)
.apply(ParDo.named("ExpandIterable").of(
new DoFn>, KV>() {
@Override
public void processElement(ProcessContext c) {
K key = c.element().getKey();
for (V value : c.element().getValue()) {
c.output(KV.of(key, value));
}
}
}));
}
}