com.google.cloud.dataflow.sdk.util.LateDataDroppingDoFnRunner Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2016 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.util;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import org.joda.time.Instant;
/**
* A customized {@link DoFnRunner} that handles late data dropping for
* a {@link KeyedWorkItem} input {@link DoFn}.
*
* It expands windows before checking data lateness.
*
*
{@link KeyedWorkItem KeyedWorkItems} are always in empty windows.
*
* @param key type
* @param input value element type
* @param output value element type
* @param window type
*/
public class LateDataDroppingDoFnRunner
implements DoFnRunner, KV> {
private final DoFnRunner, KV> doFnRunner;
private final LateDataFilter lateDataFilter;
public LateDataDroppingDoFnRunner(
DoFnRunner, KV> doFnRunner,
WindowingStrategy, ?> windowingStrategy,
TimerInternals timerInternals,
Aggregator droppedDueToLateness) {
this.doFnRunner = doFnRunner;
lateDataFilter = new LateDataFilter(windowingStrategy, timerInternals, droppedDueToLateness);
}
@Override
public void startBundle() {
doFnRunner.startBundle();
}
@Override
public void processElement(WindowedValue> elem) {
Iterable> nonLateElements = lateDataFilter.filter(
elem.getValue().key(), elem.getValue().elementsIterable());
KeyedWorkItem keyedWorkItem = KeyedWorkItems.workItem(
elem.getValue().key(), elem.getValue().timersIterable(), nonLateElements);
doFnRunner.processElement(elem.withValue(keyedWorkItem));
}
@Override
public void finishBundle() {
doFnRunner.finishBundle();
}
/**
* It filters late data in a {@link KeyedWorkItem}.
*/
@VisibleForTesting
static class LateDataFilter {
private final WindowingStrategy, ?> windowingStrategy;
private final TimerInternals timerInternals;
private final Aggregator droppedDueToLateness;
public LateDataFilter(
WindowingStrategy, ?> windowingStrategy,
TimerInternals timerInternals,
Aggregator droppedDueToLateness) {
this.windowingStrategy = windowingStrategy;
this.timerInternals = timerInternals;
this.droppedDueToLateness = droppedDueToLateness;
}
/**
* Returns an {@code Iterable>} that only contains
* non-late input elements.
*/
public Iterable> filter(
final K key, Iterable> elements) {
Iterable>> windowsExpandedElements = Iterables.transform(
elements,
new Function, Iterable>>() {
@Override
public Iterable> apply(final WindowedValue input) {
return Iterables.transform(
input.getWindows(),
new Function>() {
@Override
public WindowedValue apply(BoundedWindow window) {
return WindowedValue.of(
input.getValue(), input.getTimestamp(), window, input.getPane());
}
});
}});
Iterable> nonLateElements = Iterables.filter(
Iterables.concat(windowsExpandedElements),
new Predicate>() {
@Override
public boolean apply(WindowedValue input) {
BoundedWindow window = Iterables.getOnlyElement(input.getWindows());
if (canDropDueToExpiredWindow(window)) {
// The element is too late for this window.
droppedDueToLateness.addValue(1L);
WindowTracing.debug(
"ReduceFnRunner.processElement: Dropping element at {} for key:{}; window:{} "
+ "since too far behind inputWatermark:{}; outputWatermark:{}",
input.getTimestamp(), key, window, timerInternals.currentInputWatermarkTime(),
timerInternals.currentOutputWatermarkTime());
return false;
} else {
return true;
}
}
});
return nonLateElements;
}
/** Is {@code window} expired w.r.t. the garbage collection watermark? */
private boolean canDropDueToExpiredWindow(BoundedWindow window) {
Instant inputWM = timerInternals.currentInputWatermarkTime();
return inputWM != null
&& window.maxTimestamp().plus(windowingStrategy.getAllowedLateness()).isBefore(inputWM);
}
}
}