All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.util.LateDataDroppingDoFnRunner Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2016 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.dataflow.sdk.util;

import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;

import org.joda.time.Instant;

/**
 * A customized {@link DoFnRunner} that handles late data dropping for
 * a {@link KeyedWorkItem} input {@link DoFn}.
 *
 * 

It expands windows before checking data lateness. * *

{@link KeyedWorkItem KeyedWorkItems} are always in empty windows. * * @param key type * @param input value element type * @param output value element type * @param window type */ public class LateDataDroppingDoFnRunner implements DoFnRunner, KV> { private final DoFnRunner, KV> doFnRunner; private final LateDataFilter lateDataFilter; public LateDataDroppingDoFnRunner( DoFnRunner, KV> doFnRunner, WindowingStrategy windowingStrategy, TimerInternals timerInternals, Aggregator droppedDueToLateness) { this.doFnRunner = doFnRunner; lateDataFilter = new LateDataFilter(windowingStrategy, timerInternals, droppedDueToLateness); } @Override public void startBundle() { doFnRunner.startBundle(); } @Override public void processElement(WindowedValue> elem) { Iterable> nonLateElements = lateDataFilter.filter( elem.getValue().key(), elem.getValue().elementsIterable()); KeyedWorkItem keyedWorkItem = KeyedWorkItems.workItem( elem.getValue().key(), elem.getValue().timersIterable(), nonLateElements); doFnRunner.processElement(elem.withValue(keyedWorkItem)); } @Override public void finishBundle() { doFnRunner.finishBundle(); } /** * It filters late data in a {@link KeyedWorkItem}. */ @VisibleForTesting static class LateDataFilter { private final WindowingStrategy windowingStrategy; private final TimerInternals timerInternals; private final Aggregator droppedDueToLateness; public LateDataFilter( WindowingStrategy windowingStrategy, TimerInternals timerInternals, Aggregator droppedDueToLateness) { this.windowingStrategy = windowingStrategy; this.timerInternals = timerInternals; this.droppedDueToLateness = droppedDueToLateness; } /** * Returns an {@code Iterable>} that only contains * non-late input elements. */ public Iterable> filter( final K key, Iterable> elements) { Iterable>> windowsExpandedElements = Iterables.transform( elements, new Function, Iterable>>() { @Override public Iterable> apply(final WindowedValue input) { return Iterables.transform( input.getWindows(), new Function>() { @Override public WindowedValue apply(BoundedWindow window) { return WindowedValue.of( input.getValue(), input.getTimestamp(), window, input.getPane()); } }); }}); Iterable> nonLateElements = Iterables.filter( Iterables.concat(windowsExpandedElements), new Predicate>() { @Override public boolean apply(WindowedValue input) { BoundedWindow window = Iterables.getOnlyElement(input.getWindows()); if (canDropDueToExpiredWindow(window)) { // The element is too late for this window. droppedDueToLateness.addValue(1L); WindowTracing.debug( "ReduceFnRunner.processElement: Dropping element at {} for key:{}; window:{} " + "since too far behind inputWatermark:{}; outputWatermark:{}", input.getTimestamp(), key, window, timerInternals.currentInputWatermarkTime(), timerInternals.currentOutputWatermarkTime()); return false; } else { return true; } } }); return nonLateElements; } /** Is {@code window} expired w.r.t. the garbage collection watermark? */ private boolean canDropDueToExpiredWindow(BoundedWindow window) { Instant inputWM = timerInternals.currentInputWatermarkTime(); return inputWM != null && window.maxTimestamp().plus(windowingStrategy.getAllowedLateness()).isBefore(inputWM); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy