All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark Maven / Gradle / Ivy

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.transforms.windowing;

import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.transforms.windowing.Trigger.OnceTrigger;
import com.google.cloud.dataflow.sdk.util.ReduceFn.MergingStateContext;
import com.google.cloud.dataflow.sdk.util.ReduceFn.StateContext;
import com.google.cloud.dataflow.sdk.util.TimeDomain;
import com.google.cloud.dataflow.sdk.util.state.CombiningValueState;

import org.joda.time.Instant;

import java.util.List;
import java.util.Objects;

/**
 * 

{@code AfterWatermark} triggers fire based on progress of the system watermark. This time is a * lower-bound, sometimes heuristically established, on event times that have been fully processed * by the pipeline. * *

For sources that provide non-heuristic watermarks (e.g. * {@link com.google.cloud.dataflow.sdk.io.PubsubIO} when using arrival times as event times), the * watermark is a strict guarantee that no data with an event time earlier than * that watermark will ever be observed in the pipeline. In this case, it's safe to assume that any * pane triggered by an {@code AfterWatermark} trigger with a reference point at or beyond the end * of the window will be the last pane ever for that window. * *

For sources that provide heuristic watermarks (e.g. * {@link com.google.cloud.dataflow.sdk.io.PubsubIO} when using user-supplied event times), the * watermark itself becomes an estimate that no data with an event time earlier than that * watermark (i.e. "late data) will ever be observed in the pipeline. These heuristics can * often be quite accurate, but the chance of seeing late data for any given window is non-zero. * Thus, if absolute correctness over time is important to your use case, you may want to consider * using a trigger that accounts for late data. The default trigger, * {@code Repeatedly.forever(AfterWatermark.pastEndOfWindow())}, which fires * once when the watermark passes the end of the window and then immediately therafter when any * late data arrive, is one such example. * *

The watermark is the clock that defines {@link TimeDomain#EVENT_TIME}. * * @param {@link BoundedWindow} subclass used to represent the windows used. */ @Experimental(Experimental.Kind.TRIGGER) public class AfterWatermark { private static final long serialVersionUID = 0L; // Static factory class. private AfterWatermark() {} /** * Creates a trigger that fires when the watermark passes timestamp of the first element added to * the pane. */ static TimeTrigger pastFirstElementInPane() { return new FromFirstElementInPane(TimeTrigger.IDENTITY); } /** * Creates a trigger that fires when the watermark passes the end of the window. */ public static OnceTrigger pastEndOfWindow() { return new FromEndOfWindow(); } /** * A watermark trigger targeted relative to the event time of the first element in the pane. */ private static class FromFirstElementInPane extends TimeTrigger { private static final long serialVersionUID = 0L; private FromFirstElementInPane( List> delayFunction) { super(delayFunction); } @Override public TriggerResult onElement(OnElementContext c) throws Exception { CombiningValueState delayUntilState = c.state().access(DELAYED_UNTIL_TAG); Instant delayUntil = delayUntilState.get().read(); if (delayUntil == null) { delayUntil = computeTargetTimestamp(c.eventTimestamp()); c.timers().setTimer(delayUntil, TimeDomain.EVENT_TIME); delayUntilState.add(delayUntil); } return TriggerResult.CONTINUE; } @Override public MergeResult onMerge(OnMergeContext c) throws Exception { // If the watermark time timer has fired in any of the windows being merged, it would have // fired at the same point if it had been added to the merged window. So, we just record it as // finished. if (c.trigger().finishedInAnyMergingWindow()) { return MergeResult.ALREADY_FINISHED; } // To have gotten here, we must not have fired in any of the oldWindows. Determine the event // timestamp from the minimum (we could also just pick one, or try to record the arrival times // of this first element in each pane). // Determine the earliest point across all the windows, and delay to that. CombiningValueState mergingDelays = c.state().accessAcrossMergingWindows(DELAYED_UNTIL_TAG); Instant earliestTimer = mergingDelays.get().read(); if (earliestTimer != null) { mergingDelays.clear(); mergingDelays.add(earliestTimer); c.timers().setTimer(earliestTimer, TimeDomain.EVENT_TIME); } return MergeResult.CONTINUE; } @Override public TriggerResult onTimer(OnTimerContext c) throws Exception { if (c.timeDomain() != TimeDomain.EVENT_TIME) { return TriggerResult.CONTINUE; } Instant delayedUntil = c.state().access(DELAYED_UNTIL_TAG).get().read(); if (delayedUntil == null || delayedUntil.isAfter(c.timestamp())) { return TriggerResult.CONTINUE; } return TriggerResult.FIRE_AND_FINISH; } @Override public void prefetchOnElement(StateContext state) { state.access(DELAYED_UNTIL_TAG).get(); } @Override public void prefetchOnMerge(MergingStateContext state) { state.accessAcrossMergingWindows(DELAYED_UNTIL_TAG).get(); } @Override public void prefetchOnTimer(StateContext state) { state.access(DELAYED_UNTIL_TAG).get(); } @Override public void clear(TriggerContext c) throws Exception { CombiningValueState delayed = c.state().access(DELAYED_UNTIL_TAG); Instant timestamp = delayed.get().read(); delayed.clear(); if (timestamp != null) { c.timers().deleteTimer(timestamp, TimeDomain.EVENT_TIME); } } @Override public Instant getWatermarkThatGuaranteesFiring(W window) { return computeTargetTimestamp(window.maxTimestamp()); } @Override protected FromFirstElementInPane newWith( List> transforms) { return new FromFirstElementInPane(transforms); } @Override public OnceTrigger getContinuationTrigger(List> continuationTriggers) { return this; } @Override public String toString() { return "AfterWatermark.pastFirstElementInPane(" + timestampMappers + ")"; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof FromFirstElementInPane)) { return false; } FromFirstElementInPane that = (FromFirstElementInPane) obj; return Objects.equals(this.timestampMappers, that.timestampMappers); } @Override public int hashCode() { return Objects.hashCode(timestampMappers); } } /** * A watermark trigger targeted relative to the end of the window. */ private static class FromEndOfWindow extends OnceTrigger { private static final long serialVersionUID = 0L; private FromEndOfWindow() { super(null); } @Override public TriggerResult onElement(OnElementContext c) throws Exception { c.timers().setTimer(c.window().maxTimestamp(), TimeDomain.EVENT_TIME); return TriggerResult.CONTINUE; } @Override public MergeResult onMerge(OnMergeContext c) throws Exception { // If the watermark was past the end of a window that is past the end of the new window, // then the watermark must also be past the end of this window. What's more, we've already // fired some elements for that trigger firing, so we report FINISHED (without firing). for (W finishedWindow : c.trigger().getFinishedMergingWindows()) { if (finishedWindow.maxTimestamp().isAfter(c.window().maxTimestamp())) { return MergeResult.ALREADY_FINISHED; } } // Otherwise, set a timer for this window, and return. c.timers().setTimer(c.window().maxTimestamp(), TimeDomain.EVENT_TIME); return MergeResult.CONTINUE; } @Override public TriggerResult onTimer(OnTimerContext c) throws Exception { if (c.timeDomain() != TimeDomain.EVENT_TIME || c.timestamp().isBefore(c.window().maxTimestamp())) { return TriggerResult.CONTINUE; } else { return TriggerResult.FIRE_AND_FINISH; } } @Override public void clear(TriggerContext c) throws Exception { c.timers().deleteTimer(c.window().maxTimestamp(), TimeDomain.EVENT_TIME); } @Override public Instant getWatermarkThatGuaranteesFiring(W window) { return window.maxTimestamp(); } @Override public FromEndOfWindow getContinuationTrigger(List> continuationTriggers) { return this; } @Override public String toString() { return "AfterWatermark.pastEndOfWindow()"; } @Override public boolean equals(Object obj) { return obj instanceof FromEndOfWindow; } @Override public int hashCode() { return Objects.hash(getClass()); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy