org.apache.flink.streaming.api.operators.co.IntervalJoinOperator Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.api.operators.co;
import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerConfigSnapshot;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerUtil;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility;
import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot;
import org.apache.flink.api.common.typeutils.base.ListSerializer;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.api.common.typeutils.base.StringSerializer;
import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataOutputView;
import org.apache.flink.runtime.state.StateInitializationContext;
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
import org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator;
import org.apache.flink.streaming.api.operators.InternalTimer;
import org.apache.flink.streaming.api.operators.InternalTimerService;
import org.apache.flink.streaming.api.operators.TimestampedCollector;
import org.apache.flink.streaming.api.operators.Triggerable;
import org.apache.flink.streaming.api.operators.TwoInputStreamOperator;
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
import org.apache.flink.util.Collector;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.OutputTag;
import org.apache.flink.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
* An {@link TwoInputStreamOperator operator} to execute time-bounded stream inner joins.
*
* By using a configurable lower and upper bound this operator will emit exactly those pairs
* (T1, T2) where t2.ts ∈ [T1.ts + lowerBound, T1.ts + upperBound]. Both the lower and the
* upper bound can be configured to be either inclusive or exclusive.
*
*
As soon as elements are joined they are passed to a user-defined {@link ProcessJoinFunction}.
*
*
The basic idea of this implementation is as follows: Whenever we receive an element at
* {@link #processElement1(StreamRecord)} (a.k.a. the left side), we add it to the left buffer.
* We then check the right buffer to see whether there are any elements that can be joined. If
* there are, they are joined and passed to the aforementioned function. The same happens the
* other way around when receiving an element on the right side.
*
*
Whenever a pair of elements is emitted it will be assigned the max timestamp of either of
* the elements.
*
*
In order to avoid the element buffers to grow indefinitely a cleanup timer is registered
* per element. This timer indicates when an element is not considered for joining anymore and can
* be removed from the state.
*
* @param The type of the key based on which we join elements.
* @param The type of the elements in the left stream.
* @param The type of the elements in the right stream.
* @param The output type created by the user-defined function.
*/
@Internal
public class IntervalJoinOperator
extends AbstractUdfStreamOperator>
implements TwoInputStreamOperator, Triggerable {
private static final long serialVersionUID = -5380774605111543454L;
private static final Logger logger = LoggerFactory.getLogger(IntervalJoinOperator.class);
private static final String LEFT_BUFFER = "LEFT_BUFFER";
private static final String RIGHT_BUFFER = "RIGHT_BUFFER";
private static final String CLEANUP_TIMER_NAME = "CLEANUP_TIMER";
private static final String CLEANUP_NAMESPACE_LEFT = "CLEANUP_LEFT";
private static final String CLEANUP_NAMESPACE_RIGHT = "CLEANUP_RIGHT";
private final long lowerBound;
private final long upperBound;
private final TypeSerializer leftTypeSerializer;
private final TypeSerializer rightTypeSerializer;
private transient MapState>> leftBuffer;
private transient MapState>> rightBuffer;
private transient TimestampedCollector collector;
private transient ContextImpl context;
private transient InternalTimerService internalTimerService;
/**
* Creates a new IntervalJoinOperator.
*
* @param lowerBound The lower bound for evaluating if elements should be joined
* @param upperBound The upper bound for evaluating if elements should be joined
* @param lowerBoundInclusive Whether or not to include elements where the timestamp matches
* the lower bound
* @param upperBoundInclusive Whether or not to include elements where the timestamp matches
* the upper bound
* @param udf A user-defined {@link ProcessJoinFunction} that gets called
* whenever two elements of T1 and T2 are joined
*/
public IntervalJoinOperator(
long lowerBound,
long upperBound,
boolean lowerBoundInclusive,
boolean upperBoundInclusive,
TypeSerializer leftTypeSerializer,
TypeSerializer rightTypeSerializer,
ProcessJoinFunction udf) {
super(Preconditions.checkNotNull(udf));
Preconditions.checkArgument(lowerBound <= upperBound,
"lowerBound <= upperBound must be fulfilled");
// Move buffer by +1 / -1 depending on inclusiveness in order not needing
// to check for inclusiveness later on
this.lowerBound = (lowerBoundInclusive) ? lowerBound : lowerBound + 1L;
this.upperBound = (upperBoundInclusive) ? upperBound : upperBound - 1L;
this.leftTypeSerializer = Preconditions.checkNotNull(leftTypeSerializer);
this.rightTypeSerializer = Preconditions.checkNotNull(rightTypeSerializer);
}
@Override
public void open() throws Exception {
super.open();
collector = new TimestampedCollector<>(output);
context = new ContextImpl(userFunction);
internalTimerService =
getInternalTimerService(CLEANUP_TIMER_NAME, StringSerializer.INSTANCE, this);
}
@Override
public void initializeState(StateInitializationContext context) throws Exception {
super.initializeState(context);
this.leftBuffer = context.getKeyedStateStore().getMapState(new MapStateDescriptor<>(
LEFT_BUFFER,
LongSerializer.INSTANCE,
new ListSerializer<>(new BufferEntrySerializer<>(leftTypeSerializer))
));
this.rightBuffer = context.getKeyedStateStore().getMapState(new MapStateDescriptor<>(
RIGHT_BUFFER,
LongSerializer.INSTANCE,
new ListSerializer<>(new BufferEntrySerializer<>(rightTypeSerializer))
));
}
/**
* Process a {@link StreamRecord} from the left stream. Whenever an {@link StreamRecord}
* arrives at the left stream, it will get added to the left buffer. Possible join candidates
* for that element will be looked up from the right buffer and if the pair lies within the
* user defined boundaries, it gets passed to the {@link ProcessJoinFunction}.
*
* @param record An incoming record to be joined
* @throws Exception Can throw an Exception during state access
*/
@Override
public void processElement1(StreamRecord record) throws Exception {
processElement(record, leftBuffer, rightBuffer, lowerBound, upperBound, true);
}
/**
* Process a {@link StreamRecord} from the right stream. Whenever a {@link StreamRecord}
* arrives at the right stream, it will get added to the right buffer. Possible join candidates
* for that element will be looked up from the left buffer and if the pair lies within the user
* defined boundaries, it gets passed to the {@link ProcessJoinFunction}.
*
* @param record An incoming record to be joined
* @throws Exception Can throw an exception during state access
*/
@Override
public void processElement2(StreamRecord record) throws Exception {
processElement(record, rightBuffer, leftBuffer, -upperBound, -lowerBound, false);
}
@SuppressWarnings("unchecked")
private void processElement(
final StreamRecord record,
final MapState>> ourBuffer,
final MapState>> otherBuffer,
final long relativeLowerBound,
final long relativeUpperBound,
final boolean isLeft) throws Exception {
final THIS ourValue = record.getValue();
final long ourTimestamp = record.getTimestamp();
if (ourTimestamp == Long.MIN_VALUE) {
throw new FlinkException("Long.MIN_VALUE timestamp: Elements used in " +
"interval stream joins need to have timestamps meaningful timestamps.");
}
if (isLate(ourTimestamp)) {
return;
}
addToBuffer(ourBuffer, ourValue, ourTimestamp);
for (Map.Entry>> bucket: otherBuffer.entries()) {
final long timestamp = bucket.getKey();
if (timestamp < ourTimestamp + relativeLowerBound ||
timestamp > ourTimestamp + relativeUpperBound) {
continue;
}
for (BufferEntry entry: bucket.getValue()) {
if (isLeft) {
collect((T1) ourValue, (T2) entry.element, ourTimestamp, timestamp);
} else {
collect((T1) entry.element, (T2) ourValue, timestamp, ourTimestamp);
}
}
}
long cleanupTime = (relativeUpperBound > 0L) ? ourTimestamp + relativeUpperBound : ourTimestamp;
if (isLeft) {
internalTimerService.registerEventTimeTimer(CLEANUP_NAMESPACE_LEFT, cleanupTime);
} else {
internalTimerService.registerEventTimeTimer(CLEANUP_NAMESPACE_RIGHT, cleanupTime);
}
}
private boolean isLate(long timestamp) {
long currentWatermark = internalTimerService.currentWatermark();
return currentWatermark != Long.MIN_VALUE && timestamp < currentWatermark;
}
private void collect(T1 left, T2 right, long leftTimestamp, long rightTimestamp) throws Exception {
final long resultTimestamp = Math.max(leftTimestamp, rightTimestamp);
collector.setAbsoluteTimestamp(resultTimestamp);
context.updateTimestamps(leftTimestamp, rightTimestamp, resultTimestamp);
userFunction.processElement(left, right, context, collector);
}
private static void addToBuffer(
final MapState>> buffer,
final T value,
final long timestamp) throws Exception {
List> elemsInBucket = buffer.get(timestamp);
if (elemsInBucket == null) {
elemsInBucket = new ArrayList<>();
}
elemsInBucket.add(new BufferEntry<>(value, false));
buffer.put(timestamp, elemsInBucket);
}
@Override
public void onEventTime(InternalTimer timer) throws Exception {
long timerTimestamp = timer.getTimestamp();
String namespace = timer.getNamespace();
logger.trace("onEventTime @ {}", timerTimestamp);
switch (namespace) {
case CLEANUP_NAMESPACE_LEFT: {
long timestamp = (upperBound <= 0L) ? timerTimestamp : timerTimestamp - upperBound;
logger.trace("Removing from left buffer @ {}", timestamp);
leftBuffer.remove(timestamp);
break;
}
case CLEANUP_NAMESPACE_RIGHT: {
long timestamp = (lowerBound <= 0L) ? timerTimestamp + lowerBound : timerTimestamp;
logger.trace("Removing from right buffer @ {}", timestamp);
rightBuffer.remove(timestamp);
break;
}
default:
throw new RuntimeException("Invalid namespace " + namespace);
}
}
@Override
public void onProcessingTime(InternalTimer timer) throws Exception {
// do nothing.
}
/**
* The context that is available during an invocation of
* {@link ProcessJoinFunction#processElement(Object, Object, ProcessJoinFunction.Context, Collector)}.
*
* It gives access to the timestamps of the left element in the joined pair, the right one, and that of
* the joined pair. In addition, this context allows to emit elements on a side output.
*/
private final class ContextImpl extends ProcessJoinFunction.Context {
private long resultTimestamp = Long.MIN_VALUE;
private long leftTimestamp = Long.MIN_VALUE;
private long rightTimestamp = Long.MIN_VALUE;
private ContextImpl(ProcessJoinFunction func) {
func.super();
}
private void updateTimestamps(long left, long right, long result) {
this.leftTimestamp = left;
this.rightTimestamp = right;
this.resultTimestamp = result;
}
@Override
public long getLeftTimestamp() {
return leftTimestamp;
}
@Override
public long getRightTimestamp() {
return rightTimestamp;
}
@Override
public long getTimestamp() {
return resultTimestamp;
}
@Override
public void output(OutputTag outputTag, X value) {
Preconditions.checkArgument(outputTag != null, "OutputTag must not be null");
output.collect(outputTag, new StreamRecord<>(value, getTimestamp()));
}
}
/**
* A container for elements put in the left/write buffer.
* This will contain the element itself along with a flag indicating
* if it has been joined or not.
*/
@Internal
@VisibleForTesting
static class BufferEntry {
private final T element;
private final boolean hasBeenJoined;
BufferEntry(T element, boolean hasBeenJoined) {
this.element = element;
this.hasBeenJoined = hasBeenJoined;
}
}
/**
* A {@link TypeSerializer serializer} for the {@link BufferEntry}.
*/
@Internal
@VisibleForTesting
static class BufferEntrySerializer extends TypeSerializer> {
private static final long serialVersionUID = -20197698803836236L;
private final TypeSerializer elementSerializer;
BufferEntrySerializer(TypeSerializer elementSerializer) {
this.elementSerializer = Preconditions.checkNotNull(elementSerializer);
}
@Override
public boolean isImmutableType() {
return true;
}
@Override
public TypeSerializer> duplicate() {
return new BufferEntrySerializer<>(elementSerializer.duplicate());
}
@Override
public BufferEntry createInstance() {
return null;
}
@Override
public BufferEntry copy(BufferEntry from) {
return new BufferEntry<>(from.element, from.hasBeenJoined);
}
@Override
public BufferEntry copy(BufferEntry from, BufferEntry reuse) {
return copy(from);
}
@Override
public int getLength() {
return -1;
}
@Override
public void serialize(BufferEntry record, DataOutputView target) throws IOException {
target.writeBoolean(record.hasBeenJoined);
elementSerializer.serialize(record.element, target);
}
@Override
public BufferEntry deserialize(DataInputView source) throws IOException {
boolean hasBeenJoined = source.readBoolean();
T element = elementSerializer.deserialize(source);
return new BufferEntry<>(element, hasBeenJoined);
}
@Override
public BufferEntry deserialize(BufferEntry reuse, DataInputView source) throws IOException {
return deserialize(source);
}
@Override
public void copy(DataInputView source, DataOutputView target) throws IOException {
target.writeBoolean(source.readBoolean());
elementSerializer.copy(source, target);
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
BufferEntrySerializer that = (BufferEntrySerializer) o;
return Objects.equals(elementSerializer, that.elementSerializer);
}
@Override
public int hashCode() {
return Objects.hash(elementSerializer);
}
@Override
public TypeSerializerSnapshot> snapshotConfiguration() {
return new BufferEntrySerializerSnapshot<>(this);
}
}
/**
* The {@link CompositeTypeSerializerConfigSnapshot configuration} of our serializer.
*
* @deprecated this snapshot class is no longer in use, and is maintained only for backwards compatibility.
* It is fully replaced by {@link BufferEntrySerializerSnapshot}.
*/
@Deprecated
public static class BufferSerializerConfigSnapshot extends CompositeTypeSerializerConfigSnapshot> {
private static final int VERSION = 1;
public BufferSerializerConfigSnapshot() {
}
public BufferSerializerConfigSnapshot(final TypeSerializer userTypeSerializer) {
super(userTypeSerializer);
}
@Override
public int getVersion() {
return VERSION;
}
@Override
public TypeSerializerSchemaCompatibility> resolveSchemaCompatibility(TypeSerializer> newSerializer) {
return CompositeTypeSerializerUtil.delegateCompatibilityCheckToNewSnapshot(
newSerializer,
new BufferEntrySerializerSnapshot<>(),
getSingleNestedSerializerAndConfig().f1);
}
}
/**
* A {@link TypeSerializerSnapshot} for {@link BufferEntrySerializer}.
*/
public static final class BufferEntrySerializerSnapshot
extends CompositeTypeSerializerSnapshot, BufferEntrySerializer> {
private static final int VERSION = 2;
@SuppressWarnings({"unused", "WeakerAccess"})
public BufferEntrySerializerSnapshot() {
super(BufferEntrySerializer.class);
}
BufferEntrySerializerSnapshot(BufferEntrySerializer serializerInstance) {
super(serializerInstance);
}
@Override
protected int getCurrentOuterSnapshotVersion() {
return VERSION;
}
@Override
protected TypeSerializer[] getNestedSerializers(BufferEntrySerializer outerSerializer) {
return new TypeSerializer[]{outerSerializer.elementSerializer};
}
@Override
@SuppressWarnings("unchecked")
protected BufferEntrySerializer createOuterSerializerWithNestedSerializers(TypeSerializer[] nestedSerializers) {
return new BufferEntrySerializer<>((TypeSerializer) nestedSerializers[0]);
}
}
@VisibleForTesting
MapState>> getLeftBuffer() {
return leftBuffer;
}
@VisibleForTesting
MapState>> getRightBuffer() {
return rightBuffer;
}
}