org.apache.flink.streaming.api.datastream.CoGroupedStreams Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.api.datastream;
import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerConfigSnapshot;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerUtil;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.TypeSerializerConfigSnapshot;
import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility;
import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.operators.translation.WrappingFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataOutputView;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner;
import org.apache.flink.streaming.api.windowing.evictors.Evictor;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.util.Collector;
import org.apache.flink.util.Preconditions;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import static java.util.Objects.requireNonNull;
/**
* {@code CoGroupedStreams} represents two {@link DataStream DataStreams} that have been co-grouped.
* A streaming co-group operation is evaluated over elements in a window.
*
* To finalize co-group operation you also need to specify a {@link KeySelector} for both the
* first and second input and a {@link WindowAssigner}.
*
*
Note: Right now, the groups are being built in memory so you need to ensure that they don't
* get too big. Otherwise the JVM might crash.
*
*
Example:
*
*
{@code
* DataStream> one = ...;
* DataStream> two = ...;
*
* DataStream result = one.coGroup(two)
* .where(new MyFirstKeySelector())
* .equalTo(new MyFirstKeySelector())
* .window(TumblingEventTimeWindows.of(Time.of(5, TimeUnit.SECONDS)))
* .apply(new MyCoGroupFunction());
* }
*/
@Public
public class CoGroupedStreams {
/** The first input stream. */
private final DataStream input1;
/** The second input stream. */
private final DataStream input2;
/**
* Creates new CoGrouped data streams, which are the first step towards building a streaming
* co-group.
*
* @param input1 The first data stream.
* @param input2 The second data stream.
*/
public CoGroupedStreams(DataStream input1, DataStream input2) {
this.input1 = requireNonNull(input1);
this.input2 = requireNonNull(input2);
}
/**
* Specifies a {@link KeySelector} for elements from the first input.
*
* @param keySelector The KeySelector to be used for extracting the first input's key for
* partitioning.
*/
public Where where(KeySelector keySelector) {
Preconditions.checkNotNull(keySelector);
final TypeInformation keyType =
TypeExtractor.getKeySelectorTypes(keySelector, input1.getType());
return where(keySelector, keyType);
}
/**
* Specifies a {@link KeySelector} for elements from the first input with explicit type
* information.
*
* @param keySelector The KeySelector to be used for extracting the first input's key for
* partitioning.
* @param keyType The type information describing the key type.
*/
public Where where(KeySelector keySelector, TypeInformation keyType) {
Preconditions.checkNotNull(keySelector);
Preconditions.checkNotNull(keyType);
return new Where<>(input1.clean(keySelector), keyType);
}
// ------------------------------------------------------------------------
/**
* CoGrouped streams that have the key for one side defined.
*
* @param The type of the key.
*/
@Public
public class Where {
private final KeySelector keySelector1;
private final TypeInformation keyType;
Where(KeySelector keySelector1, TypeInformation keyType) {
this.keySelector1 = keySelector1;
this.keyType = keyType;
}
/**
* Specifies a {@link KeySelector} for elements from the second input.
*
* @param keySelector The KeySelector to be used for extracting the second input's key for
* partitioning.
*/
public EqualTo equalTo(KeySelector keySelector) {
Preconditions.checkNotNull(keySelector);
final TypeInformation otherKey =
TypeExtractor.getKeySelectorTypes(keySelector, input2.getType());
return equalTo(keySelector, otherKey);
}
/**
* Specifies a {@link KeySelector} for elements from the second input with explicit type
* information for the key type.
*
* @param keySelector The KeySelector to be used for extracting the key for partitioning.
* @param keyType The type information describing the key type.
*/
public EqualTo equalTo(KeySelector keySelector, TypeInformation keyType) {
Preconditions.checkNotNull(keySelector);
Preconditions.checkNotNull(keyType);
if (!keyType.equals(this.keyType)) {
throw new IllegalArgumentException(
"The keys for the two inputs are not equal: "
+ "first key = "
+ this.keyType
+ " , second key = "
+ keyType);
}
return new EqualTo(input2.clean(keySelector));
}
// --------------------------------------------------------------------
/**
* A co-group operation that has {@link KeySelector KeySelectors} defined for both inputs.
*/
@Public
public class EqualTo {
private final KeySelector keySelector2;
EqualTo(KeySelector keySelector2) {
this.keySelector2 = requireNonNull(keySelector2);
}
/** Specifies the window on which the co-group operation works. */
@PublicEvolving
public WithWindow window(
WindowAssigner super TaggedUnion, W> assigner) {
return new WithWindow<>(
input1,
input2,
keySelector1,
keySelector2,
keyType,
assigner,
null,
null,
null);
}
}
}
// ------------------------------------------------------------------------
/**
* A co-group operation that has {@link KeySelector KeySelectors} defined for both inputs as
* well as a {@link WindowAssigner}.
*
* @param Type of the elements from the first input
* @param Type of the elements from the second input
* @param Type of the key. This must be the same for both inputs
* @param Type of {@link Window} on which the co-group operation works.
*/
@Public
public static class WithWindow {
private final DataStream input1;
private final DataStream input2;
private final KeySelector keySelector1;
private final KeySelector keySelector2;
private final TypeInformation keyType;
private final WindowAssigner super TaggedUnion, W> windowAssigner;
private final Trigger super TaggedUnion, ? super W> trigger;
private final Evictor super TaggedUnion, ? super W> evictor;
private final Time allowedLateness;
private WindowedStream, KEY, W> windowedStream;
protected WithWindow(
DataStream input1,
DataStream input2,
KeySelector keySelector1,
KeySelector keySelector2,
TypeInformation keyType,
WindowAssigner super TaggedUnion, W> windowAssigner,
Trigger super TaggedUnion, ? super W> trigger,
Evictor super TaggedUnion, ? super W> evictor,
Time allowedLateness) {
this.input1 = input1;
this.input2 = input2;
this.keySelector1 = keySelector1;
this.keySelector2 = keySelector2;
this.keyType = keyType;
this.windowAssigner = windowAssigner;
this.trigger = trigger;
this.evictor = evictor;
this.allowedLateness = allowedLateness;
}
/** Sets the {@code Trigger} that should be used to trigger window emission. */
@PublicEvolving
public WithWindow trigger(
Trigger super TaggedUnion, ? super W> newTrigger) {
return new WithWindow<>(
input1,
input2,
keySelector1,
keySelector2,
keyType,
windowAssigner,
newTrigger,
evictor,
allowedLateness);
}
/**
* Sets the {@code Evictor} that should be used to evict elements from a window before
* emission.
*
* Note: When using an evictor window performance will degrade significantly, since
* pre-aggregation of window results cannot be used.
*/
@PublicEvolving
public WithWindow evictor(
Evictor super TaggedUnion, ? super W> newEvictor) {
return new WithWindow<>(
input1,
input2,
keySelector1,
keySelector2,
keyType,
windowAssigner,
trigger,
newEvictor,
allowedLateness);
}
/**
* Sets the time by which elements are allowed to be late.
*
* @see WindowedStream#allowedLateness(Time)
*/
@PublicEvolving
public WithWindow allowedLateness(Time newLateness) {
return new WithWindow<>(
input1,
input2,
keySelector1,
keySelector2,
keyType,
windowAssigner,
trigger,
evictor,
newLateness);
}
/**
* Completes the co-group operation with the user function that is executed for windowed
* groups.
*
* Note: This method's return type does not support setting an operator-specific
* parallelism. Due to binary backwards compatibility, this cannot be altered. Use the
* {@link #with(CoGroupFunction)} method to set an operator-specific parallelism.
*/
public DataStream apply(CoGroupFunction function) {
TypeInformation resultType =
TypeExtractor.getCoGroupReturnTypes(
function, input1.getType(), input2.getType(), "CoGroup", false);
return apply(function, resultType);
}
/**
* Completes the co-group operation with the user function that is executed for windowed
* groups.
*
* Note: This is a temporary workaround while the {@link #apply(CoGroupFunction)}
* method has the wrong return type and hence does not allow one to set an operator-specific
* parallelism
*
* @deprecated This method will be removed once the {@link #apply(CoGroupFunction)} method
* is fixed in the next major version of Flink (2.0).
*/
@PublicEvolving
@Deprecated
public SingleOutputStreamOperator with(CoGroupFunction function) {
return (SingleOutputStreamOperator) apply(function);
}
/**
* Completes the co-group operation with the user function that is executed for windowed
* groups.
*
* Note: This method's return type does not support setting an operator-specific
* parallelism. Due to binary backwards compatibility, this cannot be altered. Use the
* {@link #with(CoGroupFunction, TypeInformation)} method to set an operator-specific
* parallelism.
*/
public DataStream apply(
CoGroupFunction function, TypeInformation resultType) {
// clean the closure
function = input1.getExecutionEnvironment().clean(function);
UnionTypeInfo unionType =
new UnionTypeInfo<>(input1.getType(), input2.getType());
UnionKeySelector unionKeySelector =
new UnionKeySelector<>(keySelector1, keySelector2);
DataStream> taggedInput1 =
input1.map(new Input1Tagger())
.setParallelism(input1.getParallelism())
.returns(unionType);
DataStream> taggedInput2 =
input2.map(new Input2Tagger())
.setParallelism(input2.getParallelism())
.returns(unionType);
DataStream> unionStream = taggedInput1.union(taggedInput2);
// we explicitly create the keyed stream to manually pass the key type information in
windowedStream =
new KeyedStream, KEY>(
unionStream, unionKeySelector, keyType)
.window(windowAssigner);
if (trigger != null) {
windowedStream.trigger(trigger);
}
if (evictor != null) {
windowedStream.evictor(evictor);
}
if (allowedLateness != null) {
windowedStream.allowedLateness(allowedLateness);
}
return windowedStream.apply(
new CoGroupWindowFunction(function), resultType);
}
/**
* Completes the co-group operation with the user function that is executed for windowed
* groups.
*
* Note: This is a temporary workaround while the {@link #apply(CoGroupFunction,
* TypeInformation)} method has the wrong return type and hence does not allow one to set an
* operator-specific parallelism
*
* @deprecated This method will be removed once the {@link #apply(CoGroupFunction,
* TypeInformation)} method is fixed in the next major version of Flink (2.0).
*/
@PublicEvolving
@Deprecated
public SingleOutputStreamOperator with(
CoGroupFunction function, TypeInformation resultType) {
return (SingleOutputStreamOperator) apply(function, resultType);
}
@VisibleForTesting
Time getAllowedLateness() {
return allowedLateness;
}
@VisibleForTesting
WindowedStream, KEY, W> getWindowedStream() {
return windowedStream;
}
}
// ------------------------------------------------------------------------
// Data type and type information for Tagged Union
// ------------------------------------------------------------------------
/** Internal class for implementing tagged union co-group. */
@Internal
public static class TaggedUnion {
private final T1 one;
private final T2 two;
private TaggedUnion(T1 one, T2 two) {
this.one = one;
this.two = two;
}
public boolean isOne() {
return one != null;
}
public boolean isTwo() {
return two != null;
}
public T1 getOne() {
return one;
}
public T2 getTwo() {
return two;
}
public static TaggedUnion one(T1 one) {
return new TaggedUnion<>(one, null);
}
public static TaggedUnion two(T2 two) {
return new TaggedUnion<>(null, two);
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (!(obj instanceof TaggedUnion)) {
return false;
}
TaggedUnion other = (TaggedUnion) obj;
return Objects.equals(one, other.one) && Objects.equals(two, other.two);
}
}
private static class UnionTypeInfo extends TypeInformation> {
private static final long serialVersionUID = 1L;
private final TypeInformation oneType;
private final TypeInformation twoType;
public UnionTypeInfo(TypeInformation oneType, TypeInformation twoType) {
this.oneType = oneType;
this.twoType = twoType;
}
@Override
public boolean isBasicType() {
return false;
}
@Override
public boolean isTupleType() {
return false;
}
@Override
public int getArity() {
return 2;
}
@Override
public int getTotalFields() {
return 2;
}
@Override
@SuppressWarnings("unchecked, rawtypes")
public Class> getTypeClass() {
return (Class) TaggedUnion.class;
}
@Override
public boolean isKeyType() {
return true;
}
@Override
public TypeSerializer> createSerializer(ExecutionConfig config) {
return new UnionSerializer<>(
oneType.createSerializer(config), twoType.createSerializer(config));
}
@Override
public String toString() {
return "TaggedUnion<" + oneType + ", " + twoType + ">";
}
@Override
public boolean equals(Object obj) {
if (obj instanceof UnionTypeInfo) {
@SuppressWarnings("unchecked")
UnionTypeInfo unionTypeInfo = (UnionTypeInfo) obj;
return unionTypeInfo.canEqual(this)
&& oneType.equals(unionTypeInfo.oneType)
&& twoType.equals(unionTypeInfo.twoType);
} else {
return false;
}
}
@Override
public int hashCode() {
return 31 * oneType.hashCode() + twoType.hashCode();
}
@Override
public boolean canEqual(Object obj) {
return obj instanceof UnionTypeInfo;
}
}
/** {@link TypeSerializer} for {@link TaggedUnion}. */
@VisibleForTesting
@Internal
public static class UnionSerializer extends TypeSerializer> {
private static final long serialVersionUID = 1L;
private final TypeSerializer oneSerializer;
private final TypeSerializer twoSerializer;
public UnionSerializer(TypeSerializer oneSerializer, TypeSerializer twoSerializer) {
this.oneSerializer = oneSerializer;
this.twoSerializer = twoSerializer;
}
@Override
public boolean isImmutableType() {
return false;
}
@Override
public TypeSerializer> duplicate() {
TypeSerializer duplicateOne = oneSerializer.duplicate();
TypeSerializer duplicateTwo = twoSerializer.duplicate();
// compare reference of nested serializers, if same instances returned, we can reuse
// this instance as well
if (duplicateOne != oneSerializer || duplicateTwo != twoSerializer) {
return new UnionSerializer<>(duplicateOne, duplicateTwo);
} else {
return this;
}
}
@Override
public TaggedUnion createInstance() {
// we arbitrarily always create instance of one
return TaggedUnion.one(oneSerializer.createInstance());
}
@Override
public TaggedUnion copy(TaggedUnion from) {
if (from.isOne()) {
return TaggedUnion.one(oneSerializer.copy(from.getOne()));
} else {
return TaggedUnion.two(twoSerializer.copy(from.getTwo()));
}
}
@Override
public TaggedUnion copy(TaggedUnion from, TaggedUnion reuse) {
if (from.isOne()) {
return TaggedUnion.one(oneSerializer.copy(from.getOne()));
} else {
return TaggedUnion.two(twoSerializer.copy(from.getTwo()));
}
}
@Override
public int getLength() {
return -1;
}
@Override
public void serialize(TaggedUnion record, DataOutputView target)
throws IOException {
if (record.isOne()) {
target.writeByte(1);
oneSerializer.serialize(record.getOne(), target);
} else {
target.writeByte(2);
twoSerializer.serialize(record.getTwo(), target);
}
}
@Override
public TaggedUnion deserialize(DataInputView source) throws IOException {
byte tag = source.readByte();
if (tag == 1) {
return TaggedUnion.one(oneSerializer.deserialize(source));
} else {
return TaggedUnion.two(twoSerializer.deserialize(source));
}
}
@Override
public TaggedUnion deserialize(TaggedUnion reuse, DataInputView source)
throws IOException {
byte tag = source.readByte();
if (tag == 1) {
return TaggedUnion.one(oneSerializer.deserialize(source));
} else {
return TaggedUnion.two(twoSerializer.deserialize(source));
}
}
@Override
public void copy(DataInputView source, DataOutputView target) throws IOException {
byte tag = source.readByte();
target.writeByte(tag);
if (tag == 1) {
oneSerializer.copy(source, target);
} else {
twoSerializer.copy(source, target);
}
}
@Override
public int hashCode() {
return 31 * oneSerializer.hashCode() + twoSerializer.hashCode();
}
@Override
@SuppressWarnings("unchecked")
public boolean equals(Object obj) {
if (obj instanceof UnionSerializer) {
UnionSerializer other = (UnionSerializer) obj;
return oneSerializer.equals(other.oneSerializer)
&& twoSerializer.equals(other.twoSerializer);
} else {
return false;
}
}
@Override
public TypeSerializerSnapshot> snapshotConfiguration() {
return new UnionSerializerSnapshot<>(this);
}
}
/**
* The {@link TypeSerializerConfigSnapshot} for the {@link UnionSerializer}.
*
* @deprecated this snapshot class is no longer in use, and is maintained only for backwards
* compatibility. It is fully replaced by {@link UnionSerializerSnapshot}.
*/
@Deprecated
public static class UnionSerializerConfigSnapshot
extends CompositeTypeSerializerConfigSnapshot> {
private static final int VERSION = 1;
/** This empty nullary constructor is required for deserializing the configuration. */
public UnionSerializerConfigSnapshot() {}
public UnionSerializerConfigSnapshot(
TypeSerializer oneSerializer, TypeSerializer twoSerializer) {
super(oneSerializer, twoSerializer);
}
@Override
public TypeSerializerSchemaCompatibility> resolveSchemaCompatibility(
TypeSerializer> newSerializer) {
List, TypeSerializerSnapshot>>> nestedSerializersAndConfigs =
getNestedSerializersAndConfigs();
return CompositeTypeSerializerUtil.delegateCompatibilityCheckToNewSnapshot(
newSerializer,
new UnionSerializerSnapshot<>(),
nestedSerializersAndConfigs.get(0).f1,
nestedSerializersAndConfigs.get(1).f1);
}
@Override
public int getVersion() {
return VERSION;
}
}
/** The {@link TypeSerializerSnapshot} for the {@link UnionSerializer}. */
public static class UnionSerializerSnapshot
extends CompositeTypeSerializerSnapshot, UnionSerializer> {
private static final int VERSION = 2;
@SuppressWarnings("WeakerAccess")
public UnionSerializerSnapshot() {
super(UnionSerializer.class);
}
UnionSerializerSnapshot(UnionSerializer serializerInstance) {
super(serializerInstance);
}
@Override
protected int getCurrentOuterSnapshotVersion() {
return VERSION;
}
@Override
protected TypeSerializer>[] getNestedSerializers(
UnionSerializer outerSerializer) {
return new TypeSerializer[] {
outerSerializer.oneSerializer, outerSerializer.twoSerializer
};
}
@SuppressWarnings("unchecked")
@Override
protected UnionSerializer createOuterSerializerWithNestedSerializers(
TypeSerializer>[] nestedSerializers) {
return new UnionSerializer<>(
(TypeSerializer) nestedSerializers[0],
(TypeSerializer) nestedSerializers[1]);
}
}
// ------------------------------------------------------------------------
// Utility functions that implement the CoGroup logic based on the tagged
// union window reduce
// ------------------------------------------------------------------------
private static class Input1Tagger implements MapFunction> {
private static final long serialVersionUID = 1L;
@Override
public TaggedUnion map(T1 value) throws Exception {
return TaggedUnion.one(value);
}
}
private static class Input2Tagger implements MapFunction> {
private static final long serialVersionUID = 1L;
@Override
public TaggedUnion map(T2 value) throws Exception {
return TaggedUnion.two(value);
}
}
private static class UnionKeySelector
implements KeySelector, KEY> {
private static final long serialVersionUID = 1L;
private final KeySelector keySelector1;
private final KeySelector keySelector2;
public UnionKeySelector(
KeySelector keySelector1, KeySelector keySelector2) {
this.keySelector1 = keySelector1;
this.keySelector2 = keySelector2;
}
@Override
public KEY getKey(TaggedUnion value) throws Exception {
if (value.isOne()) {
return keySelector1.getKey(value.getOne());
} else {
return keySelector2.getKey(value.getTwo());
}
}
}
private static class CoGroupWindowFunction
extends WrappingFunction>
implements WindowFunction, T, KEY, W> {
private static final long serialVersionUID = 1L;
public CoGroupWindowFunction(CoGroupFunction userFunction) {
super(userFunction);
}
@Override
public void apply(KEY key, W window, Iterable> values, Collector out)
throws Exception {
List oneValues = new ArrayList<>();
List twoValues = new ArrayList<>();
for (TaggedUnion val : values) {
if (val.isOne()) {
oneValues.add(val.getOne());
} else {
twoValues.add(val.getTwo());
}
}
wrappedFunction.coGroup(oneValues, twoValues, out);
}
}
}