All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.streaming.api.datastream.CoGroupedStreams Maven / Gradle / Ivy

There is a newer version: 2.0-preview1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.datastream;

import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerConfigSnapshot;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerSnapshot;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerUtil;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.TypeSerializerConfigSnapshot;
import org.apache.flink.api.common.typeutils.TypeSerializerSchemaCompatibility;
import org.apache.flink.api.common.typeutils.TypeSerializerSnapshot;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.operators.translation.WrappingFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataOutputView;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner;
import org.apache.flink.streaming.api.windowing.evictors.Evictor;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.util.Collector;
import org.apache.flink.util.Preconditions;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

import static java.util.Objects.requireNonNull;

/**
 * {@code CoGroupedStreams} represents two {@link DataStream DataStreams} that have been co-grouped.
 * A streaming co-group operation is evaluated over elements in a window.
 *
 * 

To finalize co-group operation you also need to specify a {@link KeySelector} for both the * first and second input and a {@link WindowAssigner}. * *

Note: Right now, the groups are being built in memory so you need to ensure that they don't * get too big. Otherwise the JVM might crash. * *

Example: * *

{@code
 * DataStream> one = ...;
 * DataStream> two = ...;
 *
 * DataStream result = one.coGroup(two)
 *     .where(new MyFirstKeySelector())
 *     .equalTo(new MyFirstKeySelector())
 *     .window(TumblingEventTimeWindows.of(Time.of(5, TimeUnit.SECONDS)))
 *     .apply(new MyCoGroupFunction());
 * }
*/ @Public public class CoGroupedStreams { /** The first input stream. */ private final DataStream input1; /** The second input stream. */ private final DataStream input2; /** * Creates new CoGrouped data streams, which are the first step towards building a streaming * co-group. * * @param input1 The first data stream. * @param input2 The second data stream. */ public CoGroupedStreams(DataStream input1, DataStream input2) { this.input1 = requireNonNull(input1); this.input2 = requireNonNull(input2); } /** * Specifies a {@link KeySelector} for elements from the first input. * * @param keySelector The KeySelector to be used for extracting the first input's key for * partitioning. */ public Where where(KeySelector keySelector) { Preconditions.checkNotNull(keySelector); final TypeInformation keyType = TypeExtractor.getKeySelectorTypes(keySelector, input1.getType()); return where(keySelector, keyType); } /** * Specifies a {@link KeySelector} for elements from the first input with explicit type * information. * * @param keySelector The KeySelector to be used for extracting the first input's key for * partitioning. * @param keyType The type information describing the key type. */ public Where where(KeySelector keySelector, TypeInformation keyType) { Preconditions.checkNotNull(keySelector); Preconditions.checkNotNull(keyType); return new Where<>(input1.clean(keySelector), keyType); } // ------------------------------------------------------------------------ /** * CoGrouped streams that have the key for one side defined. * * @param The type of the key. */ @Public public class Where { private final KeySelector keySelector1; private final TypeInformation keyType; Where(KeySelector keySelector1, TypeInformation keyType) { this.keySelector1 = keySelector1; this.keyType = keyType; } /** * Specifies a {@link KeySelector} for elements from the second input. * * @param keySelector The KeySelector to be used for extracting the second input's key for * partitioning. */ public EqualTo equalTo(KeySelector keySelector) { Preconditions.checkNotNull(keySelector); final TypeInformation otherKey = TypeExtractor.getKeySelectorTypes(keySelector, input2.getType()); return equalTo(keySelector, otherKey); } /** * Specifies a {@link KeySelector} for elements from the second input with explicit type * information for the key type. * * @param keySelector The KeySelector to be used for extracting the key for partitioning. * @param keyType The type information describing the key type. */ public EqualTo equalTo(KeySelector keySelector, TypeInformation keyType) { Preconditions.checkNotNull(keySelector); Preconditions.checkNotNull(keyType); if (!keyType.equals(this.keyType)) { throw new IllegalArgumentException( "The keys for the two inputs are not equal: " + "first key = " + this.keyType + " , second key = " + keyType); } return new EqualTo(input2.clean(keySelector)); } // -------------------------------------------------------------------- /** * A co-group operation that has {@link KeySelector KeySelectors} defined for both inputs. */ @Public public class EqualTo { private final KeySelector keySelector2; EqualTo(KeySelector keySelector2) { this.keySelector2 = requireNonNull(keySelector2); } /** Specifies the window on which the co-group operation works. */ @PublicEvolving public WithWindow window( WindowAssigner, W> assigner) { return new WithWindow<>( input1, input2, keySelector1, keySelector2, keyType, assigner, null, null, null); } } } // ------------------------------------------------------------------------ /** * A co-group operation that has {@link KeySelector KeySelectors} defined for both inputs as * well as a {@link WindowAssigner}. * * @param Type of the elements from the first input * @param Type of the elements from the second input * @param Type of the key. This must be the same for both inputs * @param Type of {@link Window} on which the co-group operation works. */ @Public public static class WithWindow { private final DataStream input1; private final DataStream input2; private final KeySelector keySelector1; private final KeySelector keySelector2; private final TypeInformation keyType; private final WindowAssigner, W> windowAssigner; private final Trigger, ? super W> trigger; private final Evictor, ? super W> evictor; private final Time allowedLateness; private WindowedStream, KEY, W> windowedStream; protected WithWindow( DataStream input1, DataStream input2, KeySelector keySelector1, KeySelector keySelector2, TypeInformation keyType, WindowAssigner, W> windowAssigner, Trigger, ? super W> trigger, Evictor, ? super W> evictor, Time allowedLateness) { this.input1 = input1; this.input2 = input2; this.keySelector1 = keySelector1; this.keySelector2 = keySelector2; this.keyType = keyType; this.windowAssigner = windowAssigner; this.trigger = trigger; this.evictor = evictor; this.allowedLateness = allowedLateness; } /** Sets the {@code Trigger} that should be used to trigger window emission. */ @PublicEvolving public WithWindow trigger( Trigger, ? super W> newTrigger) { return new WithWindow<>( input1, input2, keySelector1, keySelector2, keyType, windowAssigner, newTrigger, evictor, allowedLateness); } /** * Sets the {@code Evictor} that should be used to evict elements from a window before * emission. * *

Note: When using an evictor window performance will degrade significantly, since * pre-aggregation of window results cannot be used. */ @PublicEvolving public WithWindow evictor( Evictor, ? super W> newEvictor) { return new WithWindow<>( input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, newEvictor, allowedLateness); } /** * Sets the time by which elements are allowed to be late. * * @see WindowedStream#allowedLateness(Time) */ @PublicEvolving public WithWindow allowedLateness(Time newLateness) { return new WithWindow<>( input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, evictor, newLateness); } /** * Completes the co-group operation with the user function that is executed for windowed * groups. * *

Note: This method's return type does not support setting an operator-specific * parallelism. Due to binary backwards compatibility, this cannot be altered. Use the * {@link #with(CoGroupFunction)} method to set an operator-specific parallelism. */ public DataStream apply(CoGroupFunction function) { TypeInformation resultType = TypeExtractor.getCoGroupReturnTypes( function, input1.getType(), input2.getType(), "CoGroup", false); return apply(function, resultType); } /** * Completes the co-group operation with the user function that is executed for windowed * groups. * *

Note: This is a temporary workaround while the {@link #apply(CoGroupFunction)} * method has the wrong return type and hence does not allow one to set an operator-specific * parallelism * * @deprecated This method will be removed once the {@link #apply(CoGroupFunction)} method * is fixed in the next major version of Flink (2.0). */ @PublicEvolving @Deprecated public SingleOutputStreamOperator with(CoGroupFunction function) { return (SingleOutputStreamOperator) apply(function); } /** * Completes the co-group operation with the user function that is executed for windowed * groups. * *

Note: This method's return type does not support setting an operator-specific * parallelism. Due to binary backwards compatibility, this cannot be altered. Use the * {@link #with(CoGroupFunction, TypeInformation)} method to set an operator-specific * parallelism. */ public DataStream apply( CoGroupFunction function, TypeInformation resultType) { // clean the closure function = input1.getExecutionEnvironment().clean(function); UnionTypeInfo unionType = new UnionTypeInfo<>(input1.getType(), input2.getType()); UnionKeySelector unionKeySelector = new UnionKeySelector<>(keySelector1, keySelector2); DataStream> taggedInput1 = input1.map(new Input1Tagger()) .setParallelism(input1.getParallelism()) .returns(unionType); DataStream> taggedInput2 = input2.map(new Input2Tagger()) .setParallelism(input2.getParallelism()) .returns(unionType); DataStream> unionStream = taggedInput1.union(taggedInput2); // we explicitly create the keyed stream to manually pass the key type information in windowedStream = new KeyedStream, KEY>( unionStream, unionKeySelector, keyType) .window(windowAssigner); if (trigger != null) { windowedStream.trigger(trigger); } if (evictor != null) { windowedStream.evictor(evictor); } if (allowedLateness != null) { windowedStream.allowedLateness(allowedLateness); } return windowedStream.apply( new CoGroupWindowFunction(function), resultType); } /** * Completes the co-group operation with the user function that is executed for windowed * groups. * *

Note: This is a temporary workaround while the {@link #apply(CoGroupFunction, * TypeInformation)} method has the wrong return type and hence does not allow one to set an * operator-specific parallelism * * @deprecated This method will be removed once the {@link #apply(CoGroupFunction, * TypeInformation)} method is fixed in the next major version of Flink (2.0). */ @PublicEvolving @Deprecated public SingleOutputStreamOperator with( CoGroupFunction function, TypeInformation resultType) { return (SingleOutputStreamOperator) apply(function, resultType); } @VisibleForTesting Time getAllowedLateness() { return allowedLateness; } @VisibleForTesting WindowedStream, KEY, W> getWindowedStream() { return windowedStream; } } // ------------------------------------------------------------------------ // Data type and type information for Tagged Union // ------------------------------------------------------------------------ /** Internal class for implementing tagged union co-group. */ @Internal public static class TaggedUnion { private final T1 one; private final T2 two; private TaggedUnion(T1 one, T2 two) { this.one = one; this.two = two; } public boolean isOne() { return one != null; } public boolean isTwo() { return two != null; } public T1 getOne() { return one; } public T2 getTwo() { return two; } public static TaggedUnion one(T1 one) { return new TaggedUnion<>(one, null); } public static TaggedUnion two(T2 two) { return new TaggedUnion<>(null, two); } @Override public boolean equals(Object obj) { if (obj == this) { return true; } if (!(obj instanceof TaggedUnion)) { return false; } TaggedUnion other = (TaggedUnion) obj; return Objects.equals(one, other.one) && Objects.equals(two, other.two); } } private static class UnionTypeInfo extends TypeInformation> { private static final long serialVersionUID = 1L; private final TypeInformation oneType; private final TypeInformation twoType; public UnionTypeInfo(TypeInformation oneType, TypeInformation twoType) { this.oneType = oneType; this.twoType = twoType; } @Override public boolean isBasicType() { return false; } @Override public boolean isTupleType() { return false; } @Override public int getArity() { return 2; } @Override public int getTotalFields() { return 2; } @Override @SuppressWarnings("unchecked, rawtypes") public Class> getTypeClass() { return (Class) TaggedUnion.class; } @Override public boolean isKeyType() { return true; } @Override public TypeSerializer> createSerializer(ExecutionConfig config) { return new UnionSerializer<>( oneType.createSerializer(config), twoType.createSerializer(config)); } @Override public String toString() { return "TaggedUnion<" + oneType + ", " + twoType + ">"; } @Override public boolean equals(Object obj) { if (obj instanceof UnionTypeInfo) { @SuppressWarnings("unchecked") UnionTypeInfo unionTypeInfo = (UnionTypeInfo) obj; return unionTypeInfo.canEqual(this) && oneType.equals(unionTypeInfo.oneType) && twoType.equals(unionTypeInfo.twoType); } else { return false; } } @Override public int hashCode() { return 31 * oneType.hashCode() + twoType.hashCode(); } @Override public boolean canEqual(Object obj) { return obj instanceof UnionTypeInfo; } } /** {@link TypeSerializer} for {@link TaggedUnion}. */ @VisibleForTesting @Internal public static class UnionSerializer extends TypeSerializer> { private static final long serialVersionUID = 1L; private final TypeSerializer oneSerializer; private final TypeSerializer twoSerializer; public UnionSerializer(TypeSerializer oneSerializer, TypeSerializer twoSerializer) { this.oneSerializer = oneSerializer; this.twoSerializer = twoSerializer; } @Override public boolean isImmutableType() { return false; } @Override public TypeSerializer> duplicate() { TypeSerializer duplicateOne = oneSerializer.duplicate(); TypeSerializer duplicateTwo = twoSerializer.duplicate(); // compare reference of nested serializers, if same instances returned, we can reuse // this instance as well if (duplicateOne != oneSerializer || duplicateTwo != twoSerializer) { return new UnionSerializer<>(duplicateOne, duplicateTwo); } else { return this; } } @Override public TaggedUnion createInstance() { // we arbitrarily always create instance of one return TaggedUnion.one(oneSerializer.createInstance()); } @Override public TaggedUnion copy(TaggedUnion from) { if (from.isOne()) { return TaggedUnion.one(oneSerializer.copy(from.getOne())); } else { return TaggedUnion.two(twoSerializer.copy(from.getTwo())); } } @Override public TaggedUnion copy(TaggedUnion from, TaggedUnion reuse) { if (from.isOne()) { return TaggedUnion.one(oneSerializer.copy(from.getOne())); } else { return TaggedUnion.two(twoSerializer.copy(from.getTwo())); } } @Override public int getLength() { return -1; } @Override public void serialize(TaggedUnion record, DataOutputView target) throws IOException { if (record.isOne()) { target.writeByte(1); oneSerializer.serialize(record.getOne(), target); } else { target.writeByte(2); twoSerializer.serialize(record.getTwo(), target); } } @Override public TaggedUnion deserialize(DataInputView source) throws IOException { byte tag = source.readByte(); if (tag == 1) { return TaggedUnion.one(oneSerializer.deserialize(source)); } else { return TaggedUnion.two(twoSerializer.deserialize(source)); } } @Override public TaggedUnion deserialize(TaggedUnion reuse, DataInputView source) throws IOException { byte tag = source.readByte(); if (tag == 1) { return TaggedUnion.one(oneSerializer.deserialize(source)); } else { return TaggedUnion.two(twoSerializer.deserialize(source)); } } @Override public void copy(DataInputView source, DataOutputView target) throws IOException { byte tag = source.readByte(); target.writeByte(tag); if (tag == 1) { oneSerializer.copy(source, target); } else { twoSerializer.copy(source, target); } } @Override public int hashCode() { return 31 * oneSerializer.hashCode() + twoSerializer.hashCode(); } @Override @SuppressWarnings("unchecked") public boolean equals(Object obj) { if (obj instanceof UnionSerializer) { UnionSerializer other = (UnionSerializer) obj; return oneSerializer.equals(other.oneSerializer) && twoSerializer.equals(other.twoSerializer); } else { return false; } } @Override public TypeSerializerSnapshot> snapshotConfiguration() { return new UnionSerializerSnapshot<>(this); } } /** * The {@link TypeSerializerConfigSnapshot} for the {@link UnionSerializer}. * * @deprecated this snapshot class is no longer in use, and is maintained only for backwards * compatibility. It is fully replaced by {@link UnionSerializerSnapshot}. */ @Deprecated public static class UnionSerializerConfigSnapshot extends CompositeTypeSerializerConfigSnapshot> { private static final int VERSION = 1; /** This empty nullary constructor is required for deserializing the configuration. */ public UnionSerializerConfigSnapshot() {} public UnionSerializerConfigSnapshot( TypeSerializer oneSerializer, TypeSerializer twoSerializer) { super(oneSerializer, twoSerializer); } @Override public TypeSerializerSchemaCompatibility> resolveSchemaCompatibility( TypeSerializer> newSerializer) { List, TypeSerializerSnapshot>> nestedSerializersAndConfigs = getNestedSerializersAndConfigs(); return CompositeTypeSerializerUtil.delegateCompatibilityCheckToNewSnapshot( newSerializer, new UnionSerializerSnapshot<>(), nestedSerializersAndConfigs.get(0).f1, nestedSerializersAndConfigs.get(1).f1); } @Override public int getVersion() { return VERSION; } } /** The {@link TypeSerializerSnapshot} for the {@link UnionSerializer}. */ public static class UnionSerializerSnapshot extends CompositeTypeSerializerSnapshot, UnionSerializer> { private static final int VERSION = 2; @SuppressWarnings("WeakerAccess") public UnionSerializerSnapshot() { super(UnionSerializer.class); } UnionSerializerSnapshot(UnionSerializer serializerInstance) { super(serializerInstance); } @Override protected int getCurrentOuterSnapshotVersion() { return VERSION; } @Override protected TypeSerializer[] getNestedSerializers( UnionSerializer outerSerializer) { return new TypeSerializer[] { outerSerializer.oneSerializer, outerSerializer.twoSerializer }; } @SuppressWarnings("unchecked") @Override protected UnionSerializer createOuterSerializerWithNestedSerializers( TypeSerializer[] nestedSerializers) { return new UnionSerializer<>( (TypeSerializer) nestedSerializers[0], (TypeSerializer) nestedSerializers[1]); } } // ------------------------------------------------------------------------ // Utility functions that implement the CoGroup logic based on the tagged // union window reduce // ------------------------------------------------------------------------ private static class Input1Tagger implements MapFunction> { private static final long serialVersionUID = 1L; @Override public TaggedUnion map(T1 value) throws Exception { return TaggedUnion.one(value); } } private static class Input2Tagger implements MapFunction> { private static final long serialVersionUID = 1L; @Override public TaggedUnion map(T2 value) throws Exception { return TaggedUnion.two(value); } } private static class UnionKeySelector implements KeySelector, KEY> { private static final long serialVersionUID = 1L; private final KeySelector keySelector1; private final KeySelector keySelector2; public UnionKeySelector( KeySelector keySelector1, KeySelector keySelector2) { this.keySelector1 = keySelector1; this.keySelector2 = keySelector2; } @Override public KEY getKey(TaggedUnion value) throws Exception { if (value.isOne()) { return keySelector1.getKey(value.getOne()); } else { return keySelector2.getKey(value.getTwo()); } } } private static class CoGroupWindowFunction extends WrappingFunction> implements WindowFunction, T, KEY, W> { private static final long serialVersionUID = 1L; public CoGroupWindowFunction(CoGroupFunction userFunction) { super(userFunction); } @Override public void apply(KEY key, W window, Iterable> values, Collector out) throws Exception { List oneValues = new ArrayList<>(); List twoValues = new ArrayList<>(); for (TaggedUnion val : values) { if (val.isOne()) { oneValues.add(val.getOne()); } else { twoValues.add(val.getTwo()); } } wrappedFunction.coGroup(oneValues, twoValues, out); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy