All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.streaming.api.datastream.CoGroupedStreams Maven / Gradle / Ivy

There is a newer version: 1.14.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.datastream;

import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.CompatibilityResult;
import org.apache.flink.api.common.typeutils.CompatibilityUtil;
import org.apache.flink.api.common.typeutils.CompositeTypeSerializerConfigSnapshot;
import org.apache.flink.api.common.typeutils.TypeDeserializerAdapter;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.TypeSerializerConfigSnapshot;
import org.apache.flink.api.common.typeutils.UnloadableDummyTypeSerializer;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.operators.translation.WrappingFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataOutputView;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner;
import org.apache.flink.streaming.api.windowing.evictors.Evictor;
import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.util.Collector;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static java.util.Objects.requireNonNull;

/**
 *{@code CoGroupedStreams} represents two {@link DataStream DataStreams} that have been co-grouped.
 * A streaming co-group operation is evaluated over elements in a window.
 *
 * 

To finalize co-group operation you also need to specify a {@link KeySelector} for * both the first and second input and a {@link WindowAssigner}. * *

Note: Right now, the groups are being built in memory so you need to ensure that they don't * get too big. Otherwise the JVM might crash. * *

Example: *

 {@code
 * DataStream> one = ...;
 * DataStream> two = ...;
 *
 * DataStream result = one.coGroup(two)
 *     .where(new MyFirstKeySelector())
 *     .equalTo(new MyFirstKeySelector())
 *     .window(TumblingEventTimeWindows.of(Time.of(5, TimeUnit.SECONDS)))
 *     .apply(new MyCoGroupFunction());
 * } 
*/ @Public public class CoGroupedStreams { /** The first input stream. */ private final DataStream input1; /** The second input stream. */ private final DataStream input2; /** * Creates new CoGrouped data streams, which are the first step towards building a streaming * co-group. * * @param input1 The first data stream. * @param input2 The second data stream. */ public CoGroupedStreams(DataStream input1, DataStream input2) { this.input1 = requireNonNull(input1); this.input2 = requireNonNull(input2); } /** * Specifies a {@link KeySelector} for elements from the first input. */ public Where where(KeySelector keySelector) { TypeInformation keyType = TypeExtractor.getKeySelectorTypes(keySelector, input1.getType()); return new Where<>(input1.clean(keySelector), keyType); } // ------------------------------------------------------------------------ /** * CoGrouped streams that have the key for one side defined. * * @param The type of the key. */ @Public public class Where { private final KeySelector keySelector1; private final TypeInformation keyType; Where(KeySelector keySelector1, TypeInformation keyType) { this.keySelector1 = keySelector1; this.keyType = keyType; } /** * Specifies a {@link KeySelector} for elements from the second input. */ public EqualTo equalTo(KeySelector keySelector) { TypeInformation otherKey = TypeExtractor.getKeySelectorTypes(keySelector, input2.getType()); if (!otherKey.equals(this.keyType)) { throw new IllegalArgumentException("The keys for the two inputs are not equal: " + "first key = " + this.keyType + " , second key = " + otherKey); } return new EqualTo(input2.clean(keySelector)); } // -------------------------------------------------------------------- /** * A co-group operation that has {@link KeySelector KeySelectors} defined for both inputs. */ @Public public class EqualTo { private final KeySelector keySelector2; EqualTo(KeySelector keySelector2) { this.keySelector2 = requireNonNull(keySelector2); } /** * Specifies the window on which the co-group operation works. */ @PublicEvolving public WithWindow window(WindowAssigner, W> assigner) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, assigner, null, null); } } } // ------------------------------------------------------------------------ /** * A co-group operation that has {@link KeySelector KeySelectors} defined for both inputs as * well as a {@link WindowAssigner}. * * @param Type of the elements from the first input * @param Type of the elements from the second input * @param Type of the key. This must be the same for both inputs * @param Type of {@link Window} on which the co-group operation works. */ @Public public static class WithWindow { private final DataStream input1; private final DataStream input2; private final KeySelector keySelector1; private final KeySelector keySelector2; private final TypeInformation keyType; private final WindowAssigner, W> windowAssigner; private final Trigger, ? super W> trigger; private final Evictor, ? super W> evictor; protected WithWindow(DataStream input1, DataStream input2, KeySelector keySelector1, KeySelector keySelector2, TypeInformation keyType, WindowAssigner, W> windowAssigner, Trigger, ? super W> trigger, Evictor, ? super W> evictor) { this.input1 = input1; this.input2 = input2; this.keySelector1 = keySelector1; this.keySelector2 = keySelector2; this.keyType = keyType; this.windowAssigner = windowAssigner; this.trigger = trigger; this.evictor = evictor; } /** * Sets the {@code Trigger} that should be used to trigger window emission. */ @PublicEvolving public WithWindow trigger(Trigger, ? super W> newTrigger) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, newTrigger, evictor); } /** * Sets the {@code Evictor} that should be used to evict elements from a window before * emission. * *

Note: When using an evictor window performance will degrade significantly, since * pre-aggregation of window results cannot be used. */ @PublicEvolving public WithWindow evictor(Evictor, ? super W> newEvictor) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, newEvictor); } /** * Completes the co-group operation with the user function that is executed * for windowed groups. * *

Note: This method's return type does not support setting an operator-specific parallelism. * Due to binary backwards compatibility, this cannot be altered. Use the {@link #with(CoGroupFunction)} * method to set an operator-specific parallelism. */ public DataStream apply(CoGroupFunction function) { TypeInformation resultType = TypeExtractor.getCoGroupReturnTypes( function, input1.getType(), input2.getType(), "CoGroup", false); return apply(function, resultType); } /** * Completes the co-group operation with the user function that is executed * for windowed groups. * *

Note: This is a temporary workaround while the {@link #apply(CoGroupFunction)} * method has the wrong return type and hence does not allow one to set an operator-specific * parallelism * * @deprecated This method will be removed once the {@link #apply(CoGroupFunction)} method is fixed * in the next major version of Flink (2.0). */ @PublicEvolving @Deprecated public SingleOutputStreamOperator with(CoGroupFunction function) { return (SingleOutputStreamOperator) apply(function); } /** * Completes the co-group operation with the user function that is executed * for windowed groups. * *

Note: This method's return type does not support setting an operator-specific parallelism. * Due to binary backwards compatibility, this cannot be altered. Use the * {@link #with(CoGroupFunction, TypeInformation)} method to set an operator-specific parallelism. */ public DataStream apply(CoGroupFunction function, TypeInformation resultType) { //clean the closure function = input1.getExecutionEnvironment().clean(function); UnionTypeInfo unionType = new UnionTypeInfo<>(input1.getType(), input2.getType()); UnionKeySelector unionKeySelector = new UnionKeySelector<>(keySelector1, keySelector2); DataStream> taggedInput1 = input1 .map(new Input1Tagger()) .setParallelism(input1.getParallelism()) .returns(unionType); DataStream> taggedInput2 = input2 .map(new Input2Tagger()) .setParallelism(input2.getParallelism()) .returns(unionType); DataStream> unionStream = taggedInput1.union(taggedInput2); // we explicitly create the keyed stream to manually pass the key type information in WindowedStream, KEY, W> windowOp = new KeyedStream, KEY>(unionStream, unionKeySelector, keyType) .window(windowAssigner); if (trigger != null) { windowOp.trigger(trigger); } if (evictor != null) { windowOp.evictor(evictor); } return windowOp.apply(new CoGroupWindowFunction(function), resultType); } /** * Completes the co-group operation with the user function that is executed * for windowed groups. * *

Note: This is a temporary workaround while the {@link #apply(CoGroupFunction, TypeInformation)} * method has the wrong return type and hence does not allow one to set an operator-specific * parallelism * * @deprecated This method will be removed once the {@link #apply(CoGroupFunction, TypeInformation)} * method is fixed in the next major version of Flink (2.0). */ @PublicEvolving @Deprecated public SingleOutputStreamOperator with(CoGroupFunction function, TypeInformation resultType) { return (SingleOutputStreamOperator) apply(function, resultType); } } // ------------------------------------------------------------------------ // Data type and type information for Tagged Union // ------------------------------------------------------------------------ /** * Internal class for implementing tagged union co-group. */ @Internal public static class TaggedUnion { private final T1 one; private final T2 two; private TaggedUnion(T1 one, T2 two) { this.one = one; this.two = two; } public boolean isOne() { return one != null; } public boolean isTwo() { return two != null; } public T1 getOne() { return one; } public T2 getTwo() { return two; } public static TaggedUnion one(T1 one) { return new TaggedUnion<>(one, null); } public static TaggedUnion two(T2 two) { return new TaggedUnion<>(null, two); } } private static class UnionTypeInfo extends TypeInformation> { private static final long serialVersionUID = 1L; private final TypeInformation oneType; private final TypeInformation twoType; public UnionTypeInfo(TypeInformation oneType, TypeInformation twoType) { this.oneType = oneType; this.twoType = twoType; } @Override public boolean isBasicType() { return false; } @Override public boolean isTupleType() { return false; } @Override public int getArity() { return 2; } @Override public int getTotalFields() { return 2; } @Override @SuppressWarnings("unchecked, rawtypes") public Class> getTypeClass() { return (Class) TaggedUnion.class; } @Override public boolean isKeyType() { return true; } @Override public TypeSerializer> createSerializer(ExecutionConfig config) { return new UnionSerializer<>(oneType.createSerializer(config), twoType.createSerializer(config)); } @Override public String toString() { return "TaggedUnion<" + oneType + ", " + twoType + ">"; } @Override public boolean equals(Object obj) { if (obj instanceof UnionTypeInfo) { @SuppressWarnings("unchecked") UnionTypeInfo unionTypeInfo = (UnionTypeInfo) obj; return unionTypeInfo.canEqual(this) && oneType.equals(unionTypeInfo.oneType) && twoType.equals(unionTypeInfo.twoType); } else { return false; } } @Override public int hashCode() { return 31 * oneType.hashCode() + twoType.hashCode(); } @Override public boolean canEqual(Object obj) { return obj instanceof UnionTypeInfo; } } private static class UnionSerializer extends TypeSerializer> { private static final long serialVersionUID = 1L; private final TypeSerializer oneSerializer; private final TypeSerializer twoSerializer; public UnionSerializer(TypeSerializer oneSerializer, TypeSerializer twoSerializer) { this.oneSerializer = oneSerializer; this.twoSerializer = twoSerializer; } @Override public boolean isImmutableType() { return false; } @Override public TypeSerializer> duplicate() { return this; } @Override public TaggedUnion createInstance() { return null; } @Override public TaggedUnion copy(TaggedUnion from) { if (from.isOne()) { return TaggedUnion.one(oneSerializer.copy(from.getOne())); } else { return TaggedUnion.two(twoSerializer.copy(from.getTwo())); } } @Override public TaggedUnion copy(TaggedUnion from, TaggedUnion reuse) { if (from.isOne()) { return TaggedUnion.one(oneSerializer.copy(from.getOne())); } else { return TaggedUnion.two(twoSerializer.copy(from.getTwo())); } } @Override public int getLength() { return -1; } @Override public void serialize(TaggedUnion record, DataOutputView target) throws IOException { if (record.isOne()) { target.writeByte(1); oneSerializer.serialize(record.getOne(), target); } else { target.writeByte(2); twoSerializer.serialize(record.getTwo(), target); } } @Override public TaggedUnion deserialize(DataInputView source) throws IOException { byte tag = source.readByte(); if (tag == 1) { return TaggedUnion.one(oneSerializer.deserialize(source)); } else { return TaggedUnion.two(twoSerializer.deserialize(source)); } } @Override public TaggedUnion deserialize(TaggedUnion reuse, DataInputView source) throws IOException { byte tag = source.readByte(); if (tag == 1) { return TaggedUnion.one(oneSerializer.deserialize(source)); } else { return TaggedUnion.two(twoSerializer.deserialize(source)); } } @Override public void copy(DataInputView source, DataOutputView target) throws IOException { byte tag = source.readByte(); target.writeByte(tag); if (tag == 1) { oneSerializer.copy(source, target); } else { twoSerializer.copy(source, target); } } @Override public int hashCode() { return 31 * oneSerializer.hashCode() + twoSerializer.hashCode(); } @Override @SuppressWarnings("unchecked") public boolean equals(Object obj) { if (obj instanceof UnionSerializer) { UnionSerializer other = (UnionSerializer) obj; return other.canEqual(this) && oneSerializer.equals(other.oneSerializer) && twoSerializer.equals(other.twoSerializer); } else { return false; } } @Override public boolean canEqual(Object obj) { return obj instanceof UnionSerializer; } @Override public TypeSerializerConfigSnapshot snapshotConfiguration() { return new UnionSerializerConfigSnapshot<>(oneSerializer, twoSerializer); } @Override public CompatibilityResult> ensureCompatibility(TypeSerializerConfigSnapshot configSnapshot) { if (configSnapshot instanceof UnionSerializerConfigSnapshot) { List, TypeSerializerConfigSnapshot>> previousSerializersAndConfigs = ((UnionSerializerConfigSnapshot) configSnapshot).getNestedSerializersAndConfigs(); CompatibilityResult oneSerializerCompatResult = CompatibilityUtil.resolveCompatibilityResult( previousSerializersAndConfigs.get(0).f0, UnloadableDummyTypeSerializer.class, previousSerializersAndConfigs.get(0).f1, oneSerializer); CompatibilityResult twoSerializerCompatResult = CompatibilityUtil.resolveCompatibilityResult( previousSerializersAndConfigs.get(1).f0, UnloadableDummyTypeSerializer.class, previousSerializersAndConfigs.get(1).f1, twoSerializer); if (!oneSerializerCompatResult.isRequiresMigration() && !twoSerializerCompatResult.isRequiresMigration()) { return CompatibilityResult.compatible(); } else if (oneSerializerCompatResult.getConvertDeserializer() != null && twoSerializerCompatResult.getConvertDeserializer() != null) { return CompatibilityResult.requiresMigration( new UnionSerializer<>( new TypeDeserializerAdapter<>(oneSerializerCompatResult.getConvertDeserializer()), new TypeDeserializerAdapter<>(twoSerializerCompatResult.getConvertDeserializer()))); } } return CompatibilityResult.requiresMigration(); } } /** * The {@link TypeSerializerConfigSnapshot} for the {@link UnionSerializer}. */ public static class UnionSerializerConfigSnapshot extends CompositeTypeSerializerConfigSnapshot { private static final int VERSION = 1; /** This empty nullary constructor is required for deserializing the configuration. */ public UnionSerializerConfigSnapshot() {} public UnionSerializerConfigSnapshot(TypeSerializer oneSerializer, TypeSerializer twoSerializer) { super(oneSerializer, twoSerializer); } @Override public int getVersion() { return VERSION; } } // ------------------------------------------------------------------------ // Utility functions that implement the CoGroup logic based on the tagged // union window reduce // ------------------------------------------------------------------------ private static class Input1Tagger implements MapFunction> { private static final long serialVersionUID = 1L; @Override public TaggedUnion map(T1 value) throws Exception { return TaggedUnion.one(value); } } private static class Input2Tagger implements MapFunction> { private static final long serialVersionUID = 1L; @Override public TaggedUnion map(T2 value) throws Exception { return TaggedUnion.two(value); } } private static class UnionKeySelector implements KeySelector, KEY> { private static final long serialVersionUID = 1L; private final KeySelector keySelector1; private final KeySelector keySelector2; public UnionKeySelector(KeySelector keySelector1, KeySelector keySelector2) { this.keySelector1 = keySelector1; this.keySelector2 = keySelector2; } @Override public KEY getKey(TaggedUnion value) throws Exception{ if (value.isOne()) { return keySelector1.getKey(value.getOne()); } else { return keySelector2.getKey(value.getTwo()); } } } private static class CoGroupWindowFunction extends WrappingFunction> implements WindowFunction, T, KEY, W> { private static final long serialVersionUID = 1L; public CoGroupWindowFunction(CoGroupFunction userFunction) { super(userFunction); } @Override public void apply(KEY key, W window, Iterable> values, Collector out) throws Exception { List oneValues = new ArrayList<>(); List twoValues = new ArrayList<>(); for (TaggedUnion val: values) { if (val.isOne()) { oneValues.add(val.getOne()); } else { twoValues.add(val.getTwo()); } } wrappedFunction.coGroup(oneValues, twoValues, out); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy