All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.streaming.api.datastream.JoinedStreams Maven / Gradle / Ivy

There is a newer version: 1.14.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.datastream;

import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.FlatJoinFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.operators.translation.WrappingFunction;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.streaming.api.datastream.CoGroupedStreams.TaggedUnion;
import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner;
import org.apache.flink.streaming.api.windowing.evictors.Evictor;
import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.util.Collector;

import static java.util.Objects.requireNonNull;

/**
 *{@code JoinedStreams} represents two {@link DataStream DataStreams} that have been joined.
 * A streaming join operation is evaluated over elements in a window.
 *
 * 

To finalize the join operation you also need to specify a {@link KeySelector} for * both the first and second input and a {@link WindowAssigner}. * *

Note: Right now, the join is being evaluated in memory so you need to ensure that the number * of elements per key does not get too high. Otherwise the JVM might crash. * *

Example: *

 {@code
 * DataStream> one = ...;
 * DataStream> twp = ...;
 *
 * DataStream result = one.join(two)
 *     .where(new MyFirstKeySelector())
 *     .equalTo(new MyFirstKeySelector())
 *     .window(TumblingEventTimeWindows.of(Time.of(5, TimeUnit.SECONDS)))
 *     .apply(new MyJoinFunction());
 * } 
*/ @Public public class JoinedStreams { /** The first input stream. */ private final DataStream input1; /** The second input stream. */ private final DataStream input2; /** * Creates new JoinedStreams data streams, which are the first step towards building a streaming co-group. * * @param input1 The first data stream. * @param input2 The second data stream. */ public JoinedStreams(DataStream input1, DataStream input2) { this.input1 = requireNonNull(input1); this.input2 = requireNonNull(input2); } /** * Specifies a {@link KeySelector} for elements from the first input. */ public Where where(KeySelector keySelector) { TypeInformation keyType = TypeExtractor.getKeySelectorTypes(keySelector, input1.getType()); return new Where<>(input1.clean(keySelector), keyType); } // ------------------------------------------------------------------------ /** * Joined streams that have the key for one side defined. * * @param The type of the key. */ @Public public class Where { private final KeySelector keySelector1; private final TypeInformation keyType; Where(KeySelector keySelector1, TypeInformation keyType) { this.keySelector1 = keySelector1; this.keyType = keyType; } /** * Specifies a {@link KeySelector} for elements from the second input. */ public EqualTo equalTo(KeySelector keySelector) { TypeInformation otherKey = TypeExtractor.getKeySelectorTypes(keySelector, input2.getType()); if (!otherKey.equals(this.keyType)) { throw new IllegalArgumentException("The keys for the two inputs are not equal: " + "first key = " + this.keyType + " , second key = " + otherKey); } return new EqualTo(input2.clean(keySelector)); } // -------------------------------------------------------------------- /** * A join operation that has {@link KeySelector KeySelectors} defined for both inputs. */ @Public public class EqualTo { private final KeySelector keySelector2; EqualTo(KeySelector keySelector2) { this.keySelector2 = requireNonNull(keySelector2); } /** * Specifies the window on which the join operation works. */ @PublicEvolving public WithWindow window(WindowAssigner, W> assigner) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, assigner, null, null); } } } // ------------------------------------------------------------------------ /** * A join operation that has {@link KeySelector KeySelectors} defined for both inputs as * well as a {@link WindowAssigner}. * * @param Type of the elements from the first input * @param Type of the elements from the second input * @param Type of the key. This must be the same for both inputs * @param Type of {@link Window} on which the join operation works. */ @Public public static class WithWindow { private final DataStream input1; private final DataStream input2; private final KeySelector keySelector1; private final KeySelector keySelector2; private final TypeInformation keyType; private final WindowAssigner, W> windowAssigner; private final Trigger, ? super W> trigger; private final Evictor, ? super W> evictor; @PublicEvolving protected WithWindow(DataStream input1, DataStream input2, KeySelector keySelector1, KeySelector keySelector2, TypeInformation keyType, WindowAssigner, W> windowAssigner, Trigger, ? super W> trigger, Evictor, ? super W> evictor) { this.input1 = requireNonNull(input1); this.input2 = requireNonNull(input2); this.keySelector1 = requireNonNull(keySelector1); this.keySelector2 = requireNonNull(keySelector2); this.keyType = requireNonNull(keyType); this.windowAssigner = requireNonNull(windowAssigner); this.trigger = trigger; this.evictor = evictor; } /** * Sets the {@code Trigger} that should be used to trigger window emission. */ @PublicEvolving public WithWindow trigger(Trigger, ? super W> newTrigger) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, newTrigger, evictor); } /** * Sets the {@code Evictor} that should be used to evict elements from a window before emission. * *

Note: When using an evictor window performance will degrade significantly, since * pre-aggregation of window results cannot be used. */ @PublicEvolving public WithWindow evictor(Evictor, ? super W> newEvictor) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, newEvictor); } /** * Completes the join operation with the user function that is executed * for each combination of elements with the same key in a window. * *

Note: This method's return type does not support setting an operator-specific parallelism. * Due to binary backwards compatibility, this cannot be altered. Use the {@link #with(JoinFunction)} * method to set an operator-specific parallelism. */ public DataStream apply(JoinFunction function) { TypeInformation resultType = TypeExtractor.getBinaryOperatorReturnType( function, JoinFunction.class, 0, 1, 2, new int[]{0}, new int[]{1}, TypeExtractor.NO_INDEX, input1.getType(), input2.getType(), "Join", false); return apply(function, resultType); } /** * Completes the join operation with the user function that is executed * for each combination of elements with the same key in a window. * *

Note: This is a temporary workaround while the {@link #apply(JoinFunction)} * method has the wrong return type and hence does not allow one to set an operator-specific * parallelism * * @deprecated This method will be removed once the {@link #apply(JoinFunction)} method is fixed * in the next major version of Flink (2.0). */ @PublicEvolving @Deprecated public SingleOutputStreamOperator with(JoinFunction function) { return (SingleOutputStreamOperator) apply(function); } /** * Completes the join operation with the user function that is executed * for each combination of elements with the same key in a window. * *

Note: This method's return type does not support setting an operator-specific parallelism. * Due to binary backwards compatibility, this cannot be altered. Use the * {@link #with(JoinFunction, TypeInformation)}, method to set an operator-specific parallelism. */ public DataStream apply(FlatJoinFunction function, TypeInformation resultType) { //clean the closure function = input1.getExecutionEnvironment().clean(function); return input1.coGroup(input2) .where(keySelector1) .equalTo(keySelector2) .window(windowAssigner) .trigger(trigger) .evictor(evictor) .apply(new FlatJoinCoGroupFunction<>(function), resultType); } /** * Completes the join operation with the user function that is executed * for each combination of elements with the same key in a window. * *

Note: This is a temporary workaround while the {@link #apply(JoinFunction, TypeInformation)} * method has the wrong return type and hence does not allow one to set an operator-specific * parallelism * * @deprecated This method will be replaced by {@link #apply(FlatJoinFunction, TypeInformation)} in Flink 2.0. * So use the {@link #apply(FlatJoinFunction, TypeInformation)} in the future. */ @PublicEvolving @Deprecated public SingleOutputStreamOperator with(FlatJoinFunction function, TypeInformation resultType) { return (SingleOutputStreamOperator) apply(function, resultType); } /** * Completes the join operation with the user function that is executed * for each combination of elements with the same key in a window. * *

Note: This method's return type does not support setting an operator-specific parallelism. * Due to binary backwards compatibility, this cannot be altered. Use the * {@link #with(FlatJoinFunction)}, method to set an operator-specific parallelism. */ public DataStream apply(FlatJoinFunction function) { TypeInformation resultType = TypeExtractor.getBinaryOperatorReturnType( function, FlatJoinFunction.class, 0, 1, 2, new int[]{0}, new int[]{1}, new int[]{2, 0}, input1.getType(), input2.getType(), "Join", false); return apply(function, resultType); } /** * Completes the join operation with the user function that is executed * for each combination of elements with the same key in a window. * *

Note: This is a temporary workaround while the {@link #apply(FlatJoinFunction)} * method has the wrong return type and hence does not allow one to set an operator-specific * parallelism. * * @deprecated This method will be removed once the {@link #apply(FlatJoinFunction)} * method is fixed in the next major version of Flink (2.0). */ @PublicEvolving @Deprecated public SingleOutputStreamOperator with(FlatJoinFunction function) { return (SingleOutputStreamOperator) apply(function); } /** * Completes the join operation with the user function that is executed * for each combination of elements with the same key in a window. * *

Note: This method's return type does not support setting an operator-specific parallelism. * Due to binary backwards compatibility, this cannot be altered. Use the * {@link #with(JoinFunction, TypeInformation)}, method to set an operator-specific parallelism. */ public DataStream apply(JoinFunction function, TypeInformation resultType) { //clean the closure function = input1.getExecutionEnvironment().clean(function); return input1.coGroup(input2) .where(keySelector1) .equalTo(keySelector2) .window(windowAssigner) .trigger(trigger) .evictor(evictor) .apply(new JoinCoGroupFunction<>(function), resultType); } /** * Completes the join operation with the user function that is executed * for each combination of elements with the same key in a window. * *

Note: This is a temporary workaround while the {@link #apply(FlatJoinFunction, TypeInformation)} * method has the wrong return type and hence does not allow one to set an operator-specific * parallelism * * @deprecated This method will be removed once the {@link #apply(JoinFunction, TypeInformation)} * method is fixed in the next major version of Flink (2.0). */ @PublicEvolving @Deprecated public SingleOutputStreamOperator with(JoinFunction function, TypeInformation resultType) { return (SingleOutputStreamOperator) apply(function, resultType); } } // ------------------------------------------------------------------------ // Implementation of the functions // ------------------------------------------------------------------------ /** * CoGroup function that does a nested-loop join to get the join result. */ private static class JoinCoGroupFunction extends WrappingFunction> implements CoGroupFunction { private static final long serialVersionUID = 1L; public JoinCoGroupFunction(JoinFunction wrappedFunction) { super(wrappedFunction); } @Override public void coGroup(Iterable first, Iterable second, Collector out) throws Exception { for (T1 val1: first) { for (T2 val2: second) { out.collect(wrappedFunction.join(val1, val2)); } } } } /** * CoGroup function that does a nested-loop join to get the join result. (FlatJoin version) */ private static class FlatJoinCoGroupFunction extends WrappingFunction> implements CoGroupFunction { private static final long serialVersionUID = 1L; public FlatJoinCoGroupFunction(FlatJoinFunction wrappedFunction) { super(wrappedFunction); } @Override public void coGroup(Iterable first, Iterable second, Collector out) throws Exception { for (T1 val1: first) { for (T2 val2: second) { wrappedFunction.join(val1, val2, out); } } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy