All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.streaming.api.scala.JoinedStreams.scala Maven / Gradle / Ivy

There is a newer version: 0.10.2-hadoop1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.scala

import org.apache.flink.api.common.functions.{FlatJoinFunction, JoinFunction}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.functions.KeySelector
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.streaming.api.datastream.{JoinedStreams => JavaJoinedStreams, CoGroupedStreams => JavaCoGroupedStreams}
import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner
import org.apache.flink.streaming.api.windowing.evictors.Evictor
import org.apache.flink.streaming.api.windowing.triggers.Trigger
import org.apache.flink.streaming.api.windowing.windows.Window
import org.apache.flink.util.Collector

import scala.reflect.ClassTag

/**
 * `JoinedStreams` represents two [[DataStream]]s that have been joined.
 * A streaming join operation is evaluated over elements in a window.
 *
 * To finalize the join operation you also need to specify a [[KeySelector]] for
 * both the first and second input and a [[WindowAssigner]]
 *
 * Note: Right now, the groups are being built in memory so you need to ensure that they don't
 * get too big. Otherwise the JVM might crash.
 *
 * Example:
 *
 * {{{
 * val one: DataStream[(String, Int)]  = ...
 * val two: DataStream[(String, Int)] = ...
 *
 * val result = one.join(two)
 *     .where {t => ... }
 *     .equal {t => ... }
 *     .window(TumblingTimeWindows.of(Time.of(5, TimeUnit.SECONDS)))
 *     .apply(new MyJoinFunction())
 * } }}}
 */
object JoinedStreams {

  /**
   * A join operation that does not yet have its [[KeySelector]]s defined.
   *
   * @tparam T1 Type of the elements from the first input
   * @tparam T2 Type of the elements from the second input
   */
  class Unspecified[T1, T2](input1: DataStream[T1], input2: DataStream[T2]) {

    /**
     * Specifies a [[KeySelector]] for elements from the first input.
     */
    def where[KEY: TypeInformation](keySelector: T1 => KEY): WithKey[T1, T2, KEY] = {
      val cleanFun = clean(keySelector)
      val keyType = implicitly[TypeInformation[KEY]]
      val javaSelector = new KeySelector[T1, KEY] with ResultTypeQueryable[KEY] {
        def getKey(in: T1) = cleanFun(in)
        override def getProducedType: TypeInformation[KEY] = keyType
      }
      new WithKey[T1, T2, KEY](input1, input2, javaSelector, null, keyType)
    }

    /**
     * Specifies a [[KeySelector]] for elements from the second input.
     */
    def equalTo[KEY: TypeInformation](keySelector: T2 => KEY): WithKey[T1, T2, KEY] = {
      val cleanFun = clean(keySelector)
      val keyType = implicitly[TypeInformation[KEY]]
      val javaSelector = new KeySelector[T2, KEY] with ResultTypeQueryable[KEY] {
        def getKey(in: T2) = cleanFun(in)
        override def getProducedType: TypeInformation[KEY] = keyType
      }
      new WithKey[T1, T2, KEY](input1, input2, null, javaSelector, keyType)
    }

    /**
     * Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning
     * is not disabled in the [[org.apache.flink.api.common.ExecutionConfig]].
     */
    private[flink] def clean[F <: AnyRef](f: F): F = {
      new StreamExecutionEnvironment(input1.getJavaStream.getExecutionEnvironment).scalaClean(f)
    }
  }

  /**
   * A join operation that has [[KeySelector]]s defined for either both or
   * one input.
   *
   * You need to specify a [[KeySelector]] for both inputs using [[where()]] and [[equalTo()]]
   * before you can proceeed with specifying a [[WindowAssigner]] using [[window()]].
   *
   * @tparam T1 Type of the elements from the first input
   * @tparam T2 Type of the elements from the second input
   * @tparam KEY Type of the key. This must be the same for both inputs
   */
  class WithKey[T1, T2, KEY](
      input1: DataStream[T1],
      input2: DataStream[T2],
      keySelector1: KeySelector[T1, KEY],
      keySelector2: KeySelector[T2, KEY],
      keyType: TypeInformation[KEY]) {

    /**
     * Specifies a [[KeySelector]] for elements from the first input.
     */
    def where(keySelector: T1 => KEY): JoinedStreams.WithKey[T1, T2, KEY] = {
      val cleanFun = clean(keySelector)
      val localKeyType = keyType
      val javaSelector = new KeySelector[T1, KEY] with ResultTypeQueryable[KEY] {
        def getKey(in: T1) = cleanFun(in)
        override def getProducedType: TypeInformation[KEY] = localKeyType
      }
      new WithKey[T1, T2, KEY](input1, input2, javaSelector, keySelector2, localKeyType)
    }

    /**
     * Specifies a [[KeySelector]] for elements from the second input.
     */
    def equalTo(keySelector: T2 => KEY): JoinedStreams.WithKey[T1, T2, KEY] = {
      val cleanFun = clean(keySelector)
      val localKeyType = keyType
      val javaSelector = new KeySelector[T2, KEY] with ResultTypeQueryable[KEY] {
        def getKey(in: T2) = cleanFun(in)
        override def getProducedType: TypeInformation[KEY] = localKeyType
      }
      new WithKey[T1, T2, KEY](input1, input2, keySelector1, javaSelector, localKeyType)
    }

    /**
     * Specifies the window on which the join operation works.
     */
    def window[W <: Window](
        assigner: WindowAssigner[_ >: JavaCoGroupedStreams.TaggedUnion[T1, T2], W])
        : JoinedStreams.WithWindow[T1, T2, KEY, W] = {
      if (keySelector1 == null || keySelector2 == null) {
        throw new UnsupportedOperationException("You first need to specify KeySelectors for both" +
          "inputs using where() and equalTo().")
      }
      new JoinedStreams.WithWindow[T1, T2, KEY, W](
        input1,
        input2,
        keySelector1,
        keySelector2,
        clean(assigner),
        null,
        null)
    }

    /**
     * Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning
     * is not disabled in the [[org.apache.flink.api.common.ExecutionConfig]].
     */
    private[flink] def clean[F <: AnyRef](f: F): F = {
      new StreamExecutionEnvironment(input1.getJavaStream.getExecutionEnvironment).scalaClean(f)
    }
  }

  /**
   * A join operation that has [[KeySelector]]s defined for both inputs as
   * well as a [[WindowAssigner]].
   *
   * @tparam T1 Type of the elements from the first input
   * @tparam T2 Type of the elements from the second input
   * @tparam KEY Type of the key. This must be the same for both inputs
   * @tparam W Type of { @link Window} on which the join operation works.
   */
  class WithWindow[T1, T2, KEY, W <: Window](
      input1: DataStream[T1],
      input2: DataStream[T2],
      keySelector1: KeySelector[T1, KEY],
      keySelector2: KeySelector[T2, KEY],
      windowAssigner: WindowAssigner[_ >: JavaCoGroupedStreams.TaggedUnion[T1, T2], W],
      trigger: Trigger[_ >: JavaCoGroupedStreams.TaggedUnion[T1, T2], _ >: W],
      evictor: Evictor[_ >: JavaCoGroupedStreams.TaggedUnion[T1, T2], _ >: W]) {


    /**
     * Sets the [[Trigger]] that should be used to trigger window emission.
     */
    def trigger(newTrigger: Trigger[_ >: JavaCoGroupedStreams.TaggedUnion[T1, T2], _ >: W])
    : JoinedStreams.WithWindow[T1, T2, KEY, W] = {
      new WithWindow[T1, T2, KEY, W](
        input1,
        input2,
        keySelector1,
        keySelector2,
        windowAssigner,
        newTrigger,
        evictor)
    }

    /**
     * Sets the [[Evictor]] that should be used to evict elements from a window before emission.
     *
     * Note: When using an evictor window performance will degrade significantly, since
     * pre-aggregation of window results cannot be used.
     */
    def evictor(newEvictor: Evictor[_ >: JavaCoGroupedStreams.TaggedUnion[T1, T2], _ >: W])
    : JoinedStreams.WithWindow[T1, T2, KEY, W] = {
      new WithWindow[T1, T2, KEY, W](
        input1,
        input2,
        keySelector1,
        keySelector2,
        windowAssigner,
        trigger,
        newEvictor)
    }

    /**
     * Completes the join operation with the user function that is executed
     * for windowed groups.
     */
    def apply[O: TypeInformation: ClassTag](fun: (T1, T2) => O): DataStream[O] = {
      require(fun != null, "Join function must not be null.")

      val joiner = new FlatJoinFunction[T1, T2, O] {
        val cleanFun = clean(fun)
        def join(left: T1, right: T2, out: Collector[O]) = {
          out.collect(cleanFun(left, right))
        }
      }
      apply(joiner)
    }

    /**
     * Completes the join operation with the user function that is executed
     * for windowed groups.
     */

    def apply[O: TypeInformation: ClassTag](fun: (T1, T2, Collector[O]) => Unit): DataStream[O] = {
      require(fun != null, "Join function must not be null.")

      val joiner = new FlatJoinFunction[T1, T2, O] {
        val cleanFun = clean(fun)
        def join(left: T1, right: T2, out: Collector[O]) = {
          cleanFun(left, right, out)
        }
      }
      apply(joiner)
    }

    /**
     * Completes the join operation with the user function that is executed
     * for windowed groups.
     */
    def apply[T: TypeInformation](function: JoinFunction[T1, T2, T]): DataStream[T] = {

      val join = new JavaJoinedStreams[T1, T2](input1.getJavaStream, input2.getJavaStream)

      join
        .where(keySelector1)
        .equalTo(keySelector2)
        .window(windowAssigner)
        .trigger(trigger)
        .evictor(evictor)
        .apply(clean(function), implicitly[TypeInformation[T]])
    }

    /**
     * Completes the join operation with the user function that is executed
     * for windowed groups.
     */
    def apply[T: TypeInformation](function: FlatJoinFunction[T1, T2, T]): DataStream[T] = {

      val join = new JavaJoinedStreams[T1, T2](input1.getJavaStream, input2.getJavaStream)

      join
        .where(keySelector1)
        .equalTo(keySelector2)
        .window(windowAssigner)
        .trigger(trigger)
        .evictor(evictor)
        .apply(clean(function), implicitly[TypeInformation[T]])
    }

    /**
     * Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning
     * is not disabled in the [[org.apache.flink.api.common.ExecutionConfig]].
     */
    private[flink] def clean[F <: AnyRef](f: F): F = {
      new StreamExecutionEnvironment(input1.getJavaStream.getExecutionEnvironment).scalaClean(f)
    }
  }


  /**
   * Creates a new join operation from the two given inputs.
   */
  def createJoin[T1, T2](input1: DataStream[T1], input2: DataStream[T2])
      : JoinedStreams.Unspecified[T1, T2] = {
    new JoinedStreams.Unspecified[T1, T2](input1, input2)
  }

}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy