org.apache.flink.streaming.api.scala.ConnectedDataStream.scala Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.scala

import java.util
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.functions.KeySelector
import org.apache.flink.api.scala.ClosureCleaner
import org.apache.flink.streaming.api.datastream.{ConnectedDataStream => JavaCStream, DataStream => JavaStream}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction, CoReduceFunction, CoWindowFunction}
import org.apache.flink.util.Collector
import scala.collection.JavaConverters._
import scala.reflect.ClassTag

class ConnectedDataStream[IN1, IN2](javaStream: JavaCStream[IN1, IN2]) {

  /**
   * Applies a CoMap transformation on a {@link ConnectedDataStream} and maps
   * the output to a common type. The transformation calls a
   * @param fun1 for each element of the first input and
   * @param fun2 for each element of the second input. Each
   * CoMapFunction call returns exactly one element.
   *
   * The CoMapFunction used to jointly transform the two input
   * DataStreams
   * @return The transformed { @link DataStream}
   */
  def map[R: TypeInformation: ClassTag](fun1: IN1 => R, fun2: IN2 => R): 
  DataStream[R] = {
    if (fun1 == null || fun2 == null) {
      throw new NullPointerException("Map function must not be null.")
    }
    val cleanFun1 = clean(fun1)
    val cleanFun2 = clean(fun2)
    val comapper = new CoMapFunction[IN1, IN2, R] {
      def map1(in1: IN1): R = cleanFun1(in1)
      def map2(in2: IN2): R = cleanFun2(in2)
    }

    map(comapper)
  }

  /**
   * Applies a CoMap transformation on a {@link ConnectedDataStream} and maps
   * the output to a common type. The transformation calls a
   * {@link CoMapFunction#map1} for each element of the first input and
   * {@link CoMapFunction#map2} for each element of the second input. Each
   * CoMapFunction call returns exactly one element. The user can also extend
   * {@link RichCoMapFunction} to gain access to other features provided by
   * the {@link RichFuntion} interface.
   *
   * @param coMapper
   * The CoMapFunction used to jointly transform the two input
   * DataStreams
   * @return The transformed { @link DataStream}
   */
  def map[R: TypeInformation: ClassTag](coMapper: CoMapFunction[IN1, IN2, R]): 
  DataStream[R] = {
    if (coMapper == null) {
      throw new NullPointerException("Map function must not be null.")
    }

    val outType : TypeInformation[R] = implicitly[TypeInformation[R]]    
    javaStream.map(coMapper).returns(outType).asInstanceOf[JavaStream[R]]
  }

  /**
   * Applies a CoFlatMap transformation on a {@link ConnectedDataStream} and
   * maps the output to a common type. The transformation calls a
   * {@link CoFlatMapFunction#flatMap1} for each element of the first input
   * and {@link CoFlatMapFunction#flatMap2} for each element of the second
   * input. Each CoFlatMapFunction call returns any number of elements
   * including none. The user can also extend {@link RichFlatMapFunction} to
   * gain access to other features provided by the {@link RichFuntion}
   * interface.
   *
   * @param coFlatMapper
   * The CoFlatMapFunction used to jointly transform the two input
   * DataStreams
   * @return The transformed { @link DataStream}
   */
  def flatMap[R: TypeInformation: ClassTag](coFlatMapper: CoFlatMapFunction[IN1, IN2, R]): 
  DataStream[R] = {
    if (coFlatMapper == null) {
      throw new NullPointerException("FlatMap function must not be null.")
    }
    
    val outType : TypeInformation[R] = implicitly[TypeInformation[R]]    
    javaStream.flatMap(coFlatMapper).returns(outType).asInstanceOf[JavaStream[R]]
  }

  /**
   * Applies a CoFlatMap transformation on a {@link ConnectedDataStream} and
   * maps the output to a common type. The transformation calls a
   * @param fun1 for each element of the first input
   * and @param fun2 for each element of the second
   * input. Each CoFlatMapFunction call returns any number of elements
   * including none.
   *
   * @return The transformed { @link DataStream}
   */
  def flatMap[R: TypeInformation: ClassTag](fun1: (IN1, Collector[R]) => Unit, 
      fun2: (IN2, Collector[R]) => Unit): DataStream[R] = {
    if (fun1 == null || fun2 == null) {
      throw new NullPointerException("FlatMap functions must not be null.")
    }
    val cleanFun1 = clean(fun1)
    val cleanFun2 = clean(fun2)
    val flatMapper = new CoFlatMapFunction[IN1, IN2, R] {
      def flatMap1(value: IN1, out: Collector[R]): Unit = cleanFun1(value, out)
      def flatMap2(value: IN2, out: Collector[R]): Unit = cleanFun2(value, out)
    }
    flatMap(flatMapper)
  }

  /**
   * Applies a CoFlatMap transformation on a {@link ConnectedDataStream} and
   * maps the output to a common type. The transformation calls a
   * @param fun1 for each element of the first input
   * and @param fun2 for each element of the second
   * input. Each CoFlatMapFunction call returns any number of elements
   * including none.
   *
   * @return The transformed { @link DataStream}
   */
  def flatMap[R: TypeInformation: ClassTag](fun1: IN1 => TraversableOnce[R],
      fun2: IN2 => TraversableOnce[R]): DataStream[R] = {
    if (fun1 == null || fun2 == null) {
      throw new NullPointerException("FlatMap functions must not be null.")
    }
    val cleanFun1 = clean(fun1)
    val cleanFun2 = clean(fun2)
    val flatMapper = new CoFlatMapFunction[IN1, IN2, R] {
      def flatMap1(value: IN1, out: Collector[R]) = { cleanFun1(value) foreach out.collect }
      def flatMap2(value: IN2, out: Collector[R]) = { cleanFun2(value) foreach out.collect }
    }
    flatMap(flatMapper)
  }

  /**
   * GroupBy operation for connected data stream. Groups the elements of
   * input1 and input2 according to keyPosition1 and keyPosition2. Used for
   * applying function on grouped data streams for example
   * {@link ConnectedDataStream#reduce}
   *
   * @param keyPosition1
   * The field used to compute the hashcode of the elements in the
   * first input stream.
   * @param keyPosition2
   * The field used to compute the hashcode of the elements in the
   * second input stream.
   * @return @return The transformed { @link ConnectedDataStream}
   */
  def groupBy(keyPosition1: Int, keyPosition2: Int): ConnectedDataStream[IN1, IN2] = {
    javaStream.groupBy(keyPosition1, keyPosition2)
  }

  /**
   * GroupBy operation for connected data stream. Groups the elements of
   * input1 and input2 according to keyPositions1 and keyPositions2. Used for
   * applying function on grouped data streams for example
   * {@link ConnectedDataStream#reduce}
   *
   * @param keyPositions1
   * The fields used to group the first input stream.
   * @param keyPositions2
   * The fields used to group the second input stream.
   * @return @return The transformed { @link ConnectedDataStream}
   */
  def groupBy(keyPositions1: Array[Int], keyPositions2: Array[Int]): 
  ConnectedDataStream[IN1, IN2] = {
    javaStream.groupBy(keyPositions1, keyPositions2)
  }

  /**
   * GroupBy operation for connected data stream using key expressions. Groups
   * the elements of input1 and input2 according to field1 and field2. A field
   * expression is either the name of a public field or a getter method with
   * parentheses of the {@link DataStream}S underlying type. A dot can be used
   * to drill down into objects, as in {@code "field1.getInnerField2()" }.
   *
   * @param field1
   * The grouping expression for the first input
   * @param field2
   * The grouping expression for the second input
   * @return The grouped { @link ConnectedDataStream}
   */
  def groupBy(field1: String, field2: String): ConnectedDataStream[IN1, IN2] = {
    javaStream.groupBy(field1, field2)
  }

  /**
   * GroupBy operation for connected data stream using key expressions. Groups
   * the elements of input1 and input2 according to fields1 and fields2. A
   * field expression is either the name of a public field or a getter method
   * with parentheses of the {@link DataStream}S underlying type. A dot can be
   * used to drill down into objects, as in {@code "field1.getInnerField2()" }
   * .
   *
   * @param fields1
   * The grouping expressions for the first input
   * @param fields2
   * The grouping expressions for the second input
   * @return The grouped { @link ConnectedDataStream}
   */
  def groupBy(fields1: Array[String], fields2: Array[String]): 
  ConnectedDataStream[IN1, IN2] = {
    javaStream.groupBy(fields1, fields2)
  }

  /**
   * GroupBy operation for connected data stream. Groups the elements of
   * input1 and input2 using fun1 and fun2. Used for applying
   * function on grouped data streams for example
   * {@link ConnectedDataStream#reduce}
   *
   * @param fun1
   * The function used for grouping the first input
   * @param fun2
   * The function used for grouping the second input
   * @return The grouped { @link ConnectedDataStream}
   */
  def groupBy[K: TypeInformation, L: TypeInformation](fun1: IN1 => K, fun2: IN2 => L):
  ConnectedDataStream[IN1, IN2] = {

    val cleanFun1 = clean(fun1)
    val cleanFun2 = clean(fun2)
    val keyExtractor1 = new KeySelector[IN1, K] {
      def getKey(in: IN1) = cleanFun1(in)
    }
    val keyExtractor2 = new KeySelector[IN2, L] {
      def getKey(in: IN2) = cleanFun2(in)
    }

    javaStream.groupBy(keyExtractor1, keyExtractor2)
  }

  /**
   * PartitionBy operation for connected data stream. Partitions the elements of
   * input1 and input2 according to keyPosition1 and keyPosition2.
   *
   * @param keyPosition1
   * The field used to compute the hashcode of the elements in the
   * first input stream.
   * @param keyPosition2
   * The field used to compute the hashcode of the elements in the
   * second input stream.
   * @return The transformed { @link ConnectedDataStream}
   */
  def partitionByHash(keyPosition1: Int, keyPosition2: Int): ConnectedDataStream[IN1, IN2] = {
    javaStream.partitionByHash(keyPosition1, keyPosition2)
  }

  /**
   * PartitionBy operation for connected data stream. Partitions the elements of
   * input1 and input2 according to keyPositions1 and keyPositions2.
   *
   * @param keyPositions1
   * The fields used to partition the first input stream.
   * @param keyPositions2
   * The fields used to partition the second input stream.
   * @return The transformed { @link ConnectedDataStream}
   */
  def partitionByHash(keyPositions1: Array[Int], keyPositions2: Array[Int]):
  ConnectedDataStream[IN1, IN2] = {
    javaStream.partitionByHash(keyPositions1, keyPositions2)
  }

  /**
   * PartitionBy operation for connected data stream using key expressions. Partitions
   * the elements of input1 and input2 according to field1 and field2. A field
   * expression is either the name of a public field or a getter method with
   * parentheses of the {@link DataStream}S underlying type. A dot can be used
   * to drill down into objects, as in {@code "field1.getInnerField2()" }.
   *
   * @param field1
   * The partitioning expression for the first input
   * @param field2
   * The partitioning expression for the second input
   * @return The grouped { @link ConnectedDataStream}
   */
  def partitionByHash(field1: String, field2: String): ConnectedDataStream[IN1, IN2] = {
    javaStream.partitionByHash(field1, field2)
  }

  /**
   * PartitionBy operation for connected data stream using key expressions. Partitions
   * the elements of input1 and input2 according to fields1 and fields2.
   *
   * @param fields1
   * The partitioning expressions for the first input
   * @param fields2
   * The partitioning expressions for the second input
   * @return The partitioned { @link ConnectedDataStream}
   */
  def partitionByHash(fields1: Array[String], fields2: Array[String]):
  ConnectedDataStream[IN1, IN2] = {
    javaStream.partitionByHash(fields1, fields2)
  }

  /**
   * PartitionBy operation for connected data stream. Partitions the elements of
   * input1 and input2 using fun1 and fun2.
   *
   * @param fun1
   * The function used for partitioning the first input
   * @param fun2
   * The function used for partitioning the second input
   * @return The partitioned { @link ConnectedDataStream}
   */
  def partitionByHash[K: TypeInformation, L: TypeInformation](fun1: IN1 => K, fun2: IN2 => L):
  ConnectedDataStream[IN1, IN2] = {

    val cleanFun1 = clean(fun1)
    val cleanFun2 = clean(fun2)

    val keyExtractor1 = new KeySelector[IN1, K] {
      def getKey(in: IN1) = cleanFun1(in)
    }
    val keyExtractor2 = new KeySelector[IN2, L] {
      def getKey(in: IN2) = cleanFun2(in)
    }

    javaStream.partitionByHash(keyExtractor1, keyExtractor2)
  }

  /**
   * Applies a reduce transformation on a {@link ConnectedDataStream} and maps
   * the outputs to a common type. If the {@link ConnectedDataStream} is
   * batched or windowed then the reduce transformation is applied on every
   * sliding batch/window of the data stream. If the connected data stream is
   * grouped then the reducer is applied on every group of elements sharing
   * the same key. This type of reduce is much faster than reduceGroup since
   * the reduce function can be applied incrementally.
   *
   * @param coReducer
   * The { @link CoReduceFunction} that will be called for every
   *             element of the inputs.
   * @return The transformed { @link DataStream}.
   */
  def reduce[R: TypeInformation: ClassTag](coReducer: CoReduceFunction[IN1, IN2, R]): 
  DataStream[R] = {
    if (coReducer == null) {
      throw new NullPointerException("Reduce function must not be null.")
    }
    
    val outType : TypeInformation[R] = implicitly[TypeInformation[R]]    
    javaStream.reduce(coReducer).returns(outType).asInstanceOf[JavaStream[R]]
  }

  /**
   * Applies a reduce transformation on a {@link ConnectedDataStream} and maps
   * the outputs to a common type. If the {@link ConnectedDataStream} is
   * batched or windowed then the reduce transformation is applied on every
   * sliding batch/window of the data stream. If the connected data stream is
   * grouped then the reducer is applied on every group of elements sharing
   * the same key. This type of reduce is much faster than reduceGroup since
   * the reduce function can be applied incrementally.
   *
   * @return The transformed { @link DataStream}.
   */
  def reduce[R: TypeInformation: ClassTag](reducer1: (IN1, IN1) => IN1,
      reducer2: (IN2, IN2) => IN2,mapper1: IN1 => R, mapper2: IN2 => R): DataStream[R] = {
    if (mapper1 == null || mapper2 == null) {
      throw new NullPointerException("Map functions must not be null.")
    }
    if (reducer1 == null || reducer2 == null) {
      throw new NullPointerException("Reduce functions must not be null.")
    }

    val cleanReducer1 = clean(reducer1)
    val cleanReducer2 = clean(reducer2)
    val cleanMapper1 = clean(mapper1)
    val cleanMapper2 = clean(mapper2)

    val reducer = new CoReduceFunction[IN1, IN2, R] {
      def reduce1(value1: IN1, value2: IN1): IN1 = cleanReducer1(value1, value2)
      def reduce2(value1: IN2, value2: IN2): IN2 = cleanReducer2(value1, value2)
      def map1(value: IN1): R = cleanMapper1(value)
      def map2(value: IN2): R = cleanMapper2(value)
    }
    reduce(reducer)
  }

  /**
   * Applies a CoWindow transformation on the connected DataStreams. The
   * transformation calls the {@link CoWindowFunction#coWindow} method for for
   * time aligned windows of the two data streams. System time is used as
   * default to compute windows.
   *
   * @param coWindowFunction
   * The { @link CoWindowFunction} that will be applied for the time
   *             windows.
   * @param windowSize
   * Size of the windows that will be aligned for both streams in
   * milliseconds.
   * @param slideInterval
   * After every function call the windows will be slid by this
   * interval.
   *
   * @return The transformed { @link DataStream}.
   */
  def windowReduce[R: TypeInformation: ClassTag](coWindowFunction: 
      CoWindowFunction[IN1, IN2, R], windowSize: Long, slideInterval: Long):
      DataStream[R] = {
    if (coWindowFunction == null) {
      throw new NullPointerException("CoWindow function must no be null")
    }
    
    val outType : TypeInformation[R] = implicitly[TypeInformation[R]]    
    
    javaStream.windowReduce(coWindowFunction, windowSize, slideInterval).
    returns(outType).asInstanceOf[JavaStream[R]]
  }

  /**
   * Applies a CoWindow transformation on the connected DataStreams. The
   * transformation calls the {@link CoWindowFunction#coWindow} method for for
   * time aligned windows of the two data streams. System time is used as
   * default to compute windows.
   *
   * @param coWindower
   * The coWindowing function to be applied for the time windows.
   * @param windowSize
   * Size of the windows that will be aligned for both streams in
   * milliseconds.
   * @param slideInterval
   * After every function call the windows will be slid by this
   * interval.
   *
   * @return The transformed { @link DataStream}.
   */
  def windowReduce[R: TypeInformation: ClassTag](coWindower: (Seq[IN1], Seq[IN2], 
      Collector[R]) => Unit, windowSize: Long, slideInterval: Long):
      DataStream[R] = {
    if (coWindower == null) {
      throw new NullPointerException("CoWindow function must no be null")
    }

    val cleanCoWindower = clean(coWindower)

    val coWindowFun = new CoWindowFunction[IN1, IN2, R] {
      def coWindow(first: util.List[IN1], second: util.List[IN2], 
          out: Collector[R]): Unit = cleanCoWindower(first.asScala, second.asScala, out)
    }

    windowReduce(coWindowFun, windowSize, slideInterval)
  }

  /**
   * Returns the first {@link DataStream}.
   *
   * @return The first DataStream.
   */
  def getFirst(): DataStream[IN1] = {
    javaStream.getFirst
  }

  /**
   * Returns the second {@link DataStream}.
   *
   * @return The second DataStream.
   */
  def getSecond(): DataStream[IN2] = {
    javaStream.getSecond
  }

  /**
   * Gets the type of the first input
   *
   * @return The type of the first input
   */
  def getInputType1(): TypeInformation[IN1] = {
    javaStream.getType1
  }

  /**
   * Gets the type of the second input
   *
   * @return The type of the second input
   */
  def getInputType2(): TypeInformation[IN2] = {
    javaStream.getType2
  }

  /**
   * Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning
   * is not disabled in the {@link org.apache.flink.api.common.ExecutionConfig}
   */
  private[flink] def clean[F <: AnyRef](f: F): F = {
    new StreamExecutionEnvironment(javaStream.getExecutionEnvironment).scalaClean(f)
  }

}