All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flinkx.api.AllWindowedStream.scala Maven / Gradle / Ivy

The newest version!
package org.apache.flinkx.api

import org.apache.flink.annotation.{Public, PublicEvolving}
import org.apache.flink.api.common.functions.{AggregateFunction, ReduceFunction}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.datastream.{AllWindowedStream => JavaAllWStream}
import org.apache.flink.streaming.api.functions.aggregation.AggregationFunction.AggregationType
import org.apache.flink.streaming.api.functions.aggregation.{ComparableAggregator, SumAggregator}
import org.apache.flinkx.api.function.util.{
  ScalaAllWindowFunction,
  ScalaAllWindowFunctionWrapper,
  ScalaProcessAllWindowFunctionWrapper,
  ScalaReduceFunction
}
import org.apache.flinkx.api.function.{AllWindowFunction, ProcessAllWindowFunction}
import org.apache.flink.streaming.api.windowing.evictors.Evictor
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers.Trigger
import org.apache.flink.streaming.api.windowing.windows.Window
import org.apache.flink.util.Collector
import org.apache.flink.util.Preconditions.checkNotNull
import ScalaStreamOps._

/** A [[AllWindowedStream]] represents a data stream where the stream of elements is split into windows based on a
  * [[org.apache.flink.streaming.api.windowing.assigners.WindowAssigner]]. Window emission is triggered based on a
  * [[Trigger]].
  *
  * If an [[Evictor]] is specified it will be used to evict elements from the window after evaluation was triggered by
  * the [[Trigger]] but before the actual evaluation of the window. When using an evictor window performance will
  * degrade significantly, since pre-aggregation of window results cannot be used.
  *
  * Note that the [[AllWindowedStream()]] is purely and API construct, during runtime the [[AllWindowedStream()]] will
  * be collapsed together with the operation over the window into one single operation.
  *
  * @tparam T
  *   The type of elements in the stream.
  * @tparam W
  *   The type of [[Window]] that the [[org.apache.flink.streaming.api.windowing.assigners.WindowAssigner]] assigns the
  *   elements to.
  */
@Public
class AllWindowedStream[T, W <: Window](javaStream: JavaAllWStream[T, W]) {

  /** Sets the allowed lateness to a user-specified value. If not explicitly set, the allowed lateness is [[0L]].
    * Setting the allowed lateness is only valid for event-time windows. If a value different than 0 is provided with a
    * processing-time [[org.apache.flink.streaming.api.windowing.assigners.WindowAssigner]], then an exception is
    * thrown.
    */
  @PublicEvolving
  def allowedLateness(lateness: Time): AllWindowedStream[T, W] = {
    javaStream.allowedLateness(lateness)
    this
  }

  /** Send late arriving data to the side output identified by the given [[OutputTag]]. Data is considered late after
    * the watermark has passed the end of the window plus the allowed lateness set using [[allowedLateness(Time)]].
    *
    * You can get the stream of late data using [[DataStream.getSideOutput()]] on the [[DataStream]] resulting from the
    * windowed operation with the same [[OutputTag]].
    */
  @PublicEvolving
  def sideOutputLateData(outputTag: OutputTag[T]): AllWindowedStream[T, W] = {
    javaStream.sideOutputLateData(outputTag)
    this
  }

  /** Sets the [[Trigger]] that should be used to trigger window emission.
    */
  @PublicEvolving
  def trigger(trigger: Trigger[_ >: T, _ >: W]): AllWindowedStream[T, W] = {
    javaStream.trigger(trigger)
    this
  }

  /** Sets the [[Evictor]] that should be used to evict elements from a window before emission.
    *
    * Note: When using an evictor window performance will degrade significantly, since pre-aggregation of window results
    * cannot be used.
    */
  @PublicEvolving
  def evictor(evictor: Evictor[_ >: T, _ >: W]): AllWindowedStream[T, W] = {
    javaStream.evictor(evictor)
    this
  }

  // ------------------------------------------------------------------------
  //  Operations on the windows
  // ------------------------------------------------------------------------

  // ---------------------------- reduce() ------------------------------------

  /** Applies a reduce function to the window. The window function is called for each evaluation of the window for each
    * key individually. The output of the reduce function is interpreted as a regular non-windowed stream.
    *
    * This window will try and pre-aggregate data as much as the window policies permit. For example, tumbling time
    * windows can perfectly pre-aggregate the data, meaning that only one element per key is stored. Sliding time
    * windows will pre-aggregate on the granularity of the slide interval, so a few elements are stored per key (one per
    * slide interval). Custom windows may not be able to pre-aggregate, or may need to store extra values in an
    * aggregation tree.
    *
    * @param function
    *   The reduce function.
    * @return
    *   The data stream that is the result of applying the reduce function to the window.
    */
  def reduce(function: ReduceFunction[T]): DataStream[T] = {
    asScalaStream(javaStream.reduce(clean(function)))
  }

  /** Applies a reduce function to the window. The window function is called for each evaluation of the window for each
    * key individually. The output of the reduce function is interpreted as a regular non-windowed stream.
    *
    * This window will try and pre-aggregate data as much as the window policies permit. For example, tumbling time
    * windows can perfectly pre-aggregate the data, meaning that only one element per key is stored. Sliding time
    * windows will pre-aggregate on the granularity of the slide interval, so a few elements are stored per key (one per
    * slide interval). Custom windows may not be able to pre-aggregate, or may need to store extra values in an
    * aggregation tree.
    *
    * @param function
    *   The reduce function.
    * @return
    *   The data stream that is the result of applying the reduce function to the window.
    */
  def reduce(function: (T, T) => T): DataStream[T] = {
    if (function == null) {
      throw new NullPointerException("Reduce function must not be null.")
    }
    val cleanFun = clean(function)
    val reducer  = new ScalaReduceFunction[T](cleanFun)

    reduce(reducer)
  }

  /** Applies the given window function to each window. The window function is called for each evaluation of the window
    * for each key individually. The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Arriving data is pre-aggregated using the given pre-aggregation reducer.
    *
    * @param preAggregator
    *   The reduce function that is used for pre-aggregation
    * @param windowFunction
    *   The window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  def reduce[R: TypeInformation](
      preAggregator: ReduceFunction[T],
      windowFunction: AllWindowFunction[T, R, W]
  ): DataStream[R] = {

    val cleanedReducer        = clean(preAggregator)
    val cleanedWindowFunction = clean(windowFunction)

    val applyFunction = new ScalaAllWindowFunctionWrapper[T, R, W](cleanedWindowFunction)

    val returnType: TypeInformation[R] = implicitly[TypeInformation[R]]
    asScalaStream(javaStream.reduce(cleanedReducer, applyFunction, returnType))
  }

  /** Applies the given window function to each window. The window function is called for each evaluation of the window
    * for each key individually. The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Arriving data is pre-aggregated using the given pre-aggregation reducer.
    *
    * @param preAggregator
    *   The reduce function that is used for pre-aggregation
    * @param windowFunction
    *   The window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  def reduce[R: TypeInformation](
      preAggregator: (T, T) => T,
      windowFunction: (W, Iterable[T], Collector[R]) => Unit
  ): DataStream[R] = {

    if (preAggregator == null) {
      throw new NullPointerException("Reduce function must not be null.")
    }
    if (windowFunction == null) {
      throw new NullPointerException("WindowApply function must not be null.")
    }

    val cleanReducer        = clean(preAggregator)
    val cleanWindowFunction = clean(windowFunction)

    val reducer       = new ScalaReduceFunction[T](cleanReducer)
    val applyFunction = new ScalaAllWindowFunction[T, R, W](cleanWindowFunction)

    val returnType: TypeInformation[R] = implicitly[TypeInformation[R]]
    asScalaStream(javaStream.reduce(reducer, applyFunction, returnType))
  }

  /** Applies the given window function to each window. The window function is called for each evaluation of the window
    * for each key individually. The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Arriving data is pre-aggregated using the given pre-aggregation reducer.
    *
    * @param preAggregator
    *   The reduce function that is used for pre-aggregation
    * @param windowFunction
    *   The process window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  @PublicEvolving
  def reduce[R: TypeInformation](
      preAggregator: ReduceFunction[T],
      windowFunction: ProcessAllWindowFunction[T, R, W]
  ): DataStream[R] = {

    val cleanedReducer        = clean(preAggregator)
    val cleanedWindowFunction = clean(windowFunction)

    val applyFunction = new ScalaProcessAllWindowFunctionWrapper[T, R, W](cleanedWindowFunction)

    val returnType: TypeInformation[R] = implicitly[TypeInformation[R]]
    asScalaStream(javaStream.reduce(cleanedReducer, applyFunction, returnType))
  }

  /** Applies the given window function to each window. The window function is called for each evaluation of the window
    * for each key individually. The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Arriving data is pre-aggregated using the given pre-aggregation reducer.
    *
    * @param preAggregator
    *   The reduce function that is used for pre-aggregation
    * @param windowFunction
    *   The process window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  @PublicEvolving
  def reduce[R: TypeInformation](
      preAggregator: (T, T) => T,
      windowFunction: ProcessAllWindowFunction[T, R, W]
  ): DataStream[R] = {

    if (preAggregator == null) {
      throw new NullPointerException("Reduce function must not be null.")
    }
    if (windowFunction == null) {
      throw new NullPointerException("WindowApply function must not be null.")
    }

    val cleanReducer        = clean(preAggregator)
    val cleanWindowFunction = clean(windowFunction)

    val reducer       = new ScalaReduceFunction[T](cleanReducer)
    val applyFunction = new ScalaProcessAllWindowFunctionWrapper[T, R, W](cleanWindowFunction)

    val returnType: TypeInformation[R] = implicitly[TypeInformation[R]]
    asScalaStream(javaStream.reduce(reducer, applyFunction, returnType))
  }

  // --------------------------- aggregate() ----------------------------------

  /** Applies the given aggregation function to each window. The aggregation function is called for each element,
    * aggregating values incrementally and keeping the state to one accumulator per window.
    *
    * @param aggregateFunction
    *   The aggregation function.
    * @return
    *   The data stream that is the result of applying the aggregate function to the window.
    */
  @PublicEvolving
  def aggregate[ACC: TypeInformation, R: TypeInformation](
      aggregateFunction: AggregateFunction[T, ACC, R]
  ): DataStream[R] = {

    checkNotNull(aggregateFunction, "AggregationFunction must not be null")

    val accumulatorType: TypeInformation[ACC] = implicitly[TypeInformation[ACC]]
    val resultType: TypeInformation[R]        = implicitly[TypeInformation[R]]

    asScalaStream(javaStream.aggregate(clean(aggregateFunction), accumulatorType, resultType))
  }

  /** Applies the given window function to each window. The window function is called for each evaluation of the window
    * for each key individually. The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Arriving data is pre-aggregated using the given aggregation function.
    *
    * @param preAggregator
    *   The aggregation function that is used for pre-aggregation
    * @param windowFunction
    *   The window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  @PublicEvolving
  def aggregate[ACC: TypeInformation, V: TypeInformation, R: TypeInformation](
      preAggregator: AggregateFunction[T, ACC, V],
      windowFunction: AllWindowFunction[V, R, W]
  ): DataStream[R] = {

    checkNotNull(preAggregator, "AggregationFunction must not be null")
    checkNotNull(windowFunction, "Window function must not be null")

    val cleanedPreAggregator  = clean(preAggregator)
    val cleanedWindowFunction = clean(windowFunction)

    val applyFunction = new ScalaAllWindowFunctionWrapper[V, R, W](cleanedWindowFunction)

    val accumulatorType: TypeInformation[ACC] = implicitly[TypeInformation[ACC]]
    val resultType: TypeInformation[R]        = implicitly[TypeInformation[R]]

    asScalaStream(javaStream.aggregate(cleanedPreAggregator, applyFunction, accumulatorType, resultType))
  }

  /** Applies the given window function to each window. The window function is called for each evaluation of the window
    * for each key individually. The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Arriving data is pre-aggregated using the given aggregation function.
    *
    * @param preAggregator
    *   The aggregation function that is used for pre-aggregation
    * @param windowFunction
    *   The process window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  @PublicEvolving
  def aggregate[ACC: TypeInformation, V: TypeInformation, R: TypeInformation](
      preAggregator: AggregateFunction[T, ACC, V],
      windowFunction: ProcessAllWindowFunction[V, R, W]
  ): DataStream[R] = {

    checkNotNull(preAggregator, "AggregationFunction must not be null")
    checkNotNull(windowFunction, "Window function must not be null")

    val cleanedPreAggregator  = clean(preAggregator)
    val cleanedWindowFunction = clean(windowFunction)

    val applyFunction = new ScalaProcessAllWindowFunctionWrapper[V, R, W](cleanedWindowFunction)

    val accumulatorType: TypeInformation[ACC]     = implicitly[TypeInformation[ACC]]
    val aggregationResultType: TypeInformation[V] = implicitly[TypeInformation[V]]
    val resultType: TypeInformation[R]            = implicitly[TypeInformation[R]]

    asScalaStream(
      javaStream.aggregate(cleanedPreAggregator, applyFunction, accumulatorType, aggregationResultType, resultType)
    )
  }

  /** Applies the given window function to each window. The window function is called for each evaluation of the window.
    * The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Arriving data is pre-aggregated using the given aggregation function.
    *
    * @param preAggregator
    *   The aggregation function that is used for pre-aggregation
    * @param windowFunction
    *   The window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  @PublicEvolving
  def aggregate[ACC: TypeInformation, V: TypeInformation, R: TypeInformation](
      preAggregator: AggregateFunction[T, ACC, V],
      windowFunction: (W, Iterable[V], Collector[R]) => Unit
  ): DataStream[R] = {

    checkNotNull(preAggregator, "AggregationFunction must not be null")
    checkNotNull(windowFunction, "Window function must not be null")

    val cleanPreAggregator  = clean(preAggregator)
    val cleanWindowFunction = clean(windowFunction)

    val applyFunction = new ScalaAllWindowFunction[V, R, W](cleanWindowFunction)

    val accumulatorType: TypeInformation[ACC] = implicitly[TypeInformation[ACC]]
    val resultType: TypeInformation[R]        = implicitly[TypeInformation[R]]

    asScalaStream(javaStream.aggregate(cleanPreAggregator, applyFunction, accumulatorType, resultType))
  }

  // ---------------------------- apply() -------------------------------------

  /** Applies the given window function to each window. The window function is called for each evaluation of the window
    * for each key individually. The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Note that this function requires that all data in the windows is buffered until the window is evaluated, as the
    * function provides no means of pre-aggregation.
    *
    * @param function
    *   The process window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  @PublicEvolving
  def process[R: TypeInformation](function: ProcessAllWindowFunction[T, R, W]): DataStream[R] = {

    val cleanedFunction = clean(function)
    val javaFunction    = new ScalaProcessAllWindowFunctionWrapper[T, R, W](cleanedFunction)

    asScalaStream(javaStream.process(javaFunction, implicitly[TypeInformation[R]]))
  }

  /** Applies the given window function to each window. The window function is called for each evaluation of the window
    * for each key individually. The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Note that this function requires that all data in the windows is buffered until the window is evaluated, as the
    * function provides no means of pre-aggregation.
    *
    * @param function
    *   The window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  def apply[R: TypeInformation](function: AllWindowFunction[T, R, W]): DataStream[R] = {

    val cleanedFunction = clean(function)
    val javaFunction    = new ScalaAllWindowFunctionWrapper[T, R, W](cleanedFunction)

    asScalaStream(javaStream.apply(javaFunction, implicitly[TypeInformation[R]]))
  }

  /** Applies the given window function to each window. The window function is called for each evaluation of the window
    * for each key individually. The output of the window function is interpreted as a regular non-windowed stream.
    *
    * Note that this function requires that all data in the windows is buffered until the window is evaluated, as the
    * function provides no means of pre-aggregation.
    *
    * @param function
    *   The window function.
    * @return
    *   The data stream that is the result of applying the window function to the window.
    */
  def apply[R: TypeInformation](function: (W, Iterable[T], Collector[R]) => Unit): DataStream[R] = {

    val cleanedFunction = clean(function)
    val applyFunction   = new ScalaAllWindowFunction[T, R, W](cleanedFunction)

    asScalaStream(javaStream.apply(applyFunction, implicitly[TypeInformation[R]]))
  }

  // ------------------------------------------------------------------------
  //  Aggregations on the keyed windows
  // ------------------------------------------------------------------------

  /** Applies an aggregation that that gives the maximum of the elements in the window at the given position.
    */
  def max(position: Int): DataStream[T] = aggregate(AggregationType.MAX, position)

  /** Applies an aggregation that that gives the maximum of the elements in the window at the given field.
    */
  def max(field: String): DataStream[T] = aggregate(AggregationType.MAX, field)

  /** Applies an aggregation that that gives the minimum of the elements in the window at the given position.
    */
  def min(position: Int): DataStream[T] = aggregate(AggregationType.MIN, position)

  /** Applies an aggregation that that gives the minimum of the elements in the window at the given field.
    */
  def min(field: String): DataStream[T] = aggregate(AggregationType.MIN, field)

  /** Applies an aggregation that sums the elements in the window at the given position.
    */
  def sum(position: Int): DataStream[T] = aggregate(AggregationType.SUM, position)

  /** Applies an aggregation that sums the elements in the window at the given field.
    */
  def sum(field: String): DataStream[T] = aggregate(AggregationType.SUM, field)

  /** Applies an aggregation that that gives the maximum element of the window by the given position. When equality,
    * returns the first.
    */
  def maxBy(position: Int): DataStream[T] = aggregate(AggregationType.MAXBY, position)

  /** Applies an aggregation that that gives the maximum element of the window by the given field. When equality,
    * returns the first.
    */
  def maxBy(field: String): DataStream[T] = aggregate(AggregationType.MAXBY, field)

  /** Applies an aggregation that that gives the minimum element of the window by the given position. When equality,
    * returns the first.
    */
  def minBy(position: Int): DataStream[T] = aggregate(AggregationType.MINBY, position)

  /** Applies an aggregation that that gives the minimum element of the window by the given field. When equality,
    * returns the first.
    */
  def minBy(field: String): DataStream[T] = aggregate(AggregationType.MINBY, field)

  private def aggregate(aggregationType: AggregationType, field: String): DataStream[T] = {
    val position = fieldNames2Indices(getInputType(), Array(field))(0)
    aggregate(aggregationType, position)
  }

  def aggregate(aggregationType: AggregationType, position: Int): DataStream[T] = {

    val jStream = javaStream.asInstanceOf[JavaAllWStream[Product, W]]

    val reducer = aggregationType match {
      case AggregationType.SUM =>
        new SumAggregator(position, jStream.getInputType, jStream.getExecutionEnvironment.getConfig)

      case _ =>
        new ComparableAggregator(
          position,
          jStream.getInputType,
          aggregationType,
          true,
          jStream.getExecutionEnvironment.getConfig
        )
    }

    new DataStream[Product](jStream.reduce(reducer)).asInstanceOf[DataStream[T]]
  }

  // ------------------------------------------------------------------------
  //  Utilities
  // ------------------------------------------------------------------------

  /** Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning is not disabled in the
    * [[org.apache.flink.api.common.ExecutionConfig]].
    */
  private[flinkx] def clean[F <: AnyRef](f: F): F = {
    new StreamExecutionEnvironment(javaStream.getExecutionEnvironment).scalaClean(f)
  }

  /** Gets the output type.
    */
  private def getInputType(): TypeInformation[T] = javaStream.getInputType
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy