
org.apache.flink.streaming.api.scala.WindowedDataStream.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.api.scala
import org.apache.flink.api.scala.ClosureCleaner
import scala.Array.canBuildFrom
import scala.collection.JavaConverters._
import scala.reflect.ClassTag
import org.apache.flink.api.common.functions.{FoldFunction, ReduceFunction}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.functions.KeySelector
import org.apache.flink.api.java.typeutils.TupleTypeInfoBase
import org.apache.flink.api.streaming.scala.ScalaStreamingAggregator
import org.apache.flink.streaming.api.datastream.{WindowedDataStream => JavaWStream, DiscretizedStream}
import org.apache.flink.streaming.api.functions.WindowMapFunction
import org.apache.flink.streaming.api.functions.aggregation.AggregationFunction.AggregationType
import org.apache.flink.streaming.api.functions.aggregation.SumFunction
import org.apache.flink.streaming.api.windowing.StreamWindow
import org.apache.flink.streaming.api.windowing.helper.WindowingHelper
import org.apache.flink.util.Collector
class WindowedDataStream[T](javaStream: JavaWStream[T]) {
/**
* Gets the name of the current data stream. This name is
* used by the visualization and logging during runtime.
*
* @return Name of the stream.
*/
def getName : String = javaStream match {
case stream : DiscretizedStream[_] => stream.getName
case _ => throw new
UnsupportedOperationException("Only supported for windowing operators.")
}
/**
* Sets the name of the current data stream. This name is
* used by the visualization and logging during runtime.
*
* @return The named operator
*/
def name(name: String) : WindowedDataStream[T] = javaStream match {
case stream : DiscretizedStream[T] => stream.name(name)
case _ => throw new
UnsupportedOperationException("Only supported for windowing operators.")
this
}
/**
* Defines the slide size (trigger frequency) for the windowed data stream.
* This controls how often the user defined function will be triggered on
* the window.
*/
def every(windowingHelper: WindowingHelper[_]): WindowedDataStream[T] =
javaStream.every(windowingHelper)
/**
* Groups the elements of the WindowedDataStream using the given
* field positions. The window sizes (evictions) and slide sizes
* (triggers) will be calculated on the whole stream (in a global fashion),
* but the user defined functions will be applied on a per group basis.
* To get windows and triggers on a per group basis apply the
* DataStream.window(...) operator on an already grouped data stream.
*
*/
def groupBy(fields: Int*): WindowedDataStream[T] = javaStream.groupBy(fields: _*)
/**
* Groups the elements of the WindowedDataStream using the given
* field expressions. The window sizes (evictions) and slide sizes
* (triggers) will be calculated on the whole stream (in a global fashion),
* but the user defined functions will be applied on a per group basis.
* To get windows and triggers on a per group basis apply the
* DataStream.window(...) operator on an already grouped data stream.
*
*/
def groupBy(firstField: String, otherFields: String*): WindowedDataStream[T] =
javaStream.groupBy(firstField +: otherFields.toArray: _*)
/**
* Groups the elements of the WindowedDataStream using the given
* KeySelector function. The window sizes (evictions) and slide sizes
* (triggers) will be calculated on the whole stream (in a global fashion),
* but the user defined functions will be applied on a per group basis.
* To get windows and triggers on a per group basis apply the
* DataStream.window(...) operator on an already grouped data stream.
*
*/
def groupBy[K: TypeInformation](fun: T => K): WindowedDataStream[T] = {
val cleanFun = clean(fun)
val keyExtractor = new KeySelector[T, K] {
def getKey(in: T) = cleanFun(in)
}
javaStream.groupBy(keyExtractor)
}
/**
* Sets the window discretisation local, meaning that windows will be
* created in parallel at environment parallelism.
*
*/
def local(): WindowedDataStream[T] = javaStream.local
/**
* Flattens the result of a window transformation returning the stream of window
* contents elementwise.
*/
def flatten(): DataStream[T] = javaStream.flatten()
/**
* Returns the stream of StreamWindows created by the window tranformation
*/
def getDiscretizedStream(): DataStream[StreamWindow[T]] = javaStream.getDiscretizedStream()
/**
* Applies a reduce transformation on the windowed data stream by reducing
* the current window at every trigger.
*
*/
def reduceWindow(reducer: ReduceFunction[T]): WindowedDataStream[T] = {
if (reducer == null) {
throw new NullPointerException("Reduce function must not be null.")
}
javaStream.reduceWindow(reducer)
}
/**
* Applies a reduce transformation on the windowed data stream by reducing
* the current window at every trigger.
*
*/
def reduceWindow(fun: (T, T) => T): WindowedDataStream[T] = {
if (fun == null) {
throw new NullPointerException("Reduce function must not be null.")
}
val cleanFun = clean(fun)
val reducer = new ReduceFunction[T] {
def reduce(v1: T, v2: T) = { cleanFun(v1, v2) }
}
reduceWindow(reducer)
}
/**
* Applies a fold transformation on the windowed data stream by reducing
* the current window at every trigger.
*
*/
def foldWindow[R: TypeInformation: ClassTag](initialValue: R, folder: FoldFunction[T,R]):
WindowedDataStream[R] = {
if (folder == null) {
throw new NullPointerException("Fold function must not be null.")
}
javaStream.foldWindow(initialValue, folder, implicitly[TypeInformation[R]])
}
/**
* Applies a fold transformation on the windowed data stream by reducing
* the current window at every trigger.
*
*/
def foldWindow[R: TypeInformation: ClassTag](initialValue: R, fun: (R, T) => R):
WindowedDataStream[R] = {
if (fun == null) {
throw new NullPointerException("Fold function must not be null.")
}
val cleanFun = clean(fun)
val folder = new FoldFunction[T,R] {
def fold(acc: R, v: T) = { cleanFun(acc, v) }
}
foldWindow(initialValue, folder)
}
/**
* Applies a mapWindow transformation on the windowed data stream by calling the mapWindow
* method on current window at every trigger. In contrast with the simple binary reduce
* operator, mapWindow exposes the whole window through the Iterable interface.
*
*
* Whenever possible try to use reduceWindow instead of mapWindow for increased efficiency
*/
def mapWindow[R: ClassTag: TypeInformation](reducer: WindowMapFunction[T, R]):
WindowedDataStream[R] = {
if (reducer == null) {
throw new NullPointerException("GroupReduce function must not be null.")
}
javaStream.mapWindow(reducer, implicitly[TypeInformation[R]])
}
/**
* Applies a mapWindow transformation on the windowed data stream by calling the mapWindow
* method on current window at every trigger. In contrast with the simple binary reduce
* operator, mapWindow exposes the whole window through the Iterable interface.
*
*
* Whenever possible try to use reduceWindow instead of mapWindow for increased efficiency
*/
def mapWindow[R: ClassTag: TypeInformation](fun: (Iterable[T], Collector[R]) => Unit):
WindowedDataStream[R] = {
if (fun == null) {
throw new NullPointerException("GroupReduce function must not be null.")
}
val cleanFun = clean(fun)
val reducer = new WindowMapFunction[T, R] {
def mapWindow(in: java.lang.Iterable[T], out: Collector[R]) = { cleanFun(in.asScala, out) }
}
mapWindow(reducer)
}
/**
* Applies an aggregation that that gives the maximum of the elements in the window at
* the given position.
*
*/
def max(position: Int): WindowedDataStream[T] = aggregate(AggregationType.MAX, position)
/**
* Applies an aggregation that that gives the maximum of the elements in the window at
* the given field.
*
*/
def max(field: String): WindowedDataStream[T] = aggregate(AggregationType.MAX, field)
/**
* Applies an aggregation that that gives the minimum of the elements in the window at
* the given position.
*
*/
def min(position: Int): WindowedDataStream[T] = aggregate(AggregationType.MIN, position)
/**
* Applies an aggregation that that gives the minimum of the elements in the window at
* the given field.
*
*/
def min(field: String): WindowedDataStream[T] = aggregate(AggregationType.MIN, field)
/**
* Applies an aggregation that sums the elements in the window at the given position.
*
*/
def sum(position: Int): WindowedDataStream[T] = aggregate(AggregationType.SUM, position)
/**
* Applies an aggregation that sums the elements in the window at the given field.
*
*/
def sum(field: String): WindowedDataStream[T] = aggregate(AggregationType.SUM, field)
/**
* Applies an aggregation that that gives the maximum element of the window by
* the given position. When equality, returns the first.
*
*/
def maxBy(position: Int): WindowedDataStream[T] = aggregate(AggregationType.MAXBY,
position)
/**
* Applies an aggregation that that gives the maximum element of the window by
* the given field. When equality, returns the first.
*
*/
def maxBy(field: String): WindowedDataStream[T] = aggregate(AggregationType.MAXBY,
field)
/**
* Applies an aggregation that that gives the minimum element of the window by
* the given position. When equality, returns the first.
*
*/
def minBy(position: Int): WindowedDataStream[T] = aggregate(AggregationType.MINBY,
position)
/**
* Applies an aggregation that that gives the minimum element of the window by
* the given field. When equality, returns the first.
*
*/
def minBy(field: String): WindowedDataStream[T] = aggregate(AggregationType.MINBY,
field)
private def aggregate(aggregationType: AggregationType, field: String):
WindowedDataStream[T] = {
val position = fieldNames2Indices(getType(), Array(field))(0)
aggregate(aggregationType, position)
}
def aggregate(aggregationType: AggregationType, position: Int):
WindowedDataStream[T] = {
val jStream = javaStream.asInstanceOf[JavaWStream[Product]]
val outType = jStream.getType().asInstanceOf[TupleTypeInfoBase[_]]
val agg = new ScalaStreamingAggregator[Product](
jStream.getType().createSerializer(javaStream.getExecutionConfig),
position)
val reducer = aggregationType match {
case AggregationType.SUM => new agg.Sum(SumFunction.getForClass(
outType.getTypeAt(position).getTypeClass()))
case _ => new agg.ProductComparableAggregator(aggregationType, true)
}
new WindowedDataStream[Product](
jStream.reduceWindow(reducer)).asInstanceOf[WindowedDataStream[T]]
}
/**
* Gets the output type.
*
* @return The output type.
*/
def getType(): TypeInformation[T] = javaStream.getType
/**
* Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning
* is not disabled in the {@link org.apache.flink.api.common.ExecutionConfig}
*/
private[flink] def clean[F <: AnyRef](f: F): F = {
new StreamExecutionEnvironment(
javaStream.getDiscretizedStream.getExecutionEnvironment).scalaClean(f)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy