
org.apache.flink.table.api.table.scala Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.table.api
import org.apache.calcite.rel.RelNode
import org.apache.flink.api.java.operators.join.JoinType
import org.apache.flink.table.api.functions.TemporalTableFunction
import org.apache.flink.table.api.types._
import org.apache.flink.table.calcite.{FlinkRelBuilder, FlinkTypeFactory}
import org.apache.flink.table.expressions.{Alias, Asc, Expression, ExpressionParser, Literal, Ordering, ResolvedFieldReference, UnresolvedAlias, UnresolvedFieldReference, WindowProperty}
import org.apache.flink.table.functions.utils.UserDefinedFunctionUtils
import org.apache.flink.table.plan.ProjectionTranslator._
import org.apache.flink.table.plan.logical.{Minus, _}
import org.apache.flink.table.plan.schema.TableSourceSinkTable
import org.apache.flink.table.sinks.{CollectRowTableSink, CollectTableSink, TableSink}
import org.apache.flink.types.Row
import _root_.scala.annotation.varargs
import _root_.scala.collection.JavaConversions._
import _root_.scala.collection.JavaConverters._
/**
* A Table is the core component of the Table API.
* Similar to how the batch and streaming APIs have DataSet and DataStream,
* the Table API is built around [[Table]].
*
* Use the methods of [[Table]] to transform data. Use [[TableEnvironment]] to convert a [[Table]]
* back to a DataSet or DataStream.
*
* When using Scala a [[Table]] can also be converted using implicit conversions.
*
* Example:
*
* {{{
* val env = ExecutionEnvironment.getExecutionEnvironment
* val tEnv = TableEnvironment.getTableEnvironment(env)
*
* val set: DataSet[(String, Int)] = ...
* val table = set.toTable(tEnv, 'a, 'b)
* ...
* val table2 = ...
* val set2: DataSet[MyType] = table2.toDataSet[MyType]
* }}}
*
* Operations such as [[join]], [[select]], [[where]] and [[groupBy]] either take arguments
* in a Scala DSL or as an expression String. Please refer to the documentation for the expression
* syntax.
*
* @param tableEnv The [[TableEnvironment]] to which the table is bound.
* @param logicalPlan logical representation
*/
class Table(
private[flink] val tableEnv: TableEnvironment,
private[flink] val logicalPlan: LogicalNode) {
// Check if the plan has an unbounded TableFunctionCall as child node.
// A TableFunctionCall is tolerated as root node because the Table holds the initial call.
if (containsUnboudedUDTFCall(logicalPlan) &&
!logicalPlan.isInstanceOf[LogicalTableFunctionCall]) {
throw new ValidationException("TableFunction can only be used in join and leftOuterJoin.")
}
/**
* Creates a [[Table]] for a TableFunction call from a String expression.
*
* @param tableEnv The TableEnvironment in which the call is created.
* @param udtfCall A String expression of the TableFunction call.
*/
def this(tableEnv: TableEnvironment, udtfCall: String) {
this(tableEnv, UserDefinedFunctionUtils.createLogicalFunctionCall(tableEnv, udtfCall))
}
private lazy val tableSchema: TableSchema = logicalPlan match {
case CatalogNode(tablePath, _) if tableEnv.getTable(tablePath.toArray)
.exists(_.isInstanceOf[TableSourceSinkTable[_]]) =>
val tableSourceSinkTable = tableEnv.getTable(tablePath.toArray).get
.asInstanceOf[TableSourceSinkTable[_]]
if (tableSourceSinkTable.isSourceTable) {
tableSourceSinkTable.tableSourceTable.get.tableSource.getTableSchema
} else {
new TableSchema(
logicalPlan.output.map(_.name).toArray,
logicalPlan.output.map(_.resultType).toArray,
getRelNode.getRowType.getFieldList.map(_.getType.isNullable).toArray)
}
case _ =>
new TableSchema(
logicalPlan.output.map(_.name).toArray,
logicalPlan.output.map(_.resultType).toArray,
getRelNode.getRowType.getFieldList.map(_.getType.isNullable).toArray)
}
def relBuilder: FlinkRelBuilder = tableEnv.getRelBuilder
def getRelNode: RelNode = if (containsUnboudedUDTFCall(logicalPlan)) {
throw new ValidationException("Cannot translate a query with an unbounded table function call.")
} else {
logicalPlan.toRelNode(relBuilder)
}
/**
* Returns the schema of this table.
*/
def getSchema: TableSchema = tableSchema
/**
* Prints the schema of this table to the console in a tree format.
*/
def printSchema(): Unit = _root_.scala.Predef.print(tableSchema.toString)
def collectSink[T](sink: CollectTableSink[T], jobName : Option[String] = None): Seq[T] = {
// get schema information of table
val rowType = getRelNode.getRowType
val fieldNames: Array[String] = rowType.getFieldNames.asScala.toArray
val fieldTypes: Array[DataType] = rowType.getFieldList.asScala
.map(field => FlinkTypeFactory.toInternalType(field.getType)).toArray
val configuredSink = sink.configure(fieldNames, fieldTypes)
tableEnv.collect(this, configuredSink.asInstanceOf[CollectTableSink[T]], jobName)
}
def collect(): Seq[Row] = collectSink(new CollectRowTableSink, None)
def collectAsT[T](t: DataType, jobName : String = null): Seq[T] =
collectSink(new CollectTableSink(_ => t), Option(jobName))
/**
* cache this table to builtin table service or the specified customized table service.
*
* This method provides a hint to Flink that the current table maybe reused later so a
* cache should be created to avoid regenerating this table.
*
* The following code snippet gives an example of how this method could be used.
*
* {{{
* val t = tEnv.fromCollection(data).as('country, 'color, 'count)
*
* val t1 = t.filter('count < 100)
* t1.cache()
* // t1 is cached after it is generated for the first time.
* val x = t1.collect().size
*
* // When t1 is used again to generate t2, it may not be regenerated.
* val t2 = t1.groupBy('country).select('country, 'count.sum as 'sum)
* val res2 = t2.collect()
* res2.foreach(println)
*
* // Similarly when t1 is used again to generate t2, it may not be regenerated.
* val t3 = t1.groupBy('color).select('color, 'count.avg as 'avg)
* val res3 = t3.collect()
* res3.foreach(println)
*
* }}}
*
* @note Flink optimizer may decide to not use the cache if doing that will accelerate the
* processing, or if the cache is no longer available for reasons such as the table service
* has failed.
* @note The table cache is create lazily. That means the cache is only created at the first
* time when the cacehd table is computed.
*/
def cache(): Unit = {
// Enable the subsection optimization.
tableEnv.getConfig.setSubsectionOptimization(true)
// check if it has been already cached.
(tableEnv.tableServiceManager.getToBeCachedTableName(logicalPlan) orElse
tableEnv.tableServiceManager.getCachedTableName(logicalPlan)) match {
case None => tableEnv.tableServiceManager.cacheTable(this)
case Some(_) =>
}
}
private def print(jobName : Option[String] = None): Unit = {
for (row <- collectSink(new CollectRowTableSink, jobName)) {
System.out.println(row)
}
}
def print(jobName: String): Unit = print(Option.apply(jobName))
def print(): Unit = print(None)
/**
* Performs a selection operation. Similar to an SQL SELECT statement. The field expressions
* can contain complex expressions and aggregations.
*
* Example:
*
* {{{
* tab.select('key, 'value.avg + " The average" as 'average)
* }}}
*/
def select(fields: Expression*): Table = {
val expandedFields = expandProjectList(fields, logicalPlan, tableEnv)
val (aggNames, propNames) = extractAggregationsAndProperties(expandedFields, tableEnv)
if (propNames.nonEmpty) {
throw new ValidationException("Window properties can only be used on windowed tables.")
}
if (aggNames.nonEmpty) {
val projectsOnAgg = replaceAggregationsAndProperties(
expandedFields, tableEnv, aggNames, propNames)
val projectFields = extractFieldReferences(expandedFields)
new Table(tableEnv,
Project(projectsOnAgg,
Aggregate(Nil, aggNames.map(a => Alias(a._1, a._2)).toSeq,
Project(projectFields, logicalPlan).validate(tableEnv)
).validate(tableEnv)
).validate(tableEnv)
)
} else {
new Table(tableEnv,
Project(expandedFields.map(UnresolvedAlias), logicalPlan).validate(tableEnv))
}
}
/**
* Performs a selection operation. Similar to an SQL SELECT statement. The field expressions
* can contain complex expressions and aggregations.
*
* Example:
*
* {{{
* tab.select("key, value.avg + ' The average' as average")
* }}}
*/
def select(fields: String): Table = {
val fieldExprs = ExpressionParser.parseExpressionList(fields)
//get the correct expression for AggFunctionCall
val withResolvedAggFunctionCall = fieldExprs.map(replaceAggFunctionCall(_, tableEnv))
select(withResolvedAggFunctionCall: _*)
}
/**
* Creates [[TemporalTableFunction]] backed up by this table as a history table.
* Temporal Tables represent a concept of a table that changes over time and for which
* Flink keeps track of those changes. [[TemporalTableFunction]] provides a way how to access
* those data.
*
* For more information please check Flink's documentation on Temporal Tables.
*
* Currently [[TemporalTableFunction]]s are only supported in streaming.
*
* @param timeAttribute Must points to a time attribute. Provides a way to compare which records
* are a newer or older version.
* @param primaryKey Defines the primary key. With primary key it is possible to update
* a row or to delete it.
* @return [[TemporalTableFunction]] which is an instance of
* [[org.apache.flink.table.api.functions.TableFunction]]. It takes one single argument,
* the `timeAttribute`, for which it returns matching version of the [[Table]], from which
* [[TemporalTableFunction]] was created.
*/
def createTemporalTableFunction(
timeAttribute: String,
primaryKey: String): TemporalTableFunction = {
createTemporalTableFunction(
ExpressionParser.parseExpression(timeAttribute),
ExpressionParser.parseExpression(primaryKey))
}
/**
* Creates [[TemporalTableFunction]] backed up by this table as a history table.
* Temporal Tables represent a concept of a table that changes over time and for which
* Flink keeps track of those changes. [[TemporalTableFunction]] provides a way how to access
* those data.
*
* For more information please check Flink's documentation on Temporal Tables.
*
* Currently [[TemporalTableFunction]]s are only supported in streaming.
*
* @param timeAttribute Must points to a time indicator. Provides a way to compare which records
* are a newer or older version.
* @param primaryKey Defines the primary key. With primary key it is possible to update
* a row or to delete it.
* @return [[TemporalTableFunction]] which is an instance of
* [[org.apache.flink.table.api.functions.TableFunction]]. It takes one single argument,
* the `timeAttribute`, for which it returns matching version of the [[Table]], from which
* [[TemporalTableFunction]] was created.
*/
def createTemporalTableFunction(
timeAttribute: Expression,
primaryKey: Expression): TemporalTableFunction = {
val temporalTable = TemporalTable(timeAttribute, primaryKey, logicalPlan)
.validate(tableEnv)
.asInstanceOf[TemporalTable]
TemporalTableFunction.create(
this,
temporalTable.timeAttribute,
validatePrimaryKeyExpression(temporalTable.primaryKey))
}
private def validatePrimaryKeyExpression(expression: Expression): String = {
expression match {
case fieldReference: ResolvedFieldReference =>
fieldReference.name
case _ => throw new ValidationException(
s"Unsupported expression [$expression] as primary key. " +
s"Only top-level (not nested) field references are supported.")
}
}
/**
* Renames the fields of the expression result. Use this to disambiguate fields before
* joining to operations.
*
* Example:
*
* {{{
* tab.as('a, 'b)
* }}}
*/
def as(fields: Expression*): Table = {
logicalPlan match {
case functionCall: LogicalTableFunctionCall if functionCall.input == null =>
// If the logical plan is a TableFunctionCall, we replace its field names to avoid special
// cases during the validation.
if (fields.length != functionCall.output.length) {
throw new ValidationException(
"List of column aliases must have same degree as TableFunction's output")
}
if (!fields.forall(_.isInstanceOf[UnresolvedFieldReference])) {
throw new ValidationException(
"Alias field must be an instance of UnresolvedFieldReference"
)
}
new Table(
tableEnv,
LogicalTableFunctionCall(
functionCall.functionName,
functionCall.tableFunction,
functionCall.parameters,
functionCall.externalResultType,
fields.map(_.asInstanceOf[UnresolvedFieldReference].name).toArray,
functionCall.input)
)
case _ =>
// prepend an AliasNode
new Table(tableEnv, AliasNode(fields, logicalPlan).validate(tableEnv))
}
}
/**
* Renames the fields of the expression result. Use this to disambiguate fields before
* joining to operations.
*
* Example:
*
* {{{
* tab.as("a, b")
* }}}
*/
def as(fields: String): Table = {
val fieldExprs = ExpressionParser.parseExpressionList(fields)
as(fieldExprs: _*)
}
/**
* Filters out elements that don't pass the filter predicate. Similar to a SQL WHERE
* clause.
*
* Example:
*
* {{{
* tab.filter('name === "Fred")
* }}}
*/
def filter(predicate: Expression): Table = {
new Table(tableEnv, Filter(predicate, logicalPlan).validate(tableEnv))
}
/**
* Filters out elements that don't pass the filter predicate. Similar to a SQL WHERE
* clause.
*
* Example:
*
* {{{
* tab.filter("name = 'Fred'")
* }}}
*/
def filter(predicate: String): Table = {
val predicateExpr = ExpressionParser.parseExpression(predicate)
filter(predicateExpr)
}
/**
* Filters out elements that don't pass the filter predicate. Similar to a SQL WHERE
* clause.
*
* Example:
*
* {{{
* tab.where('name === "Fred")
* }}}
*/
def where(predicate: Expression): Table = {
filter(predicate)
}
/**
* Filters out elements that don't pass the filter predicate. Similar to a SQL WHERE
* clause.
*
* Example:
*
* {{{
* tab.where("name = 'Fred'")
* }}}
*/
def where(predicate: String): Table = {
filter(predicate)
}
/**
* Groups the elements on some grouping keys. Use this before a selection with aggregations
* to perform the aggregation on a per-group basis. Similar to a SQL GROUP BY statement.
*
* Example:
*
* {{{
* tab.groupBy('key).select('key, 'value.avg)
* }}}
*/
def groupBy(fields: Expression*): GroupedTable = {
new GroupedTable(this, fields)
}
/**
* Groups the elements on some grouping keys. Use this before a selection with aggregations
* to perform the aggregation on a per-group basis. Similar to a SQL GROUP BY statement.
*
* Example:
*
* {{{
* tab.groupBy("key").select("key, value.avg")
* }}}
*/
def groupBy(fields: String): GroupedTable = {
val fieldsExpr = ExpressionParser.parseExpressionList(fields)
groupBy(fieldsExpr: _*)
}
/**
* Removes duplicate values and returns only distinct (different) values.
*
* Example:
*
* {{{
* tab.select("key, value").distinct()
* }}}
*/
def distinct(): Table = {
new Table(tableEnv, Distinct(logicalPlan).validate(tableEnv))
}
/**
* Joins two [[Table]]s. Similar to an SQL join. The fields of the two joined
* operations must not overlap, use [[as]] to rename fields if necessary. You can use
* where and select clauses after a join to further specify the behaviour of the join.
*
* Note: Both tables must be bound to the same [[TableEnvironment]].
*
* Example:
*
* {{{
* left.join(right).where('a === 'b && 'c > 3).select('a, 'b, 'd)
* }}}
*/
def join(right: Table): Table = {
join(right, None, JoinType.INNER)
}
/**
* Joins two [[Table]]s. Similar to an SQL join. The fields of the two joined
* operations must not overlap, use [[as]] to rename fields if necessary.
*
* Note: Both tables must be bound to the same [[TableEnvironment]].
*
* Example:
*
* {{{
* left.join(right, "a = b")
* }}}
*/
def join(right: Table, joinPredicate: String): Table = {
join(right, joinPredicate, JoinType.INNER)
}
/**
* Joins two [[Table]]s. Similar to an SQL join. The fields of the two joined
* operations must not overlap, use [[as]] to rename fields if necessary.
*
* Note: Both tables must be bound to the same [[TableEnvironment]].
*
* Example:
*
* {{{
* left.join(right, 'a === 'b).select('a, 'b, 'd)
* }}}
*/
def join(right: Table, joinPredicate: Expression): Table = {
join(right, Some(joinPredicate), JoinType.INNER)
}
/**
* Joins this [[Table]] with an user-defined [[org.apache.calcite.schema.TableFunction]].
* This join is similar to a SQL left outer join with ON TRUE predicate, but it works with a
* table function. Each row of the outer table is joined with all rows produced by the table
* function. If the table function does not produce any row, the outer row is padded with nulls.
*
* Scala Example:
* {{{
* class MySplitUDTF extends TableFunction[String] {
* def eval(str: String): Unit = {
* str.split("#").foreach(collect)
* }
* }
*
* val split = new MySplitUDTF()
* table.leftOuterJoin(split('c) as ('s)).select('a,'b,'c,'s)
* }}}
*
* Java Example:
* {{{
* class MySplitUDTF extends TableFunction {
* public void eval(String str) {
* str.split("#").forEach(this::collect);
* }
* }
*
* TableFunction split = new MySplitUDTF();
* tableEnv.registerFunction("split", split);
* table.leftOuterJoin(new Table(tableEnv, "split(c)").as("s"))).select("a, b, c, s");
* }}}
*/
def leftOuterJoin(right: Table): Table = {
join(right, None, JoinType.LEFT_OUTER)
}
/**
* Joins two [[Table]]s. Similar to an SQL left outer join. The fields of the two joined
* operations must not overlap, use [[as]] to rename fields if necessary.
*
* Note: Both tables must be bound to the same [[TableEnvironment]] and its [[TableConfig]] must
* have nullCheck enabled.
*
* Example:
*
* {{{
* left.leftOuterJoin(right, "a = b").select('a, 'b, 'd)
* }}}
*/
def leftOuterJoin(right: Table, joinPredicate: String): Table = {
join(right, joinPredicate, JoinType.LEFT_OUTER)
}
/**
* Joins two [[Table]]s. Similar to an SQL left outer join. The fields of the two joined
* operations must not overlap, use [[as]] to rename fields if necessary.
*
* Note: Both tables must be bound to the same [[TableEnvironment]] and its [[TableConfig]] must
* have nullCheck enabled.
*
* Example:
*
* {{{
* left.leftOuterJoin(right, 'a === 'b).select('a, 'b, 'd)
* }}}
*/
def leftOuterJoin(right: Table, joinPredicate: Expression): Table = {
join(right, Some(joinPredicate), JoinType.LEFT_OUTER)
}
/**
* Joins two [[Table]]s. Similar to an SQL right outer join. The fields of the two joined
* operations must not overlap, use [[as]] to rename fields if necessary.
*
* Note: Both tables must be bound to the same [[TableEnvironment]] and its [[TableConfig]] must
* have nullCheck enabled.
*
* Example:
*
* {{{
* left.rightOuterJoin(right, "a = b").select('a, 'b, 'd)
* }}}
*/
def rightOuterJoin(right: Table, joinPredicate: String): Table = {
join(right, joinPredicate, JoinType.RIGHT_OUTER)
}
/**
* Joins two [[Table]]s. Similar to an SQL right outer join. The fields of the two joined
* operations must not overlap, use [[as]] to rename fields if necessary.
*
* Note: Both tables must be bound to the same [[TableEnvironment]] and its [[TableConfig]] must
* have nullCheck enabled.
*
* Example:
*
* {{{
* left.rightOuterJoin(right, 'a === 'b).select('a, 'b, 'd)
* }}}
*/
def rightOuterJoin(right: Table, joinPredicate: Expression): Table = {
join(right, Some(joinPredicate), JoinType.RIGHT_OUTER)
}
/**
* Joins two [[Table]]s. Similar to an SQL full outer join. The fields of the two joined
* operations must not overlap, use [[as]] to rename fields if necessary.
*
* Note: Both tables must be bound to the same [[TableEnvironment]] and its [[TableConfig]] must
* have nullCheck enabled.
*
* Example:
*
* {{{
* left.fullOuterJoin(right, "a = b").select('a, 'b, 'd)
* }}}
*/
def fullOuterJoin(right: Table, joinPredicate: String): Table = {
join(right, joinPredicate, JoinType.FULL_OUTER)
}
/**
* Joins two [[Table]]s. Similar to an SQL full outer join. The fields of the two joined
* operations must not overlap, use [[as]] to rename fields if necessary.
*
* Note: Both tables must be bound to the same [[TableEnvironment]] and its [[TableConfig]] must
* have nullCheck enabled.
*
* Example:
*
* {{{
* left.fullOuterJoin(right, 'a === 'b).select('a, 'b, 'd)
* }}}
*/
def fullOuterJoin(right: Table, joinPredicate: Expression): Table = {
join(right, Some(joinPredicate), JoinType.FULL_OUTER)
}
private def join(right: Table, joinPredicate: String, joinType: JoinType): Table = {
val joinPredicateExpr = ExpressionParser.parseExpression(joinPredicate)
join(right, Some(joinPredicateExpr), joinType)
}
private def join(right: Table, joinPredicate: Option[Expression], joinType: JoinType): Table = {
// check if we join with a table or a table function
if (!containsUnboudedUDTFCall(right.logicalPlan)) {
// regular table-table join
// check that the TableEnvironment of right table is not null
// and right table belongs to the same TableEnvironment
if (right.tableEnv != this.tableEnv) {
throw new ValidationException("Only tables from the same TableEnvironment can be joined.")
}
new Table(
tableEnv,
Join(this.logicalPlan, right.logicalPlan, joinType, joinPredicate, correlated = false)
.validate(tableEnv))
} else {
// join with a table function
// check join type
if (joinType != JoinType.INNER && joinType != JoinType.LEFT_OUTER) {
throw new ValidationException(
"TableFunctions are currently supported for join and leftOuterJoin.")
}
val udtf = right.logicalPlan.asInstanceOf[LogicalTableFunctionCall]
val udtfCall = LogicalTableFunctionCall(
udtf.functionName,
udtf.tableFunction,
udtf.parameters,
udtf.externalResultType,
udtf.fieldNames,
this.logicalPlan
).validate(tableEnv)
// correlated is false only if parameters is empty or all parameters are Literal
val correlated = !udtf.parameters.forall(_.isInstanceOf[Literal])
new Table(
tableEnv,
Join(this.logicalPlan, udtfCall, joinType, joinPredicate, correlated)
.validate(tableEnv))
}
}
/**
* Minus of two [[Table]]s with duplicate records removed.
* Similar to a SQL EXCEPT clause. Minus returns records from the left table that do not
* exist in the right table. Duplicate records in the left table are returned
* exactly once, i.e., duplicates are removed. Both tables must have identical field types.
*
* Note: Both tables must be bound to the same [[TableEnvironment]].
*
* Example:
*
* {{{
* left.minus(right)
* }}}
*/
def minus(right: Table): Table = {
// check that right table belongs to the same TableEnvironment
if (right.tableEnv != this.tableEnv) {
throw new ValidationException("Only tables from the same TableEnvironment can be " +
"subtracted.")
}
new Table(tableEnv, Minus(logicalPlan, right.logicalPlan, all = false)
.validate(tableEnv))
}
/**
* Minus of two [[Table]]s. Similar to an SQL EXCEPT ALL.
* Similar to a SQL EXCEPT ALL clause. MinusAll returns the records that do not exist in
* the right table. A record that is present n times in the left table and m times
* in the right table is returned (n - m) times, i.e., as many duplicates as are present
* in the right table are removed. Both tables must have identical field types.
*
* Note: Both tables must be bound to the same [[TableEnvironment]].
*
* Example:
*
* {{{
* left.minusAll(right)
* }}}
*/
def minusAll(right: Table): Table = {
// check that right table belongs to the same TableEnvironment
if (right.tableEnv != this.tableEnv) {
throw new ValidationException("Only tables from the same TableEnvironment can be " +
"subtracted.")
}
new Table(tableEnv, Minus(logicalPlan, right.logicalPlan, all = true)
.validate(tableEnv))
}
/**
* Unions two [[Table]]s with duplicate records removed.
* Similar to an SQL UNION. The fields of the two union operations must fully overlap.
*
* Note: Both tables must be bound to the same [[TableEnvironment]].
*
* Example:
*
* {{{
* left.union(right)
* }}}
*/
def union(right: Table): Table = {
// check that right table belongs to the same TableEnvironment
if (right.tableEnv != this.tableEnv) {
throw new ValidationException("Only tables from the same TableEnvironment can be unioned.")
}
new Table(tableEnv, Union(logicalPlan, right.logicalPlan, all = false).validate(tableEnv))
}
/**
* Unions two [[Table]]s. Similar to an SQL UNION ALL. The fields of the two union operations
* must fully overlap.
*
* Note: Both tables must be bound to the same [[TableEnvironment]].
*
* Example:
*
* {{{
* left.unionAll(right)
* }}}
*/
def unionAll(right: Table): Table = {
// check that right table belongs to the same TableEnvironment
if (right.tableEnv != this.tableEnv) {
throw new ValidationException("Only tables from the same TableEnvironment can be unioned.")
}
new Table(tableEnv, Union(logicalPlan, right.logicalPlan, all = true).validate(tableEnv))
}
/**
* Intersects two [[Table]]s with duplicate records removed. Intersect returns records that
* exist in both tables. If a record is present in one or both tables more than once, it is
* returned just once, i.e., the resulting table has no duplicate records. Similar to an
* SQL INTERSECT. The fields of the two intersect operations must fully overlap.
*
* Note: Both tables must be bound to the same [[TableEnvironment]].
*
* Example:
*
* {{{
* left.intersect(right)
* }}}
*/
def intersect(right: Table): Table = {
// check that right table belongs to the same TableEnvironment
if (right.tableEnv != this.tableEnv) {
throw new ValidationException(
"Only tables from the same TableEnvironment can be intersected.")
}
new Table(tableEnv, Intersect(logicalPlan, right.logicalPlan, all = false).validate(tableEnv))
}
/**
* Intersects two [[Table]]s. IntersectAll returns records that exist in both tables.
* If a record is present in both tables more than once, it is returned as many times as it
* is present in both tables, i.e., the resulting table might have duplicate records. Similar
* to an SQL INTERSECT ALL. The fields of the two intersect operations must fully overlap.
*
* Note: Both tables must be bound to the same [[TableEnvironment]].
*
* Example:
*
* {{{
* left.intersectAll(right)
* }}}
*/
def intersectAll(right: Table): Table = {
// check that right table belongs to the same TableEnvironment
if (right.tableEnv != this.tableEnv) {
throw new ValidationException(
"Only tables from the same TableEnvironment can be intersected.")
}
new Table(tableEnv, Intersect(logicalPlan, right.logicalPlan, all = true).validate(tableEnv))
}
/**
* Sorts the given [[Table]]. Similar to SQL ORDER BY.
* The resulting Table is globally sorted across all parallel partitions.
*
* Example:
*
* {{{
* tab.orderBy('name.desc)
* }}}
*/
def orderBy(fields: Expression*): Table = {
val order: Seq[Ordering] = fields.map {
case o: Ordering => o
case e => Asc(e)
}
new Table(tableEnv, Sort(order, logicalPlan).validate(tableEnv))
}
/**
* Sorts the given [[Table]]. Similar to SQL ORDER BY.
* The resulting Table is sorted globally sorted across all parallel partitions.
*
* Example:
*
* {{{
* tab.orderBy("name.desc")
* }}}
*/
def orderBy(fields: String): Table = {
val parsedFields = ExpressionParser.parseExpressionList(fields)
orderBy(parsedFields: _*)
}
/**
* Limits a sorted result from an offset position.
* Similar to a SQL OFFSET clause. Offset is technically part of the Order By operator and
* thus must be preceded by it.
*
* [[Table.offset(o)]] can be combined with a subsequent [[Table.fetch(n)]] call to return the
* first n rows starting from the offset position o.
*
* {{{
* // returns unlimited number of records beginning with the 4th record
* tab.orderBy('name.desc).offset(3)
* // return the first 5 records starting from the 10th record
* tab.orderBy('name.desc).offset(10).fetch(5)
* }}}
*
* @param offset number of records to skip
*/
def offset(offset: Int): Table = {
new Table(tableEnv, Limit(offset, -1, logicalPlan).validate(tableEnv))
}
/**
* Limits a sorted result to the first n rows.
* Similar to a SQL FETCH clause. Fetch is technically part of the Order By operator and
* thus must be preceded by it.
*
* [[Table.fetch(n)]] can be combined with a preceding [[Table.offset(o)]] call to return the
* first n rows starting from the offset position o.
*
* {{{
* // returns the first 3 records.
* tab.orderBy('name.desc).fetch(3)
* // return the first 5 records starting from the 10th record
* tab.orderBy('name.desc).offset(10).fetch(5)
* }}}
*
* @param fetch the number of records to return
*/
def fetch(fetch: Int): Table = {
this.logicalPlan match {
case Limit(o, -1, c) =>
// replace LIMIT without FETCH by LIMIT with FETCH
new Table(tableEnv, Limit(o, fetch, c).validate(tableEnv))
case Limit(_, _, _) =>
throw new ValidationException("FETCH is already defined.")
case _ =>
new Table(tableEnv, Limit(0, fetch, logicalPlan).validate(tableEnv))
}
}
/**
* Writes the [[Table]] to a [[TableSink]]. A [[TableSink]] defines an external storage location.
*
* A batch [[Table]] can only be written to a
* [[org.apache.flink.table.sinks.BatchTableSink]], a streaming [[Table]] requires a
* [[org.apache.flink.table.sinks.AppendStreamTableSink]], a
* [[org.apache.flink.table.sinks.RetractStreamTableSink]], or an
* [[org.apache.flink.table.sinks.UpsertStreamTableSink]].
*
* @param sink The [[TableSink]] to which the [[Table]] is written.
* @tparam T The data type that the [[TableSink]] expects.
*/
def writeToSink[T](sink: TableSink[T]): Unit = {
// get schema information of table
val rowType = getRelNode.getRowType
val fieldNames: Array[String] = rowType.getFieldNames.asScala.toArray
val fieldTypes: Array[DataType] = rowType.getFieldList.asScala
.map(field => FlinkTypeFactory.toInternalType(field.getType))
.map {
// replace time indicator types by SQL_TIMESTAMP
case t: TimestampType if FlinkTypeFactory.isTimeIndicatorType(t) => TimestampType.TIMESTAMP
case t: InternalType => t
}.toArray
// configure the table sink
val configuredSink = sink.configure(fieldNames, fieldTypes)
// emit the table to the configured table sink
tableEnv.writeToSink(this, configuredSink)
}
/**
* Writes the [[Table]] to a [[TableSink]]. A [[TableSink]] defines an external storage location.
*
* A batch [[Table]] can only be written to a
* [[org.apache.flink.table.sinks.BatchTableSink]], a streaming [[Table]] requires a
* [[org.apache.flink.table.sinks.AppendStreamTableSink]], a
* [[org.apache.flink.table.sinks.RetractStreamTableSink]], or an
* [[org.apache.flink.table.sinks.UpsertStreamTableSink]].
*
* @param sink The [[TableSink]] to which the [[Table]] is written.
* @param conf The configuration for the query that writes to the sink.
* @tparam T The data type that the [[TableSink]] expects.
*/
def writeToSink[T](sink: TableSink[T], conf: QueryConfig): Unit = {
conf.overrideTableConfig(tableEnv.getConfig)
writeToSink(sink)
}
/**
* Writes the [[Table]] to a [[TableSink]] that was registered under the specified name.
*
* A batch [[Table]] can only be written to a
* [[org.apache.flink.table.sinks.BatchTableSink]], a streaming [[Table]] requires a
* [[org.apache.flink.table.sinks.AppendStreamTableSink]], a
* [[org.apache.flink.table.sinks.RetractStreamTableSink]], or an
* [[org.apache.flink.table.sinks.UpsertStreamTableSink]].
*
* @param tableName Name of the registered [[TableSink]] to which the [[Table]] is written.
*/
def insertInto(tableName: String): Unit = {
tableEnv.insertInto(this, tableName)
}
/**
* Writes the [[Table]] to a [[TableSink]] that was registered under the specified name.
*
* A batch [[Table]] can only be written to a
* [[org.apache.flink.table.sinks.BatchTableSink]], a streaming [[Table]] requires a
* [[org.apache.flink.table.sinks.AppendStreamTableSink]], a
* [[org.apache.flink.table.sinks.RetractStreamTableSink]], or an
* [[org.apache.flink.table.sinks.UpsertStreamTableSink]].
*
* @param tableName Name of the [[TableSink]] to which the [[Table]] is written.
* @param conf The [[QueryConfig]] to use.
*/
def insertInto(tableName: String, conf: QueryConfig): Unit = {
conf.overrideTableConfig(tableEnv.getConfig)
tableEnv.insertInto(this, tableName)
}
/**
* Groups the records of a table by assigning them to windows defined by a time or row interval.
*
* For streaming tables of infinite size, grouping into windows is required to define finite
* groups on which group-based aggregates can be computed.
*
* For batch tables of finite size, windowing essentially provides shortcuts for time-based
* groupBy.
*
* __Note__: Computing windowed aggregates on a streaming table is only a parallel operation
* if additional grouping attributes are added to the `groupBy(...)` clause.
* If the `groupBy(...)` only references a window alias, the streamed table will be processed
* by a single task, i.e., with parallelism 1.
*
* @param window window that specifies how elements are grouped.
* @return A windowed table.
*/
def window(window: WindowExpression): WindowedTable = {
new WindowedTable(this, window)
}
/**
* Defines over-windows on the records of a table.
*
* An over-window defines for each record an interval of records over which aggregation
* functions can be computed.
*
* Example:
*
* {{{
* table
* .window(Over partitionBy 'c orderBy 'rowTime preceding 10.seconds as 'ow)
* .select('c, 'b.count over 'ow, 'e.sum over 'ow)
* }}}
*
* __Note__: Computing over window aggregates on a streaming table is only a parallel operation
* if the window is partitioned. Otherwise, the whole stream will be processed by a single
* task, i.e., with parallelism 1.
*
* __Note__: Over-windows for batch tables are currently not supported.
*
* @param overWindows windows that specify the record interval over which aggregations are
* computed.
* @return An OverWindowedTable to specify the aggregations.
*/
@varargs
def window(overWindows: OverWindow*): OverWindowedTable = {
new OverWindowedTable(this, overWindows.toArray)
}
var tableName: String = _
/**
* Registers an unique table name under the table environment
* and return the registered table name.
*/
override def toString: String = {
if (tableName == null) {
tableName = "UnnamedTable$" + tableEnv.attrNameCntr.getAndIncrement()
tableEnv.registerTable(tableName, this)
}
tableName
}
/**
* Checks if the plan represented by a [[LogicalNode]] contains an unbounded UDTF call.
* @param n the node to check
* @return true if the plan contains an unbounded UDTF call, false otherwise.
*/
private def containsUnboudedUDTFCall(n: LogicalNode): Boolean = {
n match {
case functionCall: LogicalTableFunctionCall if functionCall.input == null => true
case u: UnaryNode => containsUnboudedUDTFCall(u.child)
case b: BinaryNode => containsUnboudedUDTFCall(b.left) || containsUnboudedUDTFCall(b.right)
case _: LeafNode => false
}
}
}
/**
* A table that has been grouped on a set of grouping keys.
*/
class GroupedTable(
private[flink] val table: Table,
private[flink] val groupKey: Seq[Expression]) {
/**
* Performs a selection operation on a grouped table. Similar to an SQL SELECT statement.
* The field expressions can contain complex expressions and aggregations.
*
* Example:
*
* {{{
* tab.groupBy('key).select('key, 'value.avg + " The average" as 'average)
* }}}
*/
def select(fields: Expression*): Table = {
val expandedFields = expandProjectList(fields, table.logicalPlan, table.tableEnv)
val (aggNames, propNames) = extractAggregationsAndProperties(expandedFields, table.tableEnv)
if (propNames.nonEmpty) {
throw new ValidationException("Window properties can only be used on windowed tables.")
}
val projectsOnAgg = replaceAggregationsAndProperties(
expandedFields, table.tableEnv, aggNames, propNames)
val projectFields = extractFieldReferences(expandedFields ++ groupKey)
new Table(table.tableEnv,
Project(projectsOnAgg,
Aggregate(groupKey, aggNames.map(a => Alias(a._1, a._2)).toSeq,
Project(projectFields, table.logicalPlan).validate(table.tableEnv)
).validate(table.tableEnv)
).validate(table.tableEnv))
}
/**
* Performs a selection operation on a grouped table. Similar to an SQL SELECT statement.
* The field expressions can contain complex expressions and aggregations.
*
* Example:
*
* {{{
* tab.groupBy("key").select("key, value.avg + ' The average' as average")
* }}}
*/
def select(fields: String): Table = {
val fieldExprs = ExpressionParser.parseExpressionList(fields)
//get the correct expression for AggFunctionCall
val withResolvedAggFunctionCall = fieldExprs.map(replaceAggFunctionCall(_, table.tableEnv))
select(withResolvedAggFunctionCall: _*)
}
}
class WindowedTable(
private[flink] val table: Table,
private[flink] val window: WindowExpression) {
/**
* Groups the elements by a mandatory window and one or more optional grouping attributes.
* The window is specified by referring to its alias.
*
* If no additional grouping attribute is specified and if the input is a streaming table,
* the aggregation will be performed by a single task, i.e., with parallelism 1.
*
* Aggregations are performed per group and defined by a subsequent `select(...)` clause similar
* to SQL SELECT-GROUP-BY query.
*
* Example:
*
* {{{
* tab.window([window] as 'w)).groupBy('w, 'key).select('key, 'value.avg)
* }}}
*/
def groupBy(fields: Expression*): WindowGroupedTable = {
val fieldsWithoutWindow = fields.filterNot(window.alias.equals(_))
if (fields.size != fieldsWithoutWindow.size + 1) {
throw new ValidationException("GroupBy must contain exactly one window alias.")
}
new WindowGroupedTable(table, fieldsWithoutWindow, window)
}
/**
* Groups the elements by a mandatory window and one or more optional grouping attributes.
* The window is specified by referring to its alias.
*
* If no additional grouping attribute is specified and if the input is a streaming table,
* the aggregation will be performed by a single task, i.e., with parallelism 1.
*
* Aggregations are performed per group and defined by a subsequent `select(...)` clause similar
* to SQL SELECT-GROUP-BY query.
*
* Example:
*
* {{{
* tab.window([window].as("w")).groupBy("w, key").select("key, value.avg")
* }}}
*/
def groupBy(fields: String): WindowGroupedTable = {
val fieldsExpr = ExpressionParser.parseExpressionList(fields)
groupBy(fieldsExpr: _*)
}
}
class OverWindowedTable(
private[flink] val table: Table,
private[flink] val overWindows: Array[OverWindow]) {
def select(fields: Expression*): Table = {
val expandedFields = expandProjectList(
fields,
table.logicalPlan,
table.tableEnv)
if(fields.exists(_.isInstanceOf[WindowProperty])){
throw new ValidationException(
"Window start and end properties are not available for Over windows.")
}
val expandedOverFields = resolveOverWindows(expandedFields, overWindows, table.tableEnv)
new Table(
table.tableEnv,
Project(expandedOverFields.map(UnresolvedAlias), table.logicalPlan).validate(table.tableEnv))
}
def select(fields: String): Table = {
val fieldExprs = ExpressionParser.parseExpressionList(fields)
//get the correct expression for AggFunctionCall
val withResolvedAggFunctionCall = fieldExprs.map(replaceAggFunctionCall(_, table.tableEnv))
select(withResolvedAggFunctionCall: _*)
}
}
class WindowGroupedTable(
private[flink] val table: Table,
private[flink] val groupKeys: Seq[Expression],
private[flink] val window: WindowExpression) {
/**
* Performs a selection operation on a window grouped table. Similar to an SQL SELECT statement.
* The field expressions can contain complex expressions and aggregations.
*
* Example:
*
* {{{
* windowGroupedTable.select('key, 'window.start, 'value.avg as 'valavg)
* }}}
*/
def select(fields: Expression*): Table = {
val expandedFields = expandProjectList(fields, table.logicalPlan, table.tableEnv)
val (aggNames, propNames) = extractAggregationsAndProperties(expandedFields, table.tableEnv)
val projectsOnAgg = replaceAggregationsAndProperties(
expandedFields, table.tableEnv, aggNames, propNames)
val projectFields = extractFieldReferences(expandedFields ++ groupKeys :+ window.timeField)
new Table(table.tableEnv,
Project(
projectsOnAgg,
WindowAggregate(
groupKeys,
window.toLogicalWindow,
propNames.map(a => Alias(a._1, a._2)).toSeq,
aggNames.map(a => Alias(a._1, a._2)).toSeq,
Project(projectFields, table.logicalPlan).validate(table.tableEnv)
).validate(table.tableEnv)
).validate(table.tableEnv))
}
/**
* Performs a selection operation on a window grouped table. Similar to an SQL SELECT statement.
* The field expressions can contain complex expressions and aggregations.
*
* Example:
*
* {{{
* windowGroupedTable.select("key, window.start, value.avg as valavg")
* }}}
*/
def select(fields: String): Table = {
val fieldExprs = ExpressionParser.parseExpressionList(fields)
//get the correct expression for AggFunctionCall
val withResolvedAggFunctionCall = fieldExprs.map(replaceAggFunctionCall(_, table.tableEnv))
select(withResolvedAggFunctionCall: _*)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy