
org.apache.flink.table.functions.AggregateFunction Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.table.functions;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.annotation.FunctionHint;
import org.apache.flink.table.api.dataview.ListView;
import org.apache.flink.table.api.dataview.MapView;
import org.apache.flink.table.catalog.DataTypeFactory;
import org.apache.flink.table.types.extraction.TypeInferenceExtractor;
import org.apache.flink.table.types.inference.TypeInference;
/**
* Base class for a user-defined aggregate function. A user-defined aggregate function maps scalar
* values of multiple rows to a new scalar value.
*
* The behavior of an {@link AggregateFunction} is centered around the concept of an accumulator.
* The accumulator is an intermediate data structure that stores the aggregated values until a final
* aggregation result is computed.
*
*
For each set of rows that needs to be aggregated, the runtime will create an empty accumulator
* by calling {@link #createAccumulator()}. Subsequently, the {@code accumulate()} method of the
* function is called for each input row to update the accumulator. Once all rows have been
* processed, the {@link #getValue(Object)} method of the function is called to compute and return
* the final result.
*
*
The main behavior of an {@link AggregateFunction} can be defined by implementing a custom
* accumulate method. An accumulate method must be declared publicly, not static, and named
* accumulate
. Accumulate methods can also be overloaded by implementing multiple methods
* named accumulate
.
*
*
By default, input, accumulator, and output data types are automatically extracted using
* reflection. This includes the generic argument {@code ACC} of the class for determining an
* accumulator data type and the generic argument {@code T} for determining an accumulator data
* type. Input arguments are derived from one or more {@code accumulate()} methods. If the
* reflective information is not sufficient, it can be supported and enriched with {@link
* DataTypeHint} and {@link FunctionHint} annotations.
*
*
An {@link AggregateFunction} needs at least three methods:
*
*
* - {@code createAccumulator}
*
- {@code accumulate}
*
- {@code getValue}
*
*
* There are a few other methods that are optional:
*
*
* - {@code retract}
*
- {@code merge}
*
*
* All these methods must be declared publicly, not static, and named exactly as the names
* mentioned above to be called by generated code.
*
*
For storing a user-defined function in a catalog, the class must have a default constructor
* and must be instantiable during runtime.
*
*
{@code
* Processes the input values and updates the provided accumulator instance. The method
* accumulate can be overloaded with different custom types and arguments. An aggregate function
* requires at least one accumulate() method.
*
* param: accumulator the accumulator which contains the current aggregated results
* param: [user defined inputs] the input value (usually obtained from new arrived data).
*
* public void accumulate(ACC accumulator, [user defined inputs])
* }
*
* {@code
* Retracts the input values from the accumulator instance. The current design assumes the
* inputs are the values that have been previously accumulated. The method retract can be
* overloaded with different custom types and arguments. This method must be implemented for
* bounded OVER aggregates over unbounded tables.
*
* param: accumulator the accumulator which contains the current aggregated results
* param: [user defined inputs] the input value (usually obtained from new arrived data).
*
* public void retract(ACC accumulator, [user defined inputs])
* }
*
* {@code
* Merges a group of accumulator instances into one accumulator instance. This method must be
* implemented for unbounded session window and hop window grouping aggregates and
* bounded grouping aggregates. Besides, implementing this method will be helpful for optimizations.
* For example, two phase aggregation optimization requires all the {@link AggregateFunction}s
* support "merge" method.
*
* param: accumulator the accumulator which will keep the merged aggregate results. It should
* be noted that the accumulator may contain the previous aggregated
* results. Therefore user should not replace or clean this instance in the
* custom merge method.
* param: iterable an java.lang.Iterable pointed to a group of accumulators that will be
* merged.
*
* public void merge(ACC accumulator, java.lang.Iterable iterable)
* }
*
* If this aggregate function can only be applied in an OVER window, this can be declared by
* returning the requirement {@link FunctionRequirement#OVER_WINDOW_ONLY} in {@link
* #getRequirements()}.
*
*
If an accumulator needs to store large amounts of data, {@link ListView} and {@link MapView}
* provide advanced features for leveraging Flink's state backends in unbounded data scenarios.
*
*
The following examples show how to specify an aggregate function:
*
*
{@code
* // a function that counts STRING arguments that are not null and emits them as STRING
* // the accumulator is BIGINT
* public static class CountFunction extends AggregateFunction {
* public static class MyAccumulator {
* public long count = 0L;
* }
*
* {@literal @}Override
* public MyAccumulator createAccumulator() {
* return new MyAccumulator();
* }
*
* public void accumulate(MyAccumulator accumulator, Integer i) {
* if (i != null) {
* accumulator.count += i;
* }
* }
*
* {@literal @}Override
* public String getValue(MyAccumulator accumulator) {
* return "Result: " + accumulator.count;
* }
* }
*
* // a function that determines the maximum of either BIGINT or STRING arguments
* // the accumulator and the output is either BIGINT or STRING
* public static class MaxFunction extends AggregateFunction
*
* @param final result type of the aggregation
* @param intermediate result type during the aggregation
*/
@PublicEvolving
public abstract class AggregateFunction extends ImperativeAggregateFunction {
/**
* Called every time when an aggregation result should be materialized. The returned value could
* be either an early and incomplete result (periodically emitted as data arrives) or the final
* result of the aggregation.
*
* @param accumulator the accumulator which contains the current intermediate results
* @return the aggregation result
*/
public abstract T getValue(ACC accumulator);
@Override
public final FunctionKind getKind() {
return FunctionKind.AGGREGATE;
}
@Override
@SuppressWarnings({"unchecked", "rawtypes"})
public TypeInference getTypeInference(DataTypeFactory typeFactory) {
return TypeInferenceExtractor.forAggregateFunction(typeFactory, (Class) getClass());
}
}