org.apache.flink.table.runtime.operators.deduplicate.DeduplicateFunctionHelper Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of flink-table-runtime-blink_2.12 Show documentation
This module contains classes that are required by a task manager for execution of table programs. The content of this module is work-in-progress. It will replace the runtime classes contained in flink-table-planner once it is stable. See FLINK-11439 and FLIP-32 for more details.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.table.runtime.operators.deduplicate;

import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.runtime.generated.RecordEqualiser;
import org.apache.flink.types.RowKind;
import org.apache.flink.util.Collector;
import org.apache.flink.util.Preconditions;

/** Utility for deduplicate function. */
class DeduplicateFunctionHelper {

    /**
     * Processes element to deduplicate on keys with process time semantic, sends current element as
     * last row, retracts previous element if needed.
     *
     * @param currentRow latest row received by deduplicate function
     * @param generateUpdateBefore whether need to send UPDATE_BEFORE message for updates
     * @param state state of function, null if generateUpdateBefore is false
     * @param out underlying collector
     * @param isStateTtlEnabled whether state ttl is disabled
     * @param equaliser the record equaliser used to equal RowData.
     */
    static void processLastRowOnProcTime(
            RowData currentRow,
            boolean generateUpdateBefore,
            boolean generateInsert,
            ValueState state,
            Collector out,
            boolean isStateTtlEnabled,
            RecordEqualiser equaliser)
            throws Exception {

        checkInsertOnly(currentRow);
        if (generateUpdateBefore || generateInsert) {
            // use state to keep the previous row content if we need to generate UPDATE_BEFORE
            // or use to distinguish the first row, if we need to generate INSERT
            RowData preRow = state.value();
            state.update(currentRow);
            if (preRow == null) {
                // the first row, send INSERT message
                currentRow.setRowKind(RowKind.INSERT);
                out.collect(currentRow);
            } else {
                if (!isStateTtlEnabled && equaliser.equals(preRow, currentRow)) {
                    // currentRow is the same as preRow and state cleaning is not enabled.
                    // We do not emit retraction and update message.
                    // If state cleaning is enabled, we have to emit messages to prevent too early
                    // state eviction of downstream operators.
                    return;
                } else {
                    if (generateUpdateBefore) {
                        preRow.setRowKind(RowKind.UPDATE_BEFORE);
                        out.collect(preRow);
                    }
                    currentRow.setRowKind(RowKind.UPDATE_AFTER);
                    out.collect(currentRow);
                }
            }
        } else {
            // always send UPDATE_AFTER if INSERT is not needed
            currentRow.setRowKind(RowKind.UPDATE_AFTER);
            out.collect(currentRow);
        }
    }

    /**
     * Processes element to deduplicate on keys, sends current element as last row, retracts
     * previous element if needed.
     *
     * Note: we don't support stateless mode yet. Because this is not safe for Kafka tombstone
     * messages which doesn't contain full content. This can be a future improvement if the
     * downstream (e.g. sink) doesn't require full content for DELETE messages.
     *
     * @param currentRow latest row received by deduplicate function
     * @param generateUpdateBefore whether need to send UPDATE_BEFORE message for updates
     * @param state state of function
     * @param out underlying collector
     */
    static void processLastRowOnChangelog(
            RowData currentRow,
            boolean generateUpdateBefore,
            ValueState state,
            Collector out,
            boolean isStateTtlEnabled,
            RecordEqualiser equaliser)
            throws Exception {
        RowData preRow = state.value();
        RowKind currentKind = currentRow.getRowKind();
        if (currentKind == RowKind.INSERT || currentKind == RowKind.UPDATE_AFTER) {
            if (preRow == null) {
                // the first row, send INSERT message
                currentRow.setRowKind(RowKind.INSERT);
                out.collect(currentRow);
            } else {
                if (!isStateTtlEnabled && equaliser.equals(preRow, currentRow)) {
                    // currentRow is the same as preRow and state cleaning is not enabled.
                    // We do not emit retraction and update message.
                    // If state cleaning is enabled, we have to emit messages to prevent too early
                    // state eviction of downstream operators.
                    return;
                } else {
                    if (generateUpdateBefore) {
                        preRow.setRowKind(RowKind.UPDATE_BEFORE);
                        out.collect(preRow);
                    }
                    currentRow.setRowKind(RowKind.UPDATE_AFTER);
                    out.collect(currentRow);
                }
            }
            // normalize row kind
            currentRow.setRowKind(RowKind.INSERT);
            // save to state
            state.update(currentRow);
        } else {
            // DELETE or UPDATER_BEFORE
            if (preRow != null) {
                // always set to DELETE because this row has been removed
                // even the the input is UPDATE_BEFORE, there may no UPDATE_AFTER after it.
                preRow.setRowKind(RowKind.DELETE);
                // output the preRow instead of currentRow,
                // because preRow always contains the full content.
                // currentRow may only contain key parts (e.g. Kafka tombstone records).
                out.collect(preRow);
                // clear state as the row has been removed
                state.clear();
            }
            // nothing to do if removing a non-existed row
        }
    }

    /**
     * Processes element to deduplicate on keys with process time semantic, sends current element if
     * it is first row.
     *
     * @param currentRow latest row received by deduplicate function
     * @param state state of function
     * @param out underlying collector
     */
    static void processFirstRowOnProcTime(
            RowData currentRow, ValueState state, Collector out)
            throws Exception {

        checkInsertOnly(currentRow);
        // ignore record if it is not first row
        if (state.value() != null) {
            return;
        }
        state.update(true);
        // emit the first row which is INSERT message
        out.collect(currentRow);
    }

    /**
     * Collect the updated result for duplicate row.
     *
     * @param generateUpdateBefore flag to generate UPDATE_BEFORE message or not
     * @param generateInsert flag to generate INSERT message or not
     * @param preRow previous row under the key
     * @param currentRow current row under the key which is the duplicate row
     * @param out underlying collector
     */
    static void updateDeduplicateResult(
            boolean generateUpdateBefore,
            boolean generateInsert,
            RowData preRow,
            RowData currentRow,
            Collector out) {

        if (generateUpdateBefore || generateInsert) {
            if (preRow == null) {
                // the first row, send INSERT message
                currentRow.setRowKind(RowKind.INSERT);
                out.collect(currentRow);
            } else {
                if (generateUpdateBefore) {
                    final RowKind preRowKind = preRow.getRowKind();
                    preRow.setRowKind(RowKind.UPDATE_BEFORE);
                    out.collect(preRow);
                    preRow.setRowKind(preRowKind);
                }
                currentRow.setRowKind(RowKind.UPDATE_AFTER);
                out.collect(currentRow);
            }
        } else {
            currentRow.setRowKind(RowKind.UPDATE_AFTER);
            out.collect(currentRow);
        }
    }

    /** Returns current row is duplicate row or not compared to previous row. */
    static boolean isDuplicate(
            RowData preRow, RowData currentRow, int rowtimeIndex, boolean keepLastRow) {
        if (keepLastRow) {
            return preRow == null
                    || getRowtime(preRow, rowtimeIndex) <= getRowtime(currentRow, rowtimeIndex);
        } else {
            return preRow == null
                    || getRowtime(currentRow, rowtimeIndex) < getRowtime(preRow, rowtimeIndex);
        }
    }

    private static long getRowtime(RowData input, int rowtimeIndex) {
        return input.getLong(rowtimeIndex);
    }

    /** check message should be insert only. */
    static void checkInsertOnly(RowData currentRow) {
        Preconditions.checkArgument(currentRow.getRowKind() == RowKind.INSERT);
    }

    private DeduplicateFunctionHelper() {}
}