All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.table.runtime.operators.deduplicate.DeduplicateFunctionHelper Maven / Gradle / Ivy

Go to download

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.table.runtime.operators.deduplicate;

import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.runtime.generated.RecordEqualiser;
import org.apache.flink.types.RowKind;
import org.apache.flink.util.Collector;
import org.apache.flink.util.Preconditions;

/** Utility for deduplicate function. */
class DeduplicateFunctionHelper {

    /**
     * Processes element to deduplicate on keys with process time semantic, sends current element as
     * last row, retracts previous element if needed.
     *
     * @param currentRow latest row received by deduplicate function
     * @param generateUpdateBefore whether need to send UPDATE_BEFORE message for updates
     * @param state state of function, null if generateUpdateBefore is false
     * @param out underlying collector
     * @param isStateTtlEnabled whether state ttl is disabled
     * @param equaliser the record equaliser used to equal RowData.
     */
    static void processLastRowOnProcTime(
            RowData currentRow,
            boolean generateUpdateBefore,
            boolean generateInsert,
            ValueState state,
            Collector out,
            boolean isStateTtlEnabled,
            RecordEqualiser equaliser)
            throws Exception {

        checkInsertOnly(currentRow);
        if (generateUpdateBefore || generateInsert) {
            // use state to keep the previous row content if we need to generate UPDATE_BEFORE
            // or use to distinguish the first row, if we need to generate INSERT
            RowData preRow = state.value();
            state.update(currentRow);
            if (preRow == null) {
                // the first row, send INSERT message
                currentRow.setRowKind(RowKind.INSERT);
                out.collect(currentRow);
            } else {
                if (!isStateTtlEnabled && equaliser.equals(preRow, currentRow)) {
                    // currentRow is the same as preRow and state cleaning is not enabled.
                    // We do not emit retraction and update message.
                    // If state cleaning is enabled, we have to emit messages to prevent too early
                    // state eviction of downstream operators.
                    return;
                } else {
                    if (generateUpdateBefore) {
                        preRow.setRowKind(RowKind.UPDATE_BEFORE);
                        out.collect(preRow);
                    }
                    currentRow.setRowKind(RowKind.UPDATE_AFTER);
                    out.collect(currentRow);
                }
            }
        } else {
            // always send UPDATE_AFTER if INSERT is not needed
            currentRow.setRowKind(RowKind.UPDATE_AFTER);
            out.collect(currentRow);
        }
    }

    /**
     * Processes element to deduplicate on keys, sends current element as last row, retracts
     * previous element if needed.
     *
     * 

Note: we don't support stateless mode yet. Because this is not safe for Kafka tombstone * messages which doesn't contain full content. This can be a future improvement if the * downstream (e.g. sink) doesn't require full content for DELETE messages. * * @param currentRow latest row received by deduplicate function * @param generateUpdateBefore whether need to send UPDATE_BEFORE message for updates * @param state state of function * @param out underlying collector */ static void processLastRowOnChangelog( RowData currentRow, boolean generateUpdateBefore, ValueState state, Collector out, boolean isStateTtlEnabled, RecordEqualiser equaliser) throws Exception { RowData preRow = state.value(); RowKind currentKind = currentRow.getRowKind(); if (currentKind == RowKind.INSERT || currentKind == RowKind.UPDATE_AFTER) { if (preRow == null) { // the first row, send INSERT message currentRow.setRowKind(RowKind.INSERT); out.collect(currentRow); } else { if (!isStateTtlEnabled && equaliser.equals(preRow, currentRow)) { // currentRow is the same as preRow and state cleaning is not enabled. // We do not emit retraction and update message. // If state cleaning is enabled, we have to emit messages to prevent too early // state eviction of downstream operators. return; } else { if (generateUpdateBefore) { preRow.setRowKind(RowKind.UPDATE_BEFORE); out.collect(preRow); } currentRow.setRowKind(RowKind.UPDATE_AFTER); out.collect(currentRow); } } // normalize row kind currentRow.setRowKind(RowKind.INSERT); // save to state state.update(currentRow); } else { // DELETE or UPDATER_BEFORE if (preRow != null) { // always set to DELETE because this row has been removed // even the the input is UPDATE_BEFORE, there may no UPDATE_AFTER after it. preRow.setRowKind(RowKind.DELETE); // output the preRow instead of currentRow, // because preRow always contains the full content. // currentRow may only contain key parts (e.g. Kafka tombstone records). out.collect(preRow); // clear state as the row has been removed state.clear(); } // nothing to do if removing a non-existed row } } /** * Processes element to deduplicate on keys with process time semantic, sends current element if * it is first row. * * @param currentRow latest row received by deduplicate function * @param state state of function * @param out underlying collector */ static void processFirstRowOnProcTime( RowData currentRow, ValueState state, Collector out) throws Exception { checkInsertOnly(currentRow); // ignore record if it is not first row if (state.value() != null) { return; } state.update(true); // emit the first row which is INSERT message out.collect(currentRow); } /** * Collect the updated result for duplicate row. * * @param generateUpdateBefore flag to generate UPDATE_BEFORE message or not * @param generateInsert flag to generate INSERT message or not * @param preRow previous row under the key * @param currentRow current row under the key which is the duplicate row * @param out underlying collector */ static void updateDeduplicateResult( boolean generateUpdateBefore, boolean generateInsert, RowData preRow, RowData currentRow, Collector out) { if (generateUpdateBefore || generateInsert) { if (preRow == null) { // the first row, send INSERT message currentRow.setRowKind(RowKind.INSERT); out.collect(currentRow); } else { if (generateUpdateBefore) { final RowKind preRowKind = preRow.getRowKind(); preRow.setRowKind(RowKind.UPDATE_BEFORE); out.collect(preRow); preRow.setRowKind(preRowKind); } currentRow.setRowKind(RowKind.UPDATE_AFTER); out.collect(currentRow); } } else { currentRow.setRowKind(RowKind.UPDATE_AFTER); out.collect(currentRow); } } /** Returns current row is duplicate row or not compared to previous row. */ static boolean isDuplicate( RowData preRow, RowData currentRow, int rowtimeIndex, boolean keepLastRow) { if (keepLastRow) { return preRow == null || getRowtime(preRow, rowtimeIndex) <= getRowtime(currentRow, rowtimeIndex); } else { return preRow == null || getRowtime(currentRow, rowtimeIndex) < getRowtime(preRow, rowtimeIndex); } } private static long getRowtime(RowData input, int rowtimeIndex) { return input.getLong(rowtimeIndex); } /** check message should be insert only. */ static void checkInsertOnly(RowData currentRow) { Preconditions.checkArgument(currentRow.getRowKind() == RowKind.INSERT); } private DeduplicateFunctionHelper() {} }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy