All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.paimon.flink.sink.CombinedTableCompactorSink Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.paimon.flink.sink;

import org.apache.paimon.CoreOptions;
import org.apache.paimon.append.MultiTableUnawareAppendCompactionTask;
import org.apache.paimon.catalog.CatalogLoader;
import org.apache.paimon.manifest.WrappedManifestCommittable;
import org.apache.paimon.options.Options;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.configuration.ExecutionOptions;
import org.apache.flink.configuration.ReadableConfig;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink;
import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory;
import org.apache.flink.table.data.RowData;

import java.io.Serializable;
import java.util.Map;

import static org.apache.paimon.CoreOptions.createCommitUser;
import static org.apache.paimon.flink.FlinkConnectorOptions.END_INPUT_WATERMARK;
import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_COMMITTER_OPERATOR_CHAINING;
import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_MANAGED_WRITER_BUFFER_MEMORY;
import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_USE_MANAGED_MEMORY;
import static org.apache.paimon.flink.sink.FlinkSink.assertBatchAdaptiveParallelism;
import static org.apache.paimon.flink.sink.FlinkSink.assertStreamingConfiguration;
import static org.apache.paimon.flink.utils.ManagedMemoryUtils.declareManagedMemory;

/** A sink for processing multi-tables in dedicated compaction job. */
public class CombinedTableCompactorSink implements Serializable {
    private static final long serialVersionUID = 1L;

    private static final String WRITER_NAME = "Writer";
    private static final String GLOBAL_COMMITTER_NAME = "Global Committer";

    private final CatalogLoader catalogLoader;
    private final boolean ignorePreviousFiles;
    private final boolean fullCompaction;

    private final Options options;

    public CombinedTableCompactorSink(
            CatalogLoader catalogLoader, Options options, boolean fullCompaction) {
        this.catalogLoader = catalogLoader;
        this.ignorePreviousFiles = false;
        this.fullCompaction = fullCompaction;
        this.options = options;
    }

    public DataStreamSink sinkFrom(
            DataStream awareBucketTableSource,
            DataStream unawareBucketTableSource) {
        // This commitUser is valid only for new jobs.
        // After the job starts, this commitUser will be recorded into the states of write and
        // commit operators.
        // When the job restarts, commitUser will be recovered from states and this value is
        // ignored.
        return sinkFrom(
                awareBucketTableSource, unawareBucketTableSource, createCommitUser(options));
    }

    public DataStreamSink sinkFrom(
            DataStream awareBucketTableSource,
            DataStream unawareBucketTableSource,
            String initialCommitUser) {
        // do the actually writing action, no snapshot generated in this stage
        DataStream written =
                doWrite(awareBucketTableSource, unawareBucketTableSource, initialCommitUser);

        // commit the committable to generate a new snapshot
        return doCommit(written, initialCommitUser);
    }

    public DataStream doWrite(
            DataStream awareBucketTableSource,
            DataStream unawareBucketTableSource,
            String commitUser) {
        StreamExecutionEnvironment env = awareBucketTableSource.getExecutionEnvironment();
        boolean isStreaming =
                env.getConfiguration().get(ExecutionOptions.RUNTIME_MODE)
                        == RuntimeExecutionMode.STREAMING;

        SingleOutputStreamOperator multiBucketTableRewriter =
                awareBucketTableSource
                        .transform(
                                String.format("%s-%s", "Multi-Bucket-Table", WRITER_NAME),
                                new MultiTableCommittableTypeInfo(),
                                combinedMultiComacptionWriteOperator(
                                        env.getCheckpointConfig(),
                                        isStreaming,
                                        fullCompaction,
                                        commitUser))
                        .setParallelism(awareBucketTableSource.getParallelism());

        SingleOutputStreamOperator unawareBucketTableRewriter =
                unawareBucketTableSource
                        .transform(
                                String.format("%s-%s", "Unaware-Bucket-Table", WRITER_NAME),
                                new MultiTableCommittableTypeInfo(),
                                new AppendOnlyMultiTableCompactionWorkerOperator.Factory(
                                        catalogLoader, commitUser, options))
                        .setParallelism(unawareBucketTableSource.getParallelism());

        if (!isStreaming) {
            assertBatchAdaptiveParallelism(env, multiBucketTableRewriter.getParallelism());
            assertBatchAdaptiveParallelism(env, unawareBucketTableRewriter.getParallelism());
        }

        if (options.get(SINK_USE_MANAGED_MEMORY)) {
            declareManagedMemory(
                    multiBucketTableRewriter, options.get(SINK_MANAGED_WRITER_BUFFER_MEMORY));
            declareManagedMemory(
                    unawareBucketTableRewriter, options.get(SINK_MANAGED_WRITER_BUFFER_MEMORY));
        }
        return multiBucketTableRewriter.union(unawareBucketTableRewriter);
    }

    protected DataStreamSink doCommit(
            DataStream written, String commitUser) {
        StreamExecutionEnvironment env = written.getExecutionEnvironment();
        ReadableConfig conf = env.getConfiguration();
        CheckpointConfig checkpointConfig = env.getCheckpointConfig();
        boolean isStreaming =
                conf.get(ExecutionOptions.RUNTIME_MODE) == RuntimeExecutionMode.STREAMING;
        boolean streamingCheckpointEnabled =
                isStreaming && checkpointConfig.isCheckpointingEnabled();
        if (streamingCheckpointEnabled) {
            assertStreamingConfiguration(env);
        }

        DataStream partitioned =
                FlinkStreamPartitioner.partition(
                        written,
                        new MultiTableCommittableChannelComputer(),
                        written.getParallelism());
        SingleOutputStreamOperator committed =
                partitioned
                        .transform(
                                GLOBAL_COMMITTER_NAME,
                                new MultiTableCommittableTypeInfo(),
                                new CommitterOperatorFactory<>(
                                        streamingCheckpointEnabled,
                                        false,
                                        commitUser,
                                        createCommitterFactory(isStreaming),
                                        createCommittableStateManager(),
                                        options.get(END_INPUT_WATERMARK)))
                        .setParallelism(written.getParallelism());
        if (!options.get(SINK_COMMITTER_OPERATOR_CHAINING)) {
            committed = committed.startNewChain();
        }
        return committed.sinkTo(new DiscardingSink<>()).name("end").setParallelism(1);
    }

    // TODO:refactor FlinkSink to adopt this sink
    protected OneInputStreamOperatorFactory
            combinedMultiComacptionWriteOperator(
                    CheckpointConfig checkpointConfig,
                    boolean isStreaming,
                    boolean fullCompaction,
                    String commitUser) {
        return new MultiTablesStoreCompactOperator.Factory(
                catalogLoader,
                commitUser,
                checkpointConfig,
                isStreaming,
                ignorePreviousFiles,
                fullCompaction,
                options);
    }

    protected Committer.Factory
            createCommitterFactory(boolean isStreaming) {
        Map dynamicOptions = options.toMap();
        dynamicOptions.put(CoreOptions.WRITE_ONLY.key(), "false");
        if (isStreaming) {
            dynamicOptions.put(CoreOptions.NUM_SORTED_RUNS_STOP_TRIGGER.key(), "2147483647");
            dynamicOptions.put(CoreOptions.SORT_SPILL_THRESHOLD.key(), "10");
            dynamicOptions.put(CoreOptions.LOOKUP_WAIT.key(), "false");
        }
        return context -> new StoreMultiCommitter(catalogLoader, context, true, dynamicOptions);
    }

    protected CommittableStateManager createCommittableStateManager() {
        return new RestoreAndFailCommittableStateManager<>(
                WrappedManifestCommittableSerializer::new);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy