io.delta.flink.internal.table.DeltaDynamicTableSink Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of delta-flink Show documentation
delta-flink
There is a newer version: 3.2.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.delta.flink.internal.table;

import java.util.LinkedHashMap;
import java.util.Map;

import io.delta.flink.internal.options.DeltaConnectorConfiguration;
import io.delta.flink.sink.DeltaSink;
import io.delta.flink.sink.internal.DeltaBucketAssigner;
import io.delta.flink.sink.internal.DeltaPartitionComputer.DeltaRowDataPartitionComputer;
import io.delta.flink.sink.internal.DeltaSinkBuilder;
import io.delta.flink.source.internal.builder.RowDataFormat;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.core.fs.Path;
import org.apache.flink.formats.parquet.row.ParquetRowDataBuilder;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.BasePathBucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.OnCheckpointRollingPolicy;
import org.apache.flink.table.catalog.CatalogTable;
import org.apache.flink.table.connector.ChangelogMode;
import org.apache.flink.table.connector.sink.DynamicTableSink;
import org.apache.flink.table.connector.sink.SinkProvider;
import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hadoop.conf.Configuration;

/**
 * Sink of a dynamic Flink table to a Delta lake table.
 *
 * 
 * It utilizes new Flink Sink API (available for {@code Flink >= 1.12}) and interfaces (available
 * for {@code Flink >= 1.13}) provided for interoperability between this new Sink API and Table API.
 * It also supports static partitioning.
 *
 * 

 * For regular batch scenarios, the sink can solely accept insert-only rows and write out bounded
 * streams.
 *
 * 

 * For regular streaming scenarios, the sink can solely accept insert-only rows and can write out
 * unbounded streams.
 */
public class DeltaDynamicTableSink implements DynamicTableSink, SupportsPartitioning {

    /**
     * Hardcoded option for {@link RowDataFormat} to threat timestamps as a UTC timestamps.
     */
    private static final boolean PARQUET_UTC_TIMESTAMP = true;

    /**
     * The Delta's mergeSchema option is not supported in Flink SQL/Table API due to fact that
     * Flink's table planner validates every query against table schema.
     * If query schema does not match table's schema the query will fail.
     */
    private static final boolean MERGE_SCHEMA = false;

    private final Path basePath;

    private final Configuration hadoopConf;

    private final RowType rowType;

    private final CatalogTable catalogTable;

    /**
     * Flink is providing the connector with the partition values derived from the PARTITION
     * clause, e.g.
     * 
     * INSERT INTO x PARTITION(col1='val1") ...
     * 
     * Those partition values will be populated to this map via {@link #applyStaticPartition(Map)}
     */
    private LinkedHashMap staticPartitionSpec;

    /**
     * Constructor for creating sink of Flink dynamic table to Delta table.
     *
     * @param basePath     full Delta table path
     * @param hadoopConf   Hadoop's configuration
     * @param rowType      Flink's logical type with the structure of the events in the stream
     * @param catalogTable represents the unresolved metadata of derived by Flink framework from
     *                     table's DDL
     */
    public DeltaDynamicTableSink(
            Path basePath,
            Configuration hadoopConf,
            RowType rowType,
            CatalogTable catalogTable) {

        this(basePath, hadoopConf, rowType, catalogTable, new LinkedHashMap<>());
    }

    private DeltaDynamicTableSink(
            Path basePath,
            Configuration hadoopConf,
            RowType rowType,
            CatalogTable catalogTable,
            LinkedHashMap staticPartitionSpec) {

        this.basePath = basePath;
        this.rowType = rowType;
        this.hadoopConf = hadoopConf;
        this.catalogTable = catalogTable;
        this.staticPartitionSpec = staticPartitionSpec;
    }

    /**
     * Returns the set of changes that the sink accepts during runtime.
     *
     * @param requestedMode expected set of changes by the current plan
     * @return {@link ChangelogMode} only allowing for inserts to the Delta table
     */
    @Override
    public ChangelogMode getChangelogMode(ChangelogMode requestedMode) {
        return ChangelogMode.insertOnly();
    }

    /**
     * Utility method for transition from Flink's DataStream to Table API.
     *
     * @param context Context for creating runtime implementation via a {@link
     *                SinkRuntimeProvider}.
     * @return provider representing {@link DeltaSink} implementation for writing the data to a
     * Delta table.
     */
    @Override
    public SinkRuntimeProvider getSinkRuntimeProvider(Context context) {

        DeltaSinkBuilder builder =
            new DeltaSinkBuilder.DefaultDeltaFormatBuilder<>(
                this.basePath,
                this.hadoopConf,
                ParquetRowDataBuilder.createWriterFactory(
                    this.rowType,
                    this.hadoopConf,
                    PARQUET_UTC_TIMESTAMP
                ),
                new BasePathBucketAssigner<>(),
                OnCheckpointRollingPolicy.build(),
                this.rowType,
                MERGE_SCHEMA, // mergeSchema = false
                new DeltaConnectorConfiguration()
            );

        if (catalogTable.isPartitioned()) {
            DeltaRowDataPartitionComputer partitionComputer =
                new DeltaRowDataPartitionComputer(
                    rowType,
                    catalogTable.getPartitionKeys().toArray(new String[0]),
                    staticPartitionSpec
                );
            DeltaBucketAssigner partitionAssigner =
                new DeltaBucketAssigner<>(partitionComputer);

            builder.withBucketAssigner(partitionAssigner);
        }

        return SinkProvider.of(builder.build());
    }

    @Override
    public DynamicTableSink copy() {
        return new DeltaDynamicTableSink(
            this.basePath,
            this.hadoopConf,
            this.rowType,
            this.catalogTable,
            new LinkedHashMap<>(this.staticPartitionSpec));
    }

    @Override
    public String asSummaryString() {
        return "DeltaSink";
    }

    /**
     * Static values for partitions that should set explicitly instead of being derived from the
     * content of the records.
     *
     * 
     * If all partition keys get a value assigned in the {@code PARTITION} clause, the operation
     * is considered an "insertion into a static partition". In the below example, the query result
     * should be written into the static partition {@code region='europe', month='2020-01'} which
     * will be passed by the planner into {@code applyStaticPartition(Map)}.
     *
     * 
     * INSERT INTO t PARTITION (region='europe', month='2020-01') SELECT a, b, c FROM my_view;
     * 
     *
     * If only a subset of all partition keys get a static value assigned in the {@code
     * PARTITION} clause or with a constant part in a {@code SELECT} clause, the operation is
     * considered an "insertion into a dynamic partition". In the below example, the static
     * partition part is {@code region='europe'} which will be passed by the planner into {@code
     * #applyStaticPartition(Map)}. The remaining values for partition keys should be obtained from
     * each individual record by the sink during runtime.
     *
     * 
     * INSERT INTO t PARTITION (region='europe') SELECT a, b, c, month FROM another_view;
     * 
     *
     * @param partition map of static partitions and their values.
     */
    @Override
    public void applyStaticPartition(Map partition) {
        // make it a LinkedHashMap to maintain partition column order
        LinkedHashMap staticPartitions = new LinkedHashMap<>();

        for (String partitionCol : catalogTable.getPartitionKeys()) {
            if (partition.containsKey(partitionCol)) {
                staticPartitions.put(partitionCol, partition.get(partitionCol));
            }
        }

        this.staticPartitionSpec = staticPartitions;
    }

    @VisibleForTesting
    Configuration getHadoopConf() {
        return new Configuration(hadoopConf);
    }
}