All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.paimon.flink.source.FlinkSourceBuilder Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.paimon.flink.source;

import org.apache.paimon.CoreOptions;
import org.apache.paimon.CoreOptions.StartupMode;
import org.apache.paimon.CoreOptions.StreamingReadMode;
import org.apache.paimon.flink.FlinkConnectorOptions;
import org.apache.paimon.flink.NestedProjectedRowData;
import org.apache.paimon.flink.Projection;
import org.apache.paimon.flink.log.LogSourceProvider;
import org.apache.paimon.flink.sink.FlinkSink;
import org.apache.paimon.flink.source.align.AlignedContinuousFileStoreSource;
import org.apache.paimon.flink.source.operator.MonitorSource;
import org.apache.paimon.flink.utils.TableScanUtils;
import org.apache.paimon.options.Options;
import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.table.BucketMode;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.Table;
import org.apache.paimon.table.source.ReadBuilder;
import org.apache.paimon.utils.StringUtils;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.connector.source.Boundedness;
import org.apache.flink.api.connector.source.Source;
import org.apache.flink.connector.base.source.hybrid.HybridSource;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.util.DataFormatConverters;
import org.apache.flink.table.runtime.typeutils.ExternalTypeInfo;
import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.types.Row;

import javax.annotation.Nullable;

import java.util.List;
import java.util.Optional;

import static org.apache.flink.table.types.utils.TypeConversions.fromLogicalToDataType;
import static org.apache.paimon.CoreOptions.StreamingReadMode.FILE;
import static org.apache.paimon.flink.FlinkConnectorOptions.SOURCE_OPERATOR_UID_SUFFIX;
import static org.apache.paimon.flink.FlinkConnectorOptions.generateCustomUid;
import static org.apache.paimon.flink.LogicalTypeConversion.toLogicalType;
import static org.apache.paimon.utils.Preconditions.checkArgument;
import static org.apache.paimon.utils.Preconditions.checkState;

/**
 * DataStream API for building Flink Source.
 *
 * @since 0.8
 */
public class FlinkSourceBuilder {
    private static final String SOURCE_NAME = "Source";

    private final Table table;
    private final Options conf;
    private final BucketMode bucketMode;
    private String sourceName;
    private Boolean sourceBounded;
    private StreamExecutionEnvironment env;
    @Nullable private int[][] projectedFields;
    @Nullable private Predicate predicate;
    @Nullable private LogSourceProvider logSourceProvider;
    @Nullable private Integer parallelism;
    @Nullable private Long limit;
    @Nullable private WatermarkStrategy watermarkStrategy;
    @Nullable private DynamicPartitionFilteringInfo dynamicPartitionFilteringInfo;

    public FlinkSourceBuilder(Table table) {
        this.table = table;
        this.bucketMode =
                table instanceof FileStoreTable
                        ? ((FileStoreTable) table).bucketMode()
                        : BucketMode.HASH_FIXED;
        this.sourceName = table.name();
        this.conf = Options.fromMap(table.options());
    }

    public FlinkSourceBuilder env(StreamExecutionEnvironment env) {
        this.env = env;
        if (sourceBounded == null) {
            sourceBounded = !FlinkSink.isStreaming(env);
        }
        return this;
    }

    public FlinkSourceBuilder sourceName(String name) {
        this.sourceName = name;
        return this;
    }

    public FlinkSourceBuilder sourceBounded(boolean bounded) {
        this.sourceBounded = bounded;
        return this;
    }

    public FlinkSourceBuilder projection(int[] projectedFields) {
        return projection(Projection.of(projectedFields).toNestedIndexes());
    }

    public FlinkSourceBuilder projection(int[][] projectedFields) {
        this.projectedFields = projectedFields;
        return this;
    }

    public FlinkSourceBuilder predicate(Predicate predicate) {
        this.predicate = predicate;
        return this;
    }

    public FlinkSourceBuilder limit(@Nullable Long limit) {
        this.limit = limit;
        return this;
    }

    public FlinkSourceBuilder sourceParallelism(@Nullable Integer parallelism) {
        this.parallelism = parallelism;
        return this;
    }

    public FlinkSourceBuilder watermarkStrategy(
            @Nullable WatermarkStrategy watermarkStrategy) {
        this.watermarkStrategy = watermarkStrategy;
        return this;
    }

    public FlinkSourceBuilder dynamicPartitionFilteringFields(
            List dynamicPartitionFilteringFields) {
        if (dynamicPartitionFilteringFields != null && !dynamicPartitionFilteringFields.isEmpty()) {
            checkState(
                    table instanceof FileStoreTable,
                    "Only Paimon FileStoreTable supports dynamic filtering but get %s.",
                    table.getClass().getName());

            this.dynamicPartitionFilteringInfo =
                    new DynamicPartitionFilteringInfo(
                            ((FileStoreTable) table).schema().logicalPartitionType(),
                            dynamicPartitionFilteringFields);
        }
        return this;
    }

    @Deprecated
    FlinkSourceBuilder logSourceProvider(LogSourceProvider logSourceProvider) {
        this.logSourceProvider = logSourceProvider;
        return this;
    }

    private ReadBuilder createReadBuilder(@Nullable org.apache.paimon.types.RowType readType) {
        ReadBuilder readBuilder = table.newReadBuilder();
        if (readType != null) {
            readBuilder.withReadType(readType);
        }
        readBuilder.withFilter(predicate);
        if (limit != null) {
            readBuilder.withLimit(limit.intValue());
        }
        return readBuilder.dropStats();
    }

    private DataStream buildStaticFileSource() {
        Options options = Options.fromMap(table.options());
        return toDataStream(
                new StaticFileStoreSource(
                        createReadBuilder(projectedRowType()),
                        limit,
                        options.get(FlinkConnectorOptions.SCAN_SPLIT_ENUMERATOR_BATCH_SIZE),
                        options.get(FlinkConnectorOptions.SCAN_SPLIT_ENUMERATOR_ASSIGN_MODE),
                        dynamicPartitionFilteringInfo,
                        outerProject()));
    }

    private DataStream buildContinuousFileSource() {
        return toDataStream(
                new ContinuousFileStoreSource(
                        createReadBuilder(projectedRowType()),
                        table.options(),
                        limit,
                        bucketMode,
                        outerProject()));
    }

    private DataStream buildAlignedContinuousFileSource() {
        assertStreamingConfigurationForAlignMode(env);
        return toDataStream(
                new AlignedContinuousFileStoreSource(
                        createReadBuilder(projectedRowType()),
                        table.options(),
                        limit,
                        bucketMode,
                        outerProject()));
    }

    private DataStream toDataStream(Source source) {
        DataStreamSource dataStream =
                env.fromSource(
                        source,
                        watermarkStrategy == null
                                ? WatermarkStrategy.noWatermarks()
                                : watermarkStrategy,
                        sourceName,
                        produceTypeInfo());

        String uidSuffix = table.options().get(SOURCE_OPERATOR_UID_SUFFIX.key());
        if (!StringUtils.isNullOrWhitespaceOnly(uidSuffix)) {
            dataStream =
                    (DataStreamSource)
                            dataStream.uid(generateCustomUid(SOURCE_NAME, table.name(), uidSuffix));
        }

        if (parallelism != null) {
            dataStream.setParallelism(parallelism);
        }
        return dataStream;
    }

    private TypeInformation produceTypeInfo() {
        RowType rowType = toLogicalType(table.rowType());
        LogicalType produceType =
                Optional.ofNullable(projectedFields)
                        .map(Projection::of)
                        .map(p -> p.project(rowType))
                        .orElse(rowType);
        return InternalTypeInfo.of(produceType);
    }

    private @Nullable org.apache.paimon.types.RowType projectedRowType() {
        return Optional.ofNullable(projectedFields)
                .map(Projection::of)
                .map(p -> p.project(table.rowType()))
                .orElse(null);
    }

    private @Nullable NestedProjectedRowData outerProject() {
        return Optional.ofNullable(projectedFields)
                .map(Projection::of)
                .map(p -> p.getOuterProjectRow(table.rowType()))
                .orElse(null);
    }

    /** Build source {@link DataStream} with {@link RowData}. */
    public DataStream buildForRow() {
        DataType rowType = fromLogicalToDataType(toLogicalType(table.rowType()));
        DataType[] fieldDataTypes = rowType.getChildren().toArray(new DataType[0]);

        DataFormatConverters.RowConverter converter =
                new DataFormatConverters.RowConverter(fieldDataTypes);
        DataStream source = build();
        return source.map((MapFunction) converter::toExternal)
                .setParallelism(source.getParallelism())
                .returns(ExternalTypeInfo.of(rowType));
    }

    /** Build source {@link DataStream} with {@link RowData}. */
    public DataStream build() {
        if (env == null) {
            throw new IllegalArgumentException("StreamExecutionEnvironment should not be null.");
        }
        if (conf.contains(CoreOptions.CONSUMER_ID)
                && !conf.contains(CoreOptions.CONSUMER_EXPIRATION_TIME)) {
            throw new IllegalArgumentException(
                    "You need to configure 'consumer.expiration-time' (ALTER TABLE) and restart your write job for it"
                            + " to take effect, when you need consumer-id feature. This is to prevent consumers from leaving"
                            + " too many snapshots that could pose a risk to the file system.");
        }

        if (sourceBounded) {
            return buildStaticFileSource();
        }
        TableScanUtils.streamingReadingValidate(table);

        // TODO visit all options through CoreOptions
        StartupMode startupMode = CoreOptions.startupMode(conf);
        StreamingReadMode streamingReadMode = CoreOptions.streamReadType(conf);

        if (logSourceProvider != null && streamingReadMode != FILE) {
            logSourceProvider.preCreateSource();
            if (startupMode != StartupMode.LATEST_FULL) {
                return toDataStream(logSourceProvider.createSource(null));
            } else {
                return toDataStream(
                        HybridSource.builder(
                                        LogHybridSourceFactory.buildHybridFirstSource(
                                                table,
                                                projectedRowType(),
                                                predicate,
                                                outerProject()))
                                .addSource(
                                        new LogHybridSourceFactory(logSourceProvider),
                                        Boundedness.CONTINUOUS_UNBOUNDED)
                                .build());
            }
        } else {
            if (conf.get(FlinkConnectorOptions.SOURCE_CHECKPOINT_ALIGN_ENABLED)) {
                return buildAlignedContinuousFileSource();
            } else if (conf.contains(CoreOptions.CONSUMER_ID)
                    && conf.get(CoreOptions.CONSUMER_CONSISTENCY_MODE)
                            == CoreOptions.ConsumerMode.EXACTLY_ONCE) {
                return buildContinuousStreamOperator();
            } else {
                return buildContinuousFileSource();
            }
        }
    }

    private DataStream buildContinuousStreamOperator() {
        DataStream dataStream;
        if (limit != null) {
            throw new IllegalArgumentException(
                    "Cannot limit streaming source, please use batch execution mode.");
        }
        dataStream =
                MonitorSource.buildSource(
                        env,
                        sourceName,
                        produceTypeInfo(),
                        createReadBuilder(projectedRowType()),
                        conf.get(CoreOptions.CONTINUOUS_DISCOVERY_INTERVAL).toMillis(),
                        watermarkStrategy == null,
                        conf.get(
                                FlinkConnectorOptions.STREAMING_READ_SHUFFLE_BUCKET_WITH_PARTITION),
                        bucketMode,
                        outerProject());
        if (parallelism != null) {
            dataStream.getTransformation().setParallelism(parallelism);
        }
        if (watermarkStrategy != null) {
            dataStream = dataStream.assignTimestampsAndWatermarks(watermarkStrategy);
        }
        return dataStream;
    }

    private void assertStreamingConfigurationForAlignMode(StreamExecutionEnvironment env) {
        CheckpointConfig checkpointConfig = env.getCheckpointConfig();
        checkArgument(
                checkpointConfig.isCheckpointingEnabled(),
                "The align mode of paimon source is only supported when checkpoint enabled. Please set "
                        + "execution.checkpointing.interval larger than 0");
        checkArgument(
                checkpointConfig.getMaxConcurrentCheckpoints() == 1,
                "The align mode of paimon source supports at most one ongoing checkpoint at the same time. Please set "
                        + "execution.checkpointing.max-concurrent-checkpoints to 1");
        checkArgument(
                checkpointConfig.getCheckpointTimeout()
                        > conf.get(FlinkConnectorOptions.SOURCE_CHECKPOINT_ALIGN_TIMEOUT)
                                .toMillis(),
                "The align mode of paimon source requires that the timeout of checkpoint is greater than the timeout of the source's snapshot alignment. Please increase "
                        + "execution.checkpointing.timeout or decrease "
                        + FlinkConnectorOptions.SOURCE_CHECKPOINT_ALIGN_TIMEOUT.key());
        checkArgument(
                !env.getCheckpointConfig().isUnalignedCheckpointsEnabled(),
                "The align mode of paimon source currently does not support unaligned checkpoints. Please set "
                        + "execution.checkpointing.unaligned.enabled to false.");
        checkArgument(
                env.getCheckpointConfig().getCheckpointingMode() == CheckpointingMode.EXACTLY_ONCE,
                "The align mode of paimon source currently only supports EXACTLY_ONCE checkpoint mode. Please set "
                        + "execution.checkpointing.mode to exactly-once");
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy