org.apache.flink.lakesoul.source.LakeSoulSource Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of lakesoul-flink Show documentation
There is a newer version: 2.5.1-flink-1.17
// SPDX-FileCopyrightText: 2023 LakeSoul Contributors
//
// SPDX-License-Identifier: Apache-2.0

package org.apache.flink.lakesoul.source;

import com.dmetasoul.lakesoul.meta.DBUtil;
import com.dmetasoul.lakesoul.meta.DataFileInfo;
import com.dmetasoul.lakesoul.meta.DataOperation;
import com.dmetasoul.lakesoul.meta.LakeSoulOptions;
import com.dmetasoul.lakesoul.meta.entity.TableInfo;
import org.apache.flink.api.connector.source.*;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.io.SimpleVersionedSerializer;
import org.apache.flink.lakesoul.tool.FlinkUtil;
import org.apache.flink.lakesoul.tool.LakeSoulSinkOptions;
import org.apache.flink.lakesoul.types.TableId;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.types.logical.RowType;
import org.apache.parquet.filter2.predicate.FilterPredicate;

import javax.annotation.Nullable;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.*;

public class LakeSoulSource implements Source {
    TableId tableId;

    RowType rowType;

    RowType rowTypeWithPk;

    boolean isStreaming;

    List pkColumns;

    Map optionParams;

    @Nullable
    List> remainingPartitions;

    @Nullable
    FilterPredicate filter;

    public LakeSoulSource(TableId tableId,
                          RowType rowType,
                          RowType rowTypeWithPk,
                          boolean isStreaming,
                          List pkColumns,
                          Map optionParams,
                          @Nullable List> remainingPartitions,
                          @Nullable FilterPredicate filter) {
        this.tableId = tableId;
        this.rowType = rowType;
        this.rowTypeWithPk = rowTypeWithPk;
        this.isStreaming = isStreaming;
        this.pkColumns = pkColumns;
        this.optionParams = optionParams;
        this.remainingPartitions = remainingPartitions;
        this.filter = filter;
    }

    @Override
    public Boundedness getBoundedness() {
        if (this.isStreaming) {
            return Boundedness.CONTINUOUS_UNBOUNDED;
        } else {
            return Boundedness.BOUNDED;
        }
    }

    @Override
    public SourceReader createReader(SourceReaderContext readerContext) throws Exception {
        Configuration conf = Configuration.fromMap(optionParams);
        conf.addAll(readerContext.getConfiguration());
        return new LakeSoulSourceReader(
                () -> new LakeSoulSplitReader(conf,
                        this.rowType,
                        this.rowTypeWithPk,
                        this.pkColumns,
                        this.isStreaming,
                        this.optionParams.getOrDefault(LakeSoulSinkOptions.CDC_CHANGE_COLUMN, ""),
                        this.filter),
                new LakeSoulRecordEmitter(),
                readerContext.getConfiguration(),
                readerContext);
    }

    @Override
    public SplitEnumerator createEnumerator(
            SplitEnumeratorContext enumContext) {
        TableInfo tif = DataOperation.dbManager().getTableInfoByNameAndNamespace(tableId.table(),
                tableId.schema());
        List readStartTimestampWithTimeZone =
                Arrays.asList(optionParams.getOrDefault(LakeSoulOptions.READ_START_TIME(), ""),
                        optionParams.getOrDefault(LakeSoulOptions.TIME_ZONE(), ""));
        String readType = optionParams.getOrDefault(LakeSoulOptions.READ_TYPE(), "");
        if (this.isStreaming) {
            String partDesc = optionParams.getOrDefault(LakeSoulOptions.PARTITION_DESC(), "");
            if (partDesc.isEmpty()) {
                if (remainingPartitions != null && !remainingPartitions.isEmpty()) {
                    // use remaining partition
                    if (remainingPartitions.size() > 1) {
                        throw new RuntimeException("Streaming read allows only one specified partition," +
                                " or no specified partition to incrementally read entire table");
                    }
                    partDesc = DBUtil.formatPartitionDesc(remainingPartitions.get(0));
                }
            }
            return new LakeSoulDynamicSplitEnumerator(enumContext,
                    new LakeSoulDynSplitAssigner(optionParams.getOrDefault(LakeSoulOptions.HASH_BUCKET_NUM(), "-1")),
                    Long.parseLong(optionParams.getOrDefault(LakeSoulOptions.DISCOVERY_INTERVAL(), "30000")),
                    convertTimeFormatWithTimeZone(readStartTimestampWithTimeZone),
                    tif.getTableId(),
                    partDesc,
                    optionParams.getOrDefault(LakeSoulOptions.HASH_BUCKET_NUM(), "-1"));
        } else {
            return staticSplitEnumerator(enumContext,
                    tif,
                    readStartTimestampWithTimeZone,
                    readType);
        }
    }

    private LakeSoulStaticSplitEnumerator staticSplitEnumerator(SplitEnumeratorContext enumContext,
                                                                TableInfo tif,
                                                                List readStartTimestampWithTimeZone,
                                                                String readType) {
        List readEndTimestampWithTimeZone =
                Arrays.asList(optionParams.getOrDefault(LakeSoulOptions.READ_END_TIME(), ""),
                        optionParams.getOrDefault(LakeSoulOptions.TIME_ZONE(), ""));
        List dfinfos;
        if (readType.equals("") || readType.equals("fullread")) {
            dfinfos = Arrays.asList(getTargetDataFileInfo(tif));
        } else {
            dfinfos = new ArrayList<>();
            List partDescs = new ArrayList<>();
            String partitionDescOpt = optionParams.getOrDefault(LakeSoulOptions.PARTITION_DESC(), "");
            if (partitionDescOpt.isEmpty() && remainingPartitions != null) {
                for (Map part : remainingPartitions) {
                    String desc = DBUtil.formatPartitionDesc(part);
                    partDescs.add(desc);
                }
            } else {
                partDescs.add(partitionDescOpt);
            }
            for (String desc : partDescs) {
                dfinfos.addAll(Arrays.asList(DataOperation.getIncrementalPartitionDataInfo(tif.getTableId(),
                        desc,
                        convertTimeFormatWithTimeZone(readStartTimestampWithTimeZone),
                        convertTimeFormatWithTimeZone(readEndTimestampWithTimeZone),
                        readType)));
            }
        }
        int capacity = 100;
        ArrayList splits = new ArrayList<>(capacity);
        if (!FlinkUtil.isExistHashPartition(tif)) {
            for (DataFileInfo dfinfo : dfinfos) {
                ArrayList tmp = new ArrayList<>();
                tmp.add(new Path(dfinfo.path()));
                splits.add(new LakeSoulSplit(String.valueOf(dfinfo.hashCode()),
                        tmp,
                        0));
            }
        } else {
            Map>> splitByRangeAndHashPartition =
                    FlinkUtil.splitDataInfosToRangeAndHashPartition(tif.getTableId(),
                            dfinfos.toArray(new DataFileInfo[0]));
            for (Map.Entry>> entry : splitByRangeAndHashPartition.entrySet()) {
                for (Map.Entry> split : entry.getValue().entrySet()) {
                    splits.add(new LakeSoulSplit(String.valueOf(split.hashCode()),
                            split.getValue(),
                            0));
                }
            }
        }
        return new LakeSoulStaticSplitEnumerator(enumContext,
                new LakeSoulSimpleSplitAssigner(splits));
    }


    private DataFileInfo[] getTargetDataFileInfo(TableInfo tif) {
        return FlinkUtil.getTargetDataFileInfo(tif,
                this.remainingPartitions);
    }

    private long convertTimeFormatWithTimeZone(List readTimestampWithTimeZone) {
        String time = readTimestampWithTimeZone.get(0);
        String timeZone = readTimestampWithTimeZone.get(1);
        if (timeZone.equals("") || !Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZone)) {
            timeZone = TimeZone.getDefault().getID();
        }
        long readTimeStamp = 0;
        if (!time.equals("")) {
            readTimeStamp = LocalDateTime.parse(time,
                            DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))
                    .atZone(ZoneId.of(timeZone)).toInstant().toEpochMilli();
        }
        return readTimeStamp;
    }

    @Override
    public SplitEnumerator restoreEnumerator(
            SplitEnumeratorContext enumContext,
            LakeSoulPendingSplits checkpoint) throws Exception {
        return new LakeSoulDynamicSplitEnumerator(enumContext,
                new LakeSoulDynSplitAssigner(checkpoint.getSplits(),
                        String.valueOf(checkpoint.getHashBucketNum())),
                checkpoint.getDiscoverInterval(),
                checkpoint.getLastReadTimestamp(),
                checkpoint.getTableid(),
                checkpoint.getParDesc(),
                String.valueOf(checkpoint.getHashBucketNum()));
    }

    @Override
    public SimpleVersionedSerializer getSplitSerializer() {
        return new SimpleLakeSoulSerializer();
    }

    @Override
    public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() {
        return new SimpleLakeSoulPendingSplitsSerializer();
    }
}