org.dinky.shaded.paimon.table.source.snapshot.SnapshotReaderImpl Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.dinky.shaded.paimon.table.source.snapshot;

import org.dinky.shaded.paimon.CoreOptions;
import org.dinky.shaded.paimon.Snapshot;
import org.dinky.shaded.paimon.codegen.CodeGenUtils;
import org.dinky.shaded.paimon.codegen.RecordComparator;
import org.dinky.shaded.paimon.consumer.ConsumerManager;
import org.dinky.shaded.paimon.data.BinaryRow;
import org.dinky.shaded.paimon.io.DataFileMeta;
import org.dinky.shaded.paimon.manifest.FileKind;
import org.dinky.shaded.paimon.manifest.ManifestEntry;
import org.dinky.shaded.paimon.metrics.MetricRegistry;
import org.dinky.shaded.paimon.operation.DefaultValueAssigner;
import org.dinky.shaded.paimon.operation.FileStoreScan;
import org.dinky.shaded.paimon.operation.metrics.ScanMetrics;
import org.dinky.shaded.paimon.predicate.Predicate;
import org.dinky.shaded.paimon.predicate.PredicateBuilder;
import org.dinky.shaded.paimon.schema.TableSchema;
import org.dinky.shaded.paimon.table.source.DataSplit;
import org.dinky.shaded.paimon.table.source.RawFile;
import org.dinky.shaded.paimon.table.source.ScanMode;
import org.dinky.shaded.paimon.table.source.Split;
import org.dinky.shaded.paimon.table.source.SplitGenerator;
import org.dinky.shaded.paimon.types.RowType;
import org.dinky.shaded.paimon.utils.FileStorePathFactory;
import org.dinky.shaded.paimon.utils.Filter;
import org.dinky.shaded.paimon.utils.SnapshotManager;
import org.dinky.shaded.paimon.utils.TypeUtils;

import javax.annotation.Nullable;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.stream.Collectors;

import static org.dinky.shaded.paimon.operation.FileStoreScan.Plan.groupByPartFiles;
import static org.dinky.shaded.paimon.predicate.PredicateBuilder.transformFieldMapping;

/** Implementation of {@link SnapshotReader}. */
public class SnapshotReaderImpl implements SnapshotReader {

    private final FileStoreScan scan;
    private final TableSchema tableSchema;
    private final CoreOptions options;
    private final SnapshotManager snapshotManager;
    private final ConsumerManager consumerManager;
    private final SplitGenerator splitGenerator;
    private final BiConsumer nonPartitionFilterConsumer;
    private final DefaultValueAssigner defaultValueAssigner;
    private final FileStorePathFactory pathFactory;

    private ScanMode scanMode = ScanMode.ALL;
    private RecordComparator lazyPartitionComparator;

    private final String tableName;

    public SnapshotReaderImpl(
            FileStoreScan scan,
            TableSchema tableSchema,
            CoreOptions options,
            SnapshotManager snapshotManager,
            SplitGenerator splitGenerator,
            BiConsumer nonPartitionFilterConsumer,
            DefaultValueAssigner defaultValueAssigner,
            FileStorePathFactory pathFactory,
            String tableName) {
        this.scan = scan;
        this.tableSchema = tableSchema;
        this.options = options;
        this.snapshotManager = snapshotManager;
        this.consumerManager =
                new ConsumerManager(snapshotManager.fileIO(), snapshotManager.tablePath());
        this.splitGenerator = splitGenerator;
        this.nonPartitionFilterConsumer = nonPartitionFilterConsumer;
        this.defaultValueAssigner = defaultValueAssigner;
        this.pathFactory = pathFactory;

        this.tableName = tableName;
    }

    @Override
    public SnapshotManager snapshotManager() {
        return snapshotManager;
    }

    @Override
    public ConsumerManager consumerManager() {
        return consumerManager;
    }

    @Override
    public SplitGenerator splitGenerator() {
        return splitGenerator;
    }

    @Override
    public SnapshotReader withSnapshot(long snapshotId) {
        scan.withSnapshot(snapshotId);
        return this;
    }

    @Override
    public SnapshotReader withSnapshot(Snapshot snapshot) {
        scan.withSnapshot(snapshot);
        return this;
    }

    @Override
    public SnapshotReader withPartitionFilter(Map partitionSpec) {
        if (partitionSpec != null) {
            List partitionKeys = tableSchema.partitionKeys();
            RowType rowType = tableSchema.logicalPartitionType();
            PredicateBuilder predicateBuilder = new PredicateBuilder(rowType);
            List partitionFilters =
                    partitionSpec.entrySet().stream()
                            .map(
                                    m -> {
                                        int index = partitionKeys.indexOf(m.getKey());
                                        Object value =
                                                TypeUtils.castFromStringInternal(
                                                        m.getValue(),
                                                        rowType.getTypeAt(index),
                                                        false);
                                        return predicateBuilder.equal(index, value);
                                    })
                            .collect(Collectors.toList());
            scan.withPartitionFilter(PredicateBuilder.and(partitionFilters));
        }
        return this;
    }

    @Override
    public SnapshotReader withFilter(Predicate predicate) {
        List partitionKeys = tableSchema.partitionKeys();
        int[] fieldIdxToPartitionIdx =
                tableSchema.fields().stream()
                        .mapToInt(f -> partitionKeys.indexOf(f.name()))
                        .toArray();

        List partitionFilters = new ArrayList<>();
        List nonPartitionFilters = new ArrayList<>();
        for (Predicate p :
                PredicateBuilder.splitAnd(defaultValueAssigner.handlePredicate(predicate))) {
            Optional mapped = transformFieldMapping(p, fieldIdxToPartitionIdx);
            if (mapped.isPresent()) {
                partitionFilters.add(mapped.get());
            } else {
                nonPartitionFilters.add(p);
            }
        }

        if (partitionFilters.size() > 0) {
            scan.withPartitionFilter(PredicateBuilder.and(partitionFilters));
        }

        if (nonPartitionFilters.size() > 0) {
            nonPartitionFilterConsumer.accept(scan, PredicateBuilder.and(nonPartitionFilters));
        }
        return this;
    }

    @Override
    public SnapshotReader withMode(ScanMode scanMode) {
        this.scanMode = scanMode;
        scan.withKind(scanMode);
        return this;
    }

    @Override
    public SnapshotReader withLevelFilter(Filter levelFilter) {
        scan.withLevelFilter(levelFilter);
        return this;
    }

    @Override
    public SnapshotReader withBucket(int bucket) {
        scan.withBucket(bucket);
        return this;
    }

    @Override
    public SnapshotReader withBucketFilter(Filter bucketFilter) {
        scan.withBucketFilter(bucketFilter);
        return this;
    }

    @Override
    public SnapshotReader withMetricRegistry(MetricRegistry registry) {
        scan.withMetrics(new ScanMetrics(registry, tableName));
        return this;
    }

    /** Get splits from {@link FileKind#ADD} files. */
    @Override
    public Plan read() {
        FileStoreScan.Plan plan = scan.plan();
        Long snapshotId = plan.snapshotId();

        Map>> files =
                groupByPartFiles(plan.files(FileKind.ADD));
        if (options.scanPlanSortPartition()) {
            Map>> newFiles = new LinkedHashMap<>();
            files.entrySet().stream()
                    .sorted((o1, o2) -> partitionComparator().compare(o1.getKey(), o2.getKey()))
                    .forEach(entry -> newFiles.put(entry.getKey(), entry.getValue()));
            files = newFiles;
        }
        List splits =
                generateSplits(
                        snapshotId == null ? Snapshot.FIRST_SNAPSHOT_ID - 1 : snapshotId,
                        scanMode != ScanMode.ALL,
                        splitGenerator,
                        files);
        return new Plan() {
            @Nullable
            @Override
            public Long watermark() {
                return plan.watermark();
            }

            @Nullable
            @Override
            public Long snapshotId() {
                return plan.snapshotId();
            }

            @Override
            public List splits() {
                return (List) splits;
            }
        };
    }

    private List generateSplits(
            long snapshotId,
            boolean isStreaming,
            SplitGenerator splitGenerator,
            Map>> groupedDataFiles) {
        List splits = new ArrayList<>();
        for (Map.Entry>> entry :
                groupedDataFiles.entrySet()) {
            BinaryRow partition = entry.getKey();
            Map> buckets = entry.getValue();
            for (Map.Entry> bucketEntry : buckets.entrySet()) {
                int bucket = bucketEntry.getKey();
                List bucketFiles = bucketEntry.getValue();
                DataSplit.Builder builder =
                        DataSplit.builder()
                                .withSnapshot(snapshotId)
                                .withPartition(partition)
                                .withBucket(bucket)
                                .isStreaming(isStreaming);
                List> splitGroups =
                        isStreaming
                                ? splitGenerator.splitForStreaming(bucketFiles)
                                : splitGenerator.splitForBatch(bucketFiles);
                for (List dataFiles : splitGroups) {
                    splits.add(
                            builder.withDataFiles(dataFiles)
                                    .rawFiles(convertToRawFiles(partition, bucket, dataFiles))
                                    .build());
                }
            }
        }
        return splits;
    }

    @Override
    public List partitions() {
        List entryList = scan.plan().files();

        return entryList.stream()
                .collect(
                        Collectors.groupingBy(
                                ManifestEntry::partition,
                                LinkedHashMap::new,
                                Collectors.reducing((a, b) -> b)))
                .values()
                .stream()
                .map(Optional::get)
                .map(ManifestEntry::partition)
                .collect(Collectors.toList());
    }

    @Override
    public Plan readChanges() {
        withMode(ScanMode.DELTA);
        FileStoreScan.Plan plan = scan.plan();

        Map>> beforeFiles =
                groupByPartFiles(plan.files(FileKind.DELETE));
        Map>> dataFiles =
                groupByPartFiles(plan.files(FileKind.ADD));

        return toChangesPlan(true, plan, beforeFiles, dataFiles);
    }

    private Plan toChangesPlan(
            boolean isStreaming,
            FileStoreScan.Plan plan,
            Map>> beforeFiles,
            Map>> dataFiles) {
        List splits = new ArrayList<>();
        Map> buckets = new HashMap<>();
        beforeFiles.forEach(
                (part, bucketMap) ->
                        buckets.computeIfAbsent(part, k -> new HashSet<>())
                                .addAll(bucketMap.keySet()));
        dataFiles.forEach(
                (part, bucketMap) ->
                        buckets.computeIfAbsent(part, k -> new HashSet<>())
                                .addAll(bucketMap.keySet()));

        for (Map.Entry> entry : buckets.entrySet()) {
            BinaryRow part = entry.getKey();
            for (Integer bucket : entry.getValue()) {
                List before =
                        beforeFiles
                                .getOrDefault(part, Collections.emptyMap())
                                .getOrDefault(bucket, Collections.emptyList());
                List data =
                        dataFiles
                                .getOrDefault(part, Collections.emptyMap())
                                .getOrDefault(bucket, Collections.emptyList());

                // deduplicate
                before.removeIf(data::remove);

                DataSplit split =
                        DataSplit.builder()
                                .withSnapshot(plan.snapshotId())
                                .withPartition(part)
                                .withBucket(bucket)
                                .withBeforeFiles(before)
                                .withDataFiles(data)
                                .isStreaming(isStreaming)
                                .rawFiles(convertToRawFiles(part, bucket, data))
                                .build();
                splits.add(split);
            }
        }

        return new Plan() {
            @Nullable
            @Override
            public Long watermark() {
                return plan.watermark();
            }

            @Nullable
            @Override
            public Long snapshotId() {
                return plan.snapshotId();
            }

            @Override
            public List splits() {
                return (List) splits;
            }
        };
    }

    @Override
    public Plan readIncrementalDiff(Snapshot before) {
        withMode(ScanMode.ALL);
        FileStoreScan.Plan plan = scan.plan();
        Map>> dataFiles =
                groupByPartFiles(plan.files(FileKind.ADD));
        Map>> beforeFiles =
                groupByPartFiles(scan.withSnapshot(before).plan().files(FileKind.ADD));
        return toChangesPlan(false, plan, beforeFiles, dataFiles);
    }

    private RecordComparator partitionComparator() {
        if (lazyPartitionComparator == null) {
            lazyPartitionComparator =
                    CodeGenUtils.newRecordComparator(
                            tableSchema.logicalPartitionType().getFieldTypes(),
                            "PartitionComparator");
        }
        return lazyPartitionComparator;
    }

    private List convertToRawFiles(
            BinaryRow partition, int bucket, List dataFiles) {
        String bucketPath = pathFactory.bucketPath(partition, bucket).toString();

        // bucket with only one file can be returned
        if (dataFiles.size() == 1) {
            return Collections.singletonList(makeRawTableFile(bucketPath, dataFiles.get(0)));
        }

        // append only files can be returned
        if (tableSchema.primaryKeys().isEmpty()) {
            return makeRawTableFiles(bucketPath, dataFiles);
        }

        // bucket containing only one level (except level 0) can be returned
        Set levels =
                dataFiles.stream().map(DataFileMeta::level).collect(Collectors.toSet());
        if (levels.size() == 1 && !levels.contains(0)) {
            return makeRawTableFiles(bucketPath, dataFiles);
        }

        return Collections.emptyList();
    }

    private List makeRawTableFiles(String bucketPath, List dataFiles) {
        return dataFiles.stream()
                .map(f -> makeRawTableFile(bucketPath, f))
                .collect(Collectors.toList());
    }

    private RawFile makeRawTableFile(String bucketPath, DataFileMeta meta) {
        return new RawFile(
                bucketPath + "/" + meta.fileName(),
                0,
                meta.fileSize(),
                meta.fileFormat()
                        .map(t -> t.toString().toLowerCase())
                        .orElse(
                                new CoreOptions(tableSchema.options())
                                        .formatType()
                                        .toString()
                                        .toLowerCase()),
                meta.schemaId(),
                meta.rowCount());
    }
}