org.apache.paimon.spark.procedure.CompactProcedure Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of paimon-spark-common_2.12 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.paimon.spark.procedure;

import org.apache.paimon.CoreOptions;
import org.apache.paimon.annotation.VisibleForTesting;
import org.apache.paimon.append.UnawareAppendCompactionTask;
import org.apache.paimon.append.UnawareAppendTableCompactionCoordinator;
import org.apache.paimon.data.BinaryRow;
import org.apache.paimon.disk.IOManager;
import org.apache.paimon.manifest.PartitionEntry;
import org.apache.paimon.operation.AppendOnlyFileStoreWrite;
import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.spark.PaimonSplitScan;
import org.apache.paimon.spark.SparkUtils;
import org.apache.paimon.spark.catalyst.Compatibility;
import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionUtils;
import org.apache.paimon.spark.commands.PaimonSparkWriter;
import org.apache.paimon.spark.sort.TableSorter;
import org.apache.paimon.table.BucketMode;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.sink.BatchTableCommit;
import org.apache.paimon.table.sink.BatchTableWrite;
import org.apache.paimon.table.sink.BatchWriteBuilder;
import org.apache.paimon.table.sink.CommitMessage;
import org.apache.paimon.table.sink.CommitMessageSerializer;
import org.apache.paimon.table.sink.CompactionTaskSerializer;
import org.apache.paimon.table.sink.TableCommitImpl;
import org.apache.paimon.table.source.DataSplit;
import org.apache.paimon.table.source.EndOfScanException;
import org.apache.paimon.table.source.snapshot.SnapshotReader;
import org.apache.paimon.utils.Pair;
import org.apache.paimon.utils.ParameterUtils;
import org.apache.paimon.utils.SerializationUtils;
import org.apache.paimon.utils.StringUtils;
import org.apache.paimon.utils.TimeUtils;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.PaimonUtils;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.Expression;
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.IOException;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

import static org.apache.paimon.CoreOptions.createCommitUser;
import static org.apache.paimon.utils.Preconditions.checkArgument;
import static org.apache.spark.sql.types.DataTypes.StringType;

/**
 * Compact procedure. Usage:
 *
 * 
 *  CALL sys.compact(table => 'tableId', [partitions => 'p1=0,p2=0;p1=0,p2=1'], [order_strategy => 'xxx'], [order_by => 'xxx'], [where => 'p1>0'])
 * 
 */
public class CompactProcedure extends BaseProcedure {

    private static final Logger LOG = LoggerFactory.getLogger(CompactProcedure.class);

    private static final ProcedureParameter[] PARAMETERS =
            new ProcedureParameter[] {
                ProcedureParameter.required("table", StringType),
                ProcedureParameter.optional("partitions", StringType),
                ProcedureParameter.optional("compact_strategy", StringType),
                ProcedureParameter.optional("order_strategy", StringType),
                ProcedureParameter.optional("order_by", StringType),
                ProcedureParameter.optional("where", StringType),
                ProcedureParameter.optional("options", StringType),
                ProcedureParameter.optional("partition_idle_time", StringType),
            };

    private static final StructType OUTPUT_TYPE =
            new StructType(
                    new StructField[] {
                        new StructField("result", DataTypes.BooleanType, true, Metadata.empty())
                    });

    private static final String MINOR = "minor";
    private static final String FULL = "full";

    protected CompactProcedure(TableCatalog tableCatalog) {
        super(tableCatalog);
    }

    @Override
    public ProcedureParameter[] parameters() {
        return PARAMETERS;
    }

    @Override
    public StructType outputType() {
        return OUTPUT_TYPE;
    }

    @Override
    public InternalRow[] call(InternalRow args) {
        Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name());
        String partitions = blank(args, 1) ? null : args.getString(1);
        // make full compact strategy as default.
        String compactStrategy = blank(args, 2) ? FULL : args.getString(2);
        String sortType = blank(args, 3) ? TableSorter.OrderType.NONE.name() : args.getString(3);
        List sortColumns =
                blank(args, 4)
                        ? Collections.emptyList()
                        : Arrays.asList(args.getString(4).split(","));
        String where = blank(args, 5) ? null : args.getString(5);
        String options = args.isNullAt(6) ? null : args.getString(6);
        Duration partitionIdleTime =
                blank(args, 7) ? null : TimeUtils.parseDuration(args.getString(7));
        if (TableSorter.OrderType.NONE.name().equals(sortType) && !sortColumns.isEmpty()) {
            throw new IllegalArgumentException(
                    "order_strategy \"none\" cannot work with order_by columns.");
        }
        if (partitionIdleTime != null && (!TableSorter.OrderType.NONE.name().equals(sortType))) {
            throw new IllegalArgumentException(
                    "sort compact do not support 'partition_idle_time'.");
        }

        if (!(compactStrategy.equalsIgnoreCase(FULL) || compactStrategy.equalsIgnoreCase(MINOR))) {
            throw new IllegalArgumentException(
                    String.format(
                            "The compact strategy only supports 'full' or 'minor', but '%s' is configured.",
                            compactStrategy));
        }

        checkArgument(
                partitions == null || where == null,
                "partitions and where cannot be used together.");
        String finalWhere = partitions != null ? toWhere(partitions) : where;
        return modifyPaimonTable(
                tableIdent,
                table -> {
                    checkArgument(table instanceof FileStoreTable);
                    checkArgument(
                            sortColumns.stream().noneMatch(table.partitionKeys()::contains),
                            "order_by should not contain partition cols, because it is meaningless, your order_by cols are %s, and partition cols are %s",
                            sortColumns,
                            table.partitionKeys());
                    DataSourceV2Relation relation = createRelation(tableIdent);
                    Expression condition = null;
                    if (!StringUtils.isNullOrWhitespaceOnly(finalWhere)) {
                        condition = ExpressionUtils.resolveFilter(spark(), relation, finalWhere);
                        checkArgument(
                                ExpressionUtils.isValidPredicate(
                                        spark(),
                                        condition,
                                        table.partitionKeys().toArray(new String[0])),
                                "Only partition predicate is supported, your predicate is %s, but partition keys are %s",
                                condition,
                                table.partitionKeys());
                    }

                    Map dynamicOptions = new HashMap<>();
                    dynamicOptions.put(CoreOptions.WRITE_ONLY.key(), "false");
                    if (!StringUtils.isNullOrWhitespaceOnly(options)) {
                        dynamicOptions.putAll(ParameterUtils.parseCommaSeparatedKeyValues(options));
                    }
                    table = table.copy(dynamicOptions);
                    InternalRow internalRow =
                            newInternalRow(
                                    execute(
                                            (FileStoreTable) table,
                                            compactStrategy,
                                            sortType,
                                            sortColumns,
                                            relation,
                                            condition,
                                            partitionIdleTime));
                    return new InternalRow[] {internalRow};
                });
    }

    @Override
    public String description() {
        return "This procedure execute compact action on paimon table.";
    }

    private boolean blank(InternalRow args, int index) {
        return args.isNullAt(index) || StringUtils.isNullOrWhitespaceOnly(args.getString(index));
    }

    private boolean execute(
            FileStoreTable table,
            String compactStrategy,
            String sortType,
            List sortColumns,
            DataSourceV2Relation relation,
            @Nullable Expression condition,
            @Nullable Duration partitionIdleTime) {
        BucketMode bucketMode = table.bucketMode();
        TableSorter.OrderType orderType = TableSorter.OrderType.of(sortType);
        boolean fullCompact = compactStrategy.equalsIgnoreCase(FULL);
        Predicate filter =
                condition == null
                        ? null
                        : ExpressionUtils.convertConditionToPaimonPredicate(
                                        condition,
                                        ((LogicalPlan) relation).output(),
                                        table.rowType(),
                                        false)
                                .getOrElse(null);
        if (orderType.equals(TableSorter.OrderType.NONE)) {
            JavaSparkContext javaSparkContext = new JavaSparkContext(spark().sparkContext());
            switch (bucketMode) {
                case HASH_FIXED:
                case HASH_DYNAMIC:
                    compactAwareBucketTable(
                            table, fullCompact, filter, partitionIdleTime, javaSparkContext);
                    break;
                case BUCKET_UNAWARE:
                    compactUnAwareBucketTable(table, filter, partitionIdleTime, javaSparkContext);
                    break;
                default:
                    throw new UnsupportedOperationException(
                            "Spark compact with " + bucketMode + " is not support yet.");
            }
        } else {
            switch (bucketMode) {
                case BUCKET_UNAWARE:
                    sortCompactUnAwareBucketTable(table, orderType, sortColumns, relation, filter);
                    break;
                default:
                    throw new UnsupportedOperationException(
                            "Spark compact with sort_type "
                                    + sortType
                                    + " only support unaware-bucket append-only table yet.");
            }
        }
        return true;
    }

    private void compactAwareBucketTable(
            FileStoreTable table,
            boolean fullCompact,
            @Nullable Predicate filter,
            @Nullable Duration partitionIdleTime,
            JavaSparkContext javaSparkContext) {
        SnapshotReader snapshotReader = table.newSnapshotReader();
        if (filter != null) {
            snapshotReader.withFilter(filter);
        }
        Set partitionToBeCompacted =
                getHistoryPartition(snapshotReader, partitionIdleTime);
        List> partitionBuckets =
                snapshotReader.bucketEntries().stream()
                        .map(entry -> Pair.of(entry.partition(), entry.bucket()))
                        .distinct()
                        .filter(pair -> partitionToBeCompacted.contains(pair.getKey()))
                        .map(
                                p ->
                                        Pair.of(
                                                SerializationUtils.serializeBinaryRow(p.getLeft()),
                                                p.getRight()))
                        .collect(Collectors.toList());

        if (partitionBuckets.isEmpty()) {
            LOG.info("Partition bucket is empty, no compact job to execute.");
            return;
        }

        int readParallelism = readParallelism(partitionBuckets, spark());
        BatchWriteBuilder writeBuilder = table.newBatchWriteBuilder();
        JavaRDD commitMessageJavaRDD =
                javaSparkContext
                        .parallelize(partitionBuckets, readParallelism)
                        .mapPartitions(
                                (FlatMapFunction>, byte[]>)
                                        pairIterator -> {
                                            IOManager ioManager = SparkUtils.createIOManager();
                                            BatchTableWrite write = writeBuilder.newWrite();
                                            write.withIOManager(ioManager);
                                            try {
                                                while (pairIterator.hasNext()) {
                                                    Pair pair =
                                                            pairIterator.next();
                                                    write.compact(
                                                            SerializationUtils.deserializeBinaryRow(
                                                                    pair.getLeft()),
                                                            pair.getRight(),
                                                            fullCompact);
                                                }
                                                CommitMessageSerializer serializer =
                                                        new CommitMessageSerializer();
                                                List messages =
                                                        write.prepareCommit();
                                                List serializedMessages =
                                                        new ArrayList<>(messages.size());
                                                for (CommitMessage commitMessage : messages) {
                                                    serializedMessages.add(
                                                            serializer.serialize(commitMessage));
                                                }
                                                return serializedMessages.iterator();
                                            } finally {
                                                write.close();
                                                ioManager.close();
                                            }
                                        });

        try (BatchTableCommit commit = writeBuilder.newCommit()) {
            CommitMessageSerializer serializer = new CommitMessageSerializer();
            List serializedMessages = commitMessageJavaRDD.collect();
            List messages = new ArrayList<>(serializedMessages.size());
            for (byte[] serializedMessage : serializedMessages) {
                messages.add(serializer.deserialize(serializer.getVersion(), serializedMessage));
            }
            commit.commit(messages);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    private void compactUnAwareBucketTable(
            FileStoreTable table,
            @Nullable Predicate filter,
            @Nullable Duration partitionIdleTime,
            JavaSparkContext javaSparkContext) {
        List compactionTasks;
        try {
            compactionTasks =
                    new UnawareAppendTableCompactionCoordinator(table, false, filter).run();
        } catch (EndOfScanException e) {
            compactionTasks = new ArrayList<>();
        }
        if (partitionIdleTime != null) {
            Map partitionInfo =
                    table.newSnapshotReader().partitionEntries().stream()
                            .collect(
                                    Collectors.toMap(
                                            PartitionEntry::partition,
                                            PartitionEntry::lastFileCreationTime));
            long historyMilli =
                    LocalDateTime.now()
                            .minus(partitionIdleTime)
                            .atZone(ZoneId.systemDefault())
                            .toInstant()
                            .toEpochMilli();
            compactionTasks =
                    compactionTasks.stream()
                            .filter(task -> partitionInfo.get(task.partition()) <= historyMilli)
                            .collect(Collectors.toList());
        }
        if (compactionTasks.isEmpty()) {
            LOG.info("Task plan is empty, no compact job to execute.");
            return;
        }

        CompactionTaskSerializer serializer = new CompactionTaskSerializer();
        List serializedTasks = new ArrayList<>();
        try {
            for (UnawareAppendCompactionTask compactionTask : compactionTasks) {
                serializedTasks.add(serializer.serialize(compactionTask));
            }
        } catch (IOException e) {
            throw new RuntimeException("serialize compaction task failed");
        }

        int readParallelism = readParallelism(serializedTasks, spark());
        String commitUser = createCommitUser(table.coreOptions().toConfiguration());
        JavaRDD commitMessageJavaRDD =
                javaSparkContext
                        .parallelize(serializedTasks, readParallelism)
                        .mapPartitions(
                                (FlatMapFunction, byte[]>)
                                        taskIterator -> {
                                            AppendOnlyFileStoreWrite write =
                                                    (AppendOnlyFileStoreWrite)
                                                            table.store().newWrite(commitUser);
                                            CompactionTaskSerializer ser =
                                                    new CompactionTaskSerializer();
                                            List messages = new ArrayList<>();
                                            try {
                                                CommitMessageSerializer messageSer =
                                                        new CommitMessageSerializer();
                                                while (taskIterator.hasNext()) {
                                                    UnawareAppendCompactionTask task =
                                                            ser.deserialize(
                                                                    ser.getVersion(),
                                                                    taskIterator.next());
                                                    messages.add(
                                                            messageSer.serialize(
                                                                    task.doCompact(table, write)));
                                                }
                                                return messages.iterator();
                                            } finally {
                                                write.close();
                                            }
                                        });

        try (TableCommitImpl commit = table.newCommit(commitUser)) {
            CommitMessageSerializer messageSerializerser = new CommitMessageSerializer();
            List serializedMessages = commitMessageJavaRDD.collect();
            List messages = new ArrayList<>(serializedMessages.size());
            for (byte[] serializedMessage : serializedMessages) {
                messages.add(
                        messageSerializerser.deserialize(
                                messageSerializerser.getVersion(), serializedMessage));
            }
            commit.commit(messages);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    private Set getHistoryPartition(
            SnapshotReader snapshotReader, @Nullable Duration partitionIdleTime) {
        Set> partitionInfo =
                snapshotReader.partitionEntries().stream()
                        .map(
                                partitionEntry ->
                                        Pair.of(
                                                partitionEntry.partition(),
                                                partitionEntry.lastFileCreationTime()))
                        .collect(Collectors.toSet());
        if (partitionIdleTime != null) {
            long historyMilli =
                    LocalDateTime.now()
                            .minus(partitionIdleTime)
                            .atZone(ZoneId.systemDefault())
                            .toInstant()
                            .toEpochMilli();
            partitionInfo =
                    partitionInfo.stream()
                            .filter(partition -> partition.getValue() <= historyMilli)
                            .collect(Collectors.toSet());
        }
        return partitionInfo.stream().map(Pair::getKey).collect(Collectors.toSet());
    }

    private void sortCompactUnAwareBucketTable(
            FileStoreTable table,
            TableSorter.OrderType orderType,
            List sortColumns,
            DataSourceV2Relation relation,
            @Nullable Predicate filter) {
        SnapshotReader snapshotReader = table.newSnapshotReader();
        if (filter != null) {
            snapshotReader.withFilter(filter);
        }
        Map packedSplits = packForSort(snapshotReader.read().dataSplits());
        TableSorter sorter = TableSorter.getSorter(table, orderType, sortColumns);
        Dataset datasetForWrite =
                packedSplits.values().stream()
                        .map(
                                split -> {
                                    Dataset dataset =
                                            PaimonUtils.createDataset(
                                                    spark(),
                                                    Compatibility.createDataSourceV2ScanRelation(
                                                            relation,
                                                            PaimonSplitScan.apply(table, split),
                                                            relation.output()));
                                    return sorter.sort(dataset);
                                })
                        .reduce(Dataset::union)
                        .orElse(null);
        if (datasetForWrite != null) {
            PaimonSparkWriter writer = new PaimonSparkWriter(table);
            // Use dynamic partition overwrite
            writer.writeBuilder().withOverwrite();
            writer.commit(writer.write(datasetForWrite));
        }
    }

    private Map packForSort(List dataSplits) {
        // Make a single partition as a compact group
        return dataSplits.stream()
                .collect(
                        Collectors.groupingBy(
                                DataSplit::partition,
                                Collectors.collectingAndThen(
                                        Collectors.toList(),
                                        list -> list.toArray(new DataSplit[0]))));
    }

    private int readParallelism(List groupedTasks, SparkSession spark) {
        int sparkParallelism =
                Math.max(
                        spark.sparkContext().defaultParallelism(),
                        spark.sessionState().conf().numShufflePartitions());
        int readParallelism = Math.min(groupedTasks.size(), sparkParallelism);
        if (sparkParallelism > readParallelism) {
            LOG.warn(
                    String.format(
                            "Spark default parallelism (%s) is greater than bucket or task parallelism (%s),"
                                    + "we use %s as the final read parallelism",
                            sparkParallelism, readParallelism, readParallelism));
        }
        return readParallelism;
    }

    @VisibleForTesting
    static String toWhere(String partitions) {
        List> maps = ParameterUtils.getPartitions(partitions.split(";"));

        return maps.stream()
                .map(
                        a ->
                                a.entrySet().stream()
                                        .map(entry -> entry.getKey() + "=" + entry.getValue())
                                        .reduce((s0, s1) -> s0 + " AND " + s1))
                .filter(Optional::isPresent)
                .map(Optional::get)
                .map(a -> "(" + a + ")")
                .reduce((a, b) -> a + " OR " + b)
                .orElse(null);
    }

    public static ProcedureBuilder builder() {
        return new BaseProcedure.Builder() {
            @Override
            public CompactProcedure doBuild() {
                return new CompactProcedure(tableCatalog());
            }
        };
    }
}