All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.paimon.spark.procedure.CompactProcedure Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.paimon.spark.procedure;

import org.apache.paimon.CoreOptions;
import org.apache.paimon.annotation.VisibleForTesting;
import org.apache.paimon.append.UnawareAppendCompactionTask;
import org.apache.paimon.append.UnawareAppendTableCompactionCoordinator;
import org.apache.paimon.data.BinaryRow;
import org.apache.paimon.disk.IOManager;
import org.apache.paimon.manifest.PartitionEntry;
import org.apache.paimon.operation.AppendOnlyFileStoreWrite;
import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.spark.PaimonSplitScan;
import org.apache.paimon.spark.SparkUtils;
import org.apache.paimon.spark.catalyst.Compatibility;
import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionUtils;
import org.apache.paimon.spark.commands.PaimonSparkWriter;
import org.apache.paimon.spark.sort.TableSorter;
import org.apache.paimon.table.BucketMode;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.sink.BatchTableCommit;
import org.apache.paimon.table.sink.BatchTableWrite;
import org.apache.paimon.table.sink.BatchWriteBuilder;
import org.apache.paimon.table.sink.CommitMessage;
import org.apache.paimon.table.sink.CommitMessageSerializer;
import org.apache.paimon.table.sink.CompactionTaskSerializer;
import org.apache.paimon.table.sink.TableCommitImpl;
import org.apache.paimon.table.source.DataSplit;
import org.apache.paimon.table.source.EndOfScanException;
import org.apache.paimon.table.source.snapshot.SnapshotReader;
import org.apache.paimon.utils.Pair;
import org.apache.paimon.utils.ParameterUtils;
import org.apache.paimon.utils.SerializationUtils;
import org.apache.paimon.utils.StringUtils;
import org.apache.paimon.utils.TimeUtils;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.PaimonUtils;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.Expression;
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.IOException;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

import static org.apache.paimon.CoreOptions.createCommitUser;
import static org.apache.paimon.utils.Preconditions.checkArgument;
import static org.apache.spark.sql.types.DataTypes.StringType;

/**
 * Compact procedure. Usage:
 *
 * 

 *  CALL sys.compact(table => 'tableId', [partitions => 'p1=0,p2=0;p1=0,p2=1'], [order_strategy => 'xxx'], [order_by => 'xxx'], [where => 'p1>0'])
 * 
*/ public class CompactProcedure extends BaseProcedure { private static final Logger LOG = LoggerFactory.getLogger(CompactProcedure.class); private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { ProcedureParameter.required("table", StringType), ProcedureParameter.optional("partitions", StringType), ProcedureParameter.optional("compact_strategy", StringType), ProcedureParameter.optional("order_strategy", StringType), ProcedureParameter.optional("order_by", StringType), ProcedureParameter.optional("where", StringType), ProcedureParameter.optional("options", StringType), ProcedureParameter.optional("partition_idle_time", StringType), }; private static final StructType OUTPUT_TYPE = new StructType( new StructField[] { new StructField("result", DataTypes.BooleanType, true, Metadata.empty()) }); private static final String MINOR = "minor"; private static final String FULL = "full"; protected CompactProcedure(TableCatalog tableCatalog) { super(tableCatalog); } @Override public ProcedureParameter[] parameters() { return PARAMETERS; } @Override public StructType outputType() { return OUTPUT_TYPE; } @Override public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); String partitions = blank(args, 1) ? null : args.getString(1); // make full compact strategy as default. String compactStrategy = blank(args, 2) ? FULL : args.getString(2); String sortType = blank(args, 3) ? TableSorter.OrderType.NONE.name() : args.getString(3); List sortColumns = blank(args, 4) ? Collections.emptyList() : Arrays.asList(args.getString(4).split(",")); String where = blank(args, 5) ? null : args.getString(5); String options = args.isNullAt(6) ? null : args.getString(6); Duration partitionIdleTime = blank(args, 7) ? null : TimeUtils.parseDuration(args.getString(7)); if (TableSorter.OrderType.NONE.name().equals(sortType) && !sortColumns.isEmpty()) { throw new IllegalArgumentException( "order_strategy \"none\" cannot work with order_by columns."); } if (partitionIdleTime != null && (!TableSorter.OrderType.NONE.name().equals(sortType))) { throw new IllegalArgumentException( "sort compact do not support 'partition_idle_time'."); } if (!(compactStrategy.equalsIgnoreCase(FULL) || compactStrategy.equalsIgnoreCase(MINOR))) { throw new IllegalArgumentException( String.format( "The compact strategy only supports 'full' or 'minor', but '%s' is configured.", compactStrategy)); } checkArgument( partitions == null || where == null, "partitions and where cannot be used together."); String finalWhere = partitions != null ? toWhere(partitions) : where; return modifyPaimonTable( tableIdent, table -> { checkArgument(table instanceof FileStoreTable); checkArgument( sortColumns.stream().noneMatch(table.partitionKeys()::contains), "order_by should not contain partition cols, because it is meaningless, your order_by cols are %s, and partition cols are %s", sortColumns, table.partitionKeys()); DataSourceV2Relation relation = createRelation(tableIdent); Expression condition = null; if (!StringUtils.isNullOrWhitespaceOnly(finalWhere)) { condition = ExpressionUtils.resolveFilter(spark(), relation, finalWhere); checkArgument( ExpressionUtils.isValidPredicate( spark(), condition, table.partitionKeys().toArray(new String[0])), "Only partition predicate is supported, your predicate is %s, but partition keys are %s", condition, table.partitionKeys()); } Map dynamicOptions = new HashMap<>(); dynamicOptions.put(CoreOptions.WRITE_ONLY.key(), "false"); if (!StringUtils.isNullOrWhitespaceOnly(options)) { dynamicOptions.putAll(ParameterUtils.parseCommaSeparatedKeyValues(options)); } table = table.copy(dynamicOptions); InternalRow internalRow = newInternalRow( execute( (FileStoreTable) table, compactStrategy, sortType, sortColumns, relation, condition, partitionIdleTime)); return new InternalRow[] {internalRow}; }); } @Override public String description() { return "This procedure execute compact action on paimon table."; } private boolean blank(InternalRow args, int index) { return args.isNullAt(index) || StringUtils.isNullOrWhitespaceOnly(args.getString(index)); } private boolean execute( FileStoreTable table, String compactStrategy, String sortType, List sortColumns, DataSourceV2Relation relation, @Nullable Expression condition, @Nullable Duration partitionIdleTime) { BucketMode bucketMode = table.bucketMode(); TableSorter.OrderType orderType = TableSorter.OrderType.of(sortType); boolean fullCompact = compactStrategy.equalsIgnoreCase(FULL); Predicate filter = condition == null ? null : ExpressionUtils.convertConditionToPaimonPredicate( condition, ((LogicalPlan) relation).output(), table.rowType(), false) .getOrElse(null); if (orderType.equals(TableSorter.OrderType.NONE)) { JavaSparkContext javaSparkContext = new JavaSparkContext(spark().sparkContext()); switch (bucketMode) { case HASH_FIXED: case HASH_DYNAMIC: compactAwareBucketTable( table, fullCompact, filter, partitionIdleTime, javaSparkContext); break; case BUCKET_UNAWARE: compactUnAwareBucketTable(table, filter, partitionIdleTime, javaSparkContext); break; default: throw new UnsupportedOperationException( "Spark compact with " + bucketMode + " is not support yet."); } } else { switch (bucketMode) { case BUCKET_UNAWARE: sortCompactUnAwareBucketTable(table, orderType, sortColumns, relation, filter); break; default: throw new UnsupportedOperationException( "Spark compact with sort_type " + sortType + " only support unaware-bucket append-only table yet."); } } return true; } private void compactAwareBucketTable( FileStoreTable table, boolean fullCompact, @Nullable Predicate filter, @Nullable Duration partitionIdleTime, JavaSparkContext javaSparkContext) { SnapshotReader snapshotReader = table.newSnapshotReader(); if (filter != null) { snapshotReader.withFilter(filter); } Set partitionToBeCompacted = getHistoryPartition(snapshotReader, partitionIdleTime); List> partitionBuckets = snapshotReader.bucketEntries().stream() .map(entry -> Pair.of(entry.partition(), entry.bucket())) .distinct() .filter(pair -> partitionToBeCompacted.contains(pair.getKey())) .map( p -> Pair.of( SerializationUtils.serializeBinaryRow(p.getLeft()), p.getRight())) .collect(Collectors.toList()); if (partitionBuckets.isEmpty()) { LOG.info("Partition bucket is empty, no compact job to execute."); return; } int readParallelism = readParallelism(partitionBuckets, spark()); BatchWriteBuilder writeBuilder = table.newBatchWriteBuilder(); JavaRDD commitMessageJavaRDD = javaSparkContext .parallelize(partitionBuckets, readParallelism) .mapPartitions( (FlatMapFunction>, byte[]>) pairIterator -> { IOManager ioManager = SparkUtils.createIOManager(); BatchTableWrite write = writeBuilder.newWrite(); write.withIOManager(ioManager); try { while (pairIterator.hasNext()) { Pair pair = pairIterator.next(); write.compact( SerializationUtils.deserializeBinaryRow( pair.getLeft()), pair.getRight(), fullCompact); } CommitMessageSerializer serializer = new CommitMessageSerializer(); List messages = write.prepareCommit(); List serializedMessages = new ArrayList<>(messages.size()); for (CommitMessage commitMessage : messages) { serializedMessages.add( serializer.serialize(commitMessage)); } return serializedMessages.iterator(); } finally { write.close(); ioManager.close(); } }); try (BatchTableCommit commit = writeBuilder.newCommit()) { CommitMessageSerializer serializer = new CommitMessageSerializer(); List serializedMessages = commitMessageJavaRDD.collect(); List messages = new ArrayList<>(serializedMessages.size()); for (byte[] serializedMessage : serializedMessages) { messages.add(serializer.deserialize(serializer.getVersion(), serializedMessage)); } commit.commit(messages); } catch (Exception e) { throw new RuntimeException(e); } } private void compactUnAwareBucketTable( FileStoreTable table, @Nullable Predicate filter, @Nullable Duration partitionIdleTime, JavaSparkContext javaSparkContext) { List compactionTasks; try { compactionTasks = new UnawareAppendTableCompactionCoordinator(table, false, filter).run(); } catch (EndOfScanException e) { compactionTasks = new ArrayList<>(); } if (partitionIdleTime != null) { Map partitionInfo = table.newSnapshotReader().partitionEntries().stream() .collect( Collectors.toMap( PartitionEntry::partition, PartitionEntry::lastFileCreationTime)); long historyMilli = LocalDateTime.now() .minus(partitionIdleTime) .atZone(ZoneId.systemDefault()) .toInstant() .toEpochMilli(); compactionTasks = compactionTasks.stream() .filter(task -> partitionInfo.get(task.partition()) <= historyMilli) .collect(Collectors.toList()); } if (compactionTasks.isEmpty()) { LOG.info("Task plan is empty, no compact job to execute."); return; } CompactionTaskSerializer serializer = new CompactionTaskSerializer(); List serializedTasks = new ArrayList<>(); try { for (UnawareAppendCompactionTask compactionTask : compactionTasks) { serializedTasks.add(serializer.serialize(compactionTask)); } } catch (IOException e) { throw new RuntimeException("serialize compaction task failed"); } int readParallelism = readParallelism(serializedTasks, spark()); String commitUser = createCommitUser(table.coreOptions().toConfiguration()); JavaRDD commitMessageJavaRDD = javaSparkContext .parallelize(serializedTasks, readParallelism) .mapPartitions( (FlatMapFunction, byte[]>) taskIterator -> { AppendOnlyFileStoreWrite write = (AppendOnlyFileStoreWrite) table.store().newWrite(commitUser); CompactionTaskSerializer ser = new CompactionTaskSerializer(); List messages = new ArrayList<>(); try { CommitMessageSerializer messageSer = new CommitMessageSerializer(); while (taskIterator.hasNext()) { UnawareAppendCompactionTask task = ser.deserialize( ser.getVersion(), taskIterator.next()); messages.add( messageSer.serialize( task.doCompact(table, write))); } return messages.iterator(); } finally { write.close(); } }); try (TableCommitImpl commit = table.newCommit(commitUser)) { CommitMessageSerializer messageSerializerser = new CommitMessageSerializer(); List serializedMessages = commitMessageJavaRDD.collect(); List messages = new ArrayList<>(serializedMessages.size()); for (byte[] serializedMessage : serializedMessages) { messages.add( messageSerializerser.deserialize( messageSerializerser.getVersion(), serializedMessage)); } commit.commit(messages); } catch (Exception e) { throw new RuntimeException(e); } } private Set getHistoryPartition( SnapshotReader snapshotReader, @Nullable Duration partitionIdleTime) { Set> partitionInfo = snapshotReader.partitionEntries().stream() .map( partitionEntry -> Pair.of( partitionEntry.partition(), partitionEntry.lastFileCreationTime())) .collect(Collectors.toSet()); if (partitionIdleTime != null) { long historyMilli = LocalDateTime.now() .minus(partitionIdleTime) .atZone(ZoneId.systemDefault()) .toInstant() .toEpochMilli(); partitionInfo = partitionInfo.stream() .filter(partition -> partition.getValue() <= historyMilli) .collect(Collectors.toSet()); } return partitionInfo.stream().map(Pair::getKey).collect(Collectors.toSet()); } private void sortCompactUnAwareBucketTable( FileStoreTable table, TableSorter.OrderType orderType, List sortColumns, DataSourceV2Relation relation, @Nullable Predicate filter) { SnapshotReader snapshotReader = table.newSnapshotReader(); if (filter != null) { snapshotReader.withFilter(filter); } Map packedSplits = packForSort(snapshotReader.read().dataSplits()); TableSorter sorter = TableSorter.getSorter(table, orderType, sortColumns); Dataset datasetForWrite = packedSplits.values().stream() .map( split -> { Dataset dataset = PaimonUtils.createDataset( spark(), Compatibility.createDataSourceV2ScanRelation( relation, PaimonSplitScan.apply(table, split), relation.output())); return sorter.sort(dataset); }) .reduce(Dataset::union) .orElse(null); if (datasetForWrite != null) { PaimonSparkWriter writer = new PaimonSparkWriter(table); // Use dynamic partition overwrite writer.writeBuilder().withOverwrite(); writer.commit(writer.write(datasetForWrite)); } } private Map packForSort(List dataSplits) { // Make a single partition as a compact group return dataSplits.stream() .collect( Collectors.groupingBy( DataSplit::partition, Collectors.collectingAndThen( Collectors.toList(), list -> list.toArray(new DataSplit[0])))); } private int readParallelism(List groupedTasks, SparkSession spark) { int sparkParallelism = Math.max( spark.sparkContext().defaultParallelism(), spark.sessionState().conf().numShufflePartitions()); int readParallelism = Math.min(groupedTasks.size(), sparkParallelism); if (sparkParallelism > readParallelism) { LOG.warn( String.format( "Spark default parallelism (%s) is greater than bucket or task parallelism (%s)," + "we use %s as the final read parallelism", sparkParallelism, readParallelism, readParallelism)); } return readParallelism; } @VisibleForTesting static String toWhere(String partitions) { List> maps = ParameterUtils.getPartitions(partitions.split(";")); return maps.stream() .map( a -> a.entrySet().stream() .map(entry -> entry.getKey() + "=" + entry.getValue()) .reduce((s0, s1) -> s0 + " AND " + s1)) .filter(Optional::isPresent) .map(Optional::get) .map(a -> "(" + a + ")") .reduce((a, b) -> a + " OR " + b) .orElse(null); } public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @Override public CompactProcedure doBuild() { return new CompactProcedure(tableCatalog()); } }; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy