org.apache.paimon.spark.procedure.CompactProcedure Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.paimon.spark.procedure;
import org.apache.paimon.CoreOptions;
import org.apache.paimon.annotation.VisibleForTesting;
import org.apache.paimon.append.UnawareAppendCompactionTask;
import org.apache.paimon.append.UnawareAppendTableCompactionCoordinator;
import org.apache.paimon.data.BinaryRow;
import org.apache.paimon.disk.IOManager;
import org.apache.paimon.manifest.PartitionEntry;
import org.apache.paimon.operation.AppendOnlyFileStoreWrite;
import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.spark.PaimonSplitScan;
import org.apache.paimon.spark.SparkUtils;
import org.apache.paimon.spark.catalyst.Compatibility;
import org.apache.paimon.spark.catalyst.analysis.expressions.ExpressionUtils;
import org.apache.paimon.spark.commands.PaimonSparkWriter;
import org.apache.paimon.spark.sort.TableSorter;
import org.apache.paimon.table.BucketMode;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.sink.BatchTableCommit;
import org.apache.paimon.table.sink.BatchTableWrite;
import org.apache.paimon.table.sink.BatchWriteBuilder;
import org.apache.paimon.table.sink.CommitMessage;
import org.apache.paimon.table.sink.CommitMessageSerializer;
import org.apache.paimon.table.sink.CompactionTaskSerializer;
import org.apache.paimon.table.sink.TableCommitImpl;
import org.apache.paimon.table.source.DataSplit;
import org.apache.paimon.table.source.EndOfScanException;
import org.apache.paimon.table.source.snapshot.SnapshotReader;
import org.apache.paimon.utils.Pair;
import org.apache.paimon.utils.ParameterUtils;
import org.apache.paimon.utils.SerializationUtils;
import org.apache.paimon.utils.StringUtils;
import org.apache.paimon.utils.TimeUtils;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.PaimonUtils;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.Expression;
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import static org.apache.paimon.CoreOptions.createCommitUser;
import static org.apache.paimon.utils.Preconditions.checkArgument;
import static org.apache.spark.sql.types.DataTypes.StringType;
/**
* Compact procedure. Usage:
*
*
* CALL sys.compact(table => 'tableId', [partitions => 'p1=0,p2=0;p1=0,p2=1'], [order_strategy => 'xxx'], [order_by => 'xxx'], [where => 'p1>0'])
*
*/
public class CompactProcedure extends BaseProcedure {
private static final Logger LOG = LoggerFactory.getLogger(CompactProcedure.class);
private static final ProcedureParameter[] PARAMETERS =
new ProcedureParameter[] {
ProcedureParameter.required("table", StringType),
ProcedureParameter.optional("partitions", StringType),
ProcedureParameter.optional("compact_strategy", StringType),
ProcedureParameter.optional("order_strategy", StringType),
ProcedureParameter.optional("order_by", StringType),
ProcedureParameter.optional("where", StringType),
ProcedureParameter.optional("options", StringType),
ProcedureParameter.optional("partition_idle_time", StringType),
};
private static final StructType OUTPUT_TYPE =
new StructType(
new StructField[] {
new StructField("result", DataTypes.BooleanType, true, Metadata.empty())
});
private static final String MINOR = "minor";
private static final String FULL = "full";
protected CompactProcedure(TableCatalog tableCatalog) {
super(tableCatalog);
}
@Override
public ProcedureParameter[] parameters() {
return PARAMETERS;
}
@Override
public StructType outputType() {
return OUTPUT_TYPE;
}
@Override
public InternalRow[] call(InternalRow args) {
Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name());
String partitions = blank(args, 1) ? null : args.getString(1);
// make full compact strategy as default.
String compactStrategy = blank(args, 2) ? FULL : args.getString(2);
String sortType = blank(args, 3) ? TableSorter.OrderType.NONE.name() : args.getString(3);
List sortColumns =
blank(args, 4)
? Collections.emptyList()
: Arrays.asList(args.getString(4).split(","));
String where = blank(args, 5) ? null : args.getString(5);
String options = args.isNullAt(6) ? null : args.getString(6);
Duration partitionIdleTime =
blank(args, 7) ? null : TimeUtils.parseDuration(args.getString(7));
if (TableSorter.OrderType.NONE.name().equals(sortType) && !sortColumns.isEmpty()) {
throw new IllegalArgumentException(
"order_strategy \"none\" cannot work with order_by columns.");
}
if (partitionIdleTime != null && (!TableSorter.OrderType.NONE.name().equals(sortType))) {
throw new IllegalArgumentException(
"sort compact do not support 'partition_idle_time'.");
}
if (!(compactStrategy.equalsIgnoreCase(FULL) || compactStrategy.equalsIgnoreCase(MINOR))) {
throw new IllegalArgumentException(
String.format(
"The compact strategy only supports 'full' or 'minor', but '%s' is configured.",
compactStrategy));
}
checkArgument(
partitions == null || where == null,
"partitions and where cannot be used together.");
String finalWhere = partitions != null ? toWhere(partitions) : where;
return modifyPaimonTable(
tableIdent,
table -> {
checkArgument(table instanceof FileStoreTable);
checkArgument(
sortColumns.stream().noneMatch(table.partitionKeys()::contains),
"order_by should not contain partition cols, because it is meaningless, your order_by cols are %s, and partition cols are %s",
sortColumns,
table.partitionKeys());
DataSourceV2Relation relation = createRelation(tableIdent);
Expression condition = null;
if (!StringUtils.isNullOrWhitespaceOnly(finalWhere)) {
condition = ExpressionUtils.resolveFilter(spark(), relation, finalWhere);
checkArgument(
ExpressionUtils.isValidPredicate(
spark(),
condition,
table.partitionKeys().toArray(new String[0])),
"Only partition predicate is supported, your predicate is %s, but partition keys are %s",
condition,
table.partitionKeys());
}
Map dynamicOptions = new HashMap<>();
dynamicOptions.put(CoreOptions.WRITE_ONLY.key(), "false");
if (!StringUtils.isNullOrWhitespaceOnly(options)) {
dynamicOptions.putAll(ParameterUtils.parseCommaSeparatedKeyValues(options));
}
table = table.copy(dynamicOptions);
InternalRow internalRow =
newInternalRow(
execute(
(FileStoreTable) table,
compactStrategy,
sortType,
sortColumns,
relation,
condition,
partitionIdleTime));
return new InternalRow[] {internalRow};
});
}
@Override
public String description() {
return "This procedure execute compact action on paimon table.";
}
private boolean blank(InternalRow args, int index) {
return args.isNullAt(index) || StringUtils.isNullOrWhitespaceOnly(args.getString(index));
}
private boolean execute(
FileStoreTable table,
String compactStrategy,
String sortType,
List sortColumns,
DataSourceV2Relation relation,
@Nullable Expression condition,
@Nullable Duration partitionIdleTime) {
BucketMode bucketMode = table.bucketMode();
TableSorter.OrderType orderType = TableSorter.OrderType.of(sortType);
boolean fullCompact = compactStrategy.equalsIgnoreCase(FULL);
Predicate filter =
condition == null
? null
: ExpressionUtils.convertConditionToPaimonPredicate(
condition,
((LogicalPlan) relation).output(),
table.rowType(),
false)
.getOrElse(null);
if (orderType.equals(TableSorter.OrderType.NONE)) {
JavaSparkContext javaSparkContext = new JavaSparkContext(spark().sparkContext());
switch (bucketMode) {
case HASH_FIXED:
case HASH_DYNAMIC:
compactAwareBucketTable(
table, fullCompact, filter, partitionIdleTime, javaSparkContext);
break;
case BUCKET_UNAWARE:
compactUnAwareBucketTable(table, filter, partitionIdleTime, javaSparkContext);
break;
default:
throw new UnsupportedOperationException(
"Spark compact with " + bucketMode + " is not support yet.");
}
} else {
switch (bucketMode) {
case BUCKET_UNAWARE:
sortCompactUnAwareBucketTable(table, orderType, sortColumns, relation, filter);
break;
default:
throw new UnsupportedOperationException(
"Spark compact with sort_type "
+ sortType
+ " only support unaware-bucket append-only table yet.");
}
}
return true;
}
private void compactAwareBucketTable(
FileStoreTable table,
boolean fullCompact,
@Nullable Predicate filter,
@Nullable Duration partitionIdleTime,
JavaSparkContext javaSparkContext) {
SnapshotReader snapshotReader = table.newSnapshotReader();
if (filter != null) {
snapshotReader.withFilter(filter);
}
Set partitionToBeCompacted =
getHistoryPartition(snapshotReader, partitionIdleTime);
List> partitionBuckets =
snapshotReader.bucketEntries().stream()
.map(entry -> Pair.of(entry.partition(), entry.bucket()))
.distinct()
.filter(pair -> partitionToBeCompacted.contains(pair.getKey()))
.map(
p ->
Pair.of(
SerializationUtils.serializeBinaryRow(p.getLeft()),
p.getRight()))
.collect(Collectors.toList());
if (partitionBuckets.isEmpty()) {
LOG.info("Partition bucket is empty, no compact job to execute.");
return;
}
int readParallelism = readParallelism(partitionBuckets, spark());
BatchWriteBuilder writeBuilder = table.newBatchWriteBuilder();
JavaRDD commitMessageJavaRDD =
javaSparkContext
.parallelize(partitionBuckets, readParallelism)
.mapPartitions(
(FlatMapFunction>, byte[]>)
pairIterator -> {
IOManager ioManager = SparkUtils.createIOManager();
BatchTableWrite write = writeBuilder.newWrite();
write.withIOManager(ioManager);
try {
while (pairIterator.hasNext()) {
Pair pair =
pairIterator.next();
write.compact(
SerializationUtils.deserializeBinaryRow(
pair.getLeft()),
pair.getRight(),
fullCompact);
}
CommitMessageSerializer serializer =
new CommitMessageSerializer();
List messages =
write.prepareCommit();
List serializedMessages =
new ArrayList<>(messages.size());
for (CommitMessage commitMessage : messages) {
serializedMessages.add(
serializer.serialize(commitMessage));
}
return serializedMessages.iterator();
} finally {
write.close();
ioManager.close();
}
});
try (BatchTableCommit commit = writeBuilder.newCommit()) {
CommitMessageSerializer serializer = new CommitMessageSerializer();
List serializedMessages = commitMessageJavaRDD.collect();
List messages = new ArrayList<>(serializedMessages.size());
for (byte[] serializedMessage : serializedMessages) {
messages.add(serializer.deserialize(serializer.getVersion(), serializedMessage));
}
commit.commit(messages);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private void compactUnAwareBucketTable(
FileStoreTable table,
@Nullable Predicate filter,
@Nullable Duration partitionIdleTime,
JavaSparkContext javaSparkContext) {
List compactionTasks;
try {
compactionTasks =
new UnawareAppendTableCompactionCoordinator(table, false, filter).run();
} catch (EndOfScanException e) {
compactionTasks = new ArrayList<>();
}
if (partitionIdleTime != null) {
Map partitionInfo =
table.newSnapshotReader().partitionEntries().stream()
.collect(
Collectors.toMap(
PartitionEntry::partition,
PartitionEntry::lastFileCreationTime));
long historyMilli =
LocalDateTime.now()
.minus(partitionIdleTime)
.atZone(ZoneId.systemDefault())
.toInstant()
.toEpochMilli();
compactionTasks =
compactionTasks.stream()
.filter(task -> partitionInfo.get(task.partition()) <= historyMilli)
.collect(Collectors.toList());
}
if (compactionTasks.isEmpty()) {
LOG.info("Task plan is empty, no compact job to execute.");
return;
}
CompactionTaskSerializer serializer = new CompactionTaskSerializer();
List serializedTasks = new ArrayList<>();
try {
for (UnawareAppendCompactionTask compactionTask : compactionTasks) {
serializedTasks.add(serializer.serialize(compactionTask));
}
} catch (IOException e) {
throw new RuntimeException("serialize compaction task failed");
}
int readParallelism = readParallelism(serializedTasks, spark());
String commitUser = createCommitUser(table.coreOptions().toConfiguration());
JavaRDD commitMessageJavaRDD =
javaSparkContext
.parallelize(serializedTasks, readParallelism)
.mapPartitions(
(FlatMapFunction, byte[]>)
taskIterator -> {
AppendOnlyFileStoreWrite write =
(AppendOnlyFileStoreWrite)
table.store().newWrite(commitUser);
CompactionTaskSerializer ser =
new CompactionTaskSerializer();
List messages = new ArrayList<>();
try {
CommitMessageSerializer messageSer =
new CommitMessageSerializer();
while (taskIterator.hasNext()) {
UnawareAppendCompactionTask task =
ser.deserialize(
ser.getVersion(),
taskIterator.next());
messages.add(
messageSer.serialize(
task.doCompact(table, write)));
}
return messages.iterator();
} finally {
write.close();
}
});
try (TableCommitImpl commit = table.newCommit(commitUser)) {
CommitMessageSerializer messageSerializerser = new CommitMessageSerializer();
List serializedMessages = commitMessageJavaRDD.collect();
List messages = new ArrayList<>(serializedMessages.size());
for (byte[] serializedMessage : serializedMessages) {
messages.add(
messageSerializerser.deserialize(
messageSerializerser.getVersion(), serializedMessage));
}
commit.commit(messages);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private Set getHistoryPartition(
SnapshotReader snapshotReader, @Nullable Duration partitionIdleTime) {
Set> partitionInfo =
snapshotReader.partitionEntries().stream()
.map(
partitionEntry ->
Pair.of(
partitionEntry.partition(),
partitionEntry.lastFileCreationTime()))
.collect(Collectors.toSet());
if (partitionIdleTime != null) {
long historyMilli =
LocalDateTime.now()
.minus(partitionIdleTime)
.atZone(ZoneId.systemDefault())
.toInstant()
.toEpochMilli();
partitionInfo =
partitionInfo.stream()
.filter(partition -> partition.getValue() <= historyMilli)
.collect(Collectors.toSet());
}
return partitionInfo.stream().map(Pair::getKey).collect(Collectors.toSet());
}
private void sortCompactUnAwareBucketTable(
FileStoreTable table,
TableSorter.OrderType orderType,
List sortColumns,
DataSourceV2Relation relation,
@Nullable Predicate filter) {
SnapshotReader snapshotReader = table.newSnapshotReader();
if (filter != null) {
snapshotReader.withFilter(filter);
}
Map packedSplits = packForSort(snapshotReader.read().dataSplits());
TableSorter sorter = TableSorter.getSorter(table, orderType, sortColumns);
Dataset datasetForWrite =
packedSplits.values().stream()
.map(
split -> {
Dataset dataset =
PaimonUtils.createDataset(
spark(),
Compatibility.createDataSourceV2ScanRelation(
relation,
PaimonSplitScan.apply(table, split),
relation.output()));
return sorter.sort(dataset);
})
.reduce(Dataset::union)
.orElse(null);
if (datasetForWrite != null) {
PaimonSparkWriter writer = new PaimonSparkWriter(table);
// Use dynamic partition overwrite
writer.writeBuilder().withOverwrite();
writer.commit(writer.write(datasetForWrite));
}
}
private Map packForSort(List dataSplits) {
// Make a single partition as a compact group
return dataSplits.stream()
.collect(
Collectors.groupingBy(
DataSplit::partition,
Collectors.collectingAndThen(
Collectors.toList(),
list -> list.toArray(new DataSplit[0]))));
}
private int readParallelism(List> groupedTasks, SparkSession spark) {
int sparkParallelism =
Math.max(
spark.sparkContext().defaultParallelism(),
spark.sessionState().conf().numShufflePartitions());
int readParallelism = Math.min(groupedTasks.size(), sparkParallelism);
if (sparkParallelism > readParallelism) {
LOG.warn(
String.format(
"Spark default parallelism (%s) is greater than bucket or task parallelism (%s),"
+ "we use %s as the final read parallelism",
sparkParallelism, readParallelism, readParallelism));
}
return readParallelism;
}
@VisibleForTesting
static String toWhere(String partitions) {
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy