com.netease.arctic.op.OverwriteBaseFiles Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netease.arctic.op;
import com.netease.arctic.scan.CombinedScanTask;
import com.netease.arctic.table.KeyedTable;
import com.netease.arctic.table.UnkeyedTable;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.MapUtils;
import com.netease.arctic.shade.org.apache.iceberg.DataFile;
import com.netease.arctic.shade.org.apache.iceberg.DeleteFile;
import com.netease.arctic.shade.org.apache.iceberg.OverwriteFiles;
import com.netease.arctic.shade.org.apache.iceberg.RewriteFiles;
import com.netease.arctic.shade.org.apache.iceberg.RowDelta;
import com.netease.arctic.shade.org.apache.iceberg.StructLike;
import com.netease.arctic.shade.org.apache.iceberg.Transaction;
import com.netease.arctic.shade.org.apache.iceberg.expressions.Expression;
import com.netease.arctic.shade.org.apache.iceberg.expressions.Expressions;
import com.netease.arctic.shade.org.apache.iceberg.io.CloseableIterable;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Lists;
import com.netease.arctic.shade.org.apache.iceberg.util.StructLikeMap;
import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
/**
* Overwrite {@link com.netease.arctic.table.BaseTable} and change max transaction id map
*/
public class OverwriteBaseFiles extends PartitionTransactionOperation {
public static final String PROPERTIES_TRANSACTION_ID = "txId";
private final List deleteFiles;
private final List addFiles;
private final List deleteDeleteFiles;
private final List addDeleteFiles;
private Expression deleteExpression = Expressions.alwaysFalse();
private boolean deleteExpressionApplied = false;
private final StructLikeMap partitionOptimizedSequence;
private Long optimizedSequence;
// dynamic indicate that the optimized sequence should be applied to the changed partitions
private Boolean dynamic;
private Expression conflictDetectionFilter = null;
public OverwriteBaseFiles(KeyedTable table) {
super(table);
this.deleteFiles = Lists.newArrayList();
this.addFiles = Lists.newArrayList();
this.deleteDeleteFiles = Lists.newArrayList();
this.addDeleteFiles = Lists.newArrayList();
this.partitionOptimizedSequence = StructLikeMap.create(table.spec().partitionType());
}
public OverwriteBaseFiles overwriteByRowFilter(Expression expr) {
if (expr != null) {
deleteExpression = Expressions.or(deleteExpression, expr);
}
return this;
}
public OverwriteBaseFiles addFile(DataFile file) {
addFiles.add(file);
return this;
}
public OverwriteBaseFiles addFile(DeleteFile file) {
addDeleteFiles.add(file);
return this;
}
public OverwriteBaseFiles deleteFile(DataFile file) {
deleteFiles.add(file);
return this;
}
public OverwriteBaseFiles deleteFile(DeleteFile file) {
deleteDeleteFiles.add(file);
return this;
}
/**
* Update optimized sequence for partition.
* The files of ChangeStore whose sequence is bigger than optimized sequence should migrate to BaseStore later.
*
* @param partitionData - partition
* @param sequence - optimized sequence
* @return this for chain
*/
public OverwriteBaseFiles updateOptimizedSequence(StructLike partitionData, long sequence) {
Preconditions.checkArgument(this.dynamic == null || !this.dynamic,
"updateOptimizedSequenceDynamically() and updateOptimizedSequence() can't be used simultaneously");
this.partitionOptimizedSequence.put(partitionData, sequence);
this.dynamic = false;
return this;
}
/**
* Update optimized sequence for changed partitions.
* The files of ChangeStore whose sequence is bigger than optimized sequence should migrate to BaseStore later.
*
* @param sequence - optimized sequence
* @return this for chain
*/
public OverwriteBaseFiles updateOptimizedSequenceDynamically(long sequence) {
Preconditions.checkArgument(this.dynamic == null || this.dynamic,
"updateOptimizedSequenceDynamically() and updateOptimizedSequence() can't be used simultaneously");
this.optimizedSequence = sequence;
this.dynamic = true;
return this;
}
public OverwriteBaseFiles validateNoConflictingAppends(Expression newConflictDetectionFilter) {
Preconditions.checkArgument(newConflictDetectionFilter != null, "Conflict detection filter cannot be null");
this.conflictDetectionFilter = newConflictDetectionFilter;
return this;
}
@Override
protected boolean isEmptyCommit() {
applyDeleteExpression();
return deleteFiles.isEmpty() && addFiles.isEmpty() && deleteDeleteFiles.isEmpty() && addDeleteFiles.isEmpty() &&
partitionOptimizedSequence.isEmpty();
}
@Override
protected StructLikeMap apply(Transaction transaction, StructLikeMap partitionOptimizedSequence) {
Preconditions.checkState(this.dynamic != null,
"updateOptimizedSequence() or updateOptimizedSequenceDynamically() must be invoked");
applyDeleteExpression();
StructLikeMap sequenceForChangedPartitions = null;
if (this.dynamic) {
sequenceForChangedPartitions = StructLikeMap.create(transaction.table().spec().partitionType());
}
UnkeyedTable baseTable = keyedTable.baseTable();
// step1: overwrite data files
if (!this.addFiles.isEmpty() || !this.deleteFiles.isEmpty()) {
OverwriteFiles overwriteFiles = transaction.newOverwrite();
if (conflictDetectionFilter != null && baseTable.currentSnapshot() != null) {
overwriteFiles.validateNoConflictingAppends(conflictDetectionFilter);
overwriteFiles.validateFromSnapshot(baseTable.currentSnapshot().snapshotId());
}
if (this.dynamic) {
for (DataFile d : this.addFiles) {
sequenceForChangedPartitions.put(d.partition(), this.optimizedSequence);
}
for (DataFile d : this.deleteFiles) {
sequenceForChangedPartitions.put(d.partition(), this.optimizedSequence);
}
}
this.addFiles.forEach(overwriteFiles::addFile);
this.deleteFiles.forEach(overwriteFiles::deleteFile);
if (optimizedSequence != null && optimizedSequence > 0) {
overwriteFiles.set(PROPERTIES_TRANSACTION_ID, optimizedSequence + "");
}
if (MapUtils.isNotEmpty(properties)) {
properties.forEach(overwriteFiles::set);
}
overwriteFiles.commit();
}
// step2: RowDelta/Rewrite pos-delete files
if (CollectionUtils.isNotEmpty(addDeleteFiles) || CollectionUtils.isNotEmpty(deleteDeleteFiles)) {
if (CollectionUtils.isEmpty(deleteDeleteFiles)) {
RowDelta rowDelta = transaction.newRowDelta();
if (baseTable.currentSnapshot() != null) {
rowDelta.validateFromSnapshot(baseTable.currentSnapshot().snapshotId());
}
if (this.dynamic) {
for (DeleteFile d : this.addDeleteFiles) {
sequenceForChangedPartitions.put(d.partition(), this.optimizedSequence);
}
}
addDeleteFiles.forEach(rowDelta::addDeletes);
if (MapUtils.isNotEmpty(properties)) {
properties.forEach(rowDelta::set);
}
rowDelta.commit();
} else {
RewriteFiles rewriteFiles = transaction.newRewrite();
if (baseTable.currentSnapshot() != null) {
rewriteFiles.validateFromSnapshot(baseTable.currentSnapshot().snapshotId());
}
if (this.dynamic) {
for (DeleteFile d : this.addDeleteFiles) {
sequenceForChangedPartitions.put(d.partition(), this.optimizedSequence);
}
for (DeleteFile d : this.deleteDeleteFiles) {
sequenceForChangedPartitions.put(d.partition(), this.optimizedSequence);
}
}
rewriteFiles.rewriteFiles(Collections.emptySet(), new HashSet<>(deleteDeleteFiles),
Collections.emptySet(), new HashSet<>(addDeleteFiles));
if (MapUtils.isNotEmpty(properties)) {
properties.forEach(rewriteFiles::set);
}
rewriteFiles.commit();
}
}
// step3: set optimized sequence id
if (this.dynamic) {
partitionOptimizedSequence.putAll(sequenceForChangedPartitions);
} else {
partitionOptimizedSequence.putAll(this.partitionOptimizedSequence);
}
return partitionOptimizedSequence;
}
private void applyDeleteExpression() {
if (this.deleteExpressionApplied) {
return;
}
if (this.deleteExpression == null) {
return;
}
try (CloseableIterable combinedScanTasks
= keyedTable.newScan().filter(deleteExpression).planTasks()) {
combinedScanTasks.forEach(combinedTask -> combinedTask.tasks().forEach(
t -> {
t.dataTasks().forEach(ft -> deleteFiles.add(ft.file()));
t.arcticEquityDeletes().forEach(ft -> deleteFiles.add(ft.file()));
}
));
this.deleteExpressionApplied = true;
} catch (IOException e) {
throw new IllegalStateException("failed when apply delete expression when overwrite files", e);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy