org.apache.hudi.metadata.HoodieTableMetadataUtil Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.metadata;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.exception.HoodieMetadataException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.function.BiFunction;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.hudi.metadata.HoodieTableMetadata.EMPTY_PARTITION_NAME;
import static org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME;
/**
* A utility to convert timeline information to metadata table records.
*/
public class HoodieTableMetadataUtil {
private static final Logger LOG = LogManager.getLogger(HoodieTableMetadataUtil.class);
/**
* Delete the metadata table for the dataset. This will be invoked during upgrade/downgrade operation during which no other
* process should be running.
*
* @param basePath base path of the dataset
* @param context instance of {@link HoodieEngineContext}.
*/
public static void deleteMetadataTable(String basePath, HoodieEngineContext context) {
final String metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(basePath);
FileSystem fs = FSUtils.getFs(metadataTablePath, context.getHadoopConf().get());
try {
fs.delete(new Path(metadataTablePath), true);
} catch (Exception e) {
throw new HoodieMetadataException("Failed to remove metadata table from path " + metadataTablePath, e);
}
}
/**
* Finds all new files/partitions created as part of commit and creates metadata table records for them.
*
* @param commitMetadata
* @param instantTime
* @return a list of metadata table records
*/
public static List convertMetadataToRecords(HoodieCommitMetadata commitMetadata, String instantTime) {
List records = new LinkedList<>();
List allPartitions = new LinkedList<>();
commitMetadata.getPartitionToWriteStats().forEach((partitionStatName, writeStats) -> {
final String partition = partitionStatName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionStatName;
allPartitions.add(partition);
Map newFiles = new HashMap<>(writeStats.size());
writeStats.forEach(hoodieWriteStat -> {
String pathWithPartition = hoodieWriteStat.getPath();
if (pathWithPartition == null) {
// Empty partition
LOG.warn("Unable to find path in write stat to update metadata table " + hoodieWriteStat);
return;
}
int offset = partition.equals(NON_PARTITIONED_NAME) ? (pathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
String filename = pathWithPartition.substring(offset);
long totalWriteBytes = newFiles.containsKey(filename)
? newFiles.get(filename) + hoodieWriteStat.getTotalWriteBytes()
: hoodieWriteStat.getTotalWriteBytes();
newFiles.put(filename, totalWriteBytes);
});
// New files added to a partition
HoodieRecord record = HoodieMetadataPayload.createPartitionFilesRecord(
partition, Option.of(newFiles), Option.empty());
records.add(record);
});
// New partitions created
HoodieRecord record = HoodieMetadataPayload.createPartitionListRecord(new ArrayList<>(allPartitions));
records.add(record);
LOG.info("Updating at " + instantTime + " from Commit/" + commitMetadata.getOperationType()
+ ". #partitions_updated=" + records.size());
return records;
}
/**
* Finds all files that were deleted as part of a clean and creates metadata table records for them.
*
* @param cleanMetadata
* @param instantTime
* @return a list of metadata table records
*/
public static List convertMetadataToRecords(HoodieCleanMetadata cleanMetadata, String instantTime) {
List records = new LinkedList<>();
int[] fileDeleteCount = {0};
cleanMetadata.getPartitionMetadata().forEach((partitionName, partitionMetadata) -> {
final String partition = partitionName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionName;
// Files deleted from a partition
List deletedFiles = partitionMetadata.getDeletePathPatterns();
HoodieRecord record = HoodieMetadataPayload.createPartitionFilesRecord(partition, Option.empty(),
Option.of(new ArrayList<>(deletedFiles)));
records.add(record);
fileDeleteCount[0] += deletedFiles.size();
});
LOG.info("Updating at " + instantTime + " from Clean. #partitions_updated=" + records.size()
+ ", #files_deleted=" + fileDeleteCount[0]);
return records;
}
/**
* Aggregates all files deleted and appended to from all rollbacks associated with a restore operation then
* creates metadata table records for them.
*
* @param restoreMetadata
* @param instantTime
* @return a list of metadata table records
*/
public static List convertMetadataToRecords(HoodieActiveTimeline metadataTableTimeline,
HoodieRestoreMetadata restoreMetadata, String instantTime, Option lastSyncTs) {
Map> partitionToAppendedFiles = new HashMap<>();
Map> partitionToDeletedFiles = new HashMap<>();
restoreMetadata.getHoodieRestoreMetadata().values().forEach(rms -> {
rms.forEach(rm -> processRollbackMetadata(metadataTableTimeline, rm, partitionToDeletedFiles, partitionToAppendedFiles, lastSyncTs));
});
return convertFilesToRecords(partitionToDeletedFiles, partitionToAppendedFiles, instantTime, "Restore");
}
public static List convertMetadataToRecords(HoodieActiveTimeline metadataTableTimeline,
HoodieRollbackMetadata rollbackMetadata, String instantTime,
Option lastSyncTs, boolean wasSynced) {
Map> partitionToAppendedFiles = new HashMap<>();
Map> partitionToDeletedFiles = new HashMap<>();
processRollbackMetadata(metadataTableTimeline, rollbackMetadata, partitionToDeletedFiles, partitionToAppendedFiles, lastSyncTs);
if (!wasSynced) {
// Since the instant-being-rolled-back was never committed to the metadata table, the files added there
// need not be deleted. For MOR Table, the rollback appends logBlocks so we need to keep the appended files.
partitionToDeletedFiles.clear();
}
return convertFilesToRecords(partitionToDeletedFiles, partitionToAppendedFiles, instantTime, "Rollback");
}
/**
* Extracts information about the deleted and append files from the {@code HoodieRollbackMetadata}.
*
* During a rollback files may be deleted (COW, MOR) or rollback blocks be appended (MOR only) to files. This
* function will extract this change file for each partition.
* @param metadataTableTimeline Current timeline of the Metdata Table
* @param rollbackMetadata {@code HoodieRollbackMetadata}
* @param partitionToDeletedFiles The {@code Map} to fill with files deleted per partition.
* @param partitionToAppendedFiles The {@code Map} to fill with files appended per partition and their sizes.
*/
private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTimeline, HoodieRollbackMetadata rollbackMetadata,
Map> partitionToDeletedFiles,
Map> partitionToAppendedFiles,
Option lastSyncTs) {
rollbackMetadata.getPartitionMetadata().values().forEach(pm -> {
final String instantToRollback = rollbackMetadata.getCommitsRollback().get(0);
// Has this rollback produced new files?
boolean hasRollbackLogFiles = pm.getRollbackLogFiles() != null && !pm.getRollbackLogFiles().isEmpty();
boolean hasNonZeroRollbackLogFiles = hasRollbackLogFiles && pm.getRollbackLogFiles().values().stream().mapToLong(Long::longValue).sum() > 0;
// If instant-to-rollback has not been synced to metadata table yet then there is no need to update metadata
// This can happen in two cases:
// Case 1: Metadata Table timeline is behind the instant-to-rollback.
boolean shouldSkip = lastSyncTs.isPresent()
&& HoodieTimeline.compareTimestamps(instantToRollback, HoodieTimeline.GREATER_THAN, lastSyncTs.get());
if (!hasNonZeroRollbackLogFiles && shouldSkip) {
LOG.info(String.format("Skipping syncing of rollbackMetadata at %s, given metadata table is already synced upto to %s",
instantToRollback, lastSyncTs.get()));
return;
}
// Case 2: The instant-to-rollback was never committed to Metadata Table. This can happen if the instant-to-rollback
// was a failed commit (never completed) as only completed instants are synced to Metadata Table.
// But the required Metadata Table instants should not have been archived
HoodieInstant syncedInstant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, instantToRollback);
if (metadataTableTimeline.getCommitsTimeline().isBeforeTimelineStarts(syncedInstant.getTimestamp())) {
throw new HoodieMetadataException(String.format("The instant %s required to sync rollback of %s has been archived",
syncedInstant, instantToRollback));
}
shouldSkip = !metadataTableTimeline.containsInstant(syncedInstant);
if (!hasNonZeroRollbackLogFiles && shouldSkip) {
LOG.info(String.format("Skipping syncing of rollbackMetadata at %s, since this instant was never committed to Metadata Table",
instantToRollback));
return;
}
final String partition = pm.getPartitionPath();
if ((!pm.getSuccessDeleteFiles().isEmpty() || !pm.getFailedDeleteFiles().isEmpty()) && !shouldSkip) {
if (!partitionToDeletedFiles.containsKey(partition)) {
partitionToDeletedFiles.put(partition, new ArrayList<>());
}
// Extract deleted file name from the absolute paths saved in getSuccessDeleteFiles()
List deletedFiles = pm.getSuccessDeleteFiles().stream().map(p -> new Path(p).getName())
.collect(Collectors.toList());
if (!pm.getFailedDeleteFiles().isEmpty()) {
deletedFiles.addAll(pm.getFailedDeleteFiles().stream().map(p -> new Path(p).getName())
.collect(Collectors.toList()));
}
partitionToDeletedFiles.get(partition).addAll(deletedFiles);
}
BiFunction fileMergeFn = (oldSize, newSizeCopy) -> {
// if a file exists in both written log files and rollback log files, we want to pick the one that is higher
// as rollback file could have been updated after written log files are computed.
return oldSize > newSizeCopy ? oldSize : newSizeCopy;
};
if (hasRollbackLogFiles) {
if (!partitionToAppendedFiles.containsKey(partition)) {
partitionToAppendedFiles.put(partition, new HashMap<>());
}
// Extract appended file name from the absolute paths saved in getAppendFiles()
pm.getRollbackLogFiles().forEach((path, size) -> {
partitionToAppendedFiles.get(partition).merge(new Path(path).getName(), size, fileMergeFn);
});
}
if (pm.getWrittenLogFiles() != null && !pm.getWrittenLogFiles().isEmpty()) {
if (!partitionToAppendedFiles.containsKey(partition)) {
partitionToAppendedFiles.put(partition, new HashMap<>());
}
// Extract appended file name from the absolute paths saved in getWrittenLogFiles()
pm.getWrittenLogFiles().forEach((path, size) -> {
partitionToAppendedFiles.get(partition).merge(new Path(path).getName(), size, fileMergeFn);
});
}
});
}
private static List convertFilesToRecords(Map> partitionToDeletedFiles,
Map> partitionToAppendedFiles, String instantTime,
String operation) {
List records = new LinkedList<>();
int[] fileChangeCount = {0, 0}; // deletes, appends
partitionToDeletedFiles.forEach((partitionName, deletedFiles) -> {
fileChangeCount[0] += deletedFiles.size();
final String partition = partitionName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionName;
Option
© 2015 - 2025 Weber Informatics LLC | Privacy Policy