All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.table.timeline.TimelineUtils Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.common.table.timeline;

import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
import org.apache.hudi.common.config.HoodieCommonConfig;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieReplaceCommitMetadata;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.CleanerUtils;
import org.apache.hudi.common.util.ClusteringUtils;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieTimeTravelException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.AbstractMap;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.apache.hudi.common.config.HoodieCommonConfig.INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.DELTA_COMMIT_ACTION;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.REPLACE_COMMIT_ACTION;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.SAVEPOINT_ACTION;
import static org.apache.hudi.common.table.timeline.HoodieTimeline.compareTimestamps;

/**
 * TimelineUtils provides a common way to query incremental meta-data changes for a hoodie table.
 * 

* This is useful in multiple places including: * 1) HiveSync - this can be used to query partitions that changed since previous sync. * 2) Incremental reads - InputFormats can use this API to query */ public class TimelineUtils { private static final Logger LOG = LoggerFactory.getLogger(TimelineUtils.class); /** * Returns partitions that have new data strictly after commitTime. * Does not include internal operations such as clean in the timeline. */ public static List getWrittenPartitions(HoodieTimeline timeline) { HoodieTimeline timelineToSync = timeline.getWriteTimeline(); return getAffectedPartitions(timelineToSync); } /** * Returns partitions that have been deleted or marked for deletion in the timeline between given commit time range. * Does not include internal operations such as clean in the timeline. */ public static List getDroppedPartitions(HoodieTableMetaClient metaClient, Option lastCommitTimeSynced, Option lastCommitCompletionTimeSynced) { HoodieTimeline timeline = lastCommitTimeSynced.isPresent() ? TimelineUtils.getCommitsTimelineAfter(metaClient, lastCommitTimeSynced.get(), lastCommitCompletionTimeSynced) : metaClient.getActiveTimeline(); HoodieTimeline completedTimeline = timeline.getWriteTimeline().filterCompletedInstants(); HoodieTimeline replaceCommitTimeline = completedTimeline.getCompletedReplaceTimeline(); Map partitionToLatestDeleteTimestamp = replaceCommitTimeline.getInstantsAsStream() .map(instant -> { try { HoodieReplaceCommitMetadata commitMetadata = HoodieReplaceCommitMetadata.fromBytes( replaceCommitTimeline.getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class); return Pair.of(instant, commitMetadata); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions modified at " + instant, e); } }) .filter(pair -> isDeletePartition(pair.getRight().getOperationType())) .flatMap(pair -> pair.getRight().getPartitionToReplaceFileIds().keySet().stream() .map(partition -> new AbstractMap.SimpleEntry<>(partition, pair.getLeft().getTimestamp())) ).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (existing, replace) -> replace)); // cleaner could delete a partition when there are no active filegroups in the partition HoodieTimeline cleanerTimeline = metaClient.getActiveTimeline().getCleanerTimeline().filterCompletedInstants(); cleanerTimeline.getInstantsAsStream() .forEach(instant -> { try { HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils.deserializeHoodieCleanMetadata(cleanerTimeline.getInstantDetails(instant).get()); cleanMetadata.getPartitionMetadata().forEach((partition, partitionMetadata) -> { if (partitionMetadata.getIsPartitionDeleted()) { partitionToLatestDeleteTimestamp.put(partition, instant.getTimestamp()); } }); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions cleaned at " + instant, e); } }); if (partitionToLatestDeleteTimestamp.isEmpty()) { // There is no dropped partitions return Collections.emptyList(); } String earliestDeleteTimestamp = partitionToLatestDeleteTimestamp.values().stream() .reduce((left, right) -> compareTimestamps(left, LESSER_THAN, right) ? left : right) .get(); Map partitionToLatestWriteTimestamp = completedTimeline.getInstantsAsStream() .filter(instant -> compareTimestamps(instant.getTimestamp(), GREATER_THAN_OR_EQUALS, earliestDeleteTimestamp)) .flatMap(instant -> { try { HoodieCommitMetadata commitMetadata = getCommitMetadata(instant, completedTimeline); return commitMetadata.getWritePartitionPaths().stream() .map(partition -> new AbstractMap.SimpleEntry<>(partition, instant.getTimestamp())); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions writes at " + instant, e); } }).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (existing, replace) -> replace)); return partitionToLatestDeleteTimestamp.entrySet().stream() .filter(entry -> !partitionToLatestWriteTimestamp.containsKey(entry.getKey()) || compareTimestamps(entry.getValue(), GREATER_THAN, partitionToLatestWriteTimestamp.get(entry.getKey())) ).map(Map.Entry::getKey).filter(partition -> !partition.isEmpty()).collect(Collectors.toList()); } /** * Returns partitions that have been modified including internal operations such as clean in the passed timeline. */ public static List getAffectedPartitions(HoodieTimeline timeline) { return timeline.filterCompletedInstants().getInstantsAsStream().flatMap(s -> { switch (s.getAction()) { case COMMIT_ACTION: case DELTA_COMMIT_ACTION: try { HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(s).get(), HoodieCommitMetadata.class); return commitMetadata.getPartitionToWriteStats().keySet().stream(); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions written at " + s, e); } case REPLACE_COMMIT_ACTION: try { HoodieReplaceCommitMetadata commitMetadata = HoodieReplaceCommitMetadata.fromBytes( timeline.getInstantDetails(s).get(), HoodieReplaceCommitMetadata.class); Set partitions = new HashSet<>(); partitions.addAll(commitMetadata.getPartitionToReplaceFileIds().keySet()); partitions.addAll(commitMetadata.getPartitionToWriteStats().keySet()); return partitions.stream(); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions modified at " + s, e); } case HoodieTimeline.CLEAN_ACTION: try { HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(s).get()); return cleanMetadata.getPartitionMetadata().keySet().stream(); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions cleaned at " + s, e); } case HoodieTimeline.ROLLBACK_ACTION: try { return TimelineMetadataUtils.deserializeHoodieRollbackMetadata(timeline.getInstantDetails(s).get()).getPartitionMetadata().keySet().stream(); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions rolledback at " + s, e); } case HoodieTimeline.RESTORE_ACTION: try { HoodieRestoreMetadata restoreMetadata = TimelineMetadataUtils.deserializeAvroMetadata(timeline.getInstantDetails(s).get(), HoodieRestoreMetadata.class); return restoreMetadata.getHoodieRestoreMetadata().values().stream() .flatMap(Collection::stream) .flatMap(rollbackMetadata -> rollbackMetadata.getPartitionMetadata().keySet().stream()); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions restored at " + s, e); } case HoodieTimeline.SAVEPOINT_ACTION: try { return TimelineMetadataUtils.deserializeHoodieSavepointMetadata(timeline.getInstantDetails(s).get()).getPartitionMetadata().keySet().stream(); } catch (IOException e) { throw new HoodieIOException("Failed to get partitions savepoint at " + s, e); } case HoodieTimeline.COMPACTION_ACTION: // compaction is not a completed instant. So no need to consider this action. return Stream.empty(); default: throw new HoodieIOException("unknown action in timeline " + s.getAction()); } }).distinct().filter(s -> !s.isEmpty()).collect(Collectors.toList()); } /** * Get extra metadata for specified key from latest commit/deltacommit/replacecommit(eg. insert_overwrite) instant. */ public static Option getExtraMetadataFromLatest(HoodieTableMetaClient metaClient, String extraMetadataKey) { return metaClient.getCommitsTimeline().filterCompletedInstants().getReverseOrderedInstants() // exclude clustering commits for returning user stored extra metadata .filter(instant -> !isClusteringCommit(metaClient, instant)) .findFirst().map(instant -> getMetadataValue(metaClient, extraMetadataKey, instant)).orElse(Option.empty()); } /** * Get extra metadata for specified key from latest commit/deltacommit/replacecommit instant including internal commits * such as clustering. */ public static Option getExtraMetadataFromLatestIncludeClustering(HoodieTableMetaClient metaClient, String extraMetadataKey) { return metaClient.getCommitsTimeline().filterCompletedInstants().getReverseOrderedInstants() .findFirst().map(instant -> getMetadataValue(metaClient, extraMetadataKey, instant)).orElse(Option.empty()); } /** * Get extra metadata for specified key from all active commit/deltacommit instants. */ public static Map> getAllExtraMetadataForKey(HoodieTableMetaClient metaClient, String extraMetadataKey) { return metaClient.getCommitsTimeline().filterCompletedInstants().getReverseOrderedInstants().collect(Collectors.toMap( HoodieInstant::getTimestamp, instant -> getMetadataValue(metaClient, extraMetadataKey, instant))); } private static Option getMetadataValue(HoodieTableMetaClient metaClient, String extraMetadataKey, HoodieInstant instant) { try { LOG.info("reading checkpoint info for:" + instant + " key: " + extraMetadataKey); HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( metaClient.getCommitsTimeline().getInstantDetails(instant).get(), HoodieCommitMetadata.class); return Option.ofNullable(commitMetadata.getExtraMetadata().get(extraMetadataKey)); } catch (IOException e) { throw new HoodieIOException("Unable to parse instant metadata " + instant, e); } } public static boolean isClusteringCommit(HoodieTableMetaClient metaClient, HoodieInstant instant) { try { if (REPLACE_COMMIT_ACTION.equals(instant.getAction())) { // replacecommit is used for multiple operations: insert_overwrite/cluster etc. // Check operation type to see if this instant is related to clustering. HoodieReplaceCommitMetadata replaceMetadata = HoodieReplaceCommitMetadata.fromBytes( metaClient.getActiveTimeline().getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class); return WriteOperationType.CLUSTER.equals(replaceMetadata.getOperationType()); } return false; } catch (IOException e) { throw new HoodieIOException("Unable to read instant information: " + instant + " for " + metaClient.getBasePath(), e); } } public static HoodieDefaultTimeline getTimeline(HoodieTableMetaClient metaClient, boolean includeArchivedTimeline) { HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); if (includeArchivedTimeline) { HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); return archivedTimeline.mergeTimeline(activeTimeline); } return activeTimeline; } /** * Returns a Hudi timeline with commits after the given instant time (exclusive). * * @param metaClient {@link HoodieTableMetaClient} instance. * @param exclusiveStartInstantTime Start instant time (exclusive). * @param lastMaxCompletionTime Last commit max completion time synced * @return Hudi timeline. */ public static HoodieTimeline getCommitsTimelineAfter( HoodieTableMetaClient metaClient, String exclusiveStartInstantTime, Option lastMaxCompletionTime) { HoodieDefaultTimeline writeTimeline = metaClient.getActiveTimeline().getWriteTimeline(); HoodieDefaultTimeline timeline = writeTimeline.isBeforeTimelineStarts(exclusiveStartInstantTime) ? metaClient.getArchivedTimeline(exclusiveStartInstantTime).mergeTimeline(writeTimeline) : writeTimeline; HoodieDefaultTimeline timelineSinceLastSync = (HoodieDefaultTimeline) timeline.getCommitsTimeline() .findInstantsAfter(exclusiveStartInstantTime, Integer.MAX_VALUE); if (lastMaxCompletionTime.isPresent()) { // Get 'hollow' instants that have less instant time than exclusiveStartInstantTime but with greater commit completion time HoodieDefaultTimeline hollowInstantsTimeline = (HoodieDefaultTimeline) timeline.getCommitsTimeline() .filter(s -> compareTimestamps(s.getTimestamp(), LESSER_THAN, exclusiveStartInstantTime)) .filter(s -> compareTimestamps(s.getCompletionTime(), GREATER_THAN, lastMaxCompletionTime.get())); if (!hollowInstantsTimeline.empty()) { return timelineSinceLastSync.mergeTimeline(hollowInstantsTimeline); } } return timelineSinceLastSync; } /** * Returns the commit metadata of the given instant. * * @param instant The hoodie instant * @param timeline The timeline * @return the commit metadata */ public static HoodieCommitMetadata getCommitMetadata( HoodieInstant instant, HoodieTimeline timeline) throws IOException { byte[] data = timeline.getInstantDetails(instant).get(); if (instant.getAction().equals(REPLACE_COMMIT_ACTION)) { return HoodieReplaceCommitMetadata.fromBytes(data, HoodieReplaceCommitMetadata.class); } else { return HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class); } } /** * Gets the qualified earliest instant from the active timeline of the data table * for the archival in metadata table. *

* the qualified earliest instant is chosen as the earlier one between the earliest * commit (COMMIT, DELTA_COMMIT, and REPLACE_COMMIT only, considering non-savepoint * commit only if enabling archive beyond savepoint) and the earliest inflight * instant (all actions). * * @param dataTableActiveTimeline the active timeline of the data table. * @param shouldArchiveBeyondSavepoint whether to archive beyond savepoint. * @return the instant meeting the requirement. */ public static Option getEarliestInstantForMetadataArchival( HoodieActiveTimeline dataTableActiveTimeline, boolean shouldArchiveBeyondSavepoint) { // This is for commits only, not including CLEAN, ROLLBACK, etc. // When archive beyond savepoint is enabled, there are chances that there could be holes // in the timeline due to archival and savepoint interplay. So, the first non-savepoint // commit in the data timeline is considered as beginning of the active timeline. Option earliestCommit = shouldArchiveBeyondSavepoint ? dataTableActiveTimeline.getTimelineOfActions( CollectionUtils.createSet( COMMIT_ACTION, DELTA_COMMIT_ACTION, REPLACE_COMMIT_ACTION, SAVEPOINT_ACTION)) .getFirstNonSavepointCommit() : dataTableActiveTimeline.getCommitsTimeline().firstInstant(); // This is for all instants which are in-flight Option earliestInflight = dataTableActiveTimeline.filterInflightsAndRequested().firstInstant(); if (earliestCommit.isPresent() && earliestInflight.isPresent()) { if (earliestCommit.get().compareTo(earliestInflight.get()) < 0) { return earliestCommit; } return earliestInflight; } else if (earliestCommit.isPresent()) { return earliestCommit; } else if (earliestInflight.isPresent()) { return earliestInflight; } else { return Option.empty(); } } /** * Validate user-specified timestamp of time travel query against incomplete commit's timestamp. * * @throws HoodieException when time travel query's timestamp >= incomplete commit's timestamp */ public static void validateTimestampAsOf(HoodieTableMetaClient metaClient, String timestampAsOf) { Option firstIncompleteCommit = metaClient.getCommitsTimeline() .filterInflightsAndRequested() .filter(instant -> !HoodieTimeline.REPLACE_COMMIT_ACTION.equals(instant.getAction()) || !ClusteringUtils.getClusteringPlan(metaClient, instant).isPresent()) .firstInstant(); if (firstIncompleteCommit.isPresent()) { String incompleteCommitTime = firstIncompleteCommit.get().getTimestamp(); if (compareTimestamps(timestampAsOf, GREATER_THAN_OR_EQUALS, incompleteCommitTime)) { throw new HoodieTimeTravelException(String.format( "Time travel's timestamp '%s' must be earlier than the first incomplete commit timestamp '%s'.", timestampAsOf, incompleteCommitTime)); } } // also timestamp as of cannot query cleaned up data. Option latestCleanOpt = metaClient.getActiveTimeline().getCleanerTimeline().filterCompletedInstants().lastInstant(); if (latestCleanOpt.isPresent()) { // Ensure timestamp as of is > than the earliest commit to retain and try { HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(metaClient, latestCleanOpt.get()); String earliestCommitToRetain = cleanMetadata.getEarliestCommitToRetain(); if (!StringUtils.isNullOrEmpty(earliestCommitToRetain)) { ValidationUtils.checkArgument(HoodieTimeline.compareTimestamps(earliestCommitToRetain, LESSER_THAN_OR_EQUALS, timestampAsOf), "Cleaner cleaned up the timestamp of interest. Please ensure sufficient commits are retained with cleaner " + "for Timestamp as of query to work"); } else { // when cleaner is based on file versions, we may not find value for earliestCommitToRetain. // so, lets check if timestamp of interest is archived based on first entry in active timeline Option firstCompletedInstant = metaClient.getActiveTimeline().getWriteTimeline().filterCompletedInstants().firstInstant(); if (firstCompletedInstant.isPresent()) { ValidationUtils.checkArgument(HoodieTimeline.compareTimestamps(firstCompletedInstant.get().getTimestamp(), LESSER_THAN_OR_EQUALS, timestampAsOf), "Please ensure sufficient commits are retained (uncleaned and un-archived) for timestamp as of query to work."); } } } catch (IOException e) { throw new HoodieTimeTravelException("Cleaner cleaned up the timestamp of interest. " + "Please ensure sufficient commits are retained with cleaner for Timestamp as of query to work "); } } } /** * Handles hollow commit as per {@link HoodieCommonConfig#INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT} * and return filtered or non-filtered timeline for incremental query to run against. */ public static HoodieTimeline handleHollowCommitIfNeeded(HoodieTimeline completedCommitTimeline, HoodieTableMetaClient metaClient, HollowCommitHandling handlingMode) { if (handlingMode == HollowCommitHandling.USE_TRANSITION_TIME) { return completedCommitTimeline; } Option firstIncompleteCommit = metaClient.getCommitsTimeline() .filterInflightsAndRequested() .filter(instant -> !HoodieTimeline.REPLACE_COMMIT_ACTION.equals(instant.getAction()) || !ClusteringUtils.getClusteringPlan(metaClient, instant).isPresent()) .firstInstant(); boolean noHollowCommit = firstIncompleteCommit .map(i -> completedCommitTimeline.findInstantsAfter(i.getTimestamp()).empty()) .orElse(true); if (noHollowCommit) { return completedCommitTimeline; } String hollowCommitTimestamp = firstIncompleteCommit.get().getTimestamp(); switch (handlingMode) { case FAIL: throw new HoodieException(String.format( "Found hollow commit: '%s'. Adjust config `%s` accordingly if to avoid throwing this exception.", hollowCommitTimestamp, INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT.key())); case BLOCK: LOG.warn(String.format( "Found hollow commit '%s'. Config `%s` was set to `%s`: no data will be returned beyond '%s' until it's completed.", hollowCommitTimestamp, INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT.key(), handlingMode, hollowCommitTimestamp)); return completedCommitTimeline.findInstantsBefore(hollowCommitTimestamp); default: throw new HoodieException("Unexpected handling mode: " + handlingMode); } } public enum HollowCommitHandling { FAIL, BLOCK, USE_TRANSITION_TIME } /** * Concat two timelines timeline1 and timeline2 to build a new timeline. */ public static HoodieTimeline concatTimeline(HoodieTimeline timeline1, HoodieTimeline timeline2, HoodieTableMetaClient metaClient) { return new HoodieDefaultTimeline(Stream.concat(timeline1.getInstantsAsStream(), timeline2.getInstantsAsStream()).sorted(), instant -> metaClient.getActiveTimeline().getInstantDetails(instant)); } public static boolean isDeletePartition(WriteOperationType operation) { return operation == WriteOperationType.DELETE_PARTITION || operation == WriteOperationType.INSERT_OVERWRITE_TABLE || operation == WriteOperationType.INSERT_OVERWRITE; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy