All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.sync.common.HoodieSyncClient Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.sync.common;

import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.ParquetTableSchemaResolver;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.sync.common.model.Partition;
import org.apache.hudi.sync.common.model.PartitionEvent;
import org.apache.hudi.sync.common.model.PartitionValueExtractor;

import org.apache.hadoop.fs.Path;
import org.apache.parquet.schema.MessageType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS;
import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA;

public abstract class HoodieSyncClient implements HoodieMetaSyncOperations, AutoCloseable {

  private static final Logger LOG = LoggerFactory.getLogger(HoodieSyncClient.class);

  protected final HoodieSyncConfig config;
  protected final PartitionValueExtractor partitionValueExtractor;
  protected final HoodieTableMetaClient metaClient;
  private static final String TEMP_SUFFIX = "_temp";

  public HoodieSyncClient(HoodieSyncConfig config, HoodieTableMetaClient metaClient) {
    this.config = config;
    this.partitionValueExtractor = ReflectionUtils.loadClass(config.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS));
    this.metaClient = Objects.requireNonNull(metaClient, "metaClient is null");
  }

  public HoodieTimeline getActiveTimeline() {
    return metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
  }

  public HoodieTableType getTableType() {
    return metaClient.getTableType();
  }

  public String getBasePath() {
    return metaClient.getBasePath().toString();
  }

  public boolean isBootstrap() {
    return metaClient.getTableConfig().getBootstrapBasePath().isPresent();
  }

  public HoodieTableMetaClient getMetaClient() {
    return metaClient;
  }

  /**
   * Get the set of dropped partitions since the last synced commit.
   * If last sync time is not known then consider only active timeline.
   * Going through archive timeline is a costly operation, and it should be avoided unless some start time is given.
   */
  public Set getDroppedPartitionsSince(Option lastCommitTimeSynced, Option lastCommitCompletionTimeSynced) {
    return new HashSet<>(TimelineUtils.getDroppedPartitions(metaClient, lastCommitTimeSynced, lastCommitCompletionTimeSynced));
  }

  @Override
  public MessageType getStorageSchema() {
    try {
      return new ParquetTableSchemaResolver(metaClient).getTableParquetSchema();
    } catch (Exception e) {
      throw new HoodieSyncException("Failed to read schema from storage.", e);
    }
  }

  @Override
  public MessageType getStorageSchema(boolean includeMetadataField) {
    try {
      return new ParquetTableSchemaResolver(metaClient).getTableParquetSchema(includeMetadataField);
    } catch (Exception e) {
      throw new HoodieSyncException("Failed to read schema from storage.", e);
    }
  }

  /**
   * Gets all relative partitions paths in the Hudi table on storage.
   *
   * @return All relative partitions paths.
   */
  public List getAllPartitionPathsOnStorage() {
    HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getStorageConf());
    return FSUtils.getAllPartitionPaths(engineContext,
        metaClient.getStorage(),
        config.getString(META_SYNC_BASE_PATH),
        config.getBoolean(META_SYNC_USE_FILE_LISTING_FROM_METADATA));
  }

  public List getWrittenPartitionsSince(Option lastCommitTimeSynced, Option lastCommitCompletionTimeSynced) {
    if (!lastCommitTimeSynced.isPresent()) {
      LOG.info("Last commit time synced is not known, listing all partitions in {} , FS: {}",
          config.getString(META_SYNC_BASE_PATH), config.getHadoopFileSystem());
      return getAllPartitionPathsOnStorage();
    } else {
      LOG.info("Last commit time synced is {}, Getting commits since then", lastCommitTimeSynced.get());
      return TimelineUtils.getWrittenPartitions(
          TimelineUtils.getCommitsTimelineAfter(metaClient, lastCommitTimeSynced.get(), lastCommitCompletionTimeSynced));
    }
  }

  /**
   * Gets the partition events for changed partitions.
   * 

* This compares the list of all partitions of a table stored in the metastore and * on the storage: * (1) Partitions exist in the metastore, but NOT the storage: drops them in the metastore; * (2) Partitions exist on the storage, but NOT the metastore: adds them to the metastore; * (3) Partitions exist in both, but the partition path is different: update them in the metastore. * * @param allPartitionsInMetastore All partitions of a table stored in the metastore. * @param allPartitionsOnStorage All partitions of a table stored on the storage. * @return partition events for changed partitions. */ public List getPartitionEvents(List allPartitionsInMetastore, List allPartitionsOnStorage) { Map paths = getPartitionValuesToPathMapping(allPartitionsInMetastore); Set partitionsToDrop = new HashSet<>(paths.keySet()); List events = new ArrayList<>(); for (String storagePartition : allPartitionsOnStorage) { Path storagePartitionPath = HadoopFSUtils.constructAbsolutePathInHadoopPath(config.getString(META_SYNC_BASE_PATH), storagePartition); String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath(); // Check if the partition values or if hdfs path is the same List storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition); if (!storagePartitionValues.isEmpty()) { String storageValue = String.join(", ", storagePartitionValues); // Remove partitions that exist on storage from the `partitionsToDrop` set, // so the remaining partitions that exist in the metastore should be dropped partitionsToDrop.remove(storageValue); if (!paths.containsKey(storageValue)) { events.add(PartitionEvent.newPartitionAddEvent(storagePartition)); } else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) { events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition)); } } } partitionsToDrop.forEach(storageValue -> { String storagePath = paths.get(storageValue); try { String relativePath = FSUtils.getRelativePartitionPath( metaClient.getBasePath(), new StoragePath(storagePath)); events.add(PartitionEvent.newPartitionDropEvent(relativePath)); } catch (IllegalArgumentException e) { LOG.error("Cannot parse the path stored in the metastore, ignoring it for generating DROP partition event: \"{}\".", storagePath, e); } }); return events; } /** * Iterate over the storage partitions and find if there are any new partitions that need to be added or updated. * Generate a list of PartitionEvent based on the changes required. */ public List getPartitionEvents(List partitionsInMetastore, List writtenPartitionsOnStorage, Set droppedPartitionsOnStorage) { Map paths = getPartitionValuesToPathMapping(partitionsInMetastore); List events = new ArrayList<>(); for (String storagePartition : writtenPartitionsOnStorage) { Path storagePartitionPath = HadoopFSUtils.constructAbsolutePathInHadoopPath(config.getString(META_SYNC_BASE_PATH), storagePartition); String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath(); // Check if the partition values or if hdfs path is the same List storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition); if (droppedPartitionsOnStorage.contains(storagePartition)) { events.add(PartitionEvent.newPartitionDropEvent(storagePartition)); } else { if (!storagePartitionValues.isEmpty()) { String storageValue = String.join(", ", storagePartitionValues); if (!paths.containsKey(storageValue)) { events.add(PartitionEvent.newPartitionAddEvent(storagePartition)); } else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) { events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition)); } } } } return events; } /** * Gets the partition values to the absolute path mapping based on the * partition information from the metastore. * * @param partitionsInMetastore Partitions in the metastore. * @return The partition values to the absolute path mapping. */ private Map getPartitionValuesToPathMapping(List partitionsInMetastore) { Map paths = new HashMap<>(); for (Partition tablePartition : partitionsInMetastore) { List hivePartitionValues = tablePartition.getValues(); String fullTablePartitionPath = Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getStorageLocation())).toUri().getPath(); paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath); } return paths; } protected String generateTempTableName(String tableName) { return tableName + TEMP_SUFFIX + ZonedDateTime.now().toEpochSecond(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy