All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.metadata.BaseTableMetadata Maven / Gradle / Ivy


/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.metadata;

import org.apache.hudi.avro.model.HoodieMetadataBloomFilter;
import org.apache.hudi.avro.model.HoodieMetadataColumnStats;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordGlobalLocation;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.common.util.hash.ColumnIndexID;
import org.apache.hudi.common.util.hash.FileIndexID;
import org.apache.hudi.common.util.hash.PartitionIndexID;
import org.apache.hudi.config.metrics.HoodieMetricsConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieMetadataException;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StorageConfiguration;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.StoragePathInfo;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * Abstract class for implementing common table metadata operations.
 */
public abstract class BaseTableMetadata extends AbstractHoodieTableMetadata {

  private static final Logger LOG = LoggerFactory.getLogger(BaseTableMetadata.class);

  protected final HoodieTableMetaClient dataMetaClient;
  protected final Option metrics;
  protected final HoodieMetadataConfig metadataConfig;

  protected boolean isMetadataTableInitialized;
  protected final boolean hiveStylePartitioningEnabled;
  protected final boolean urlEncodePartitioningEnabled;

  protected BaseTableMetadata(HoodieEngineContext engineContext,
                              HoodieStorage storage,
                              HoodieMetadataConfig metadataConfig,
                              String dataBasePath) {
    super(engineContext, storage, dataBasePath);

    this.dataMetaClient = HoodieTableMetaClient.builder()
        .setStorage(storage)
        .setBasePath(dataBasePath)
        .build();

    this.hiveStylePartitioningEnabled = Boolean.parseBoolean(dataMetaClient.getTableConfig().getHiveStylePartitioningEnable());
    this.urlEncodePartitioningEnabled = Boolean.parseBoolean(dataMetaClient.getTableConfig().getUrlEncodePartitioning());
    this.metadataConfig = metadataConfig;
    this.isMetadataTableInitialized = dataMetaClient.getTableConfig().isMetadataTableAvailable();

    if (metadataConfig.isMetricsEnabled()) {
      this.metrics = Option.of(new HoodieMetadataMetrics(HoodieMetricsConfig.newBuilder()
          .fromProperties(metadataConfig.getProps()).build(), dataMetaClient.getStorage()));
    } else {
      this.metrics = Option.empty();
    }
  }

  protected HoodieEngineContext getEngineContext() {
    if (engineContext == null) {
      engineContext = new HoodieLocalEngineContext(dataMetaClient.getStorageConf());
    }
    return engineContext;
  }

  /**
   * Return the list of partitions in the dataset.
   * 

* If the Metadata Table is enabled, the listing is retrieved from the stored metadata. Otherwise, the list of * partitions is retrieved directly from the underlying {@code FileSystem}. *

* On any errors retrieving the listing from the metadata, defaults to using the file system listings. */ @Override public List getAllPartitionPaths() throws IOException { ValidationUtils.checkArgument(isMetadataTableInitialized); try { return fetchAllPartitionPaths(); } catch (Exception e) { throw new HoodieMetadataException("Failed to retrieve list of partition from metadata", e); } } /** * Return the list of files in a partition. *

* If the Metadata Table is enabled, the listing is retrieved from the stored metadata. Otherwise, the list of * partitions is retrieved directly from the underlying {@code FileSystem}. *

* On any errors retrieving the listing from the metadata, defaults to using the file system listings. * * @param partitionPath The absolute path of the partition to list */ @Override public List getAllFilesInPartition(StoragePath partitionPath) throws IOException { ValidationUtils.checkArgument(isMetadataTableInitialized); try { return fetchAllFilesInPartition(partitionPath); } catch (Exception e) { throw new HoodieMetadataException( "Failed to retrieve files in partition " + partitionPath + " from metadata", e); } } @Override public Map> getAllFilesInPartitions(Collection partitions) throws IOException { ValidationUtils.checkArgument(isMetadataTableInitialized); if (partitions.isEmpty()) { return Collections.emptyMap(); } try { List partitionPaths = partitions.stream().map(StoragePath::new).collect(Collectors.toList()); return fetchAllFilesInPartitionPaths(partitionPaths); } catch (Exception e) { throw new HoodieMetadataException("Failed to retrieve files in partition from metadata", e); } } @Override public Option getBloomFilter(final String partitionName, final String fileName) throws HoodieMetadataException { if (!dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.BLOOM_FILTERS)) { LOG.error("Metadata bloom filter index is disabled!"); return Option.empty(); } final Pair partitionFileName = Pair.of(partitionName, fileName); Map, BloomFilter> bloomFilters = getBloomFilters(Collections.singletonList(partitionFileName)); if (bloomFilters.isEmpty()) { LOG.error("Meta index: missing bloom filter for partition: {}, file: {}", partitionName, fileName); return Option.empty(); } ValidationUtils.checkState(bloomFilters.containsKey(partitionFileName)); return Option.of(bloomFilters.get(partitionFileName)); } @Override public Map, BloomFilter> getBloomFilters(final List> partitionNameFileNameList) throws HoodieMetadataException { if (!dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.BLOOM_FILTERS)) { LOG.error("Metadata bloom filter index is disabled!"); return Collections.emptyMap(); } if (partitionNameFileNameList.isEmpty()) { return Collections.emptyMap(); } HoodieTimer timer = HoodieTimer.start(); Set partitionIDFileIDStrings = new HashSet<>(); Map> fileToKeyMap = new HashMap<>(); partitionNameFileNameList.forEach(partitionNameFileNamePair -> { final String bloomFilterIndexKey = HoodieMetadataPayload.getBloomFilterIndexKey( new PartitionIndexID(HoodieTableMetadataUtil.getBloomFilterIndexPartitionIdentifier(partitionNameFileNamePair.getLeft())), new FileIndexID(partitionNameFileNamePair.getRight())); partitionIDFileIDStrings.add(bloomFilterIndexKey); fileToKeyMap.put(bloomFilterIndexKey, partitionNameFileNamePair); }); List partitionIDFileIDStringsList = new ArrayList<>(partitionIDFileIDStrings); Map> hoodieRecords = getRecordsByKeys(partitionIDFileIDStringsList, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath()); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_BLOOM_FILTERS_METADATA_STR, timer.endTimer())); metrics.ifPresent(m -> m.setMetric(HoodieMetadataMetrics.LOOKUP_BLOOM_FILTERS_FILE_COUNT_STR, partitionIDFileIDStringsList.size())); Map, BloomFilter> partitionFileToBloomFilterMap = new HashMap<>(hoodieRecords.size()); for (final Map.Entry> entry : hoodieRecords.entrySet()) { final Option bloomFilterMetadata = entry.getValue().getData().getBloomFilterMetadata(); if (bloomFilterMetadata.isPresent()) { if (!bloomFilterMetadata.get().getIsDeleted()) { ValidationUtils.checkState(fileToKeyMap.containsKey(entry.getKey())); // NOTE: We have to duplicate the [[ByteBuffer]] object here since: // - Reading out [[ByteBuffer]] mutates its state // - [[BloomFilterMetadata]] could be re-used, and hence have to stay immutable final ByteBuffer bloomFilterByteBuffer = bloomFilterMetadata.get().getBloomFilter().duplicate(); final String bloomFilterType = bloomFilterMetadata.get().getType(); final BloomFilter bloomFilter = BloomFilterFactory.fromString( StandardCharsets.UTF_8.decode(bloomFilterByteBuffer).toString(), bloomFilterType); partitionFileToBloomFilterMap.put(fileToKeyMap.get(entry.getKey()), bloomFilter); } } else { LOG.error("Meta index bloom filter missing for: {}", fileToKeyMap.get(entry.getKey())); } } return partitionFileToBloomFilterMap; } @Override public Map, HoodieMetadataColumnStats> getColumnStats(final List> partitionNameFileNameList, final String columnName) throws HoodieMetadataException { if (!dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.COLUMN_STATS)) { LOG.error("Metadata column stats index is disabled!"); return Collections.emptyMap(); } Map> columnStatKeyToFileNameMap = new HashMap<>(); Set columnStatKeyset = new HashSet<>(); final ColumnIndexID columnIndexID = new ColumnIndexID(columnName); for (Pair partitionNameFileNamePair : partitionNameFileNameList) { final String columnStatsIndexKey = HoodieMetadataPayload.getColumnStatsIndexKey( new PartitionIndexID(HoodieTableMetadataUtil.getColumnStatsIndexPartitionIdentifier(partitionNameFileNamePair.getLeft())), new FileIndexID(partitionNameFileNamePair.getRight()), columnIndexID); columnStatKeyset.add(columnStatsIndexKey); columnStatKeyToFileNameMap.put(columnStatsIndexKey, partitionNameFileNamePair); } List columnStatKeylist = new ArrayList<>(columnStatKeyset); HoodieTimer timer = HoodieTimer.start(); Map> hoodieRecords = getRecordsByKeys(columnStatKeylist, MetadataPartitionType.COLUMN_STATS.getPartitionPath()); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_COLUMN_STATS_METADATA_STR, timer.endTimer())); metrics.ifPresent(m -> m.setMetric(HoodieMetadataMetrics.LOOKUP_COLUMN_STATS_FILE_COUNT_STR, columnStatKeylist.size())); Map, HoodieMetadataColumnStats> fileToColumnStatMap = new HashMap<>(); for (final Map.Entry> entry : hoodieRecords.entrySet()) { final Option columnStatMetadata = entry.getValue().getData().getColumnStatMetadata(); if (columnStatMetadata.isPresent()) { if (!columnStatMetadata.get().getIsDeleted()) { ValidationUtils.checkState(columnStatKeyToFileNameMap.containsKey(entry.getKey())); final Pair partitionFileNamePair = columnStatKeyToFileNameMap.get(entry.getKey()); ValidationUtils.checkState(!fileToColumnStatMap.containsKey(partitionFileNamePair)); fileToColumnStatMap.put(partitionFileNamePair, columnStatMetadata.get()); } } else { LOG.error("Meta index column stats missing for: {}", entry.getKey()); } } return fileToColumnStatMap; } /** * Reads record keys from record-level index. *

* If the Metadata Table is not enabled, an exception is thrown to distinguish this from the absence of the key. * * @param recordKeys The list of record keys to read */ @Override public Map> readRecordIndex(List recordKeys) { // If record index is not initialized yet, we cannot return an empty result here unlike the code for reading from other // indexes. This is because results from this function are used for upserts and returning an empty result here would lead // to existing records being inserted again causing duplicates. // The caller is required to check for record index existence in MDT before calling this method. ValidationUtils.checkState(dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.RECORD_INDEX), "Record index is not initialized in MDT"); HoodieTimer timer = HoodieTimer.start(); Map>> result = getAllRecordsByKeys(recordKeys, MetadataPartitionType.RECORD_INDEX.getPartitionPath()); Map> recordKeyToLocation = new HashMap<>(result.size()); result.forEach((key, records) -> records.forEach(record -> { if (!record.getData().isDeleted()) { List locations = recordKeyToLocation.getOrDefault(key, new ArrayList<>()); locations.add(record.getData().getRecordGlobalLocation()); recordKeyToLocation.put(key, locations); } })); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_RECORD_INDEX_TIME_STR, timer.endTimer())); metrics.ifPresent(m -> m.setMetric(HoodieMetadataMetrics.LOOKUP_RECORD_INDEX_KEYS_COUNT_STR, recordKeys.size())); metrics.ifPresent(m -> m.setMetric(HoodieMetadataMetrics.LOOKUP_RECORD_INDEX_KEYS_HITS_COUNT_STR, recordKeyToLocation.size())); return recordKeyToLocation; } /** * Get record-location using secondary-index and record-index *

* If the Metadata Table is not enabled, an exception is thrown to distinguish this from the absence of the key. * * @param secondaryKeys The list of secondary keys to read */ @Override public Map> readSecondaryIndex(List secondaryKeys, String partitionName) { ValidationUtils.checkState(dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.RECORD_INDEX), "Record index is not initialized in MDT"); ValidationUtils.checkState( dataMetaClient.getTableConfig().getMetadataPartitions().contains(partitionName), "Secondary index is not initialized in MDT for: " + partitionName); // Fetch secondary-index records Map>> secondaryKeyRecords = getSecondaryIndexRecords(secondaryKeys, partitionName); // Now collect the record-keys and fetch the RLI records List recordKeys = new ArrayList<>(); secondaryKeyRecords.forEach((key, records) -> records.forEach(record -> { if (!record.getData().isDeleted()) { recordKeys.add(record.getData().getRecordKeyFromSecondaryIndex()); } })); return readRecordIndex(recordKeys); } /** * Returns a map of (record-key -> secondary-key) for the provided record keys. */ public Map getSecondaryKeys(List recordKeys) { ValidationUtils.checkState(dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.RECORD_INDEX), "Record index is not initialized in MDT"); ValidationUtils.checkState(dataMetaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.SECONDARY_INDEX), "Secondary index is not initialized in MDT"); return getSecondaryKeysForRecordKeys(recordKeys, MetadataPartitionType.SECONDARY_INDEX.getPartitionPath()); } /** * Returns a list of all partitions. */ protected List fetchAllPartitionPaths() { HoodieTimer timer = HoodieTimer.start(); Option> recordOpt = getRecordByKey(RECORDKEY_PARTITION_LIST, MetadataPartitionType.FILES.getPartitionPath()); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_PARTITIONS_STR, timer.endTimer())); List partitions = recordOpt.map(record -> { HoodieMetadataPayload metadataPayload = record.getData(); checkForSpuriousDeletes(metadataPayload, "\"all partitions\""); List relativePaths = metadataPayload.getFilenames(); // Non-partitioned tables have a single empty partition if (relativePaths.size() == 1 && relativePaths.get(0).equals(NON_PARTITIONED_NAME)) { return Collections.singletonList(""); } else { return relativePaths; } }) .orElse(Collections.emptyList()); LOG.info("Listed partitions from metadata: #partitions={}", partitions.size()); return partitions; } /** * Return all the files from the partition. * * @param partitionPath The absolute path of the partition */ List fetchAllFilesInPartition(StoragePath partitionPath) { String relativePartitionPath = FSUtils.getRelativePartitionPath(dataBasePath, partitionPath); String recordKey = relativePartitionPath.isEmpty() ? NON_PARTITIONED_NAME : relativePartitionPath; HoodieTimer timer = HoodieTimer.start(); Option> recordOpt = getRecordByKey(recordKey, MetadataPartitionType.FILES.getPartitionPath()); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer())); List pathInfoList = recordOpt .map(record -> { HoodieMetadataPayload metadataPayload = record.getData(); checkForSpuriousDeletes(metadataPayload, recordKey); try { return metadataPayload.getFileList(dataMetaClient.getStorage(), partitionPath); } catch (Exception e) { throw new HoodieException("Failed to extract file-pathInfoList from the payload", e); } }) .orElseGet(Collections::emptyList); LOG.debug("Listed file in partition from metadata: partition={}, #files={}", relativePartitionPath, pathInfoList.size()); return pathInfoList; } Map> fetchAllFilesInPartitionPaths(List partitionPaths) { Map partitionIdToPathMap = partitionPaths.parallelStream() .collect( Collectors.toMap(partitionPath -> { String partitionId = FSUtils.getRelativePartitionPath(dataBasePath, partitionPath); return partitionId.isEmpty() ? NON_PARTITIONED_NAME : partitionId; }, Function.identity()) ); HoodieTimer timer = HoodieTimer.start(); Map> partitionIdRecordPairs = getRecordsByKeys(new ArrayList<>(partitionIdToPathMap.keySet()), MetadataPartitionType.FILES.getPartitionPath()); metrics.ifPresent( m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer())); Map> partitionPathToFilesMap = partitionIdRecordPairs.entrySet().stream() .map(e -> { final String partitionId = e.getKey(); StoragePath partitionPath = partitionIdToPathMap.get(partitionId); HoodieMetadataPayload metadataPayload = e.getValue().getData(); checkForSpuriousDeletes(metadataPayload, partitionId); List files = metadataPayload.getFileList(dataMetaClient.getStorage(), partitionPath); return Pair.of(partitionPath.toString(), files); }) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); LOG.info("Listed files in {} partitions from metadata", partitionPaths.size()); return partitionPathToFilesMap; } /** * Handle spurious deletes. Depending on config, throw an exception or log warn msg. */ private void checkForSpuriousDeletes(HoodieMetadataPayload metadataPayload, String partitionName) { if (!metadataPayload.getDeletions().isEmpty()) { if (metadataConfig.shouldIgnoreSpuriousDeletes()) { LOG.warn("Metadata record for " + partitionName + " encountered some files to be deleted which was not added before. " + "Ignoring the spurious deletes as the `" + HoodieMetadataConfig.IGNORE_SPURIOUS_DELETES.key() + "` config is set to true"); } else { throw new HoodieMetadataException("Metadata record for " + partitionName + " is inconsistent: " + metadataPayload); } } } protected abstract Option> getRecordByKey(String key, String partitionName); protected abstract Map> getRecordsByKeys(List keys, String partitionName); protected abstract Map getSecondaryKeysForRecordKeys(List recordKeys, String partitionName); /** * Returns a map of (record-key -> list-of-secondary-index-records) for the provided secondary keys. */ protected abstract Map>> getSecondaryIndexRecords(List keys, String partitionName); public HoodieMetadataConfig getMetadataConfig() { return metadataConfig; } protected StorageConfiguration getStorageConf() { return dataMetaClient.getStorageConf(); } protected String getLatestDataInstantTime() { return dataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant() .map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP); } public boolean isMetadataTableInitialized() { return isMetadataTableInitialized; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy