org.apache.hudi.metadata.HoodieBackedTableMetadata Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.metadata;

import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieMetadataRecord;
import org.apache.hudi.common.config.HoodieCommonConfig;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.data.HoodieListData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecord.HoodieRecordType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.Functions;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.SpillableMapUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.ClosableIterator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.TableNotFoundException;
import org.apache.hudi.expression.BindVisitor;
import org.apache.hudi.expression.Expression;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.io.storage.HoodieIOFactory;
import org.apache.hudi.io.storage.HoodieSeekingFileReader;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.util.Transient;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;

import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FULL_SCAN_LOG_FILES;
import static org.apache.hudi.common.config.HoodieReaderConfig.USE_NATIVE_HFILE_READER;
import static org.apache.hudi.common.util.CollectionUtils.toStream;
import static org.apache.hudi.common.util.ConfigUtils.DEFAULT_HUDI_CONFIG_FOR_READER;
import static org.apache.hudi.common.util.ValidationUtils.checkState;
import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS;
import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS;
import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_FILES;
import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getFileSystemView;

/**
 * Table metadata provided by an internal DFS backed Hudi metadata table.
 */
public class HoodieBackedTableMetadata extends BaseTableMetadata {

  private static final Logger LOG = LoggerFactory.getLogger(HoodieBackedTableMetadata.class);

  private final String metadataBasePath;

  private HoodieTableMetaClient metadataMetaClient;
  private HoodieTableConfig metadataTableConfig;

  private HoodieTableFileSystemView metadataFileSystemView;
  // should we reuse the open file handles, across calls
  private final boolean reuse;

  // Readers for the latest file slice corresponding to file groups in the metadata partition
  private final Transient, Pair, HoodieMetadataLogRecordReader>>> partitionReaders =
      Transient.lazy(ConcurrentHashMap::new);

  // Latest file slices in the metadata partitions
  private final Map> partitionFileSliceMap = new ConcurrentHashMap<>();

  public HoodieBackedTableMetadata(HoodieEngineContext engineContext,
                                   HoodieStorage storage,
                                   HoodieMetadataConfig metadataConfig,
                                   String datasetBasePath) {
    this(engineContext, storage, metadataConfig, datasetBasePath, false);
  }

  public HoodieBackedTableMetadata(HoodieEngineContext engineContext,
                                   HoodieStorage storage,
                                   HoodieMetadataConfig metadataConfig,
                                   String datasetBasePath, boolean reuse) {
    super(engineContext, storage, metadataConfig, datasetBasePath);
    this.reuse = reuse;
    this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(dataBasePath.toString());

    initIfNeeded();
  }

  private void initIfNeeded() {
    if (!isMetadataTableInitialized) {
      if (!HoodieTableMetadata.isMetadataTable(metadataBasePath)) {
        LOG.info("Metadata table is disabled.");
      }
    } else if (this.metadataMetaClient == null) {
      try {
        this.metadataMetaClient = HoodieTableMetaClient.builder()
            .setStorage(storage)
            .setBasePath(metadataBasePath)
            .build();
        this.metadataFileSystemView = getFileSystemView(metadataMetaClient);
        this.metadataTableConfig = metadataMetaClient.getTableConfig();
      } catch (TableNotFoundException e) {
        LOG.warn("Metadata table was not found at path " + metadataBasePath);
        this.isMetadataTableInitialized = false;
        this.metadataMetaClient = null;
        this.metadataFileSystemView = null;
        this.metadataTableConfig = null;
      } catch (Exception e) {
        LOG.error("Failed to initialize metadata table at path " + metadataBasePath, e);
        this.isMetadataTableInitialized = false;
        this.metadataMetaClient = null;
        this.metadataFileSystemView = null;
        this.metadataTableConfig = null;
      }
    }
  }

  @Override
  protected Option> getRecordByKey(String key, String partitionName) {
    Map> recordsByKeys = getRecordsByKeys(Collections.singletonList(key), partitionName);
    return Option.ofNullable(recordsByKeys.get(key));
  }

  @Override
  public List getPartitionPathWithPathPrefixUsingFilterExpression(List relativePathPrefixes,
                                                                          Types.RecordType partitionFields,
                                                                          Expression expression) throws IOException {
    Expression boundedExpr = expression.accept(new BindVisitor(partitionFields, caseSensitive));
    List selectedPartitionPaths = getPartitionPathWithPathPrefixes(relativePathPrefixes);

    // Can only prune partitions if the number of partition levels matches partition fields
    // Here we'll check the first selected partition to see whether the numbers match.
    if (hiveStylePartitioningEnabled
        && getPathPartitionLevel(partitionFields, selectedPartitionPaths.get(0)) == partitionFields.fields().size()) {
      return selectedPartitionPaths.stream()
          .filter(p ->
              (boolean) boundedExpr.eval(extractPartitionValues(partitionFields, p, urlEncodePartitioningEnabled)))
          .collect(Collectors.toList());
    }

    return selectedPartitionPaths;
  }

  @Override
  public List getPartitionPathWithPathPrefixes(List relativePathPrefixes) throws IOException {
    // TODO: consider skipping this method for non-partitioned table and simplify the checks
    return getAllPartitionPaths().stream()
        .filter(p -> relativePathPrefixes.stream().anyMatch(relativePathPrefix ->
            // Partition paths stored in metadata table do not have the slash at the end.
            // If the relativePathPrefix is empty, return all partition paths;
            // else if the relative path prefix is the same as the path, this is an exact match;
            // else, we need to make sure the path is a subdirectory of relativePathPrefix, by
            // checking if the path starts with relativePathPrefix appended by a slash ("/").
            StringUtils.isNullOrEmpty(relativePathPrefix)
                || p.equals(relativePathPrefix) || p.startsWith(relativePathPrefix + "/")))
        .collect(Collectors.toList());
  }

  @Override
  public HoodieData> getRecordsByKeyPrefixes(List keyPrefixes,
                                                                                 String partitionName,
                                                                                 boolean shouldLoadInMemory) {
    // Sort the prefixes so that keys are looked up in order
    List sortedKeyPrefixes = new ArrayList<>(keyPrefixes);
    Collections.sort(sortedKeyPrefixes);

    // NOTE: Since we partition records to a particular file-group by full key, we will have
    //       to scan all file-groups for all key-prefixes as each of these might contain some
    //       records matching the key-prefix
    List partitionFileSlices = partitionFileSliceMap.computeIfAbsent(partitionName,
        k -> HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, metadataFileSystemView, partitionName));
    checkState(!partitionFileSlices.isEmpty(), "Number of file slices for partition " + partitionName + " should be > 0");

    return (shouldLoadInMemory ? HoodieListData.lazy(partitionFileSlices) :
        engineContext.parallelize(partitionFileSlices))
        .flatMap(
            (SerializableFunction>>) fileSlice -> {
              // NOTE: Since this will be executed by executors, we can't access previously cached
              //       readers, and therefore have to always open new ones
              Pair, HoodieMetadataLogRecordReader> readers =
                  openReaders(partitionName, fileSlice);
              try {
                List timings = new ArrayList<>();

                HoodieSeekingFileReader baseFileReader = readers.getKey();
                HoodieMetadataLogRecordReader logRecordScanner = readers.getRight();

                if (baseFileReader == null && logRecordScanner == null) {
                  // TODO: what do we do if both does not exist? should we throw an exception and let caller do the fallback ?
                  return Collections.emptyIterator();
                }

                boolean fullKeys = false;

                Map> logRecords =
                    readLogRecords(logRecordScanner, sortedKeyPrefixes, fullKeys, timings);

                Map> mergedRecords =
                    readFromBaseAndMergeWithLogRecords(baseFileReader, sortedKeyPrefixes, fullKeys, logRecords, timings, partitionName);

                LOG.debug(String.format("Metadata read for %s keys took [baseFileRead, logMerge] %s ms",
                    sortedKeyPrefixes.size(), timings));

                return mergedRecords.values().iterator();
              } catch (IOException ioe) {
                throw new HoodieIOException("Error merging records from metadata table for  " + sortedKeyPrefixes.size() + " key : ", ioe);
              } finally {
                closeReader(readers);
              }
            });
  }

  @Override
  protected Map> getRecordsByKeys(List keys, String partitionName) {
    if (keys.isEmpty()) {
      return Collections.emptyMap();
    }

    Map> result;

    // Load the file slices for the partition. Each file slice is a shard which saves a portion of the keys.
    List partitionFileSlices = partitionFileSliceMap.computeIfAbsent(partitionName,
        k -> HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, metadataFileSystemView, partitionName));
    final int numFileSlices = partitionFileSlices.size();
    checkState(numFileSlices > 0, "Number of file slices for partition " + partitionName + " should be > 0");

    // Lookup keys from each file slice
    if (numFileSlices == 1) {
      // Optimization for a single slice for smaller metadata table partitions
      result = lookupKeysFromFileSlice(partitionName, keys, partitionFileSlices.get(0));
    } else {
      // Parallel lookup for large sized partitions with many file slices
      // Partition the keys by the file slice which contains it
      ArrayList> partitionedKeys = partitionKeysByFileSlices(keys, numFileSlices);
      result = new HashMap<>(keys.size());
      getEngineContext().setJobStatus(this.getClass().getSimpleName(), "Reading keys from metadata table partition " + partitionName);
      getEngineContext().map(partitionedKeys, keysList -> {
        if (keysList.isEmpty()) {
          return Collections.>emptyMap();
        }
        int shardIndex = HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(keysList.get(0), numFileSlices);
        return lookupKeysFromFileSlice(partitionName, keysList, partitionFileSlices.get(shardIndex));
      }, partitionedKeys.size()).forEach(result::putAll);
    }

    return result;
  }

  private static ArrayList> partitionKeysByFileSlices(List keys, int numFileSlices) {
    ArrayList> partitionedKeys = new ArrayList<>(numFileSlices);
    for (int i = 0; i < numFileSlices; ++i) {
      partitionedKeys.add(new ArrayList<>());
    }
    keys.forEach(key -> {
      int shardIndex = HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(key, numFileSlices);
      partitionedKeys.get(shardIndex).add(key);
    });
    return partitionedKeys;
  }

  @Override
  public Map>> getAllRecordsByKeys(List keys, String partitionName) {
    if (keys.isEmpty()) {
      return Collections.emptyMap();
    }

    Map>> result;
    // Load the file slices for the partition. Each file slice is a shard which saves a portion of the keys.
    List partitionFileSlices = partitionFileSliceMap.computeIfAbsent(partitionName,
        k -> HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, metadataFileSystemView, partitionName));
    final int numFileSlices = partitionFileSlices.size();
    checkState(numFileSlices > 0, "Number of file slices for partition " + partitionName + " should be > 0");

    // Lookup keys from each file slice
    if (numFileSlices == 1) {
      // Optimization for a single slice for smaller metadata table partitions
      result = lookupAllKeysFromFileSlice(partitionName, keys, partitionFileSlices.get(0));
    } else {
      // Parallel lookup for large sized partitions with many file slices
      // Partition the keys by the file slice which contains it
      ArrayList> partitionedKeys = partitionKeysByFileSlices(keys, numFileSlices);
      result = new HashMap<>(keys.size());
      getEngineContext().setJobStatus(this.getClass().getSimpleName(), "Reading keys from metadata table partition " + partitionName);
      getEngineContext().map(partitionedKeys, keysList -> {
        if (keysList.isEmpty()) {
          return Collections.>emptyMap();
        }
        int shardIndex = HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(keysList.get(0), numFileSlices);
        return lookupAllKeysFromFileSlice(partitionName, keysList, partitionFileSlices.get(shardIndex));
      }, partitionedKeys.size()).forEach(map -> result.putAll((Map>>) map));
    }

    return result;
  }

  /**
   * Lookup list of keys from a single file slice.
   *
   * @param partitionName Name of the partition
   * @param keys          The list of keys to lookup
   * @param fileSlice     The file slice to read
   * @return A {@code Map} of key name to {@code HoodieRecord} for the keys which were found in the file slice
   */
  private Map> lookupKeysFromFileSlice(String partitionName, List keys, FileSlice fileSlice) {
    Pair, HoodieMetadataLogRecordReader> readers = getOrCreateReaders(partitionName, fileSlice);
    try {
      HoodieSeekingFileReader baseFileReader = readers.getKey();
      HoodieMetadataLogRecordReader logRecordScanner = readers.getRight();
      if (baseFileReader == null && logRecordScanner == null) {
        return Collections.emptyMap();
      }

      // Sort it here once so that we don't need to sort individually for base file and for each individual log files.
      List sortedKeys = new ArrayList<>(keys);
      Collections.sort(sortedKeys);
      boolean fullKeys = true;
      List timings = new ArrayList<>(1);
      Map> logRecords = readLogRecords(logRecordScanner, sortedKeys, fullKeys, timings);
      return readFromBaseAndMergeWithLogRecords(baseFileReader, sortedKeys, fullKeys, logRecords, timings, partitionName);
    } catch (IOException ioe) {
      throw new HoodieIOException("Error merging records from metadata table for  " + keys.size() + " key : ", ioe);
    } finally {
      if (!reuse) {
        closeReader(readers);
      }
    }
  }

  private Map> readLogRecords(HoodieMetadataLogRecordReader logRecordReader,
                                                                          List sortedKeys,
                                                                          boolean fullKey,
                                                                          List timings) {
    HoodieTimer timer = HoodieTimer.start();

    if (logRecordReader == null) {
      timings.add(timer.endTimer());
      return Collections.emptyMap();
    }

    try {
      return fullKey ? logRecordReader.getRecordsByKeys(sortedKeys) : logRecordReader.getRecordsByKeyPrefixes(sortedKeys);
    } finally {
      timings.add(timer.endTimer());
    }
  }

  private Map> readFromBaseAndMergeWithLogRecords(HoodieSeekingFileReader reader,
                                                                                              List sortedKeys,
                                                                                              boolean fullKeys,
                                                                                              Map> logRecords,
                                                                                              List timings,
                                                                                              String partitionName) throws IOException {
    HoodieTimer timer = HoodieTimer.start();

    if (reader == null) {
      // No base file at all
      timings.add(timer.endTimer());
      return logRecords;
    }

    HoodieTimer readTimer = HoodieTimer.start();

    Map> records =
        fetchBaseFileRecordsByKeys(reader, sortedKeys, fullKeys, partitionName);

    metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer()));

    // Iterate over all provided log-records, merging them into existing records
    logRecords.values().forEach(logRecord ->
        records.merge(
            logRecord.getRecordKey(),
            logRecord,
            (oldRecord, newRecord) -> {
              HoodieMetadataPayload mergedPayload = newRecord.getData().preCombine(oldRecord.getData());
              return mergedPayload.isDeleted() ? null : new HoodieAvroRecord<>(oldRecord.getKey(), mergedPayload);
            }
        ));

    timings.add(timer.endTimer());
    return records;
  }

  @SuppressWarnings("unchecked")
  private Map> fetchBaseFileRecordsByKeys(HoodieSeekingFileReader reader,
                                                                                      List sortedKeys,
                                                                                      boolean fullKeys,
                                                                                      String partitionName) throws IOException {
    Map> result;
    try (ClosableIterator> records = fullKeys
        ? reader.getRecordsByKeysIterator(sortedKeys)
        : reader.getRecordsByKeyPrefixIterator(sortedKeys)) {
      result = toStream(records)
          .map(record -> {
            GenericRecord data = (GenericRecord) record.getData();
            return Pair.of(
                (String) (data).get(HoodieMetadataPayload.KEY_FIELD_NAME),
                composeRecord(data, partitionName));
          })
          .collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    }
    return result;
  }

  private Map>> lookupAllKeysFromFileSlice(String partitionName, List keys, FileSlice fileSlice) {
    Pair, HoodieMetadataLogRecordReader> readers = getOrCreateReaders(partitionName, fileSlice);
    try {
      List timings = new ArrayList<>();
      HoodieSeekingFileReader baseFileReader = readers.getKey();
      HoodieMetadataLogRecordReader logRecordScanner = readers.getRight();
      if (baseFileReader == null && logRecordScanner == null) {
        return Collections.emptyMap();
      }

      // Sort it here once so that we don't need to sort individually for base file and for each individual log files.
      List sortedKeys = new ArrayList<>(keys);
      Collections.sort(sortedKeys);
      Map>> logRecords = readAllLogRecords(logRecordScanner, sortedKeys, timings);
      return readFromBaseAndMergeWithAllLogRecords(baseFileReader, sortedKeys, true, logRecords, timings, partitionName);
    } catch (IOException ioe) {
      throw new HoodieIOException("Error merging records from metadata table for  " + keys.size() + " key : ", ioe);
    }
  }

  private Map>> readAllLogRecords(HoodieMetadataLogRecordReader logRecordReader,
                                                                                   List sortedKeys,
                                                                                   List timings) {
    HoodieTimer timer = HoodieTimer.start();

    if (logRecordReader == null) {
      timings.add(timer.endTimer());
      return Collections.emptyMap();
    }

    try {
      return logRecordReader.getAllRecordsByKeys(sortedKeys);
    } finally {
      timings.add(timer.endTimer());
    }
  }

  private Map>> readFromBaseAndMergeWithAllLogRecords(HoodieSeekingFileReader reader,
                                                                                                       List sortedKeys,
                                                                                                       boolean fullKeys,
                                                                                                       Map>> logRecords,
                                                                                                       List timings,
                                                                                                       String partitionName) throws IOException {
    HoodieTimer timer = HoodieTimer.start();

    if (reader == null) {
      // No base file at all
      timings.add(timer.endTimer());
      return logRecords;
    }

    HoodieTimer readTimer = HoodieTimer.start();

    Map>> records =
        fetchBaseFileAllRecordsByKeys(reader, sortedKeys, fullKeys, partitionName);

    metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer()));

    // Iterate over all provided log-records, merging them into existing records

    logRecords.entrySet().forEach(kv -> {
      records.merge(
          kv.getKey(),
          kv.getValue(),
          (oldRecordList, newRecordList) -> {
            List> mergedRecordList = new ArrayList<>();
            HoodieMetadataPayload mergedPayload = null;
            HoodieKey key = null;
            if (!oldRecordList.isEmpty() && !newRecordList.isEmpty()) {
              mergedPayload = newRecordList.get(0).getData().preCombine(oldRecordList.get(0).getData());
              key = newRecordList.get(0).getKey();
            } else if (!oldRecordList.isEmpty()) {
              mergedPayload = oldRecordList.get(0).getData();
              key = oldRecordList.get(0).getKey();
            } else if (!newRecordList.isEmpty()) {
              mergedPayload = newRecordList.get(0).getData();
              key = newRecordList.get(0).getKey();
            }

            if (mergedPayload != null && !mergedPayload.isDeleted()) {
              mergedRecordList.add(new HoodieAvroRecord<>(key, mergedPayload));
            }
            return mergedRecordList;
          }
      );
    });

    timings.add(timer.endTimer());
    return records;
  }

  private Map>> fetchBaseFileAllRecordsByKeys(HoodieSeekingFileReader reader,
                                                                                               List sortedKeys,
                                                                                               boolean fullKeys,
                                                                                               String partitionName) throws IOException {
    ClosableIterator> records = fullKeys
        ? reader.getRecordsByKeysIterator(sortedKeys)
        : reader.getRecordsByKeyPrefixIterator(sortedKeys);

    return toStream(records)
        .map(record -> {
          GenericRecord data = (GenericRecord) record.getData();
          return Pair.of(
              (String) (data).get(HoodieMetadataPayload.KEY_FIELD_NAME),
              composeRecord(data, partitionName));
        })
        .collect(Collectors.groupingBy(Pair::getKey, Collectors.mapping(Pair::getValue, Collectors.toList())));
  }

  private HoodieRecord composeRecord(GenericRecord avroRecord, String partitionName) {
    if (metadataTableConfig.populateMetaFields()) {
      return SpillableMapUtils.convertToHoodieRecordPayload(avroRecord,
          metadataTableConfig.getPayloadClass(), metadataTableConfig.getPreCombineField(), false);
    }
    return SpillableMapUtils.convertToHoodieRecordPayload(avroRecord,
        metadataTableConfig.getPayloadClass(), metadataTableConfig.getPreCombineField(),
        Pair.of(metadataTableConfig.getRecordKeyFieldProp(), metadataTableConfig.getPartitionFieldProp()),
        false, Option.of(partitionName), Option.empty());
  }

  /**
   * Create a file reader and the record scanner for a given partition and file slice
   * if readers are not already available.
   *
   * @param partitionName - Partition name
   * @param slice         - The file slice to open readers for
   * @return File reader and the record scanner pair for the requested file slice
   */
  private Pair, HoodieMetadataLogRecordReader> getOrCreateReaders(String partitionName, FileSlice slice) {
    if (reuse) {
      Pair key = Pair.of(partitionName, slice.getFileId());
      return partitionReaders.get().computeIfAbsent(key, ignored -> openReaders(partitionName, slice));
    } else {
      return openReaders(partitionName, slice);
    }
  }

  private Pair, HoodieMetadataLogRecordReader> openReaders(String partitionName, FileSlice slice) {
    try {
      HoodieTimer timer = HoodieTimer.start();
      // Open base file reader
      // If the partition is a secondary index partition, use the HBase HFile reader instead of native HFile reader.
      // TODO (HUDI-7831): Support reading secondary index records using native HFile reader.
      boolean shouldUseNativeHFileReader = !partitionName.startsWith(HoodieTableMetadataUtil.PARTITION_NAME_SECONDARY_INDEX_PREFIX);
      Pair, Long> baseFileReaderOpenTimePair = getBaseFileReader(slice, timer, shouldUseNativeHFileReader);
      HoodieSeekingFileReader baseFileReader = baseFileReaderOpenTimePair.getKey();
      final long baseFileOpenMs = baseFileReaderOpenTimePair.getValue();

      // Open the log record scanner using the log files from the latest file slice
      List logFiles = slice.getLogFiles().collect(Collectors.toList());
      Pair logRecordScannerOpenTimePair =
          getLogRecordScanner(logFiles, partitionName, Option.empty());
      HoodieMetadataLogRecordReader logRecordScanner = logRecordScannerOpenTimePair.getKey();
      final long logScannerOpenMs = logRecordScannerOpenTimePair.getValue();

      metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR,
          baseFileOpenMs + logScannerOpenMs));
      return Pair.of(baseFileReader, logRecordScanner);
    } catch (IOException e) {
      throw new HoodieIOException("Error opening readers for metadata table partition " + partitionName, e);
    }
  }

  private Pair, Long> getBaseFileReader(FileSlice slice, HoodieTimer timer, boolean shouldUseNativeHFileReader) throws IOException {
    HoodieSeekingFileReader baseFileReader;
    long baseFileOpenMs;
    // If the base file is present then create a reader
    Option baseFile = slice.getBaseFile();
    if (baseFile.isPresent()) {
      StoragePath baseFilePath = baseFile.get().getStoragePath();
      HoodieConfig readerConfig = DEFAULT_HUDI_CONFIG_FOR_READER;
      if (!shouldUseNativeHFileReader) {
        readerConfig.setValue(USE_NATIVE_HFILE_READER, "false");
      }
      baseFileReader = (HoodieSeekingFileReader) HoodieIOFactory.getIOFactory(metadataMetaClient.getStorage())
          .getReaderFactory(HoodieRecordType.AVRO)
          .getFileReader(readerConfig, baseFilePath);
      baseFileOpenMs = timer.endTimer();
      LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", baseFilePath,
          baseFile.get().getCommitTime(), baseFileOpenMs));
    } else {
      baseFileReader = null;
      baseFileOpenMs = 0L;
      timer.endTimer();
    }
    return Pair.of(baseFileReader, baseFileOpenMs);
  }

  public Pair getLogRecordScanner(List logFiles,
                                                                       String partitionName,
                                                                       Option allowFullScanOverride) {
    HoodieTimer timer = HoodieTimer.start();
    List sortedLogFilePaths = logFiles.stream()
        .sorted(HoodieLogFile.getLogFileComparator())
        .map(o -> o.getPath().toString())
        .collect(Collectors.toList());

    // Only those log files which have a corresponding completed instant on the dataset should be read
    // This is because the metadata table is updated before the dataset instants are committed.
    Set validInstantTimestamps = HoodieTableMetadataUtil
        .getValidInstantTimestamps(dataMetaClient, metadataMetaClient);

    Option latestMetadataInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
    String latestMetadataInstantTime = latestMetadataInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);

    boolean allowFullScan = allowFullScanOverride.orElseGet(() -> isFullScanAllowedForPartition(partitionName));

    // Load the schema
    Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
    HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build();
    HoodieMetadataLogRecordReader logRecordScanner = HoodieMetadataLogRecordReader.newBuilder(partitionName)
        .withStorage(metadataMetaClient.getStorage())
        .withBasePath(metadataBasePath)
        .withLogFilePaths(sortedLogFilePaths)
        .withReaderSchema(schema)
        .withLatestInstantTime(latestMetadataInstantTime)
        .withMaxMemorySizeInBytes(metadataConfig.getMaxReaderMemory())
        .withBufferSize(metadataConfig.getMaxReaderBufferSize())
        .withSpillableMapBasePath(metadataConfig.getSplliableMapDir())
        .withDiskMapType(commonConfig.getSpillableDiskMapType())
        .withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled())
        .withLogBlockTimestamps(validInstantTimestamps)
        .enableFullScan(allowFullScan)
        .withPartition(partitionName)
        .withEnableOptimizedLogBlocksScan(metadataConfig.isOptimizedLogBlocksScanEnabled())
        .withTableMetaClient(metadataMetaClient)
        .build();

    Long logScannerOpenMs = timer.endTimer();
    LOG.info(String.format("Opened %d metadata log files (dataset instant=%s, metadata instant=%s) in %d ms",
        sortedLogFilePaths.size(), getLatestDataInstantTime(), latestMetadataInstantTime, logScannerOpenMs));
    return Pair.of(logRecordScanner, logScannerOpenMs);
  }

  // NOTE: We're allowing eager full-scan of the log-files only for "files" partition.
  //       Other partitions (like "column_stats", "bloom_filters") will have to be fetched
  //       t/h point-lookups
  private boolean isFullScanAllowedForPartition(String partitionName) {
    switch (partitionName) {
      case PARTITION_NAME_FILES:
        return DEFAULT_METADATA_ENABLE_FULL_SCAN_LOG_FILES;

      case PARTITION_NAME_COLUMN_STATS:
      case PARTITION_NAME_BLOOM_FILTERS:
      default:
        return false;
    }
  }

  @Override
  public void close() {
    closePartitionReaders();
    partitionFileSliceMap.clear();
  }

  /**
   * Close the file reader and the record scanner for the given file slice.
   *
   * @param partitionFileSlicePair - Partition and FileSlice
   */
  private synchronized void close(Pair partitionFileSlicePair) {
    Pair, HoodieMetadataLogRecordReader> readers =
        partitionReaders.get().remove(partitionFileSlicePair);
    closeReader(readers);
  }

  /**
   * Close and clear all the partitions readers.
   */
  private void closePartitionReaders() {
    for (Pair partitionFileSlicePair : partitionReaders.get().keySet()) {
      close(partitionFileSlicePair);
    }
    partitionReaders.get().clear();
  }

  private void closeReader(Pair, HoodieMetadataLogRecordReader> readers) {
    if (readers != null) {
      try {
        if (readers.getKey() != null) {
          readers.getKey().close();
        }
        if (readers.getValue() != null) {
          readers.getValue().close();
        }
      } catch (Exception e) {
        throw new HoodieException("Error closing resources during metadata table merge", e);
      }
    }
  }

  public boolean enabled() {
    return isMetadataTableInitialized;
  }

  public HoodieTableMetaClient getMetadataMetaClient() {
    return metadataMetaClient;
  }

  public HoodieTableFileSystemView getMetadataFileSystemView() {
    return metadataFileSystemView;
  }

  public Map stats() {
    Set allMetadataPartitionPaths = Arrays.stream(MetadataPartitionType.values()).map(MetadataPartitionType::getPartitionPath).collect(Collectors.toSet());
    return metrics.map(m -> m.getStats(true, metadataMetaClient, this, allMetadataPartitionPaths)).orElseGet(HashMap::new);
  }

  @Override
  public Option getSyncedInstantTime() {
    if (metadataMetaClient != null) {
      Option latestInstant = metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant();
      if (latestInstant.isPresent()) {
        return Option.of(latestInstant.get().getTimestamp());
      }
    }
    return Option.empty();
  }

  @Override
  public Option getLatestCompactionTime() {
    if (metadataMetaClient != null) {
      Option latestCompaction = metadataMetaClient.getActiveTimeline().getCommitAndReplaceTimeline().filterCompletedInstants().lastInstant();
      if (latestCompaction.isPresent()) {
        return Option.of(latestCompaction.get().getTimestamp());
      }
    }
    return Option.empty();
  }

  @Override
  public void reset() {
    initIfNeeded();
    dataMetaClient.reloadActiveTimeline();
    if (metadataMetaClient != null) {
      metadataMetaClient.reloadActiveTimeline();
      metadataFileSystemView.close();
      metadataFileSystemView = getFileSystemView(metadataMetaClient);
    }
    // the cached reader has max instant time restriction, they should be cleared
    // because the metadata timeline may have changed.
    closePartitionReaders();
    partitionFileSliceMap.clear();
  }

  @Override
  public int getNumFileGroupsForPartition(MetadataPartitionType partition) {
    partitionFileSliceMap.computeIfAbsent(partition.getPartitionPath(),
        k -> HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient,
            metadataFileSystemView, partition.getPartitionPath()));
    return partitionFileSliceMap.get(partition.getPartitionPath()).size();
  }

  @Override
  protected Map getSecondaryKeysForRecordKeys(List recordKeys, String partitionName) {
    if (recordKeys.isEmpty()) {
      return Collections.emptyMap();
    }

    // Load the file slices for the partition. Each file slice is a shard which saves a portion of the keys.
    List partitionFileSlices =
        partitionFileSliceMap.computeIfAbsent(partitionName, k -> HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, metadataFileSystemView, partitionName));
    if (partitionFileSlices.isEmpty()) {
      return Collections.emptyMap();
    }

    // Parallel lookup keys from each file slice
    Map reverseSecondaryKeyMap = new HashMap<>();
    partitionFileSlices.parallelStream().forEach(partition -> {
      Map partialResult = reverseLookupSecondaryKeys(partitionName, recordKeys, partition);
      synchronized (reverseSecondaryKeyMap) {
        reverseSecondaryKeyMap.putAll(partialResult);
      }
    });

    return reverseSecondaryKeyMap;
  }

  private Map reverseLookupSecondaryKeys(String partitionName, List recordKeys, FileSlice fileSlice) {
    Map recordKeyMap = new HashMap<>();
    Pair, HoodieMetadataLogRecordReader> readers = getOrCreateReaders(partitionName, fileSlice);
    try {
      HoodieSeekingFileReader baseFileReader = readers.getKey();
      HoodieMetadataLogRecordReader logRecordScanner = readers.getRight();
      if (baseFileReader == null && logRecordScanner == null) {
        return Collections.emptyMap();
      }

      Set keySet = new TreeSet<>(recordKeys);
      Map> logRecordsMap = new HashMap<>();
      logRecordScanner.getRecords().forEach(record -> {
        HoodieMetadataPayload payload = record.getData();
        String recordKey = payload.getRecordKeyFromSecondaryIndex();
        if (keySet.contains(recordKey)) {
          logRecordsMap.put(recordKey, record);
        }
      });

      // Map of (record-key, secondary-index-record)
      Map> baseFileRecords = fetchBaseFileAllRecordsByPayload(baseFileReader, keySet, partitionName);
      // Iterate over all provided log-records, merging them into existing records
      logRecordsMap.forEach((key1, value1) -> baseFileRecords.merge(key1, value1, (oldRecord, newRecord) -> {
        Option> mergedRecord = HoodieMetadataPayload.combineSecondaryIndexRecord(oldRecord, newRecord);
        return mergedRecord.orElseGet(null);
      }));
      baseFileRecords.forEach((key, value) -> recordKeyMap.put(key, value.getRecordKey()));
    } catch (IOException ioe) {
      throw new HoodieIOException("Error merging records from metadata table for  " + recordKeys.size() + " key : ", ioe);
    } finally {
      if (!reuse) {
        closeReader(readers);
      }
    }
    return recordKeyMap;
  }

  @Override
  protected Map>> getSecondaryIndexRecords(List keys, String partitionName) {
    if (keys.isEmpty()) {
      return Collections.emptyMap();
    }

    // Load the file slices for the partition. Each file slice is a shard which saves a portion of the keys.
    List partitionFileSlices = partitionFileSliceMap.computeIfAbsent(partitionName,
        k -> HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, metadataFileSystemView, partitionName));
    final int numFileSlices = partitionFileSlices.size();
    checkState(numFileSlices > 0, "Number of file slices for partition " + partitionName + " should be > 0");

    engineContext.setJobStatus(this.getClass().getSimpleName(), "Lookup keys from each file slice");
    HoodieData partitionRDD = engineContext.parallelize(partitionFileSlices);
    // Define the seqOp function (merges elements within a partition)
    Functions.Function2>>, FileSlice, Map>>> seqOp =
        (accumulator, partition) -> {
          Map>> currentFileSliceResult = lookupSecondaryKeysFromFileSlice(partitionName, keys, partition);
          currentFileSliceResult.forEach((secondaryKey, secondaryRecords) -> accumulator.merge(secondaryKey, secondaryRecords, (oldRecords, newRecords) -> {
            newRecords.addAll(oldRecords);
            return newRecords;
          }));
          return accumulator;
        };
    // Define the combOp function (merges elements across partitions)
    Functions.Function2>>, Map>>, Map>>> combOp =
        (map1, map2) -> {
          map2.forEach((secondaryKey, secondaryRecords) -> map1.merge(secondaryKey, secondaryRecords, (oldRecords, newRecords) -> {
            newRecords.addAll(oldRecords);
            return newRecords;
          }));
          return map1;
        };
    // Use aggregate to merge results within and across partitions
    // Define the zero value (initial value)
    Map>> zeroValue = new HashMap<>();
    return engineContext.aggregate(partitionRDD, zeroValue, seqOp, combOp);
  }

  /**
   * Lookup list of keys from a single file slice.
   *
   * @param partitionName Name of the partition
   * @param secondaryKeys The list of secondary keys to lookup
   * @param fileSlice     The file slice to read
   * @return A {@code Map} of secondary-key to list of {@code HoodieRecord} for the secondary-keys which were found in the file slice
   */
  private Map>> lookupSecondaryKeysFromFileSlice(String partitionName, List secondaryKeys, FileSlice fileSlice) {
    Map> logRecordsMap = new HashMap<>();

    Pair, HoodieMetadataLogRecordReader> readers = getOrCreateReaders(partitionName, fileSlice);
    try {
      List timings = new ArrayList<>(1);
      HoodieSeekingFileReader baseFileReader = readers.getKey();
      HoodieMetadataLogRecordReader logRecordScanner = readers.getRight();
      if (baseFileReader == null && logRecordScanner == null) {
        return Collections.emptyMap();
      }

      // Sort it here once so that we don't need to sort individually for base file and for each individual log files.
      Set secondaryKeySet = new HashSet<>(secondaryKeys.size());
      List sortedSecondaryKeys = new ArrayList<>(secondaryKeys);
      Collections.sort(sortedSecondaryKeys);
      secondaryKeySet.addAll(sortedSecondaryKeys);

      logRecordScanner.getRecords().forEach(record -> {
        HoodieMetadataPayload payload = record.getData();
        String recordKey = payload.getRecordKeyFromSecondaryIndex();
        if (secondaryKeySet.contains(recordKey)) {
          String secondaryKey = payload.getRecordKeyFromSecondaryIndex();
          logRecordsMap.computeIfAbsent(secondaryKey, k -> new HashMap<>()).put(recordKey, record);
        }
      });

      return readNonUniqueRecordsAndMergeWithLogRecords(baseFileReader, sortedSecondaryKeys, logRecordsMap, timings, partitionName);
    } catch (IOException ioe) {
      throw new HoodieIOException("Error merging records from metadata table for  " + secondaryKeys.size() + " key : ", ioe);
    } finally {
      if (!reuse) {
        closeReader(readers);
      }
    }
  }

  private Map>> readNonUniqueRecordsAndMergeWithLogRecords(HoodieSeekingFileReader reader,
                                                                                                            List sortedKeys,
                                                                                                            Map> logRecordsMap,
                                                                                                            List timings,
                                                                                                            String partitionName) throws IOException {
    HoodieTimer timer = HoodieTimer.start();

    Map>> resultMap = new HashMap<>();
    if (reader == null) {
      // No base file at all
      logRecordsMap.forEach((secondaryKey, logRecords) -> {
        List> recordList = new ArrayList<>();
        logRecords.values().forEach(record -> {
          recordList.add((HoodieRecord) record);
        });
        resultMap.put(secondaryKey, recordList);
      });
      timings.add(timer.endTimer());
      return resultMap;
    }

    HoodieTimer readTimer = HoodieTimer.start();
    Map>> baseFileRecordsMap =
        fetchBaseFileAllRecordsByKeys(reader, sortedKeys, true, partitionName);
    if (logRecordsMap.isEmpty() && !baseFileRecordsMap.isEmpty()) {
      // file slice has only base file
      timings.add(timer.endTimer());
      return baseFileRecordsMap;
    }

    logRecordsMap.forEach((secondaryKey, logRecords) -> {
      if (!baseFileRecordsMap.containsKey(secondaryKey)) {
        List> recordList = logRecords
            .values()
            .stream()
            .map(record -> (HoodieRecord) record)
            .collect(Collectors.toList());

        resultMap.put(secondaryKey, recordList);
      } else {
        List> baseFileRecords = baseFileRecordsMap.get(secondaryKey);
        List> resultRecords = new ArrayList<>();

        baseFileRecords.forEach(prevRecord -> {
          HoodieMetadataPayload prevPayload = prevRecord.getData();
          String recordKey = prevPayload.getRecordKeyFromSecondaryIndex();

          if (!logRecords.containsKey(recordKey)) {
            resultRecords.add(prevRecord);
          } else {
            // Merge the records
            HoodieRecord newRecord = logRecords.get(recordKey);
            HoodieMetadataPayload newPayload = newRecord.getData();
            checkState(recordKey.equals(newPayload.getRecordKeyFromSecondaryIndex()), "Record key mismatch between log record and secondary index record");
            // The rules for merging the prevRecord and the latestRecord is noted below. Note that this only applies for SecondaryIndex
            // records in the metadata table (which is the only user of this API as of this implementation)
            // 1. Iff latestRecord is deleted (i.e it is a tombstone) AND prevRecord is null (i.e not buffered), then discard latestRecord
            //    basefile never had a matching record?
            // 2. Iff latestRecord is deleted AND prevRecord is non-null, then remove prevRecord from the buffer AND discard the latestRecord
            // 3. Iff latestRecord is not deleted AND prevRecord is non-null, then remove the prevRecord from the buffer AND retain the latestRecord
            //    The rationale is that the most recent record is always retained (based on arrival time). TODO: verify this logic
            // 4. Iff latestRecord is not deleted AND prevRecord is null, then retain the latestRecord (same rationale as #1)
            if (!newPayload.isSecondaryIndexDeleted()) {
              // All the four cases boils down to just "Retain newRecord iff it is not deleted"
              resultRecords.add(newRecord);
            }
          }
        });

        resultMap.put(secondaryKey, resultRecords);
      }
    });

    metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer()));

    timings.add(timer.endTimer());
    return resultMap;
  }

  private Map> fetchBaseFileAllRecordsByPayload(HoodieSeekingFileReader reader, Set keySet, String partitionName) throws IOException {
    if (reader == null) {
      // No base file at all
      return Collections.emptyMap();
    }

    ClosableIterator> records = reader.getRecordIterator();

    return toStream(records).map(record -> {
      GenericRecord data = (GenericRecord) record.getData();
      return composeRecord(data, partitionName);
    }).filter(record -> {
      HoodieMetadataPayload payload = (HoodieMetadataPayload) record.getData();
      return keySet.contains(payload.getRecordKeyFromSecondaryIndex());
    }).collect(Collectors.toMap(record -> {
      HoodieMetadataPayload payload = (HoodieMetadataPayload) record.getData();
      return payload.getRecordKeyFromSecondaryIndex();
    }, record -> record));
  }
}