All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.index.bloom.SparkHoodieBloomIndexHelper Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.index.bloom;

import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.data.HoodiePairData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.common.util.hash.FileIndexID;
import org.apache.hudi.common.util.hash.PartitionIndexID;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaPairRDD;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.HoodieKeyLookupResult;
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
import org.apache.hudi.storage.StorageConfiguration;
import org.apache.hudi.storage.StoragePathInfo;
import org.apache.hudi.table.HoodieTable;

import org.apache.spark.Partitioner;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.broadcast.Broadcast;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import scala.Tuple2;

import static org.apache.hudi.metadata.HoodieMetadataPayload.getBloomFilterIndexKey;
import static org.apache.hudi.metadata.HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex;
import static org.apache.hudi.metadata.MetadataPartitionType.BLOOM_FILTERS;

/**
 * Helper for {@link HoodieBloomIndex} containing Spark-specific logic.
 */
public class SparkHoodieBloomIndexHelper extends BaseHoodieBloomIndexHelper {

  private static final Logger LOG = LoggerFactory.getLogger(SparkHoodieBloomIndexHelper.class);

  private static final SparkHoodieBloomIndexHelper SINGLETON_INSTANCE =
      new SparkHoodieBloomIndexHelper();

  private SparkHoodieBloomIndexHelper() {}

  public static SparkHoodieBloomIndexHelper getInstance() {
    return SINGLETON_INSTANCE;
  }

  @Override
  public HoodiePairData findMatchingFilesForRecordKeys(
      HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable,
      HoodiePairData partitionRecordKeyPairs,
      HoodiePairData fileComparisonPairs,
      Map> partitionToFileInfo,
      Map recordsPerPartition) {

    int inputParallelism = partitionRecordKeyPairs.deduceNumPartitions();
    int configuredBloomIndexParallelism = config.getBloomIndexParallelism();

    // NOTE: Target parallelism could be overridden by the config
    int targetParallelism =
        configuredBloomIndexParallelism > 0 ? configuredBloomIndexParallelism : inputParallelism;

    LOG.info(String.format("Input parallelism: %d, Index parallelism: %d", inputParallelism, targetParallelism));

    JavaPairRDD fileComparisonsRDD = HoodieJavaRDD.getJavaRDD(fileComparisonPairs);
    JavaRDD> keyLookupResultRDD;

    if (config.getBloomIndexUseMetadata()
        && hoodieTable.getMetaClient().getTableConfig().getMetadataPartitions()
        .contains(BLOOM_FILTERS.getPartitionPath())) {
      StorageConfiguration storageConf = hoodieTable.getStorageConf();

      HoodieTableFileSystemView baseFileOnlyView =
          getBaseFileOnlyView(hoodieTable, partitionToFileInfo.keySet());

      Broadcast baseFileOnlyViewBroadcast =
          ((HoodieSparkEngineContext) context).getJavaSparkContext().broadcast(baseFileOnlyView);

      // When leveraging MT we're aiming for following goals:
      //    - (G1) All requests to MT are made in batch (ie we're trying to fetch all the values
      //      for corresponding keys at once)
      //    - (G2) Each task reads no more than just _one_ file-group from the MT Bloom Filters
      //    partition
      //
      // Ta achieve G2, following invariant have to be maintained: Spark partitions have to be
      // affine w/ Metadata Table's file-groups, meaning that each Spark partition holds records
      // belonging to one and only file-group in MT Bloom Filters partition. To provide for that
      // we need to make sure
      //    - Spark's used [[Partitioner]] employs same hashing function as Metadata Table (as well
      //      as being applied to the same keys as the MT one)
      //    - Make sure that # of partitions is congruent to the # of file-groups (ie number of Spark
      //    partitions is a multiple of the # of the file-groups).
      //
      //    Last provision is necessary, so that for every key it's the case that
      //
      //        (hash(key) % N) % M = hash(key) % M, iff N % M = 0
      //
      //    Let's take an example of N = 8 and M = 4 (default # of file-groups in Bloom Filter
      //    partition). In that case Spark partitions for which `hash(key) % N` will be either 0
      //    or 4, will map to the same (first) file-group in MT
      int bloomFilterPartitionFileGroupCount =
          config.getMetadataConfig().getBloomFilterIndexFileGroupCount();
      int adjustedTargetParallelism =
          targetParallelism % bloomFilterPartitionFileGroupCount == 0
              ? targetParallelism
              // NOTE: We add 1 to make sure parallelism a) value always stays positive and b)
              //       {@code targetParallelism <= adjustedTargetParallelism}
              : (targetParallelism / bloomFilterPartitionFileGroupCount + 1) * bloomFilterPartitionFileGroupCount;

      AffineBloomIndexFileGroupPartitioner partitioner =
          new AffineBloomIndexFileGroupPartitioner(baseFileOnlyViewBroadcast, adjustedTargetParallelism);

      // First, we need to repartition and sort records using [[AffineBloomIndexFileGroupPartitioner]]
      // to make sure every Spark task accesses no more than just a single file-group in MT (allows
      // us to achieve G2).
      //
      // NOTE: Sorting records w/in individual partitions is required to make sure that we cluster
      //       together keys co-located w/in the MT files (sorted by keys)
      keyLookupResultRDD = fileComparisonsRDD.repartitionAndSortWithinPartitions(partitioner)
          .mapPartitionsToPair(new HoodieMetadataBloomFilterProbingFunction(baseFileOnlyViewBroadcast, hoodieTable))
          // Second, we use [[HoodieFileProbingFunction]] to open actual file and check whether it
          // contains the records with candidate keys that were filtered in by the Bloom Filter
          .mapPartitions(new HoodieFileProbingFunction(baseFileOnlyViewBroadcast, storageConf), true);

    } else if (config.useBloomIndexBucketizedChecking()) {
      Map comparisonsPerFileGroup = computeComparisonsPerFileGroup(
          config, recordsPerPartition, partitionToFileInfo, fileComparisonsRDD, context);
      Partitioner partitioner = new BucketizedBloomCheckPartitioner(targetParallelism, comparisonsPerFileGroup,
          config.getBloomIndexKeysPerBucket());

      keyLookupResultRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2), t))
          .repartitionAndSortWithinPartitions(partitioner)
          .map(Tuple2::_2)
          .mapPartitions(new HoodieSparkBloomIndexCheckFunction(hoodieTable, config), true);
    } else {
      keyLookupResultRDD = fileComparisonsRDD.sortByKey(true, targetParallelism)
          .mapPartitions(new HoodieSparkBloomIndexCheckFunction(hoodieTable, config), true);
    }

    return HoodieJavaPairRDD.of(keyLookupResultRDD.flatMap(List::iterator)
        .filter(lr -> lr.getMatchingRecordKeysAndPositions().size() > 0)
        .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeysAndPositions().stream()
            .map(recordKeyAndPosition -> new Tuple2<>(
                new HoodieKey(recordKeyAndPosition.getLeft(), lookupResult.getPartitionPath()),
                new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId(),
                    recordKeyAndPosition.getRight())))
            .collect(Collectors.toList()).iterator()));
  }

  /**
   * Compute the estimated number of bloom filter comparisons to be performed on each file group.
   */
  private Map computeComparisonsPerFileGroup(
      final HoodieWriteConfig config,
      final Map recordsPerPartition,
      final Map> partitionToFileInfo,
      final JavaPairRDD fileComparisonsRDD,
      final HoodieEngineContext context) {
    Map fileToComparisons;
    if (config.getBloomIndexPruneByRanges()) {
      // we will just try exploding the input and then count to determine comparisons
      // FIX(vc): Only do sampling here and extrapolate?
      context.setJobStatus(this.getClass().getSimpleName(), "Compute all comparisons needed between records and files: " + config.getTableName());
      fileToComparisons = fileComparisonsRDD.countByKey();
    } else {
      fileToComparisons = new HashMap<>();
      partitionToFileInfo.forEach((partitionPath, fileInfos) -> {
        for (BloomIndexFileInfo fileInfo : fileInfos) {
          // each file needs to be compared against all the records coming into the partition
          fileToComparisons.put(
              new HoodieFileGroupId(partitionPath, fileInfo.getFileId()), recordsPerPartition.get(partitionPath));
        }
      });
    }
    return fileToComparisons;
  }

  private static HoodieTableFileSystemView getBaseFileOnlyView(HoodieTable hoodieTable, Collection partitionPaths) {
    try {
      List fullPartitionPaths = partitionPaths.stream()
          .map(partitionPath ->
              String.format("%s/%s", hoodieTable.getMetaClient().getBasePath(), partitionPath))
          .collect(Collectors.toList());

      List allFiles =
          hoodieTable.getMetadataTable().getAllFilesInPartitions(fullPartitionPaths).values()
              .stream()
              .flatMap(e -> e.stream())
              .collect(Collectors.toList());

      return new HoodieTableFileSystemView(hoodieTable.getMetaClient(), hoodieTable.getActiveTimeline(), allFiles);
    } catch (IOException e) {
      LOG.error(String.format("Failed to fetch all files for partitions (%s)", partitionPaths));
      throw new HoodieIOException("Failed to fetch all files for partitions", e);
    }
  }

  static class AffineBloomIndexFileGroupPartitioner extends Partitioner {

    private final Broadcast latestBaseFilesBroadcast;

    // TODO(HUDI-5619) remove when addressed
    private final Map> cachedLatestBaseFileNames =
        new HashMap<>(16);

    private final int targetPartitions;

    AffineBloomIndexFileGroupPartitioner(Broadcast baseFileOnlyViewBroadcast,
                                         int targetPartitions) {
      this.targetPartitions = targetPartitions;
      this.latestBaseFilesBroadcast = baseFileOnlyViewBroadcast;
    }

    @Override
    public int numPartitions() {
      return targetPartitions;
    }

    @Override
    public int getPartition(Object key) {
      HoodieFileGroupId partitionFileGroupId = (HoodieFileGroupId) key;
      String partitionPath = partitionFileGroupId.getPartitionPath();
      String fileGroupId = partitionFileGroupId.getFileId();

      /*
      // TODO(HUDI-5619) uncomment when addressed
      String baseFileName =
          latestBaseFilesBroadcast.getValue()
              .getLatestBaseFile(partitionPath, fileGroupId)
              .orElseThrow(() -> new HoodieException(
                  String.format("File from file-group (%s) not found in partition path (%s)", fileGroupId, partitionPath)))
              .getFileName();
       */

      // NOTE: This is a workaround to alleviate performance impact of needing to process whole
      //       partition for every file-group being looked up.
      //       See HUDI-5619 for more details
      String baseFileName = cachedLatestBaseFileNames.computeIfAbsent(partitionPath, ignored ->
              latestBaseFilesBroadcast.getValue()
                  .getLatestBaseFiles(partitionPath)
                  .collect(
                      Collectors.toMap(HoodieBaseFile::getFileId, BaseFile::getFileName)
                  )
          )
          .get(fileGroupId);

      if (baseFileName == null) {
        throw new HoodieException(
            String.format("File from file-group (%s) not found in partition path (%s)", fileGroupId, partitionPath));
      }

      String bloomIndexEncodedKey =
          getBloomFilterIndexKey(new PartitionIndexID(HoodieTableMetadataUtil.getBloomFilterIndexPartitionIdentifier(partitionPath)), new FileIndexID(baseFileName));

      // NOTE: It's crucial that [[targetPartitions]] be congruent w/ the number of
      //       actual file-groups in the Bloom Index in MT
      return mapRecordKeyToFileGroupIndex(bloomIndexEncodedKey, targetPartitions);
    }
  }

  public static class HoodieSparkBloomIndexCheckFunction extends HoodieBloomIndexCheckFunction>
      implements FlatMapFunction>, List> {

    public HoodieSparkBloomIndexCheckFunction(HoodieTable hoodieTable,
                                              HoodieWriteConfig config) {
      super(hoodieTable, config, t -> t._1, t -> t._2);
    }

    @Override
    public Iterator> call(Iterator> fileGroupIdRecordKeyPairIterator) {
      return new LazyKeyCheckIterator(fileGroupIdRecordKeyPairIterator);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy