org.apache.hudi.index.bloom.HoodieBloomIndex Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-spark-bundle Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.index.bloom;

import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.mapping;
import static java.util.stream.Collectors.toList;

import com.google.common.annotations.VisibleForTesting;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.hudi.WriteStatus;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.MetadataNotFoundException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.io.HoodieRangeInfoHandle;
import org.apache.hudi.table.HoodieTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.Partitioner;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;

/**
 * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata.
 */
public class HoodieBloomIndex extends HoodieIndex {

  // we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476)
  private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
  // this is how much a triplet of (partitionPath, fileId, recordKey) costs.
  private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
  private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class);
  private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
      SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;

  public HoodieBloomIndex(HoodieWriteConfig config) {
    super(config);
  }

  @Override
  public JavaRDD> tagLocation(JavaRDD> recordRDD, JavaSparkContext jsc,
      HoodieTable hoodieTable) {

    // Step 0: cache the input record RDD
    if (config.getBloomIndexUseCaching()) {
      recordRDD.persist(config.getBloomIndexInputStorageLevel());
    }

    // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
    JavaPairRDD partitionRecordKeyPairRDD =
        recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

    // Lookup indexes for all the partition/recordkey pair
    JavaPairRDD keyFilenamePairRDD =
        lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);

    // Cache the result, for subsequent stages.
    if (config.getBloomIndexUseCaching()) {
      keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
    }
    if (logger.isDebugEnabled()) {
      long totalTaggedRecords = keyFilenamePairRDD.count();
      logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
    }

    // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
    // Cost: 4 sec.
    JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);

    if (config.getBloomIndexUseCaching()) {
      recordRDD.unpersist(); // unpersist the input Record RDD
      keyFilenamePairRDD.unpersist();
    }

    return taggedRecordRDD;
  }

  /**
   * Returns an RDD mapping each HoodieKey with a partitionPath/fileID which contains it. Option.Empty if the key is not
   * found.
   *
   * @param hoodieKeys keys to lookup
   * @param jsc spark context
   * @param hoodieTable hoodie table object
   */
  @Override
  public JavaPairRDD>> fetchRecordLocation(JavaRDD hoodieKeys,
      JavaSparkContext jsc, HoodieTable hoodieTable) {
    JavaPairRDD partitionRecordKeyPairRDD =
        hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));

    // Lookup indexes for all the partition/recordkey pair
    JavaPairRDD recordKeyLocationRDD =
        lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
    JavaPairRDD keyHoodieKeyPairRDD = hoodieKeys.mapToPair(key -> new Tuple2<>(key, null));

    return keyHoodieKeyPairRDD.leftOuterJoin(recordKeyLocationRDD).mapToPair(keyLoc -> {
      Option> partitionPathFileidPair;
      if (keyLoc._2._2.isPresent()) {
        partitionPathFileidPair = Option.of(Pair.of(keyLoc._1().getPartitionPath(), keyLoc._2._2.get().getFileId()));
      } else {
        partitionPathFileidPair = Option.empty();
      }
      return new Tuple2<>(keyLoc._1, partitionPathFileidPair);
    });
  }

  /**
   * Lookup the location for each record key and return the pair for all record keys already
   * present and drop the record keys if not present
   */
  private JavaPairRDD lookupIndex(
      JavaPairRDD partitionRecordKeyPairRDD, final JavaSparkContext jsc,
      final HoodieTable hoodieTable) {
    // Obtain records per partition, in the incoming records
    Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
    List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());

    // Step 2: Load all involved files as  pairs
    List> fileInfoList =
        loadInvolvedFiles(affectedPartitionPathList, jsc, hoodieTable);
    final Map> partitionToFileInfo =
        fileInfoList.stream().collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));

    // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
    // that contains it.
    Map comparisonsPerFileGroup =
        computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
    int safeParallelism = computeSafeParallelism(recordsPerPartition, comparisonsPerFileGroup);
    int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), safeParallelism);
    return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism, hoodieTable,
        comparisonsPerFileGroup);
  }

  /**
   * Compute the estimated number of bloom filter comparisons to be performed on each file group
   */
  private Map computeComparisonsPerFileGroup(final Map recordsPerPartition,
      final Map> partitionToFileInfo,
      JavaPairRDD partitionRecordKeyPairRDD) {

    Map fileToComparisons;
    if (config.getBloomIndexPruneByRanges()) {
      // we will just try exploding the input and then count to determine comparisons
      // FIX(vc): Only do sampling here and extrapolate?
      fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD)
          .mapToPair(t -> t).countByKey();
    } else {
      fileToComparisons = new HashMap<>();
      partitionToFileInfo.entrySet().stream().forEach(e -> {
        for (BloomIndexFileInfo fileInfo : e.getValue()) {
          // each file needs to be compared against all the records coming into the partition
          fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(e.getKey()));
        }
      });
    }
    return fileToComparisons;
  }

  /**
   * Compute the minimum parallelism needed to play well with the spark 2GB limitation.. The index lookup can be skewed
   * in three dimensions : #files, #partitions, #records
   * 
   * To be able to smoothly handle skews, we need to compute how to split each partitions into subpartitions. We do it
   * here, in a way that keeps the amount of each Spark join partition to < 2GB.
   * 

   * If {@link HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number, then that is used
   * explicitly.
   */
  int computeSafeParallelism(Map recordsPerPartition, Map comparisonsPerFileGroup) {
    long totalComparisons = comparisonsPerFileGroup.values().stream().mapToLong(Long::longValue).sum();
    long totalFiles = comparisonsPerFileGroup.size();
    long totalRecords = recordsPerPartition.values().stream().mapToLong(Long::longValue).sum();
    int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
    logger.info(String.format(
        "TotalRecords %d, TotalFiles %d, TotalAffectedPartitions %d, TotalComparisons %d, " + "SafeParallelism %d",
        totalRecords, totalFiles, recordsPerPartition.size(), totalComparisons, parallelism));
    return parallelism;
  }

  /**
   * Its crucial to pick the right parallelism.
   * 

   * totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism : typically number of input
   * file splits
   * 

   * We pick the max such that, we are always safe, but go higher if say a there are a lot of input files. (otherwise,
   * we will fallback to number of partitions in input and end up with slow performance)
   */
  private int determineParallelism(int inputParallelism, int totalSubPartitions) {
    // If bloom index parallelism is set, use it to to check against the input parallelism and
    // take the max
    int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
    int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
    logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${"
        + config.getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, "
        + "Join Parallelism set to : " + joinParallelism);
    return joinParallelism;
  }

  /**
   * Load all involved files as  pair RDD.
   */
  @VisibleForTesting
  List> loadInvolvedFiles(List partitions, final JavaSparkContext jsc,
      final HoodieTable hoodieTable) {

    // Obtain the latest data files from all the partitions.
    List> partitionPathFileIDList =
        jsc.parallelize(partitions, Math.max(partitions.size(), 1)).flatMap(partitionPath -> {
          Option latestCommitTime =
              hoodieTable.getMetaClient().getCommitsTimeline().filterCompletedInstants().lastInstant();
          List> filteredFiles = new ArrayList<>();
          if (latestCommitTime.isPresent()) {
            filteredFiles = hoodieTable.getROFileSystemView()
                .getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp())
                .map(f -> Pair.of(partitionPath, f.getFileId())).collect(toList());
          }
          return filteredFiles.iterator();
        }).collect();

    if (config.getBloomIndexPruneByRanges()) {
      // also obtain file ranges, if range pruning is enabled
      return jsc.parallelize(partitionPathFileIDList, Math.max(partitionPathFileIDList.size(), 1)).mapToPair(pf -> {
        try {
          HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf);
          String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys();
          return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1]));
        } catch (MetadataNotFoundException me) {
          logger.warn("Unable to find range metadata in file :" + pf);
          return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()));
        }
      }).collect();
    } else {
      return partitionPathFileIDList.stream()
          .map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList());
    }
  }


  @Override
  public boolean rollbackCommit(String commitTime) {
    // Nope, don't need to do anything.
    return true;
  }

  /**
   * This is not global, since we depend on the partitionPath to do the lookup
   */
  @Override
  public boolean isGlobal() {
    return false;
  }

  /**
   * No indexes into log files yet.
   */
  @Override
  public boolean canIndexLogFiles() {
    return false;
  }

  /**
   * Bloom filters are stored, into the same data files.
   */
  @Override
  public boolean isImplicitWithStorage() {
    return true;
  }

  /**
   * For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be
   * checked. For datasets, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files
   * to be compared gets cut down a lot from range pruning.
   *
   * Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on
   * recordKey ranges in the index info.
   */
  @VisibleForTesting
  JavaRDD> explodeRecordRDDWithFileComparisons(
      final Map> partitionToFileIndexInfo,
      JavaPairRDD partitionRecordKeyPairRDD) {
    IndexFileFilter indexFileFilter =
        config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo)
            : new ListBasedIndexFileFilter(partitionToFileIndexInfo);

    return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
      String recordKey = partitionRecordKeyPair._2();
      String partitionPath = partitionRecordKeyPair._1();

      return indexFileFilter.getMatchingFiles(partitionPath, recordKey).stream()
          .map(matchingFile -> new Tuple2<>(matchingFile, new HoodieKey(recordKey, partitionPath)))
          .collect(Collectors.toList());
    }).flatMap(List::iterator);
  }

  /**
   * Find out  pair. All workload grouped by file-level.
   * 

   * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD
   * partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey
   * 
   * Make sure the parallelism is atleast the groupby parallelism for tagging location
   */
  @VisibleForTesting
  JavaPairRDD findMatchingFilesForRecordKeys(
      final Map> partitionToFileIndexInfo,
      JavaPairRDD partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTable hoodieTable,
      Map fileGroupToComparisons) {
    JavaRDD> fileComparisonsRDD =
        explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);

    if (config.useBloomIndexBucketizedChecking()) {
      Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons,
          config.getBloomIndexKeysPerBucket());

      fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
          .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2);
    } else {
      fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
    }

    return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
        .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0)
        .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
            .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
                new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
            .collect(Collectors.toList()).iterator());
  }

  HoodieRecord getTaggedRecord(HoodieRecord inputRecord, Option location) {
    HoodieRecord record = inputRecord;
    if (location.isPresent()) {
      // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD
      // will have 2 entries with the same exact in memory copy of the HoodieRecord and the 2
      // separate filenames that the record is found in. This will result in setting
      // currentLocation 2 times and it will fail the second time. So creating a new in memory
      // copy of the hoodie record.
      record = new HoodieRecord<>(inputRecord);
      record.unseal();
      record.setCurrentLocation(location.get());
      record.seal();
    }
    return record;
  }

  /**
   * Tag the  back to the original HoodieRecord RDD.
   */
  protected JavaRDD> tagLocationBacktoRecords(
      JavaPairRDD keyFilenamePairRDD, JavaRDD> recordRDD) {
    JavaPairRDD> keyRecordPairRDD =
        recordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record));
    // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
    // so we do left outer join.
    return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values()
        .map(v1 -> getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull())));
  }

  @Override
  public JavaRDD updateLocation(JavaRDD writeStatusRDD, JavaSparkContext jsc,
      HoodieTable hoodieTable) {
    return writeStatusRDD;
  }
}