com.uber.hoodie.index.bloom.HoodieBloomIndex Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hoodie-client Show documentation
There is a newer version: 0.4.7
/*
 *  Copyright (c) 2017 Uber Technologies, Inc. ([email protected])
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 */

package com.uber.hoodie.index.bloom;

import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.mapping;
import static java.util.stream.Collectors.toList;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.MetadataNotFoundException;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;

/**
 * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in
 * its metadata.
 */
public class HoodieBloomIndex extends HoodieIndex {

  // we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476)
  private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
  // this is how much a triplet of (partitionPath, fileId, recordKey) costs.
  private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
  private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class);
  private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
      SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;

  public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
    super(config, jsc);
  }

  @Override
  public JavaRDD> tagLocation(JavaRDD> recordRDD,
      final HoodieTable hoodieTable) {

    // Step 0: cache the input record RDD
    if (config.getBloomIndexUseCaching()) {
      recordRDD.persist(config.getBloomIndexInputStorageLevel());
    }

    // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
    JavaPairRDD partitionRecordKeyPairRDD = recordRDD
        .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

    // Lookup indexes for all the partition/recordkey pair
    JavaPairRDD rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
        hoodieTable);

    // Cache the result, for subsequent stages.
    if (config.getBloomIndexUseCaching()) {
      rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
    }
    if (logger.isDebugEnabled()) {
      long totalTaggedRecords = rowKeyFilenamePairRDD.count();
      logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
    }

    // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
    // Cost: 4 sec.
    JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
        recordRDD);

    if (config.getBloomIndexUseCaching()) {
      recordRDD.unpersist(); // unpersist the input Record RDD
      rowKeyFilenamePairRDD.unpersist();
    }

    return taggedRecordRDD;
  }

  public JavaPairRDD> fetchRecordLocation(JavaRDD hoodieKeys,
      final HoodieTable table) {
    JavaPairRDD partitionRecordKeyPairRDD = hoodieKeys
        .mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));

    // Lookup indexes for all the partition/recordkey pair
    JavaPairRDD rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
        table);

    JavaPairRDD rowKeyHoodieKeyPairRDD = hoodieKeys
        .mapToPair(key -> new Tuple2<>(key.getRecordKey(), key));

    return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).mapToPair(keyPathTuple -> {
      Optional recordLocationPath;
      if (keyPathTuple._2._2.isPresent()) {
        String fileName = keyPathTuple._2._2.get();
        String partitionPath = keyPathTuple._2._1.getPartitionPath();
        recordLocationPath = Optional
            .of(new Path(new Path(table.getMetaClient().getBasePath(), partitionPath), fileName)
                .toUri().getPath());
      } else {
        recordLocationPath = Optional.absent();
      }
      return new Tuple2<>(keyPathTuple._2._1, recordLocationPath);
    });
  }

  /**
   * Lookup the location for each record key and return the pair for all record
   * keys already present and drop the record keys if not present
   */
  private JavaPairRDD lookupIndex(
      JavaPairRDD partitionRecordKeyPairRDD, final HoodieTable hoodieTable) {
    // Obtain records per partition, in the incoming records
    Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
    List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());

    // Step 2: Load all involved files as  pairs
    List> fileInfoList = loadInvolvedFiles(
        affectedPartitionPathList, hoodieTable);
    final Map> partitionToFileInfo = fileInfoList.stream()
        .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));

    // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
    // that contains it.
    int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
        partitionRecordKeyPairRDD);
    return findMatchingFilesForRecordKeys(hoodieTable, partitionToFileInfo,
        partitionRecordKeyPairRDD, parallelism);
  }

  /**
   * The index lookup can be skewed in three dimensions : #files, #partitions, #records
   * 
   * To be able to smoothly handle skews, we need to compute how to split each partitions into
   * subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to <
   * 2GB.
   * 

   * If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified
   * as a NON-zero number, then that is used explicitly.
   */
  private int autoComputeParallelism(final Map recordsPerPartition,
      final Map> partitionToFileInfo,
      JavaPairRDD partitionRecordKeyPairRDD) {

    long totalComparisons = 0;
    if (config.getBloomIndexPruneByRanges()) {
      // we will just try exploding the input and then count to determine comparisons
      totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
          partitionRecordKeyPairRDD).count();
    } else {
      // if not pruning by ranges, then each file in a partition needs to compared against all
      // records for a partition.
      Map filesPerPartition = partitionToFileInfo.entrySet().stream()
          .collect(Collectors.toMap(Map.Entry::getKey, e -> Long.valueOf(e.getValue().size())));
      long totalFiles = 0;
      long totalRecords = 0;
      for (String partitionPath : recordsPerPartition.keySet()) {
        long numRecords = recordsPerPartition.get(partitionPath);
        long numFiles =
            filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
                : 1L;

        totalComparisons += numFiles * numRecords;
        totalFiles +=
            filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath)
                : 0L;
        totalRecords += numRecords;
      }
      logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles
          + ", TotalAffectedPartitions:" + recordsPerPartition.size());
    }

    // each partition will have an item per comparison.
    int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
    logger.info(
        "Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons);
    return parallelism;
  }

  /**
   * Its crucial to pick the right parallelism.
   * 

   * totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism :
   * typically number of input file splits
   * 

   * We pick the max such that, we are always safe, but go higher if say a there are a lot of input
   * files. (otherwise, we will fallback to number of partitions in input and end up with slow
   * performance)
   */
  private int determineParallelism(int inputParallelism, int totalSubPartitions) {
    // If bloom index parallelism is set, use it to to check against the input parallelism and
    // take the max
    int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
    int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
    logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${" + config
        .getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, "
        + "Join Parallelism set to : " + joinParallelism);
    return joinParallelism;
  }

  /**
   * Load all involved files as  pair RDD.
   */
  @VisibleForTesting
  List> loadInvolvedFiles(List partitions,
      final HoodieTable hoodieTable) {
    // Obtain the latest data files from all the partitions.
    List> dataFilesList = jsc
        .parallelize(partitions, Math.max(partitions.size(), 1)).flatMapToPair(partitionPath -> {
          java.util.Optional latestCommitTime = hoodieTable.getCommitsTimeline()
              .filterCompletedInstants().lastInstant();
          List> filteredFiles = new ArrayList<>();
          if (latestCommitTime.isPresent()) {
            filteredFiles = hoodieTable.getROFileSystemView()
                .getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp())
                .map(f -> new Tuple2<>(partitionPath, f)).collect(toList());
          }
          return filteredFiles.iterator();
        }).collect();

    if (config.getBloomIndexPruneByRanges()) {
      // also obtain file ranges, if range pruning is enabled
      return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1)).mapToPair(ft -> {
        try {
          String[] minMaxKeys = ParquetUtils
              .readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath());
          return new Tuple2<>(ft._1(),
              new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
        } catch (MetadataNotFoundException me) {
          logger.warn("Unable to find range metadata in file :" + ft._2());
          return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()));
        }
      }).collect();
    } else {
      return dataFilesList.stream()
          .map(ft -> new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName())))
          .collect(toList());
    }
  }


  @Override
  public boolean rollbackCommit(String commitTime) {
    // Nope, don't need to do anything.
    return true;
  }

  /**
   * This is not global, since we depend on the partitionPath to do the lookup
   */
  @Override
  public boolean isGlobal() {
    return false;
  }

  /**
   * No indexes into log files yet.
   */
  @Override
  public boolean canIndexLogFiles() {
    return false;
  }

  /**
   * Bloom filters are stored, into the same data files.
   */
  @Override
  public boolean isImplicitWithStorage() {
    return true;
  }

  /**
   * if we dont have key ranges, then also we need to compare against the file. no other choice if
   * we do, then only compare the file if the record key falls in range.
   */
  private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) {
    return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey);
  }


  /**
   * For each incoming record, produce N output records, 1 each for each file against which the
   * record's key needs to be checked. For datasets, where the keys have a definite insert order
   * (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range
   * pruning.
   */
  // sub-partition to ensure the records can be looked up against files & also prune
  // file<=>record comparisons based on recordKey
  // ranges in the index info.
  @VisibleForTesting
  JavaPairRDD> explodeRecordRDDWithFileComparisons(
      final Map> partitionToFileIndexInfo,
      JavaPairRDD partitionRecordKeyPairRDD) {
    return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
      String recordKey = partitionRecordKeyPair._2();
      String partitionPath = partitionRecordKeyPair._1();

      List indexInfos = partitionToFileIndexInfo.get(partitionPath);
      List>> recordComparisons = new ArrayList<>();
      if (indexInfos != null) { // could be null, if there are no files in a given partition yet.
        // for each candidate file in partition, that needs to be compared.
        for (BloomIndexFileInfo indexInfo : indexInfos) {
          if (shouldCompareWithFile(indexInfo, recordKey)) {
            recordComparisons.add(
                new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),
                    new Tuple2<>(indexInfo.getFileName(),
                        new HoodieKey(recordKey, partitionPath))));
          }
        }
      }
      return recordComparisons;
    }).flatMapToPair(t -> t.iterator());
  }

  /**
   * Find out  pair. All workload grouped by file-level.
   * 

   * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such
   * that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load
   * rowKeys, (3) Tag rowKey
   * 
   * Make sure the parallelism is atleast the groupby parallelism for tagging location
   */
  @VisibleForTesting
  JavaPairRDD findMatchingFilesForRecordKeys(HoodieTable hoodieTable,
      final Map> partitionToFileIndexInfo,
      JavaPairRDD partitionRecordKeyPairRDD, int totalSubpartitions) {

    int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
        totalSubpartitions);

    JavaPairRDD> fileSortedTripletRDD =
        explodeRecordRDDWithFileComparisons(
            partitionToFileIndexInfo, partitionRecordKeyPairRDD)
            // sort further based on filename, such that all checking for the file can happen within
            // a single partition, on-the-fly
            .sortByKey(true, joinParallelism);

    return fileSortedTripletRDD.mapPartitionsWithIndex(
        new HoodieBloomIndexCheckFunction(hoodieTable, config.getBasePath()), true)
        .flatMap(indexLookupResults -> indexLookupResults.iterator())
        .filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0)
        .flatMapToPair(lookupResult -> {
          List> vals = new ArrayList<>();
          for (String recordKey : lookupResult.getMatchingRecordKeys()) {
            vals.add(new Tuple2<>(recordKey, lookupResult.getFileName()));
          }
          return vals.iterator();
        });
  }

  /**
   * Tag the  back to the original HoodieRecord RDD.
   */
  private JavaRDD> tagLocationBacktoRecords(
      JavaPairRDD rowKeyFilenamePairRDD, JavaRDD> recordRDD) {
    JavaPairRDD> rowKeyRecordPairRDD = recordRDD
        .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));

    // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
    // so we do left outer join.
    return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> {
      HoodieRecord record = v1._1();
      if (v1._2().isPresent()) {
        String filename = v1._2().get();
        if (filename != null && !filename.isEmpty()) {
          // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2
          // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the
          // record is found in. This will result in setting currentLocation 2 times and it will fail the second time.
          // This check will create a new in memory copy of the hoodie record.
          if (record.getCurrentLocation() != null) {
            record = new HoodieRecord(record.getKey(), record.getData());
          }
          record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename),
              FSUtils.getFileId(filename)));
        }
      }
      return record;
    });
  }

  @Override
  public JavaRDD updateLocation(JavaRDD writeStatusRDD,
      HoodieTable hoodieTable) {
    return writeStatusRDD;
  }
}