All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.uber.hoodie.index.bloom.HoodieBloomIndex Maven / Gradle / Ivy

There is a newer version: 0.4.7
Show newest version
/*
 *  Copyright (c) 2017 Uber Technologies, Inc. ([email protected])
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 */

package com.uber.hoodie.index.bloom;

import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.mapping;
import static java.util.stream.Collectors.toList;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.MetadataNotFoundException;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;

/**
 * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in
 * its metadata.
 */
public class HoodieBloomIndex extends HoodieIndex {

  // we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476)
  private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
  // this is how much a triplet of (partitionPath, fileId, recordKey) costs.
  private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
  private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class);
  private static int MAX_ITEMS_PER_SHUFFLE_PARTITION =
      SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;

  public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
    super(config, jsc);
  }

  @Override
  public JavaRDD> tagLocation(JavaRDD> recordRDD,
      final HoodieTable hoodieTable) {

    // Step 0: cache the input record RDD
    if (config.getBloomIndexUseCaching()) {
      recordRDD.persist(config.getBloomIndexInputStorageLevel());
    }

    // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
    JavaPairRDD partitionRecordKeyPairRDD = recordRDD
        .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

    // Lookup indexes for all the partition/recordkey pair
    JavaPairRDD rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
        hoodieTable);

    // Cache the result, for subsequent stages.
    if (config.getBloomIndexUseCaching()) {
      rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
    }
    if (logger.isDebugEnabled()) {
      long totalTaggedRecords = rowKeyFilenamePairRDD.count();
      logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
    }

    // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
    // Cost: 4 sec.
    JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
        recordRDD);

    if (config.getBloomIndexUseCaching()) {
      recordRDD.unpersist(); // unpersist the input Record RDD
      rowKeyFilenamePairRDD.unpersist();
    }

    return taggedRecordRDD;
  }

  public JavaPairRDD> fetchRecordLocation(JavaRDD hoodieKeys,
      final HoodieTable table) {
    JavaPairRDD partitionRecordKeyPairRDD = hoodieKeys
        .mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));

    // Lookup indexes for all the partition/recordkey pair
    JavaPairRDD rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD,
        table);

    JavaPairRDD rowKeyHoodieKeyPairRDD = hoodieKeys
        .mapToPair(key -> new Tuple2<>(key.getRecordKey(), key));

    return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).mapToPair(keyPathTuple -> {
      Optional recordLocationPath;
      if (keyPathTuple._2._2.isPresent()) {
        String fileName = keyPathTuple._2._2.get();
        String partitionPath = keyPathTuple._2._1.getPartitionPath();
        recordLocationPath = Optional
            .of(new Path(new Path(table.getMetaClient().getBasePath(), partitionPath), fileName)
                .toUri().getPath());
      } else {
        recordLocationPath = Optional.absent();
      }
      return new Tuple2<>(keyPathTuple._2._1, recordLocationPath);
    });
  }

  /**
   * Lookup the location for each record key and return the pair for all record
   * keys already present and drop the record keys if not present
   */
  private JavaPairRDD lookupIndex(
      JavaPairRDD partitionRecordKeyPairRDD, final HoodieTable hoodieTable) {
    // Obtain records per partition, in the incoming records
    Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
    List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());

    // Step 2: Load all involved files as  pairs
    List> fileInfoList = loadInvolvedFiles(
        affectedPartitionPathList, hoodieTable);
    final Map> partitionToFileInfo = fileInfoList.stream()
        .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));

    // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
    // that contains it.
    int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
        partitionRecordKeyPairRDD);
    return findMatchingFilesForRecordKeys(hoodieTable, partitionToFileInfo,
        partitionRecordKeyPairRDD, parallelism);
  }

  /**
   * The index lookup can be skewed in three dimensions : #files, #partitions, #records
   * 

* To be able to smoothly handle skews, we need to compute how to split each partitions into * subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to < * 2GB. *

* If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified * as a NON-zero number, then that is used explicitly. */ private int autoComputeParallelism(final Map recordsPerPartition, final Map> partitionToFileInfo, JavaPairRDD partitionRecordKeyPairRDD) { long totalComparisons = 0; if (config.getBloomIndexPruneByRanges()) { // we will just try exploding the input and then count to determine comparisons totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD).count(); } else { // if not pruning by ranges, then each file in a partition needs to compared against all // records for a partition. Map filesPerPartition = partitionToFileInfo.entrySet().stream() .collect(Collectors.toMap(Map.Entry::getKey, e -> Long.valueOf(e.getValue().size()))); long totalFiles = 0; long totalRecords = 0; for (String partitionPath : recordsPerPartition.keySet()) { long numRecords = recordsPerPartition.get(partitionPath); long numFiles = filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 1L; totalComparisons += numFiles * numRecords; totalFiles += filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 0L; totalRecords += numRecords; } logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles + ", TotalAffectedPartitions:" + recordsPerPartition.size()); } // each partition will have an item per comparison. int parallelism = (int) (totalComparisons / MAX_ITEMS_PER_SHUFFLE_PARTITION + 1); logger.info( "Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons); return parallelism; } /** * Its crucial to pick the right parallelism. *

* totalSubPartitions : this is deemed safe limit, to be nice with Spark. inputParallelism : * typically number of input file splits *

* We pick the max such that, we are always safe, but go higher if say a there are a lot of input * files. (otherwise, we will fallback to number of partitions in input and end up with slow * performance) */ private int determineParallelism(int inputParallelism, int totalSubPartitions) { // If bloom index parallelism is set, use it to to check against the input parallelism and // take the max int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); int joinParallelism = Math.max(totalSubPartitions, indexParallelism); logger.info("InputParallelism: ${" + inputParallelism + "}, " + "IndexParallelism: ${" + config .getBloomIndexParallelism() + "}, " + "TotalSubParts: ${" + totalSubPartitions + "}, " + "Join Parallelism set to : " + joinParallelism); return joinParallelism; } /** * Load all involved files as pair RDD. */ @VisibleForTesting List> loadInvolvedFiles(List partitions, final HoodieTable hoodieTable) { // Obtain the latest data files from all the partitions. List> dataFilesList = jsc .parallelize(partitions, Math.max(partitions.size(), 1)).flatMapToPair(partitionPath -> { java.util.Optional latestCommitTime = hoodieTable.getCommitsTimeline() .filterCompletedInstants().lastInstant(); List> filteredFiles = new ArrayList<>(); if (latestCommitTime.isPresent()) { filteredFiles = hoodieTable.getROFileSystemView() .getLatestDataFilesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp()) .map(f -> new Tuple2<>(partitionPath, f)).collect(toList()); } return filteredFiles.iterator(); }).collect(); if (config.getBloomIndexPruneByRanges()) { // also obtain file ranges, if range pruning is enabled return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1)).mapToPair(ft -> { try { String[] minMaxKeys = ParquetUtils .readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath()); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1])); } catch (MetadataNotFoundException me) { logger.warn("Unable to find range metadata in file :" + ft._2()); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName())); } }).collect(); } else { return dataFilesList.stream() .map(ft -> new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()))) .collect(toList()); } } @Override public boolean rollbackCommit(String commitTime) { // Nope, don't need to do anything. return true; } /** * This is not global, since we depend on the partitionPath to do the lookup */ @Override public boolean isGlobal() { return false; } /** * No indexes into log files yet. */ @Override public boolean canIndexLogFiles() { return false; } /** * Bloom filters are stored, into the same data files. */ @Override public boolean isImplicitWithStorage() { return true; } /** * if we dont have key ranges, then also we need to compare against the file. no other choice if * we do, then only compare the file if the record key falls in range. */ private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) { return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey); } /** * For each incoming record, produce N output records, 1 each for each file against which the * record's key needs to be checked. For datasets, where the keys have a definite insert order * (e.g: timestamp as prefix), the number of files to be compared gets cut down a lot from range * pruning. */ // sub-partition to ensure the records can be looked up against files & also prune // file<=>record comparisons based on recordKey // ranges in the index info. @VisibleForTesting JavaPairRDD> explodeRecordRDDWithFileComparisons( final Map> partitionToFileIndexInfo, JavaPairRDD partitionRecordKeyPairRDD) { return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> { String recordKey = partitionRecordKeyPair._2(); String partitionPath = partitionRecordKeyPair._1(); List indexInfos = partitionToFileIndexInfo.get(partitionPath); List>> recordComparisons = new ArrayList<>(); if (indexInfos != null) { // could be null, if there are no files in a given partition yet. // for each candidate file in partition, that needs to be compared. for (BloomIndexFileInfo indexInfo : indexInfos) { if (shouldCompareWithFile(indexInfo, recordKey)) { recordComparisons.add( new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey), new Tuple2<>(indexInfo.getFileName(), new HoodieKey(recordKey, partitionPath)))); } } } return recordComparisons; }).flatMapToPair(t -> t.iterator()); } /** * Find out pair. All workload grouped by file-level. *

* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such * that each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load * rowKeys, (3) Tag rowKey *

* Make sure the parallelism is atleast the groupby parallelism for tagging location */ @VisibleForTesting JavaPairRDD findMatchingFilesForRecordKeys(HoodieTable hoodieTable, final Map> partitionToFileIndexInfo, JavaPairRDD partitionRecordKeyPairRDD, int totalSubpartitions) { int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), totalSubpartitions); JavaPairRDD> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons( partitionToFileIndexInfo, partitionRecordKeyPairRDD) // sort further based on filename, such that all checking for the file can happen within // a single partition, on-the-fly .sortByKey(true, joinParallelism); return fileSortedTripletRDD.mapPartitionsWithIndex( new HoodieBloomIndexCheckFunction(hoodieTable, config.getBasePath()), true) .flatMap(indexLookupResults -> indexLookupResults.iterator()) .filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0) .flatMapToPair(lookupResult -> { List> vals = new ArrayList<>(); for (String recordKey : lookupResult.getMatchingRecordKeys()) { vals.add(new Tuple2<>(recordKey, lookupResult.getFileName())); } return vals.iterator(); }); } /** * Tag the back to the original HoodieRecord RDD. */ private JavaRDD> tagLocationBacktoRecords( JavaPairRDD rowKeyFilenamePairRDD, JavaRDD> recordRDD) { JavaPairRDD> rowKeyRecordPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> { HoodieRecord record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); if (filename != null && !filename.isEmpty()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2 // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the // record is found in. This will result in setting currentLocation 2 times and it will fail the second time. // This check will create a new in memory copy of the hoodie record. if (record.getCurrentLocation() != null) { record = new HoodieRecord(record.getKey(), record.getData()); } record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename))); } } return record; }); } @Override public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieTable hoodieTable) { return writeStatusRDD; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy