com.uber.hoodie.index.bloom.HoodieBloomIndex Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hoodie-client Show documentation
There is a newer version: 0.4.7
/*
 *  Copyright (c) 2017 Uber Technologies, Inc. ([email protected])
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 */

package com.uber.hoodie.index.bloom;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;

import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordLocation;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.MetadataNotFoundException;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.table.HoodieTable;

import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;

import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static java.util.stream.Collectors.*;

/**
 * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in
 * its metadata.
 */
public class HoodieBloomIndex extends HoodieIndex {

    private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class);

    // we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476)
    private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024;
    // this is how much a triplet of (partitionPath, fileId, recordKey) costs.
    private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300;
    private static int MAX_ITEMS_PER_SHUFFLE_PARTITION = SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET;

    public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) {
        super(config, jsc);
    }

    @Override
    public JavaRDD> tagLocation(JavaRDD> recordRDD, final HoodieTable hoodieTable) {

        // Step 0: cache the input record RDD
        if (config.getBloomIndexUseCaching()) {
            recordRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
        }

        // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
        JavaPairRDD partitionRecordKeyPairRDD = recordRDD
                .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

        // Lookup indexes for all the partition/recordkey pair
        JavaPairRDD rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, hoodieTable);

        // Cache the result, for subsequent stages.
        if (config.getBloomIndexUseCaching()) {
            rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
        }
        if (logger.isDebugEnabled()) {
            long totalTaggedRecords = rowKeyFilenamePairRDD.count();
            logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
        }

        // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
        // Cost: 4 sec.
        JavaRDD> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD);

        if (config.getBloomIndexUseCaching()) {
            recordRDD.unpersist(); // unpersist the input Record RDD
            rowKeyFilenamePairRDD.unpersist();
        }

        return taggedRecordRDD;
    }

    public JavaPairRDD> fetchRecordLocation(
            JavaRDD hoodieKeys, final HoodieTable table) {
        JavaPairRDD partitionRecordKeyPairRDD =
                hoodieKeys.mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));

        // Lookup indexes for all the partition/recordkey pair
        JavaPairRDD rowKeyFilenamePairRDD =
                lookupIndex(partitionRecordKeyPairRDD, table);

        JavaPairRDD rowKeyHoodieKeyPairRDD =
                hoodieKeys.mapToPair(key -> new Tuple2<>(key.getRecordKey(), key));

        return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD)
                .mapToPair(keyPathTuple -> {
                    Optional recordLocationPath;
                    if (keyPathTuple._2._2.isPresent()) {
                        String fileName = keyPathTuple._2._2.get();
                        String partitionPath = keyPathTuple._2._1.getPartitionPath();
                        recordLocationPath = Optional.of(new Path(
                                new Path(table.getMetaClient().getBasePath(), partitionPath),
                                fileName).toUri().getPath());
                    } else {
                        recordLocationPath = Optional.absent();
                    }
                    return new Tuple2<>(keyPathTuple._2._1, recordLocationPath);
                });
    }

    /**
     * Lookup the location for each record key and return the pair for all
     * record keys already present and drop the record keys if not present
     */
    private JavaPairRDD lookupIndex(
            JavaPairRDD partitionRecordKeyPairRDD, final HoodieTable hoodieTable) {
        // Obtain records per partition, in the incoming records
        Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
        List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());

        // Step 2: Load all involved files as  pairs
        List> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, hoodieTable);
        final Map> partitionToFileInfo = fileInfoList.stream()
                .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));

        // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it.
        int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
        return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism);
    }

    /**
     * The index lookup can be skewed in three dimensions : #files, #partitions, #records
     *
     * To be able to smoothly handle skews, we need to compute how to split each partitions into
     * subpartitions. We do it here, in a way that keeps the amount of each Spark join partition to
     * < 2GB.
     *
     * If {@link com.uber.hoodie.config.HoodieIndexConfig#BLOOM_INDEX_PARALLELISM_PROP} is specified as a NON-zero number,
     * then that is used explicitly.
     *
     */
    private int autoComputeParallelism(final Map recordsPerPartition,
                                       final Map> partitionToFileInfo,
                                       JavaPairRDD partitionRecordKeyPairRDD) {

        long totalComparisons = 0;
        if (config.getBloomIndexPruneByRanges()) {
            // we will just try exploding the input and then count to determine comparisons
            totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD).count();
        } else {
            // if not pruning by ranges, then each file in a partition needs to compared against all
            // records for a partition.
            Map filesPerPartition = partitionToFileInfo.entrySet().stream()
                    .collect(Collectors.toMap(Map.Entry::getKey, e -> Long.valueOf(e.getValue().size())));
            long totalFiles = 0, totalRecords = 0;
            for (String partitionPath : recordsPerPartition.keySet()) {
                long numRecords = recordsPerPartition.get(partitionPath);
                long numFiles = filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 1L;

                totalComparisons += numFiles * numRecords;
                totalFiles += filesPerPartition.containsKey(partitionPath) ? filesPerPartition.get(partitionPath) : 0L;
                totalRecords += numRecords;
            }
            logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles + ", TotalAffectedPartitions:" + recordsPerPartition.size());
        }

        // each partition will have an item per comparison.
        int parallelism = (int) (totalComparisons/ MAX_ITEMS_PER_SHUFFLE_PARTITION + 1);
        logger.info("Auto computed parallelism :" + parallelism + ", totalComparisons: " + totalComparisons);
        return parallelism;
    }

    /**
     * Its crucial to pick the right parallelism.
     *
     * totalSubPartitions : this is deemed safe limit, to be nice with Spark.
     * inputParallelism : typically number of input file splits
     *
     * We pick the max such that, we are always safe, but go higher if say a there are a lot of
     * input files. (otherwise, we will fallback to number of partitions in input and end up with
     * slow performance)
     */
    private int determineParallelism(int inputParallelism, int totalSubPartitions) {
        // If bloom index parallelism is set, use it to to check against the input parallelism and take the max
        int indexParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
        int joinParallelism = Math.max(totalSubPartitions, indexParallelism);
        logger.info("InputParallelism: ${" + inputParallelism + "}, " +
                "IndexParallelism: ${" + config.getBloomIndexParallelism() + "}, " +
                "TotalSubParts: ${" + totalSubPartitions + "}, " +
                "Join Parallelism set to : " + joinParallelism);
        return joinParallelism;
    }

    /**
     * Load all involved files as  pair RDD.
     */
    @VisibleForTesting
    List> loadInvolvedFiles(List partitions, final HoodieTable hoodieTable) {
        // Obtain the latest data files from all the partitions.
        List> dataFilesList = jsc.parallelize(partitions, Math.max(partitions.size(), 1))
                .flatMapToPair(partitionPath -> {
                    java.util.Optional latestCommitTime =
                            hoodieTable.getCommitTimeline().filterCompletedInstants().lastInstant();
                    List> filteredFiles = new ArrayList<>();
                    if (latestCommitTime.isPresent()) {
                         filteredFiles =
                                hoodieTable.getROFileSystemView().getLatestDataFilesBeforeOrOn(partitionPath,
                                        latestCommitTime.get().getTimestamp())
                                        .map(f -> new Tuple2<>(partitionPath, f))
                                        .collect(toList());
                    }
                    return filteredFiles.iterator();
                }).collect();

        if (config.getBloomIndexPruneByRanges()) {
            // also obtain file ranges, if range pruning is enabled
            return jsc.parallelize(dataFilesList, Math.max(dataFilesList.size(), 1))
                    .mapToPair(ft -> {
                        try {
                            String[] minMaxKeys = ParquetUtils.readMinMaxRecordKeys(ft._2().getFileStatus().getPath());
                            return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
                        } catch (MetadataNotFoundException me) {
                            logger.warn("Unable to find range metadata in file :" + ft._2());
                            return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()));
                        }
                    }).collect();
        } else {
            return dataFilesList.stream()
                    .map(ft -> new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName())))
                    .collect(toList());
        }
    }


    @Override
    public boolean rollbackCommit(String commitTime) {
        // Nope, don't need to do anything.
        return true;
    }

    /**
     * This is not global, since we depend on the partitionPath to do the lookup
     *
     * @return
     */
    @Override
    public boolean isGlobal() {
        return false;
    }

    /**
     * No indexes into log files yet.
     *
     * @return
     */
    @Override
    public boolean canIndexLogFiles() {
        return false;
    }

    /**
     * Bloom filters are stored, into the same data files.
     *
     * @return
     */
    @Override
    public boolean isImplicitWithStorage() {
        return true;
    }

    /**
     * if we dont have key ranges, then also we need to compare against the file. no other choice
     * if we do, then only compare the file if the record key falls in range.

     * @param indexInfo
     * @param recordKey
     * @return
     */
    private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) {
        return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey);
    }


    /**
     * For each incoming record, produce N output records, 1 each for each file against which the record's key
     * needs to be checked. For datasets, where the keys have a definite insert order (e.g: timestamp as prefix),
     * the number of files to be compared gets cut down a lot from range pruning.
     *
     *
     * @param partitionToFileIndexInfo
     * @param partitionRecordKeyPairRDD
     * @return
     */
    // sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on recordKey
    // ranges in the index info.
    @VisibleForTesting
    JavaPairRDD> explodeRecordRDDWithFileComparisons(final Map> partitionToFileIndexInfo,
                                                                                       JavaPairRDD partitionRecordKeyPairRDD) {
        return partitionRecordKeyPairRDD
                .map(partitionRecordKeyPair -> {
                    String recordKey = partitionRecordKeyPair._2();
                    String partitionPath = partitionRecordKeyPair._1();

                    List indexInfos = partitionToFileIndexInfo.get(partitionPath);
                    List>> recordComparisons = new ArrayList<>();
                    if (indexInfos != null) { // could be null, if there are no files in a given partition yet.
                        // for each candidate file in partition, that needs to be compared.
                        for (BloomIndexFileInfo indexInfo : indexInfos) {
                            if (shouldCompareWithFile(indexInfo, recordKey)) {
                                recordComparisons.add(
                                        new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),
                                                new Tuple2<>(indexInfo.getFileName(), new HoodieKey(recordKey, partitionPath))));
                            }
                        }
                    }
                    return recordComparisons;
                })
                .flatMapToPair(t -> t.iterator());
    }

    /**
     * Find out  pair. All workload grouped by file-level.
     *
     * Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition
     * such that each RDD partition is a file, then for each file, we do
     * (1) load bloom filter,
     * (2) load rowKeys,
     * (3) Tag rowKey
     *
     * Make sure the parallelism is atleast the groupby parallelism for tagging location
     */
    @VisibleForTesting
    JavaPairRDD findMatchingFilesForRecordKeys(final Map> partitionToFileIndexInfo,
                                                               JavaPairRDD partitionRecordKeyPairRDD,
                                                               int totalSubpartitions) {

        int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), totalSubpartitions);

        JavaPairRDD> fileSortedTripletRDD = explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
                // sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly
                .sortByKey(true, joinParallelism);

        return fileSortedTripletRDD
                .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(config.getBasePath()), true)
                .flatMap(indexLookupResults -> indexLookupResults.iterator())
                .filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0)
                .flatMapToPair(lookupResult -> {
                    List> vals = new ArrayList<>();
                    for (String recordKey : lookupResult.getMatchingRecordKeys()) {
                        vals.add(new Tuple2<>(recordKey, lookupResult.getFileName()));
                    }
                    return vals.iterator();
                });
    }

    /**
     * Tag the  back to the original HoodieRecord RDD.
     */
    private JavaRDD> tagLocationBacktoRecords(JavaPairRDD rowKeyFilenamePairRDD,
                                                              JavaRDD> recordRDD) {
        JavaPairRDD> rowKeyRecordPairRDD = recordRDD
                .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));

        // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join.
        return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(
                v1 -> {
                    HoodieRecord record = v1._1();
                    if (v1._2().isPresent()) {
                        String filename = v1._2().get();
                        if (filename != null && !filename.isEmpty()) {
                            record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename),
                                    FSUtils.getFileId(filename)));
                        }
                    }
                    return record;
                }
        );
    }

    @Override
    public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieTable hoodieTable) {
        return writeStatusRDD;
    }
}