All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.uber.hoodie.index.HoodieBloomIndexCheckFunction Maven / Gradle / Ivy

There is a newer version: 0.4.7
Show newest version
/*
 * Copyright (c) 2016 Uber Technologies, Inc. ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.index;

import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.func.LazyIterableIterator;

import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.function.Function2;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import scala.Tuple2;

/**
 * Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the
 * actual files
 */
public class HoodieBloomIndexCheckFunction implements Function2>>, Iterator>> {

    private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);

    private final String basePath;

    public HoodieBloomIndexCheckFunction(String basePath) {
        this.basePath = basePath;
    }

    /**
     * Given a list of row keys and one file, return only row keys existing in that file.
     */
    public static List checkCandidatesAgainstFile(List candidateRecordKeys, Path filePath) throws HoodieIndexException {
        List foundRecordKeys = new ArrayList<>();
        try {
            // Load all rowKeys from the file, to double-confirm
            if (!candidateRecordKeys.isEmpty()) {
                Set fileRowKeys = ParquetUtils.readRowKeysFromParquet(filePath);
                logger.info("Loading " + fileRowKeys.size() + " row keys from " + filePath);
                if (logger.isDebugEnabled()) {
                    logger.debug("Keys from " + filePath + " => " + fileRowKeys);
                }
                for (String rowKey : candidateRecordKeys) {
                    if (fileRowKeys.contains(rowKey)) {
                        foundRecordKeys.add(rowKey);
                    }
                }
                logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys);
                if (logger.isDebugEnabled()) {
                    logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
                }
            }
        } catch (Exception e){
            throw new HoodieIndexException("Error checking candidate keys against file.", e);
        }
        return foundRecordKeys;
    }

    class LazyKeyCheckIterator extends LazyIterableIterator>, List> {

        private List candidateRecordKeys;

        private BloomFilter bloomFilter;

        private String currentFile;

        private String currentParitionPath;

        LazyKeyCheckIterator(Iterator>> fileParitionRecordKeyTripletItr) {
            super(fileParitionRecordKeyTripletItr);
            currentFile = null;
            candidateRecordKeys = new ArrayList<>();
            bloomFilter = null;
            currentParitionPath = null;
        }

        @Override
        protected void start() {
        }

        private void initState(String fileName, String partitionPath) throws HoodieIndexException {
            try {
                Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName);
                bloomFilter = ParquetUtils.readBloomFilterFromParquetMetadata(filePath);
                candidateRecordKeys = new ArrayList<>();
                currentFile = fileName;
                currentParitionPath = partitionPath;
            } catch (Exception e) {
                throw new HoodieIndexException("Error checking candidate keys against file.", e);
            }
        }

        @Override
        protected List computeNext() {

            List ret = new ArrayList<>();
            try {
                // process one file in each go.
                while (inputItr.hasNext()) {

                    Tuple2> currentTuple = inputItr.next();
                    String fileName = currentTuple._2._1;
                    String partitionPath = currentTuple._2._2.getPartitionPath();
                    String recordKey = currentTuple._2._2.getRecordKey();

                    // lazily init state
                    if (currentFile == null) {
                        initState(fileName, partitionPath);
                    }

                    // if continue on current file)
                    if (fileName.equals(currentFile)) {
                        // check record key against bloom filter of current file & add to possible keys if needed
                        if (bloomFilter.mightContain(recordKey)) {
                            if (logger.isDebugEnabled()) {
                                logger.debug("#1 Adding " + recordKey + " as candidate for file " + fileName);
                            }
                            candidateRecordKeys.add(recordKey);
                        }
                    } else {
                        // do the actual checking of file & break out
                        Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
                        logger.info("#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath);
                        if (logger.isDebugEnabled()) {
                            logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
                        }
                        ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath)));

                        initState(fileName, partitionPath);
                        if (bloomFilter.mightContain(recordKey)) {
                            if (logger.isDebugEnabled()) {
                                logger.debug("#2 Adding " + recordKey + " as candidate for file " + fileName);
                            }
                            candidateRecordKeys.add(recordKey);
                        }
                        break;
                    }
                }

                // handle case, where we ran out of input, finish pending work, update return val
                if (!inputItr.hasNext()) {
                    Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
                    logger.info("#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath);
                    if (logger.isDebugEnabled()) {
                        logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
                    }
                    ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
                }

            } catch (Throwable e) {
                if (e instanceof HoodieException) {
                    throw e;
                }
                throw new HoodieIndexException("Error checking bloom filter index. ", e);
            }

            return ret;
        }

        @Override
        protected void end() {
        }
    }


    @Override
    public Iterator> call(Integer partition,
                                                  Iterator>> fileParitionRecordKeyTripletItr) throws Exception {
        return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy