com.uber.hoodie.index.bloom.HoodieBloomIndexCheckFunction Maven / Gradle / Ivy
/*
* Copyright (c) 2017 Uber Technologies, Inc. ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*/
package com.uber.hoodie.index.bloom;
import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.util.ParquetUtils;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIndexException;
import com.uber.hoodie.func.LazyIterableIterator;
import com.uber.hoodie.table.HoodieTable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;
/**
* Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the
* actual files
*/
public class HoodieBloomIndexCheckFunction implements
Function2>>,
Iterator>> {
private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class);
private final String basePath;
private final HoodieTable table;
public HoodieBloomIndexCheckFunction(HoodieTable table, String basePath) {
this.table = table;
this.basePath = basePath;
}
/**
* Given a list of row keys and one file, return only row keys existing in that file.
*/
public static List checkCandidatesAgainstFile(Configuration configuration,
List candidateRecordKeys, Path filePath) throws HoodieIndexException {
List foundRecordKeys = new ArrayList<>();
try {
// Load all rowKeys from the file, to double-confirm
if (!candidateRecordKeys.isEmpty()) {
Set fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath,
new HashSet<>(candidateRecordKeys));
foundRecordKeys.addAll(fileRowKeys);
logger.info("After checking with row keys, we have " + foundRecordKeys.size()
+ " results, for file " + filePath + " => " + foundRecordKeys);
if (logger.isDebugEnabled()) {
logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
}
}
} catch (Exception e) {
throw new HoodieIndexException("Error checking candidate keys against file.", e);
}
return foundRecordKeys;
}
@Override
public Iterator> call(Integer partition,
Iterator>> fileParitionRecordKeyTripletItr)
throws Exception {
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
}
class LazyKeyCheckIterator extends
LazyIterableIterator>, List> {
private List candidateRecordKeys;
private BloomFilter bloomFilter;
private String currentFile;
private String currentParitionPath;
LazyKeyCheckIterator(
Iterator>> fileParitionRecordKeyTripletItr) {
super(fileParitionRecordKeyTripletItr);
currentFile = null;
candidateRecordKeys = new ArrayList<>();
bloomFilter = null;
currentParitionPath = null;
}
@Override
protected void start() {
}
private void initState(String fileName, String partitionPath) throws HoodieIndexException {
try {
Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName);
bloomFilter = ParquetUtils
.readBloomFilterFromParquetMetadata(table.getHadoopConf(), filePath);
candidateRecordKeys = new ArrayList<>();
currentFile = fileName;
currentParitionPath = partitionPath;
} catch (Exception e) {
throw new HoodieIndexException("Error checking candidate keys against file.", e);
}
}
@Override
protected List computeNext() {
List ret = new ArrayList<>();
try {
// process one file in each go.
while (inputItr.hasNext()) {
Tuple2> currentTuple = inputItr.next();
String fileName = currentTuple._2._1;
String partitionPath = currentTuple._2._2.getPartitionPath();
String recordKey = currentTuple._2._2.getRecordKey();
// lazily init state
if (currentFile == null) {
initState(fileName, partitionPath);
}
// if continue on current file)
if (fileName.equals(currentFile)) {
// check record key against bloom filter of current file & add to possible keys if
// needed
if (bloomFilter.mightContain(recordKey)) {
if (logger.isDebugEnabled()) {
logger.debug("#1 Adding " + recordKey + " as candidate for file " + fileName);
}
candidateRecordKeys.add(recordKey);
}
} else {
// do the actual checking of file & break out
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
logger.info(
"#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
.size() + " for " + filePath);
if (logger.isDebugEnabled()) {
logger
.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
}
ret.add(new IndexLookupResult(currentFile,
checkCandidatesAgainstFile(table.getHadoopConf(), candidateRecordKeys, filePath)));
initState(fileName, partitionPath);
if (bloomFilter.mightContain(recordKey)) {
if (logger.isDebugEnabled()) {
logger.debug("#2 Adding " + recordKey + " as candidate for file " + fileName);
}
candidateRecordKeys.add(recordKey);
}
break;
}
}
// handle case, where we ran out of input, finish pending work, update return val
if (!inputItr.hasNext()) {
Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile);
logger.info(
"#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys
.size() + " for " + filePath);
if (logger.isDebugEnabled()) {
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
}
ret.add(new IndexLookupResult(currentFile,
checkCandidatesAgainstFile(table.getHadoopConf(), candidateRecordKeys, filePath)));
}
} catch (Throwable e) {
if (e instanceof HoodieException) {
throw e;
}
throw new HoodieIndexException("Error checking bloom filter index. ", e);
}
return ret;
}
@Override
protected void end() {
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy