org.apache.phoenix.iterate.TableSamplerPredicate Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phoenix-client
Phoenix Client
There is a newer version: 4.15.0-HBase-1.5
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.phoenix.iterate;

import com.google.common.base.Predicate;

/**
 * TableSampler.
 * 
 * A dice rolling on every targeted row to decide if this row is going 
 * to be picked or not.
 * An application is table Sampler, based on boolean result, this row is 
 * then picked (or rejected) to be part of sample set.
 * 
 * Currently implemented using FNV1a with Lazy mod mapping method to ensure
 * the even distribution of hashed result, so that the final sampled result 
 * will be close to the size of expected
 * 
 */
public class TableSamplerPredicate implements Predicate{
	private final double tableSamplingRate;
	
	private TableSamplerPredicate(double tableSamplingRate){
		this.tableSamplingRate=tableSamplingRate;
	}
		
	public static TableSamplerPredicate of(final Double tableSamplingRateRaw){
		assert(tableSamplingRateRaw!=null):"tableSamplingRate can not be null";
		assert(tableSamplingRateRaw>=0d&&tableSamplingRateRaw<=100d):"tableSamplingRate input has to be a rational number between 0 and 100";
		TableSamplerPredicate self=new TableSamplerPredicate(tableSamplingRateRaw);
		return self;
	}	
	
	@Override
	public boolean apply(byte[] bytes) {
		final int hashcode_FNV1Lazy=FNV1LazyImpl(bytes);
		final boolean result=evaluateWithChance(hashcode_FNV1Lazy);
    	return result;
	}
	
	/**
	 * Take build in FNV1a Hash function then apply lazy mod mapping method so that the 
	 * hash is evenly distributed between 0 and 100.
	 * 
	 * Quoted from http://isthe.com/chongo/tech/comp/fnv/, 
	 * The FNV hash is designed for hash sizes that are a power of 2. 
	 * If you need a hash size that is not a power of two, then you have two choices. 
	 * One method is called the lazy mod mapping method and the other is called the retry method. 
	 * Both involve mapping a range that is a power of 2 onto an arbitrary range.
	 * 
	 * Lazy mod mapping method: The lazy mod mapping method uses a simple mod on an n-bit hash 
	 * to yield an arbitrary range. 
	 * To produce a hash range between 0 and X use a n-bit FNV hash where n is smallest FNV hash 
	 * that will produce values larger than X without the need for xor-folding.
	 * 
	 * For example, to produce a value between 0 and 2142779559 using the lazy mod mapping method, 
	 * we select a 32-bit FNV hash because: 2 power 32 > 49999
	 * Before the final mod 50000 is performed, 
	 * we check to see if the 32-bit FNV hash value is one of the upper biased values. 
	 * If it is, we perform additional loop cycles until is below the bias level.
	 * 
	 * An advantage of the lazy mod mapping method is that it requires only 1 more operation: 
	 * only an additional mod is performed at the end.
	 * The disadvantage of the lazy mod mapping method is that there is a bias against 
	 * the larger values.
	 * 
	 * @param bytes
	 * @return
	 */
	final static private int FNV1LazyImpl(final byte[] bytes){
		final int contentBasedHashCode = java.util.Arrays.hashCode(bytes);
		return lazyRedistribute(contentBasedHashCode);
	}
	
	
	/**
	 * Lazy mod mapping method Implementation
	 * 
	 * Output result should be following the same distribution as input hashcode, 
	 * however re-mapped between 0 and 100.
	 * 
	 * @param hashcode
	 * @return 
	 */
	final static private int lazyRedistribute(final int hashcode){
		return java.lang.Math.abs(hashcode%100);
	}
	
	/**
	 * 
	 * @param hashcode
	 * @return
	 */
    final private boolean evaluateWithChance(final int hashcode){
    	assert((hashcode>=0)&&(hashcode<=100)):"hashcode should be re-distribute into 0 to 100";
    	return (hashcode