com.yahoo.sketches.theta.DirectQuickSelectSketch Maven / Gradle / Ivy
/*
* Copyright 2015-16, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/
package com.yahoo.sketches.theta;
import static com.yahoo.sketches.Util.MIN_LG_ARR_LONGS;
import static com.yahoo.sketches.Util.MIN_LG_NOM_LONGS;
import static com.yahoo.sketches.Util.REBUILD_THRESHOLD;
import static com.yahoo.sketches.theta.PreambleUtil.BIG_ENDIAN_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.FAMILY_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.FLAGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.LG_ARR_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.LG_RESIZE_FACTOR_BIT;
import static com.yahoo.sketches.theta.PreambleUtil.MAX_THETA_LONG_AS_DOUBLE;
import static com.yahoo.sketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.P_FLOAT;
import static com.yahoo.sketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.RETAINED_ENTRIES_INT;
import static com.yahoo.sketches.theta.PreambleUtil.SEED_HASH_SHORT;
import static com.yahoo.sketches.theta.PreambleUtil.SER_VER;
import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG;
import static com.yahoo.sketches.theta.PreambleUtil.extractFamilyID;
import static com.yahoo.sketches.theta.PreambleUtil.extractFlags;
import static com.yahoo.sketches.theta.PreambleUtil.extractLgArrLongs;
import static com.yahoo.sketches.theta.PreambleUtil.extractLgNomLongs;
import static com.yahoo.sketches.theta.PreambleUtil.extractP;
import static com.yahoo.sketches.theta.PreambleUtil.extractPreLongs;
import static com.yahoo.sketches.theta.PreambleUtil.extractSeedHash;
import static com.yahoo.sketches.theta.PreambleUtil.extractSerVer;
import static com.yahoo.sketches.theta.PreambleUtil.extractThetaLong;
import static com.yahoo.sketches.theta.PreambleUtil.getMemBytes;
import static com.yahoo.sketches.theta.PreambleUtil.insertCurCount;
import static com.yahoo.sketches.theta.PreambleUtil.insertFamilyID;
import static com.yahoo.sketches.theta.PreambleUtil.insertFlags;
import static com.yahoo.sketches.theta.PreambleUtil.insertLgArrLongs;
import static com.yahoo.sketches.theta.PreambleUtil.insertLgNomLongs;
import static com.yahoo.sketches.theta.PreambleUtil.insertLgResizeFactor;
import static com.yahoo.sketches.theta.PreambleUtil.insertP;
import static com.yahoo.sketches.theta.PreambleUtil.insertPreLongs;
import static com.yahoo.sketches.theta.PreambleUtil.insertSeedHash;
import static com.yahoo.sketches.theta.PreambleUtil.insertSerVer;
import static com.yahoo.sketches.theta.PreambleUtil.insertThetaLong;
import static com.yahoo.sketches.theta.Rebuilder.actLgResizeFactor;
import static com.yahoo.sketches.theta.Rebuilder.moveAndResize;
import static com.yahoo.sketches.theta.Rebuilder.quickSelectAndRebuild;
import static com.yahoo.sketches.theta.Rebuilder.resize;
import static com.yahoo.sketches.theta.UpdateReturnState.InsertedCountIncremented;
import static com.yahoo.sketches.theta.UpdateReturnState.RejectedDuplicate;
import static com.yahoo.sketches.theta.UpdateReturnState.RejectedOverTheta;
import com.yahoo.memory.Memory;
import com.yahoo.memory.MemoryUtil;
import com.yahoo.memory.NativeMemory;
import com.yahoo.sketches.Family;
import com.yahoo.sketches.HashOperations;
import com.yahoo.sketches.ResizeFactor;
import com.yahoo.sketches.SketchesArgumentException;
import com.yahoo.sketches.Util;
/**
* The default Theta Sketch using the QuickSelect algorithm.
*
* This implementation uses data in a given Memory that is owned and managed by the caller.
* This Memory can be off-heap, which if managed properly will greatly reduce the need for
* the JVM to perform garbage collection.
*
* @author Lee Rhodes
* @author Kevin Lang
*/
final class DirectQuickSelectSketch extends UpdateSketch {
private static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space
//These values are also in Memory and are also kept on-heap for speed.
private final int lgNomLongs_;
private final int preambleLongs_;
private final long seed_; //provided, kept only on heap, never serialized.
private final short seedHash_; //computed from seed_
private int hashTableThreshold_; //computed, kept only on heap, never serialized.
private Memory mem_;
private DirectQuickSelectSketch(final int lgNomLongs, final long seed, final int preambleLongs) {
lgNomLongs_ = Math.max(lgNomLongs, MIN_LG_NOM_LONGS);
seed_ = seed;
seedHash_ = Util.computeSeedHash(seed_);
preambleLongs_ = preambleLongs;
}
/**
* Get a new sketch instance and initialize the given Memory as its backing store.
*
* @param lgNomLongs See lgNomLongs.
* @param seed See Update Hash Seed.
* @param p
* See Sampling Probability, p
* @param rf Currently internally fixed at 2. Unless dstMem is not configured with a valid
* MemoryRequest, in which case the rf is effectively 1, which is no resizing at all and the
* dstMem must be large enough for a full sketch.
* See Resize Factor
* @param dstMem the given Memory object destination. It cannot be null.
* It will be cleared prior to use.
* @param unionGadget true if this sketch is implementing the Union gadget function.
* Otherwise, it is behaving as a normal QuickSelectSketch.
* @return instance of this sketch
*/
static DirectQuickSelectSketch initNewDirectInstance(final int lgNomLongs, final long seed,
final float p, final ResizeFactor rf, final Memory dstMem, final boolean unionGadget) {
//Choose family, preambleLongs
final Family family;
final int preambleLongs;
if (unionGadget) {
preambleLongs = Family.UNION.getMinPreLongs();
family = Family.UNION;
}
else {
preambleLongs = Family.QUICKSELECT.getMinPreLongs();
family = Family.QUICKSELECT;
}
//Choose RF, minReqBytes, lgArrLongs.
final int lgRF = rf.lg();
final int lgArrLongs = (lgRF == 0) ? lgNomLongs + 1 : MIN_LG_ARR_LONGS;
final int minReqBytes = getMemBytes(lgArrLongs, preambleLongs);
//Make sure Memory is large enough
final long curMemCapBytes = dstMem.getCapacity();
if (curMemCapBytes < minReqBytes) {
throw new SketchesArgumentException(
"Memory capacity is too small: " + curMemCapBytes + " < " + minReqBytes);
}
//@formatter:off
//Build preamble
final Object memObj = dstMem.array(); //may be null
final long memAdd = dstMem.getCumulativeOffset(0L);
insertPreLongs(memObj, memAdd, preambleLongs); //byte 0
insertLgResizeFactor(memObj, memAdd, lgRF); //byte 0
insertSerVer(memObj, memAdd, SER_VER); //byte 1
insertFamilyID(memObj, memAdd, family.getID()); //byte 2
insertLgNomLongs(memObj, memAdd, lgNomLongs); //byte 3
insertLgArrLongs(memObj, memAdd, lgArrLongs); //byte 4
//flags: bigEndian = readOnly = compact = ordered = false; empty = true : 00100 = 4
insertFlags(memObj, memAdd, EMPTY_FLAG_MASK); //byte 5
insertSeedHash(memObj, memAdd, Util.computeSeedHash(seed)); //bytes 6,7
insertCurCount(memObj, memAdd, 0); //bytes 8-11
insertP(memObj, memAdd, p); //bytes 12-15
final long thetaLong = (long)(p * MAX_THETA_LONG_AS_DOUBLE);
insertThetaLong(memObj, memAdd, thetaLong); //bytes 16-23
//@formatter:on
//clear hash table area
dstMem.clear(preambleLongs << 3, 8 << lgArrLongs);
final DirectQuickSelectSketch dqss =
new DirectQuickSelectSketch(lgNomLongs, seed, preambleLongs);
dqss.hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs);
dqss.mem_ = dstMem;
return dqss;
}
/**
* Wrap a sketch around the given source Memory containing sketch data that originated from
* this sketch.
* @param srcMem See Memory
* The given Memory object must be in hash table form and not read only.
* @param seed See Update Hash Seed
* @return instance of this sketch
*/
static DirectQuickSelectSketch wrapInstance(final Memory srcMem, final long seed) {
final int preambleLongs;
final int serVer;
final int familyID;
final int lgNomLongs;
final int lgArrLongs;
final int flags;
final short seedHash;
final float p;
final long thetaLong;
if (srcMem.isReadOnly() && !srcMem.isDirect()) {
preambleLongs = srcMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F;
serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF;
familyID = srcMem.getByte(FAMILY_BYTE) & 0XFF;
lgNomLongs = srcMem.getByte(LG_NOM_LONGS_BYTE) & 0XFF;
lgArrLongs = srcMem.getByte(LG_ARR_LONGS_BYTE) & 0XFF;
flags = srcMem.getByte(FLAGS_BYTE) & 0XFF;
seedHash = srcMem.getShort(SEED_HASH_SHORT);
p = srcMem.getFloat(P_FLOAT);
thetaLong = srcMem.getLong(THETA_LONG);
} else {
final Object memObj = srcMem.array(); //may be null
final long memAdd = srcMem.getCumulativeOffset(0L);
preambleLongs = extractPreLongs(memObj, memAdd); //byte 0
serVer = extractSerVer(memObj, memAdd); //byte 1
familyID = extractFamilyID(memObj, memAdd); //byte 2
lgNomLongs = extractLgNomLongs(memObj, memAdd); //byte 3
lgArrLongs = extractLgArrLongs(memObj, memAdd); //byte 4
flags = extractFlags(memObj, memAdd); //byte 5
seedHash = (short)extractSeedHash(memObj, memAdd); //byte 6,7
p = extractP(memObj, memAdd); //bytes 12-15
thetaLong = extractThetaLong(memObj, memAdd); //bytes 16-23
}
if (serVer != SER_VER) {
throw new SketchesArgumentException(
"Possible corruption: Invalid Serialization Version: " + serVer);
}
final Family family = Family.idToFamily(familyID);
if (family.equals(Family.UNION)) {
if (preambleLongs != Family.UNION.getMinPreLongs()) {
throw new SketchesArgumentException(
"Possible corruption: Invalid PreambleLongs value for UNION: " + preambleLongs);
}
}
else if (family.equals(Family.QUICKSELECT)) {
if (preambleLongs != Family.QUICKSELECT.getMinPreLongs()) {
throw new SketchesArgumentException(
"Possible corruption: Invalid PreambleLongs value for QUICKSELECT: " + preambleLongs);
}
} else {
throw new SketchesArgumentException(
"Possible corruption: Invalid Family: " + family.toString());
}
if (lgNomLongs < MIN_LG_NOM_LONGS) {
throw new SketchesArgumentException(
"Possible corruption: Current Memory lgNomLongs < min required size: "
+ lgNomLongs + " < " + MIN_LG_NOM_LONGS);
}
final int flagsMask =
ORDERED_FLAG_MASK | COMPACT_FLAG_MASK | READ_ONLY_FLAG_MASK | BIG_ENDIAN_FLAG_MASK;
if ((flags & flagsMask) > 0) {
throw new SketchesArgumentException(
"Possible corruption: Input srcMem cannot be: big-endian, compact, ordered, or read-only");
}
Util.checkSeedHashes(seedHash, Util.computeSeedHash(seed));
final long curCapBytes = srcMem.getCapacity();
final int minReqBytes = getMemBytes(lgArrLongs, preambleLongs);
if (curCapBytes < minReqBytes) {
throw new SketchesArgumentException(
"Possible corruption: Current Memory size < min required size: "
+ curCapBytes + " < " + minReqBytes);
}
final double theta = thetaLong / MAX_THETA_LONG_AS_DOUBLE;
if ((lgArrLongs <= lgNomLongs) && (theta < p) ) {
throw new SketchesArgumentException(
"Possible corruption: Theta cannot be < p and lgArrLongs <= lgNomLongs. "
+ lgArrLongs + " <= " + lgNomLongs + ", Theta: " + theta + ", p: " + p);
}
final DirectQuickSelectSketch dqss =
new DirectQuickSelectSketch(lgNomLongs, seed, preambleLongs);
dqss.hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs);
dqss.mem_ = srcMem;
return dqss;
}
/**
* Fast-wrap a sketch around the given source Memory containing sketch data that originated from
* this sketch. This does NO validity checking of the given Memory.
* @param srcMem See Memory
* The given Memory object must be in hash table form and not read only.
* @param seed See Update Hash Seed
* @return instance of this sketch
*/
static DirectQuickSelectSketch fastWrap(final Memory srcMem, final long seed) {
final int preambleLongs;
final int lgNomLongs;
final int lgArrLongs;
if (srcMem.isReadOnly() && !srcMem.isDirect()) { //Read-Only Heap
preambleLongs = srcMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F;
lgNomLongs = srcMem.getByte(LG_NOM_LONGS_BYTE) & 0XFF;
lgArrLongs = srcMem.getByte(LG_ARR_LONGS_BYTE) & 0XFF;
} else {
final Object memObj = srcMem.array(); //may be null
final long memAdd = srcMem.getCumulativeOffset(0L);
preambleLongs = extractPreLongs(memObj, memAdd); //byte 0
lgNomLongs = extractLgNomLongs(memObj, memAdd); //byte 3
lgArrLongs = extractLgArrLongs(memObj, memAdd); //byte 4
}
final DirectQuickSelectSketch dqss =
new DirectQuickSelectSketch(lgNomLongs, seed, preambleLongs);
dqss.hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs);
dqss.mem_ = srcMem;
return dqss;
}
//Sketch
@Override
public Family getFamily() {
final int familyID = mem_.getByte(FAMILY_BYTE) & 0XFF;
return Family.idToFamily(familyID);
}
@Override
public ResizeFactor getResizeFactor() {
return ResizeFactor.getRF(getLgRF());
}
@Override
public int getRetainedEntries(final boolean valid) {
return mem_.getInt(RETAINED_ENTRIES_INT);
}
@Override
public boolean isDirect() {
return true;
}
@Override
public boolean isEmpty() {
return (mem_.getByte(FLAGS_BYTE) & EMPTY_FLAG_MASK) > 0;
}
@Override
public byte[] toByteArray() { //MY_FAMILY is stored in mem_
final byte lgArrLongs = mem_.getByte(LG_ARR_LONGS_BYTE);
final int lengthBytes = (preambleLongs_ + (1 << lgArrLongs)) << 3;
final byte[] byteArray = new byte[lengthBytes];
final Memory mem = new NativeMemory(byteArray);
mem_.copy(0, mem, 0, lengthBytes);
return byteArray;
}
//UpdateSketch
@Override
public UpdateSketch rebuild() {
if (getRetainedEntries(true) > (1 << getLgNomLongs())) {
quickSelectAndRebuild(mem_, preambleLongs_, lgNomLongs_);
}
return this;
}
@Override
public final void reset() {
//clear hash table
//hash table size and hashTableThreshold stays the same
//lgArrLongs stays the same
//thetaLongs resets to p
final int arrLongs = 1 << getLgArrLongs();
final int preBytes = preambleLongs_ << 3;
mem_.clear(preBytes, arrLongs * 8); //clear data array
//flags: bigEndian = readOnly = compact = ordered = false; empty = true.
mem_.putByte(FLAGS_BYTE, (byte) EMPTY_FLAG_MASK);
mem_.putInt(RETAINED_ENTRIES_INT, 0);
final float p = mem_.getFloat(P_FLOAT);
final long thetaLong = (long) (p * MAX_THETA_LONG_AS_DOUBLE);
mem_.putLong(THETA_LONG, thetaLong);
}
//restricted methods
@Override
int getPreambleLongs() {
return preambleLongs_;
}
@Override
long[] getCache() {
final long lgArrLongs = mem_.getByte(LG_ARR_LONGS_BYTE) & 0XFF;
final long[] cacheArr = new long[1 << lgArrLongs];
final Memory mem = new NativeMemory(cacheArr);
mem_.copy(preambleLongs_ << 3, mem, 0, 8 << lgArrLongs);
return cacheArr;
}
@Override
int getLgNomLongs() {
return lgNomLongs_;
}
@Override
Memory getMemory() {
return mem_;
}
@Override
float getP() {
return mem_.getFloat(P_FLOAT);
}
@Override
long getSeed() {
return seed_;
}
@Override
short getSeedHash() {
return seedHash_;
}
@Override
long getThetaLong() {
return mem_.getLong(THETA_LONG);
}
@Override
boolean isDirty() {
return false; //Always false for QuickSelectSketch
}
@Override
int getLgArrLongs() {
return mem_.getByte(LG_ARR_LONGS_BYTE) & 0XFF;
}
@Override
UpdateReturnState hashUpdate(final long hash) {
HashOperations.checkHashCorruption(hash);
mem_.putByte(FLAGS_BYTE, (byte) (mem_.getByte(FLAGS_BYTE) & ~EMPTY_FLAG_MASK));
final long thetaLong = getThetaLong();
//The over-theta test
if (HashOperations.continueCondition(thetaLong, hash)) {
return RejectedOverTheta; //signal that hash was rejected due to theta.
}
final int lgArrLongs = getLgArrLongs();
//The duplicate test
final int index;
if (mem_.isReadOnly() && !mem_.isDirect()) {
index = HashOperations.hashSearchOrInsert(mem_, lgArrLongs, hash, preambleLongs_ << 3);
} else {
index = HashOperations.fastHashSearchOrInsert(
mem_.array(), mem_.getCumulativeOffset(0L), lgArrLongs, hash, preambleLongs_ << 3);
}
if (index >= 0) {
return RejectedDuplicate; //Duplicate, not inserted
}
//insertion occurred, increment curCount
final int curCount = getRetainedEntries() + 1;
mem_.putInt(RETAINED_ENTRIES_INT, curCount); //update curCount
if (curCount > hashTableThreshold_) { //we need to do something, we are out of space
if (lgArrLongs > lgNomLongs_) { //at full size, rebuild
//Assumes no dirty values, changes thetaLong, curCount_
assert (lgArrLongs == lgNomLongs_ + 1)
: "lgArr: " + lgArrLongs + ", lgNom: " + lgNomLongs_;
//rebuild, refresh curCount based on # values in the hashtable.
quickSelectAndRebuild(mem_, preambleLongs_, lgNomLongs_);
} //end of rebuild, exit
else { //Not at full size, resize. Should not get here if lgRF = 0 and memCap is too small.
final int lgRF = getLgRF();
final int actLgRF = actLgResizeFactor(mem_.getCapacity(), lgArrLongs, preambleLongs_, lgRF);
int tgtLgArrLongs = Math.min(lgArrLongs + actLgRF, lgNomLongs_ + 1);
if (actLgRF > 0) { //Expand in current Memory
//lgArrLongs will change; thetaLong, curCount will not
resize(mem_, preambleLongs_, lgArrLongs, tgtLgArrLongs);
hashTableThreshold_ = setHashTableThreshold(lgNomLongs_, tgtLgArrLongs);
} //end of Expand in current memory, exit.
else {
//Request more memory, then resize. lgArrLongs will change; thetaLong, curCount will not
final int preBytes = preambleLongs_ << 3;
tgtLgArrLongs = Math.min(lgArrLongs + lgRF, lgNomLongs_ + 1);
final int tgtArrBytes = 8 << tgtLgArrLongs;
final int reqBytes = tgtArrBytes + preBytes;
final Memory newDstMem = MemoryUtil.memoryRequestHandler(mem_, reqBytes, false);
moveAndResize(mem_, preambleLongs_, lgArrLongs, newDstMem, tgtLgArrLongs, thetaLong);
mem_.getMemoryRequest().free(mem_, newDstMem); //normal free mechanism via MemoryRequest
mem_ = newDstMem;
hashTableThreshold_ = setHashTableThreshold(lgNomLongs_, tgtLgArrLongs);
} //end of Request more memory to resize
} //end of resize
}
return InsertedCountIncremented;
}
//private methods
private int getLgRF() {
return (mem_.getByte(PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT) & 0X3;
}
/**
* Returns the cardinality limit given the current size of the hash table array.
*
* @param lgNomLongs See lgNomLongs.
* @param lgArrLongs See lgArrLongs.
* @return the hash table threshold
*/
private static final int setHashTableThreshold(final int lgNomLongs, final int lgArrLongs) {
//FindBugs may complain if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, but this allows us
// to tune these constants for different sketches.
final double fraction = (lgArrLongs <= lgNomLongs) ? DQS_RESIZE_THRESHOLD : REBUILD_THRESHOLD;
return (int) Math.floor(fraction * (1 << lgArrLongs));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy