All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.theta.IntersectionImpl Maven / Gradle / Ivy

/*
 * Copyright 2016, Yahoo! Inc. Licensed under the terms of the
 * Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.theta;

import static com.yahoo.sketches.Family.objectToFamily;
import static com.yahoo.sketches.Util.MIN_LG_ARR_LONGS;
import static com.yahoo.sketches.Util.floorPowerOf2;
import static com.yahoo.sketches.theta.CompactSketch.compactCachePart;
import static com.yahoo.sketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.FAMILY_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.FLAGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.LG_ARR_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.P_FLOAT;
import static com.yahoo.sketches.theta.PreambleUtil.RETAINED_ENTRIES_INT;
import static com.yahoo.sketches.theta.PreambleUtil.SEED_HASH_SHORT;
import static com.yahoo.sketches.theta.PreambleUtil.SER_VER;
import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG;
import static com.yahoo.sketches.theta.PreambleUtil.clearEmpty;
import static com.yahoo.sketches.theta.PreambleUtil.extractCurCount;
import static com.yahoo.sketches.theta.PreambleUtil.extractFamilyID;
import static com.yahoo.sketches.theta.PreambleUtil.extractFlags;
import static com.yahoo.sketches.theta.PreambleUtil.extractLgArrLongs;
import static com.yahoo.sketches.theta.PreambleUtil.extractPreLongs;
import static com.yahoo.sketches.theta.PreambleUtil.extractSerVer;
import static com.yahoo.sketches.theta.PreambleUtil.extractThetaLong;
import static com.yahoo.sketches.theta.PreambleUtil.insertCurCount;
import static com.yahoo.sketches.theta.PreambleUtil.insertFamilyID;
import static com.yahoo.sketches.theta.PreambleUtil.insertFlags;
import static com.yahoo.sketches.theta.PreambleUtil.insertLgArrLongs;
import static com.yahoo.sketches.theta.PreambleUtil.insertP;
import static com.yahoo.sketches.theta.PreambleUtil.insertPreLongs;
import static com.yahoo.sketches.theta.PreambleUtil.insertSerVer;
import static com.yahoo.sketches.theta.PreambleUtil.insertThetaLong;
import static java.lang.Math.min;

import java.util.Arrays;

import com.yahoo.memory.Memory;
import com.yahoo.memory.NativeMemory;
import com.yahoo.sketches.Family;
import com.yahoo.sketches.HashOperations;
import com.yahoo.sketches.SketchesArgumentException;
import com.yahoo.sketches.SketchesStateException;
import com.yahoo.sketches.Util;

/**
 * Intersection operation for Theta Sketches.
 *
 * 

This implementation uses data either on-heap or off-heap in a given Memory * that is owned and managed by the caller. * The off-heap Memory, which if managed properly will greatly reduce the need for * the JVM to perform garbage collection.

* * @author Lee Rhodes * @author Kevin Lang */ final class IntersectionImpl extends SetOperation implements Intersection { private final short seedHash_; private final Memory mem_; //Note: Intersection does not use lgNomLongs or k, per se. private int lgArrLongs_; //current size of hash table private int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true private long thetaLong_; private boolean empty_; private long[] hashTable_ = null; //HT => Data. Only used On Heap private int maxLgArrLongs_ = 0; //max size of hash table. Only used Off Heap private IntersectionImpl(final Memory mem, final long seed, final boolean newMem) { mem_ = mem; if (mem != null) { if (newMem) { seedHash_ = computeSeedHash(seed); mem_.putShort(SEED_HASH_SHORT, seedHash_); } else { seedHash_ = mem_.getShort(SEED_HASH_SHORT); Util.checkSeedHashes(seedHash_, computeSeedHash(seed)); //check for seed hash conflict } } else { seedHash_ = computeSeedHash(seed); } } /** * Construct a new Intersection target on the java heap. * * @param seed See Seed */ static IntersectionImpl initNewHeapInstance(final long seed) { final IntersectionImpl impl = new IntersectionImpl(null, seed, false); impl.lgArrLongs_ = 0; impl.curCount_ = -1; //Universal Set is true impl.thetaLong_ = Long.MAX_VALUE; impl.empty_ = false; //A virgin intersection represents the Universal Set so empty is FALSE! impl.hashTable_ = null; return impl; } /** * Heapify an intersection target from a Memory image containing data. * @param srcMem The source Memory object. * See Memory * @param seed See seed */ static IntersectionImpl heapifyInstance(final Memory srcMem, final long seed) { final IntersectionImpl impl = new IntersectionImpl(null, seed, false); //Get Preamble //Note: Intersection does not use lgNomLongs (or k), per se. //seedHash loaded and checked in private constructor final int preLongsMem; final int serVer; final int famID; final int lgArrLongs; //current hash table size final int flags; final int curCount; final long thetaLong; if (srcMem.isReadOnly() && !srcMem.isDirect()) { preLongsMem = srcMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF; famID = srcMem.getByte(FAMILY_BYTE) & 0XFF; lgArrLongs = srcMem.getByte(LG_ARR_LONGS_BYTE) & 0XFF; flags = srcMem.getByte(FLAGS_BYTE) & 0XFF; curCount = srcMem.getInt(RETAINED_ENTRIES_INT); thetaLong = srcMem.getLong(THETA_LONG); } else { final Object memObj = srcMem.array(); //may be null final long memAdd = srcMem.getCumulativeOffset(0L); preLongsMem = extractPreLongs(memObj, memAdd); serVer = extractSerVer(memObj, memAdd); famID = extractFamilyID(memObj, memAdd); lgArrLongs = extractLgArrLongs(memObj, memAdd); flags = extractFlags(memObj, memAdd); curCount = extractCurCount(memObj, memAdd); thetaLong = extractThetaLong(memObj, memAdd); } final boolean empty = (flags & EMPTY_FLAG_MASK) > 0; //Checks if (preLongsMem != CONST_PREAMBLE_LONGS) { throw new SketchesArgumentException( "Memory PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongsMem); } if (serVer != SER_VER) { throw new SketchesArgumentException("Serialization Version must equal " + SER_VER); } Family.INTERSECTION.checkFamilyID(famID); if (empty) { if (curCount != 0) { throw new SketchesArgumentException( "srcMem empty state inconsistent with curCount: " + empty + "," + curCount); } //empty = true AND curCount_ = 0: OK } //Initialize impl.lgArrLongs_ = lgArrLongs; impl.curCount_ = curCount; impl.thetaLong_ = thetaLong; impl.empty_ = empty; if (!empty) { if (curCount > 0) { //can't be virgin, empty, or curCount == 0 impl.hashTable_ = new long[1 << lgArrLongs]; srcMem.getLongArray(CONST_PREAMBLE_LONGS << 3, impl.hashTable_, 0, 1 << lgArrLongs); } } return impl; } /** * Construct a new Intersection target direct to the given destination Memory. * Called by SetOperation.Builder. * * @param seed See Seed * @param dstMem destination Memory. * See Memory */ static IntersectionImpl initNewDirectInstance(final long seed, final Memory dstMem) { final IntersectionImpl impl = new IntersectionImpl(dstMem, seed, true); final Object memObj = dstMem.array(); final long memAdd = dstMem.getCumulativeOffset(0L); //Load Preamble insertPreLongs(memObj, memAdd, CONST_PREAMBLE_LONGS); //RF not used = 0 insertSerVer(memObj, memAdd, SER_VER); insertFamilyID(memObj, memAdd, Family.INTERSECTION.getID()); //Note: Intersection does not use lgNomLongs or k, per se. //set lgArrLongs initially to minimum. Don't clear cache in mem insertLgArrLongs(memObj, memAdd, MIN_LG_ARR_LONGS); insertFlags(memObj, memAdd, 0); //bigEndian = readOnly = compact = ordered = empty = false; //seedHash loaded and checked in private constructor insertCurCount(memObj, memAdd, -1); insertP(memObj, memAdd, (float) 1.0); insertThetaLong(memObj, memAdd, Long.MAX_VALUE); //Initialize impl.lgArrLongs_ = MIN_LG_ARR_LONGS; impl.curCount_ = -1; //set in mem below impl.thetaLong_ = Long.MAX_VALUE; impl.empty_ = false; impl.maxLgArrLongs_ = checkMaxLgArrLongs(dstMem); //Only Off Heap return impl; } /** * Wrap an Intersection target around the given source Memory containing intersection data. * @param srcMem The source Memory image. * See Memory * @param seed See seed */ static IntersectionImpl wrapInstance(final Memory srcMem, final long seed) { final IntersectionImpl impl = new IntersectionImpl(srcMem, seed, false); //Get Preamble //Note: Intersection does not use lgNomLongs (or k), per se. //seedHash loaded and checked in private constructor final int preLongsMem; final int serVer; final int famID; final int lgArrLongs; //current hash table size final int flags; final int curCount; final long thetaLong; if (srcMem.isReadOnly() && !srcMem.isDirect()) { preLongsMem = srcMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F; serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF; famID = srcMem.getByte(FAMILY_BYTE) & 0XFF; lgArrLongs = srcMem.getByte(LG_ARR_LONGS_BYTE) & 0XFF; flags = srcMem.getByte(FLAGS_BYTE) & 0XFF; curCount = srcMem.getInt(RETAINED_ENTRIES_INT); thetaLong = srcMem.getLong(THETA_LONG); } else { final Object memObj = srcMem.array(); //may be null final long memAdd = srcMem.getCumulativeOffset(0L); preLongsMem = extractPreLongs(memObj, memAdd); serVer = extractSerVer(memObj, memAdd); famID = extractFamilyID(memObj, memAdd); lgArrLongs = extractLgArrLongs(memObj, memAdd); flags = extractFlags(memObj, memAdd); curCount = extractCurCount(memObj, memAdd); thetaLong = extractThetaLong(memObj, memAdd); } final boolean empty = (flags & EMPTY_FLAG_MASK) > 0; //Checks if (preLongsMem != CONST_PREAMBLE_LONGS) { throw new SketchesArgumentException( "Memory PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongsMem); } if (serVer != SER_VER) { throw new SketchesArgumentException("Serialization Version must equal " + SER_VER); } Family.INTERSECTION.checkFamilyID(famID); if (empty) { if (curCount != 0) { throw new SketchesArgumentException( "srcMem empty state inconsistent with curCount: " + empty + "," + curCount); } //empty = true AND curCount_ = 0: OK } //else empty = false, curCount could be anything //Initialize impl.lgArrLongs_ = lgArrLongs; impl.curCount_ = curCount; impl.thetaLong_ = thetaLong; impl.empty_ = empty; impl.maxLgArrLongs_ = checkMaxLgArrLongs(srcMem); //Only Off Heap, check for min size return impl; } @Override public void update(final Sketch sketchIn) { final boolean firstCall = curCount_ < 0; final Object memObj = mem_ != null ? mem_.array() : null; final long memAdd = mem_ != null ? mem_.getCumulativeOffset(0) : 0; //Corner cases if (sketchIn == null) { //null -> Th = 1.0, count = 0, empty = true //No seedHash to check //Beause of the def of null above and the Empty Rule (which is OR) empty_ must be null. empty_ = true; thetaLong_ = firstCall ? Long.MAX_VALUE : thetaLong_; //if Nth call, stays the same curCount_ = 0; if (mem_ != null) { PreambleUtil.setEmpty(memObj, memAdd); insertThetaLong(memObj, memAdd, thetaLong_); insertCurCount(memObj, memAdd, 0); } return; } //Checks Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); thetaLong_ = min(thetaLong_, sketchIn.getThetaLong()); //Theta rule empty_ = empty_ || sketchIn.isEmpty(); //Empty rule if (mem_ != null) { insertThetaLong(memObj, memAdd, thetaLong_); if (empty_) { PreambleUtil.setEmpty(memObj, memAdd); } else { clearEmpty(memObj, memAdd); } } final int sketchInEntries = sketchIn.getRetainedEntries(true); // The truth table for the following state machine for corner cases: // Case CurCount SketchInEntries | Actions // 1 <0 0 | CurCount = 0; HT = null; exit // 2 0 0 | CurCount = 0; HT = null; exit // 3 >0 0 | CurCount = 0; HT = null; exit // 4 <0 >0 | Clone SketchIn; exit // 5 0 >0 | CurCount = 0; HT = null; exit // 6 >0 >0 | Perform full intersect if ((curCount_ == 0) || (sketchInEntries == 0)) { //Cases 1,2,3,5 //All future intersections result in zero data, but theta can still be reduced. curCount_ = 0; if (mem_ != null) { insertCurCount(memObj, memAdd, 0); } hashTable_ = null; //No need for a HT. Don't bother clearing mem if valid } else if (firstCall) { //Case 4: Clone the incoming sketch curCount_ = sketchIn.getRetainedEntries(true); final int requiredLgArrLongs = computeMinLgArrLongsFromCount(curCount_); final int priorLgArrLongs = lgArrLongs_; //prior only used in error message lgArrLongs_ = requiredLgArrLongs; if (mem_ != null) { //Off heap, check if current dstMem is large enough insertCurCount(memObj, memAdd, curCount_); insertLgArrLongs(memObj, memAdd, lgArrLongs_); if (requiredLgArrLongs <= maxLgArrLongs_) { //OK mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear only what required } else { //not enough space in dstMem //TODO move to request model? throw new SketchesArgumentException( "Insufficient dstMem hash table space: " + (1 << requiredLgArrLongs) + " > " + (1 << priorLgArrLongs)); } } else { //On the heap, allocate a HT hashTable_ = new long[1 << lgArrLongs_]; } moveDataToTgt(sketchIn.getCache(), curCount_); } else { //Case 6: Perform full intersect //Sets resulting hashTable, curCount and adjusts lgArrLongs performIntersect(sketchIn); } } @Override public CompactSketch getResult(final boolean dstOrdered, final Memory dstMem) { if (curCount_ < 0) { throw new SketchesStateException( "Calling getResult() with no intervening intersections is not a legal result."); } long[] compactCacheR; if (curCount_ == 0) { compactCacheR = new long[0]; return CompactSketch.createCompactSketch( compactCacheR, empty_, seedHash_, curCount_, thetaLong_, dstOrdered, dstMem); } //else curCount > 0 final long[] hashTable; if (mem_ != null) { final int htLen = 1 << lgArrLongs_; hashTable = new long[htLen]; mem_.getLongArray(CONST_PREAMBLE_LONGS << 3, hashTable, 0, htLen); } else { hashTable = hashTable_; } compactCacheR = compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered); //Create the CompactSketch return CompactSketch.createCompactSketch( compactCacheR, empty_, seedHash_, curCount_, thetaLong_, dstOrdered, dstMem); } @Override public CompactSketch getResult() { return getResult(true, null); } @Override public boolean hasResult() { return (mem_ != null) ? mem_.getInt(RETAINED_ENTRIES_INT) >= 0 : curCount_ >= 0; } @Override public byte[] toByteArray() { final int preBytes = CONST_PREAMBLE_LONGS << 3; final int dataBytes = (curCount_ > 0) ? 8 << lgArrLongs_ : 0; final byte[] byteArrOut = new byte[preBytes + dataBytes]; if (mem_ != null) { mem_.getByteArray(0, byteArrOut, 0, preBytes + dataBytes); } else { final NativeMemory memOut = new NativeMemory(byteArrOut); //preamble memOut.putByte(PREAMBLE_LONGS_BYTE, (byte) CONST_PREAMBLE_LONGS); //RF not used = 0 memOut.putByte(SER_VER_BYTE, (byte) SER_VER); memOut.putByte(FAMILY_BYTE, (byte) objectToFamily(this).getID()); memOut.putByte(LG_NOM_LONGS_BYTE, (byte) 0); //not used memOut.putByte(LG_ARR_LONGS_BYTE, (byte) lgArrLongs_); if (empty_) { memOut.setBits(FLAGS_BYTE, (byte) EMPTY_FLAG_MASK); } else { memOut.clearBits(FLAGS_BYTE, (byte) EMPTY_FLAG_MASK); } memOut.putShort(SEED_HASH_SHORT, seedHash_); memOut.putInt(RETAINED_ENTRIES_INT, curCount_); memOut.putFloat(P_FLOAT, (float) 1.0); memOut.putLong(THETA_LONG, thetaLong_); //data if (curCount_ > 0) { memOut.putLongArray(preBytes, hashTable_, 0, 1 << lgArrLongs_); } } return byteArrOut; } @Override public void reset() { curCount_ = -1; thetaLong_ = Long.MAX_VALUE; empty_ = false; hashTable_ = null; if (mem_ != null) { final Object memObj = mem_.array(); //may be null final long memAdd = mem_.getCumulativeOffset(0); insertLgArrLongs(memObj, memAdd, lgArrLongs_); //make sure insertCurCount(memObj, memAdd, -1); insertThetaLong(memObj, memAdd, Long.MAX_VALUE); clearEmpty(memObj, memAdd); } } @Override public Family getFamily() { return Family.INTERSECTION; } //restricted private void performIntersect(final Sketch sketchIn) { // curCount and input data are nonzero, match against HT assert ((curCount_ > 0) && (!empty_)); final long[] cacheIn = sketchIn.getCache(); final int arrLongsIn = cacheIn.length; final long[] hashTable; if (mem_ != null) { final int htLen = 1 << lgArrLongs_; hashTable = new long[htLen]; mem_.getLongArray(CONST_PREAMBLE_LONGS << 3, hashTable, 0, htLen); } else { hashTable = hashTable_; } //allocate space for matching final long[] matchSet = new long[ min(curCount_, sketchIn.getRetainedEntries(true)) ]; int matchSetCount = 0; if (sketchIn.isOrdered()) { //ordered compact, which enables early stop for (int i = 0; i < arrLongsIn; i++ ) { final long hashIn = cacheIn[i]; //if (hashIn <= 0L) continue; //<= 0 should not happen if (hashIn >= thetaLong_) { break; //early stop assumes that hashes in input sketch are ordered! } final int foundIdx = HashOperations.hashSearch(hashTable, lgArrLongs_, hashIn); if (foundIdx == -1) { continue; } matchSet[matchSetCount++] = hashIn; } } else { //either unordered compact or hash table for (int i = 0; i < arrLongsIn; i++ ) { final long hashIn = cacheIn[i]; if ((hashIn <= 0L) || (hashIn >= thetaLong_)) { continue; } final int foundIdx = HashOperations.hashSearch(hashTable, lgArrLongs_, hashIn); if (foundIdx == -1) { continue; } matchSet[matchSetCount++] = hashIn; } } //reduce effective array size to minimum curCount_ = matchSetCount; lgArrLongs_ = computeMinLgArrLongsFromCount(matchSetCount); if (mem_ != null) { final Object memObj = mem_.array(); //may be null final long memAdd = mem_.getCumulativeOffset(0); insertCurCount(memObj, memAdd, matchSetCount); insertLgArrLongs(memObj, memAdd, lgArrLongs_); mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear for rebuild } else { Arrays.fill(hashTable_, 0, 1 << lgArrLongs_, 0L); //clear for rebuild } //move matchSet to target moveDataToTgt(matchSet, matchSetCount); } private void moveDataToTgt(final long[] arr, final int count) { final int arrLongsIn = arr.length; int tmpCnt = 0; if (mem_ != null) { //Off Heap puts directly into mem final Object memObj = mem_.array(); //may be null final long memAdd = mem_.getCumulativeOffset(0); final int preBytes = CONST_PREAMBLE_LONGS << 3; final int lgArrLongs = lgArrLongs_; final long thetaLong = thetaLong_; for (int i = 0; i < arrLongsIn; i++ ) { final long hashIn = arr[i]; if (HashOperations.continueCondition(thetaLong, hashIn)) { continue; } HashOperations.fastHashInsertOnly(memObj, memAdd, lgArrLongs, hashIn, preBytes); tmpCnt++; } } else { //On Heap. Assumes HT exists and is large enough for (int i = 0; i < arrLongsIn; i++ ) { final long hashIn = arr[i]; if (HashOperations.continueCondition(thetaLong_, hashIn)) { continue; } HashOperations.hashInsertOnly(hashTable_, lgArrLongs_, hashIn); tmpCnt++; } } assert (tmpCnt == count) : "Intersection Count Check: got: " + tmpCnt + ", expected: " + count; } //special handlers for Off Heap /** * Returns the correct maximum lgArrLongs given the capacity of the Memory. Checks that the * capacity is large enough for the minimum sized hash table. * @param dstMem the given Memory * @return the correct maximum lgArrLongs given the capacity of the Memory */ private static final int checkMaxLgArrLongs(final Memory dstMem) { final int preBytes = CONST_PREAMBLE_LONGS << 3; final long cap = dstMem.getCapacity(); final int maxLgArrLongs = Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3); if (maxLgArrLongs < MIN_LG_ARR_LONGS) { throw new SketchesArgumentException( "dstMem not large enough for minimum sized hash table: " + cap); } return maxLgArrLongs; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy