All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.theta.IntersectionImpl Maven / Gradle / Ivy

There is a newer version: 0.13.4
Show newest version
/*
 * Copyright 2016, Yahoo! Inc. Licensed under the terms of the
 * Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.theta;

import static com.yahoo.sketches.Family.objectToFamily;
import static com.yahoo.sketches.Util.MIN_LG_ARR_LONGS;
import static com.yahoo.sketches.Util.floorPowerOf2;
import static com.yahoo.sketches.theta.CompactSketch.compactCachePart;
import static com.yahoo.sketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.FAMILY_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.FLAGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.LG_ARR_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.P_FLOAT;
import static com.yahoo.sketches.theta.PreambleUtil.RETAINED_ENTRIES_INT;
import static com.yahoo.sketches.theta.PreambleUtil.SEED_HASH_SHORT;
import static com.yahoo.sketches.theta.PreambleUtil.SER_VER;
import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.THETA_LONG;
import static com.yahoo.sketches.theta.PreambleUtil.extractCurCount;
import static com.yahoo.sketches.theta.PreambleUtil.extractFamilyID;
import static com.yahoo.sketches.theta.PreambleUtil.extractFlags;
import static com.yahoo.sketches.theta.PreambleUtil.extractLgArrLongs;
import static com.yahoo.sketches.theta.PreambleUtil.extractPreLongs;
import static com.yahoo.sketches.theta.PreambleUtil.extractSeedHash;
import static com.yahoo.sketches.theta.PreambleUtil.extractSerVer;
import static com.yahoo.sketches.theta.PreambleUtil.insertCurCount;
import static com.yahoo.sketches.theta.PreambleUtil.insertFamilyID;
import static com.yahoo.sketches.theta.PreambleUtil.insertFlags;
import static com.yahoo.sketches.theta.PreambleUtil.insertLgArrLongs;
import static com.yahoo.sketches.theta.PreambleUtil.insertP;
import static com.yahoo.sketches.theta.PreambleUtil.insertPreLongs;
import static com.yahoo.sketches.theta.PreambleUtil.insertSeedHash;
import static com.yahoo.sketches.theta.PreambleUtil.insertSerVer;
import static java.lang.Math.min;

import java.util.Arrays;

import com.yahoo.memory.Memory;
import com.yahoo.memory.NativeMemory;
import com.yahoo.sketches.Family;
import com.yahoo.sketches.HashOperations;
import com.yahoo.sketches.SketchesArgumentException;
import com.yahoo.sketches.SketchesStateException;
import com.yahoo.sketches.Util;

/**
 * Intersection operation for Theta Sketches.
 *
 * 

This implementation uses data either on-heap or off-heap in a given Memory * that is owned and managed by the caller. * The off-heap Memory, which if managed properly will greatly reduce the need for * the JVM to perform garbage collection.

* * @author Lee Rhodes * @author Kevin Lang */ final class IntersectionImpl extends SetOperation implements Intersection { private final short seedHash_; //Note: Intersection does not use lgNomLongs or k, per se. private int lgArrLongs_; //current size of hash table private int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true private long thetaLong_; private boolean empty_; private long[] hashTable_ = null; //HT => Data. Only used On Heap private int maxLgArrLongs_ = 0; //max size of hash table. Only used Off Heap private Memory mem_ = null; //must be set by one of the factory methods. Only used Off Heap. private IntersectionImpl(short seedHash) { seedHash_ = seedHash; } /** * Construct a new Intersection target on the java heap. * * @param seed See Seed */ static IntersectionImpl initNewHeapInstance(long seed) { IntersectionImpl impl = new IntersectionImpl(computeSeedHash(seed)); impl.empty_ = false; //A virgin intersection represents the Universal Set so empty is FALSE! impl.curCount_ = -1; //Universal Set is true impl.thetaLong_ = Long.MAX_VALUE; impl.lgArrLongs_ = 0; impl.hashTable_ = null; impl.mem_ = null; //On the Heap return impl; } /** * Heapify an intersection target from a Memory image containing data. * @param srcMem The source Memory object. * See Memory * @param seed See seed */ static IntersectionImpl heapifyInstance(Memory srcMem, long seed) { int preLongs = CONST_PREAMBLE_LONGS; long[] preArr = new long[preLongs]; srcMem.getLongArray(0, preArr, 0, preLongs); long pre0 = preArr[0]; int preambleLongs = extractPreLongs(pre0); if (preambleLongs != CONST_PREAMBLE_LONGS) { throw new SketchesArgumentException("PreambleLongs must equal " + CONST_PREAMBLE_LONGS); } int serVer = extractSerVer(pre0); if (serVer != SER_VER) { throw new SketchesArgumentException("Ser Version must equal " + SER_VER); } int famID = extractFamilyID(pre0); Family.INTERSECTION.checkFamilyID(famID); short seedHash = computeSeedHash(seed); short seedHashMem = (short) extractSeedHash(pre0); Util.checkSeedHashes(seedHashMem, seedHash); //check for seed hash conflict IntersectionImpl impl = new IntersectionImpl(seedHash); //Note: Intersection does not use lgNomLongs or k, per se. impl.lgArrLongs_ = extractLgArrLongs(pre0); //current hash table size int flags = extractFlags(pre0); impl.empty_ = (flags & EMPTY_FLAG_MASK) > 0; impl.curCount_ = srcMem.getInt(RETAINED_ENTRIES_INT); impl.thetaLong_ = srcMem.getLong(THETA_LONG); if (impl.empty_) { if (impl.curCount_ != 0) { throw new SketchesArgumentException( "srcMem empty state inconsistent with curCount: " + impl.empty_ + "," + impl.curCount_); } //empty = true AND curCount_ = 0: OK } else { //empty = false, curCount could be anything if (impl.curCount_ > 0) { //can't be virgin, empty, or curCount == 0 impl.hashTable_ = new long[1 << impl.lgArrLongs_]; srcMem.getLongArray(CONST_PREAMBLE_LONGS << 3, impl.hashTable_, 0, 1 << impl.lgArrLongs_); } } impl.mem_ = null; //On the Heap return impl; } /** * Construct a new Intersection target direct to the given destination Memory. * Called by SetOperation.Builder. * * @param seed See Seed * @param dstMem destination Memory. * See Memory */ static IntersectionImpl initNewDirectInstance(long seed, Memory dstMem) { short seedHash = computeSeedHash(seed); IntersectionImpl impl = new IntersectionImpl(seedHash); int preLongs = CONST_PREAMBLE_LONGS; impl.maxLgArrLongs_ = checkMaxLgArrLongs(dstMem); //Only Off Heap //build preamble and cache together in single Memory, insert fields into memory in one step long[] preArr = new long[preLongs]; //becomes the preamble long pre0 = 0; pre0 = insertPreLongs(preLongs, pre0); //RF not used = 0 pre0 = insertSerVer(SER_VER, pre0); pre0 = insertFamilyID(Family.INTERSECTION.getID(), pre0); //Note: Intersection does not use lgNomLongs or k, per se. impl.lgArrLongs_ = MIN_LG_ARR_LONGS; //set initially to minimum, but don't clear cache in mem pre0 = insertLgArrLongs(MIN_LG_ARR_LONGS, pre0); //flags: bigEndian = readOnly = compact = ordered = false; impl.empty_ = false; pre0 = insertFlags(0, pre0); pre0 = insertSeedHash(seedHash, pre0); preArr[0] = pre0; long pre1 = 0; impl.curCount_ = -1; //set in mem below pre1 = insertCurCount(-1, pre1); pre1 = insertP((float) 1.0, pre1); preArr[1] = pre1; impl.thetaLong_ = Long.MAX_VALUE; preArr[2] = impl.thetaLong_; dstMem.putLongArray(0, preArr, 0, preLongs); //put into mem impl.mem_ = dstMem; //Off Heap return impl; } /** * Wrap an Intersection target around the given source Memory containing intersection data. * @param srcMem The source Memory image. * See Memory * @param seed See seed */ static IntersectionImpl wrapInstance(Memory srcMem, long seed) { int preLongs = CONST_PREAMBLE_LONGS; long[] preArr = new long[preLongs]; srcMem.getLongArray(0, preArr, 0, preLongs); long pre0 = preArr[0]; int preLongsMem = extractPreLongs(pre0); if (preLongsMem != CONST_PREAMBLE_LONGS) { throw new SketchesArgumentException("PreambleLongs must = 3."); } int serVer = extractSerVer(pre0); if (serVer != 3) { throw new SketchesArgumentException("Ser Version must = 3"); } int famID = extractFamilyID(pre0); Family.INTERSECTION.checkFamilyID(famID); short seedHash = computeSeedHash(seed); short seedHashMem = (short) extractSeedHash(pre0); Util.checkSeedHashes(seedHashMem, seedHash); //check for seed hash conflict IntersectionImpl impl = new IntersectionImpl(seedHash); //Note: Intersection does not use lgNomLongs or k, per se. impl.lgArrLongs_ = extractLgArrLongs(pre0); //current hash table size impl.maxLgArrLongs_ = checkMaxLgArrLongs(srcMem); //Only Off Heap, check for min size int flags = extractFlags(pre0); impl.empty_ = (flags & EMPTY_FLAG_MASK) > 0; impl.curCount_ = extractCurCount(preArr[1]); impl.thetaLong_ = preArr[2]; if (impl.empty_) { if (impl.curCount_ != 0) { throw new SketchesArgumentException( "srcMem empty state inconsistent with curCount: " + impl.empty_ + "," + impl.curCount_); } //empty = true AND curCount_ = 0: OK } //else empty = false, curCount could be anything impl.mem_ = srcMem; //Off Heap return impl; } @Override public void update(Sketch sketchIn) { if (sketchIn == null) { //null := Th = 1.0, count = 0, empty = true //Can't check the seedHash empty_ = setEmpty(true, mem_); //The Empty rule is OR if (curCount_ < 0) { //1st Call thetaLong_ = setThetaLong(Long.MAX_VALUE, mem_); } //else it is the Nth Call and theta stays the same curCount_ = setCurCount(0, mem_); return; } //The Intersection State Machine int sketchInEntries = sketchIn.getRetainedEntries(true); Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash()); thetaLong_ = setThetaLong(min(thetaLong_, sketchIn.getThetaLong()), mem_); //Theta rule empty_ = setEmpty(empty_ || sketchIn.isEmpty(), mem_); //Empty rule if ((curCount_ == 0) || (sketchInEntries == 0)) { //The 1st Call (curCount < 0) and sketchInEntries == 0. //The Nth Call (curCount == 0) and sketchInEntries == 0. //The Nth Call (curCount == 0) and sketchInEntries > 0. //The Nth Call (curCount > 0) and sketchInEntries == 0. //All future intersections result in zero data, but theta can still be reduced. curCount_ = setCurCount(0, mem_); hashTable_ = null; //No need for a HT. } else if (curCount_ < 0) { //virgin //The 1st Call (curCount < 0) and sketchInEntries > 0. Clone the incoming sketch curCount_ = setCurCount(sketchIn.getRetainedEntries(true), mem_); int requiredLgArrLongs = computeMinLgArrLongsFromCount(curCount_); int priorLgArrLongs = lgArrLongs_; //only used in error message lgArrLongs_ = setLgArrLongs(requiredLgArrLongs, mem_); if (mem_ != null) { //Off heap, check if current dstMem is large enough if (requiredLgArrLongs <= maxLgArrLongs_) { //OK mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); } else { //not enough space in dstMem //TODO move to request model? throw new SketchesArgumentException( "Insufficient dstMem hash table space: " + (1 << requiredLgArrLongs) + " > " + (1 << priorLgArrLongs)); } } else { //On the heap, allocate a HT hashTable_ = new long[1 << lgArrLongs_]; } moveDataToTgt(sketchIn.getCache(), curCount_); } else { //curCount > 0 //The Nth Call (curCount > 0) and sketchInEntries > 0. //Must perform full intersect //Sets resulting hashTable, curCount and adjusts lgArrLongs performIntersect(sketchIn); } } @Override public CompactSketch getResult(boolean dstOrdered, Memory dstMem) { if (curCount_ < 0) { throw new SketchesStateException( "Calling getResult() with no intervening intersections is not a legal result."); } long[] compactCacheR; if (curCount_ == 0) { compactCacheR = new long[0]; return CompactSketch.createCompactSketch( compactCacheR, empty_, seedHash_, curCount_, thetaLong_, dstOrdered, dstMem); } //else curCount > 0 long[] hashTable; if (mem_ != null) { int htLen = 1 << lgArrLongs_; hashTable = new long[htLen]; mem_.getLongArray(CONST_PREAMBLE_LONGS << 3, hashTable, 0, htLen); } else { hashTable = hashTable_; } compactCacheR = compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered); //Create the CompactSketch return CompactSketch.createCompactSketch( compactCacheR, empty_, seedHash_, curCount_, thetaLong_, dstOrdered, dstMem); } @Override public CompactSketch getResult() { return getResult(true, null); } @Override public boolean hasResult() { return (mem_ != null) ? mem_.getInt(RETAINED_ENTRIES_INT) >= 0 : curCount_ >= 0; } @Override public byte[] toByteArray() { int preBytes = CONST_PREAMBLE_LONGS << 3; int dataBytes = (curCount_ > 0) ? 8 << lgArrLongs_ : 0; byte[] byteArrOut = new byte[preBytes + dataBytes]; if (mem_ != null) { mem_.getByteArray(0, byteArrOut, 0, preBytes + dataBytes); } else { NativeMemory memOut = new NativeMemory(byteArrOut); //preamble memOut.putByte(PREAMBLE_LONGS_BYTE, (byte) CONST_PREAMBLE_LONGS); //RF not used = 0 memOut.putByte(SER_VER_BYTE, (byte) SER_VER); memOut.putByte(FAMILY_BYTE, (byte) objectToFamily(this).getID()); memOut.putByte(LG_NOM_LONGS_BYTE, (byte) 0); //not used memOut.putByte(LG_ARR_LONGS_BYTE, (byte) lgArrLongs_); if (empty_) { memOut.setBits(FLAGS_BYTE, (byte) EMPTY_FLAG_MASK); } else { memOut.clearBits(FLAGS_BYTE, (byte) EMPTY_FLAG_MASK); } memOut.putShort(SEED_HASH_SHORT, seedHash_); memOut.putInt(RETAINED_ENTRIES_INT, curCount_); memOut.putFloat(P_FLOAT, (float) 1.0); memOut.putLong(THETA_LONG, thetaLong_); //data if (curCount_ > 0) { memOut.putLongArray(preBytes, hashTable_, 0, 1 << lgArrLongs_); } } return byteArrOut; } @Override public void reset() { lgArrLongs_ = setLgArrLongs(lgArrLongs_, mem_); curCount_ = setCurCount(-1, mem_); //Universal Set is true thetaLong_ = setThetaLong(Long.MAX_VALUE, mem_); empty_ = setEmpty(false, mem_); hashTable_ = null; } @Override public Family getFamily() { return Family.INTERSECTION; } //restricted private void performIntersect(Sketch sketchIn) { // curCount and input data are nonzero, match against HT assert ((curCount_ > 0) && (!empty_)); long[] cacheIn = sketchIn.getCache(); int arrLongsIn = cacheIn.length; long[] hashTable; if (mem_ != null) { int htLen = 1 << lgArrLongs_; hashTable = new long[htLen]; mem_.getLongArray(CONST_PREAMBLE_LONGS << 3, hashTable, 0, htLen); } else { hashTable = hashTable_; } //allocate space for matching long[] matchSet = new long[ min(curCount_, sketchIn.getRetainedEntries(true)) ]; int matchSetCount = 0; if (sketchIn.isOrdered()) { //ordered compact, which enables early stop for (int i = 0; i < arrLongsIn; i++ ) { long hashIn = cacheIn[i]; //if (hashIn <= 0L) continue; //<=0 should not happen if (hashIn >= thetaLong_) { break; //early stop assumes that hashes in input sketch are ordered! } int foundIdx = HashOperations.hashSearch(hashTable, lgArrLongs_, hashIn); if (foundIdx == -1) { continue; } matchSet[matchSetCount++] = hashIn; } } else { //either unordered compact or hash table for (int i = 0; i < arrLongsIn; i++ ) { long hashIn = cacheIn[i]; if ((hashIn <= 0L) || (hashIn >= thetaLong_)) { continue; } int foundIdx = HashOperations.hashSearch(hashTable, lgArrLongs_, hashIn); if (foundIdx == -1) { continue; } matchSet[matchSetCount++] = hashIn; } } //reduce effective array size to minimum lgArrLongs_ = setLgArrLongs(computeMinLgArrLongsFromCount(curCount_), mem_); curCount_ = setCurCount(matchSetCount, mem_); if (mem_ != null) { mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear for rebuild } else { Arrays.fill(hashTable_, 0, 1 << lgArrLongs_, 0L); //clear for rebuild } //move matchSet to target moveDataToTgt(matchSet, matchSetCount); } private void moveDataToTgt(long[] arr, int count) { int arrLongsIn = arr.length; int tmpCnt = 0; if (mem_ != null) { //Off Heap puts directly into mem int preBytes = CONST_PREAMBLE_LONGS << 3; for (int i = 0; i < arrLongsIn; i++ ) { long hashIn = arr[i]; if (HashOperations.continueCondition(thetaLong_, hashIn)) { continue; } HashOperations.hashInsertOnly(mem_, lgArrLongs_, hashIn, preBytes); tmpCnt++; } } else { //On Heap. Assumes HT exists and is large enough for (int i = 0; i < arrLongsIn; i++ ) { long hashIn = arr[i]; if (HashOperations.continueCondition(thetaLong_, hashIn)) { continue; } HashOperations.hashInsertOnly(hashTable_, lgArrLongs_, hashIn); tmpCnt++; } } assert (tmpCnt == count) : "Intersection Count Check: got: " + tmpCnt + ", expected: " + count; } //special handlers for Off Heap /** * Returns the correct maximum lgArrLongs given the capacity of the Memory. Checks that the * capacity is large enough for the minimum sized hash table. * @param dstMem the given Memory * @return the correct maximum lgArrLongs given the capacity of the Memory */ private static final int checkMaxLgArrLongs(Memory dstMem) { int preBytes = CONST_PREAMBLE_LONGS << 3; long cap = dstMem.getCapacity(); int maxLgArrLongs = Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3); if (maxLgArrLongs < MIN_LG_ARR_LONGS) { throw new SketchesArgumentException( "dstMem not large enough for minimum sized hash table: " + cap); } return maxLgArrLongs; } private static final boolean setEmpty(boolean empty, Memory mem) { if (mem != null) { if (empty) { mem.setBits(FLAGS_BYTE, (byte) EMPTY_FLAG_MASK); } else { mem.clearBits(FLAGS_BYTE, (byte)EMPTY_FLAG_MASK); } } return empty; } private static final int setLgArrLongs(int lgArrLongs, Memory mem) { if (mem != null) { mem.putByte(LG_ARR_LONGS_BYTE, (byte) lgArrLongs); } return lgArrLongs; } private static final long setThetaLong(long thetaLong, Memory mem) { if (mem != null) { mem.putLong(THETA_LONG, thetaLong); } return thetaLong; } private static final int setCurCount(int curCount, Memory mem) { if (mem != null) { mem.putInt(RETAINED_ENTRIES_INT, curCount); } return curCount; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy