org.apache.datasketches.theta.UpdateSketch Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.theta;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static org.apache.datasketches.theta.CompactSketch.compactCache;
import static org.apache.datasketches.theta.CompactSketch.loadCompactMemory;
import static org.apache.datasketches.theta.PreambleUtil.BIG_ENDIAN_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.MAX_THETA_LONG_AS_DOUBLE;
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
import static org.apache.datasketches.theta.PreambleUtil.extractP;
import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import static org.apache.datasketches.theta.PreambleUtil.getMemBytes;
import static org.apache.datasketches.theta.UpdateReturnState.RejectedNullOrEmpty;
import org.apache.datasketches.Family;
import org.apache.datasketches.ResizeFactor;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.Util;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;
/**
* The parent class for the Update Sketch families, such as QuickSelect and Alpha.
* The primary task of an Update Sketch is to consider datums presented via the update() methods
* for inclusion in its internal cache. This is the sketch building process.
*
* @author Lee Rhodes
*/
public abstract class UpdateSketch extends Sketch {
UpdateSketch() {}
/**
* Wrap takes the sketch image in Memory and refers to it directly. There is no data copying onto
* the java heap. Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
* been explicitly stored as direct objects can be wrapped. This method assumes the
* {@link Util#DEFAULT_UPDATE_SEED}.
* Default Update Seed.
* @param srcMem an image of a Sketch where the image seed hash matches the default seed hash.
* See Memory
* @return a Sketch backed by the given Memory
*/
public static UpdateSketch wrap(final WritableMemory srcMem) {
return wrap(srcMem, DEFAULT_UPDATE_SEED);
}
/**
* Wrap takes the sketch image in Memory and refers to it directly. There is no data copying onto
* the java heap. Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
* been explicitly stored as direct objects can be wrapped.
* An attempt to "wrap" earlier version sketches will result in a "heapified", normal
* Java Heap version of the sketch where all data will be copied to the heap.
* @param srcMem an image of a Sketch where the image seed hash matches the given seed hash.
* See Memory
* @param seed See Update Hash Seed.
* Compact sketches store a 16-bit hash of the seed, but not the seed itself.
* @return a UpdateSketch backed by the given Memory
*/
public static UpdateSketch wrap(final WritableMemory srcMem, final long seed) {
final int preLongs = srcMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F;
final int serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF;
final int familyID = srcMem.getByte(FAMILY_BYTE) & 0XFF;
final Family family = Family.idToFamily(familyID);
if (family != Family.QUICKSELECT) {
throw new SketchesArgumentException(
"A " + family + " sketch cannot be wrapped as an UpdateSketch.");
}
if ((serVer == 3) && (preLongs == 3)) {
return DirectQuickSelectSketch.writableWrap(srcMem, seed);
} else {
throw new SketchesArgumentException(
"Corrupted: An UpdateSketch image: must have SerVer = 3 and preLongs = 3");
}
}
/**
* Instantiates an on-heap UpdateSketch from Memory. This method assumes the
* {@link Util#DEFAULT_UPDATE_SEED}.
* @param srcMem See Memory
* @return an UpdateSketch
*/
public static UpdateSketch heapify(final Memory srcMem) {
return heapify(srcMem, DEFAULT_UPDATE_SEED);
}
/**
* Instantiates an on-heap UpdateSketch from Memory.
* @param srcMem See Memory
* @param seed See Update Hash Seed.
* @return an UpdateSketch
*/
public static UpdateSketch heapify(final Memory srcMem, final long seed) {
final Family family = Family.idToFamily(srcMem.getByte(FAMILY_BYTE));
if (family.equals(Family.ALPHA)) {
return HeapAlphaSketch.heapifyInstance(srcMem, seed);
}
return HeapQuickSelectSketch.heapifyInstance(srcMem, seed);
}
//Sketch interface
@Override
public CompactSketch compact() {
return compact(true, null);
}
@Override
public CompactSketch compact(final boolean dstOrdered, final WritableMemory dstMem) {
final int curCount = this.getRetainedEntries(true);
long thetaLong = getThetaLong();
thetaLong = Sketch.thetaOnCompact(isEmpty(), curCount, thetaLong);
final boolean empty = Sketch.emptyFromCountAndTheta(curCount, thetaLong);
if (empty) {
final EmptyCompactSketch sk = EmptyCompactSketch.getInstance();
if (dstMem != null) {
dstMem.putByteArray(0, sk.toByteArray(), 0, 8);
}
return sk;
}
//not empty
if ((thetaLong == Long.MAX_VALUE) && (curCount == 1)) {
final long[] cache = getCache();
final long[] cacheOut = compactCache(cache, curCount, thetaLong, dstOrdered);
final long hash = cacheOut[0];
final SingleItemSketch sis = new SingleItemSketch(hash, getSeedHash());
if (dstMem != null) {
dstMem.putByteArray(0, sis.toByteArray(), 0, 16);
}
return new SingleItemSketch(hash, getSeedHash());
}
if (dstMem == null) {
return compactHeap(this, dstOrdered, curCount, thetaLong);
} else {
return compactDirect(this, dstMem, dstOrdered, curCount, thetaLong);
}
}
/**
* Converts the given UpdateSketch to a compact form.
* EmptyCompactSketch and SingleItemSketch have already been checked.
* @param sketch the given UpdateSketch
* @param ordered true if the destination is to be ordered.
* @param curCount the number of retained entries.
* @param thetaLong the value of theta.
* @return a CompactSketch
*/
private static CompactSketch compactHeap(final UpdateSketch sketch, final boolean ordered,
final int curCount, final long thetaLong) {
final short seedHash = sketch.getSeedHash();
final long[] cache = sketch.getCache();
final long[] cacheOut = CompactSketch.compactCache(cache, curCount, thetaLong, ordered);
if (ordered) {
return new HeapCompactOrderedSketch(cacheOut, false, seedHash, curCount, thetaLong);
} else {
return new HeapCompactUnorderedSketch(cacheOut, false, seedHash, curCount, thetaLong);
}
}
/**
* Converts the given UpdateSketch to a compact form.
* EmptyCompactSketch and SingleItemSketch have already been checked.
* @param sketch the given UpdateSketch
* @param dstMem the given destination Memory. This clears it before use.
* @param ordered true if the destination is to be ordered.
* @param curCount the number of retained entries.
* @param thetaLong the value of theta.
* @return a CompactSketch.
*/
static CompactSketch compactDirect(final UpdateSketch sketch,
final WritableMemory dstMem, final boolean ordered, final int curCount, final long thetaLong) {
final int preLongs = computeCompactPreLongs(thetaLong, false, curCount);
final short seedHash = sketch.getSeedHash();
final long[] cache = sketch.getCache();
final long[] compactCache = CompactSketch.compactCache(cache, curCount, thetaLong, ordered);
if (ordered) {
final byte flags = (byte)(READ_ONLY_FLAG_MASK | COMPACT_FLAG_MASK | ORDERED_FLAG_MASK);
loadCompactMemory(compactCache, seedHash, curCount, thetaLong, dstMem, flags, preLongs);
return new DirectCompactOrderedSketch(dstMem);
} else {
final byte flags = (byte)(READ_ONLY_FLAG_MASK | COMPACT_FLAG_MASK);
loadCompactMemory(compactCache, seedHash, curCount, thetaLong, dstMem, flags, preLongs);
return new DirectCompactUnorderedSketch(dstMem);
}
}
@Override
public boolean isCompact() {
return false;
}
@Override
public boolean isOrdered() {
return false;
}
//UpdateSketch interface
/**
* Returns a new builder
*
* @return a new builder
*/
public static final UpdateSketchBuilder builder() {
return new UpdateSketchBuilder();
}
/**
* Resets this sketch back to a virgin empty state.
*/
public abstract void reset();
/**
* Rebuilds the hash table to remove dirty values or to reduce the size
* to nominal entries.
* @return this sketch
*/
public abstract UpdateSketch rebuild();
/**
* Returns the configured ResizeFactor
* @return the configured ResizeFactor
*/
public abstract ResizeFactor getResizeFactor();
/**
* Present this sketch with a long.
*
* @param datum The given long datum.
* @return
* See Update Return State
*/
public UpdateReturnState update(final long datum) {
final long[] data = { datum };
return hashUpdate(hash(data, getSeed())[0] >>> 1);
}
/**
* Present this sketch with the given double (or float) datum.
* The double will be converted to a long using Double.doubleToLongBits(datum),
* which normalizes all NaN values to a single NaN representation.
* Plus and minus zero will be normalized to plus zero.
* The special floating-point values NaN and +/- Infinity are treated as distinct.
*
* @param datum The given double datum.
* @return
* See Update Return State
*/
public UpdateReturnState update(final double datum) {
final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0
final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN forms
return hashUpdate(hash(data, getSeed())[0] >>> 1);
}
/**
* Present this sketch with the given String.
* The string is converted to a byte array using UTF8 encoding.
* If the string is null or empty no update attempt is made and the method returns.
*
* Note: this will not produce the same output hash values as the {@link #update(char[])}
* method and will generally be a little slower depending on the complexity of the UTF8 encoding.
*
*
* @param datum The given String.
* @return
* See Update Return State
*/
public UpdateReturnState update(final String datum) {
if ((datum == null) || datum.isEmpty()) {
return RejectedNullOrEmpty;
}
final byte[] data = datum.getBytes(UTF_8);
return hashUpdate(hash(data, getSeed())[0] >>> 1);
}
/**
* Present this sketch with the given byte array.
* If the byte array is null or empty no update attempt is made and the method returns.
*
* @param data The given byte array.
* @return
* See Update Return State
*/
public UpdateReturnState update(final byte[] data) {
if ((data == null) || (data.length == 0)) {
return RejectedNullOrEmpty;
}
return hashUpdate(hash(data, getSeed())[0] >>> 1);
}
/**
* Present this sketch with the given char array.
* If the char array is null or empty no update attempt is made and the method returns.
*
* Note: this will not produce the same output hash values as the {@link #update(String)}
* method but will be a little faster as it avoids the complexity of the UTF8 encoding.
*
* @param data The given char array.
* @return
* See Update Return State
*/
public UpdateReturnState update(final char[] data) {
if ((data == null) || (data.length == 0)) {
return RejectedNullOrEmpty;
}
return hashUpdate(hash(data, getSeed())[0] >>> 1);
}
/**
* Present this sketch with the given integer array.
* If the integer array is null or empty no update attempt is made and the method returns.
*
* @param data The given int array.
* @return
* See Update Return State
*/
public UpdateReturnState update(final int[] data) {
if ((data == null) || (data.length == 0)) {
return RejectedNullOrEmpty;
}
return hashUpdate(hash(data, getSeed())[0] >>> 1);
}
/**
* Present this sketch with the given long array.
* If the long array is null or empty no update attempt is made and the method returns.
*
* @param data The given long array.
* @return
* See Update Return State
*/
public UpdateReturnState update(final long[] data) {
if ((data == null) || (data.length == 0)) {
return RejectedNullOrEmpty;
}
return hashUpdate(hash(data, getSeed())[0] >>> 1);
}
//restricted methods
/**
* All potential updates converge here.
* Don't ever call this unless you really know what you are doing!
*
* @param hash the given input hash value. A hash of zero or Long.MAX_VALUE is ignored.
* A negative hash value will throw an exception.
* @return See Update Return State
*/
abstract UpdateReturnState hashUpdate(long hash);
/**
* Gets the Log base 2 of the current size of the internal cache
* @return the Log base 2 of the current size of the internal cache
*/
abstract int getLgArrLongs();
/**
* Gets the Log base 2 of the configured nominal entries
* @return the Log base 2 of the configured nominal entries
*/
public abstract int getLgNomLongs();
/**
* Gets the configured sampling probability, p.
* See Sampling Probability, p
* @return the sampling probability, p
*/
abstract float getP();
/**
* Gets the configured seed
* @return the configured seed
*/
abstract long getSeed();
/**
* Returns true if the internal cache contains "dirty" values that are greater than or equal
* to thetaLong.
* @return true if the internal cache is dirty.
*/
abstract boolean isDirty();
/**
* Returns true if numEntries (curCount) is greater than the hashTableThreshold.
* @param numEntries the given number of entries (or current count).
* @return true if numEntries (curCount) is greater than the hashTableThreshold.
*/
abstract boolean isOutOfSpace(int numEntries);
static void checkUnionQuickSelectFamily(final Memory mem, final int preambleLongs,
final int lgNomLongs) {
//Check Family
final int familyID = extractFamilyID(mem); //byte 2
final Family family = Family.idToFamily(familyID);
if (family.equals(Family.UNION)) {
if (preambleLongs != Family.UNION.getMinPreLongs()) {
throw new SketchesArgumentException(
"Possible corruption: Invalid PreambleLongs value for UNION: " + preambleLongs);
}
}
else if (family.equals(Family.QUICKSELECT)) {
if (preambleLongs != Family.QUICKSELECT.getMinPreLongs()) {
throw new SketchesArgumentException(
"Possible corruption: Invalid PreambleLongs value for QUICKSELECT: " + preambleLongs);
}
} else {
throw new SketchesArgumentException(
"Possible corruption: Invalid Family: " + family.toString());
}
//Check lgNomLongs
if (lgNomLongs < MIN_LG_NOM_LONGS) {
throw new SketchesArgumentException(
"Possible corruption: Current Memory lgNomLongs < min required size: "
+ lgNomLongs + " < " + MIN_LG_NOM_LONGS);
}
}
static void checkMemIntegrity(final Memory srcMem, final long seed, final int preambleLongs,
final int lgNomLongs, final int lgArrLongs) {
//Check SerVer
final int serVer = extractSerVer(srcMem); //byte 1
if (serVer != SER_VER) {
throw new SketchesArgumentException(
"Possible corruption: Invalid Serialization Version: " + serVer);
}
//Check flags
final int flags = extractFlags(srcMem); //byte 5
final int flagsMask =
ORDERED_FLAG_MASK | COMPACT_FLAG_MASK | READ_ONLY_FLAG_MASK | BIG_ENDIAN_FLAG_MASK;
if ((flags & flagsMask) > 0) {
throw new SketchesArgumentException(
"Possible corruption: Input srcMem cannot be: big-endian, compact, ordered, or read-only");
}
//Check seed hashes
final short seedHash = (short)extractSeedHash(srcMem); //byte 6,7
Util.checkSeedHashes(seedHash, Util.computeSeedHash(seed));
//Check mem capacity, lgArrLongs
final long curCapBytes = srcMem.getCapacity();
final int minReqBytes = getMemBytes(lgArrLongs, preambleLongs);
if (curCapBytes < minReqBytes) {
throw new SketchesArgumentException(
"Possible corruption: Current Memory size < min required size: "
+ curCapBytes + " < " + minReqBytes);
}
//check Theta, p
final float p = extractP(srcMem); //bytes 12-15
final long thetaLong = extractThetaLong(srcMem); //bytes 16-23
final double theta = thetaLong / MAX_THETA_LONG_AS_DOUBLE;
if ((lgArrLongs <= lgNomLongs) && (theta < p) ) {
throw new SketchesArgumentException(
"Possible corruption: Theta cannot be < p and lgArrLongs <= lgNomLongs. "
+ lgArrLongs + " <= " + lgNomLongs + ", Theta: " + theta + ", p: " + p);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy