org.apache.datasketches.theta.PreambleUtil Maven / Gradle / Ivy
Show all versions of datasketches-java Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.theta;
import static org.apache.datasketches.common.Util.LS;
import static org.apache.datasketches.common.Util.zeroPad;
import java.nio.ByteOrder;
import org.apache.datasketches.common.Family;
import org.apache.datasketches.common.ResizeFactor;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.common.Util;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;
import org.apache.datasketches.thetacommon.ThetaUtil;
//@formatter:off
/**
* This class defines the preamble data structure and provides basic utilities for some of the key
* fields.
* The intent of the design of this class was to isolate the detailed knowledge of the bit and
* byte layout of the serialized form of the sketches derived from the Sketch class into one place.
* This allows the possibility of the introduction of different serialization
* schemes with minimal impact on the rest of the library.
*
*
* MAP: Low significance bytes of this long data structure are on the right. However, the
* multi-byte integers (int and long) are stored in native byte order. The
* byte values are treated as unsigned.
*
* An empty CompactSketch only requires 8 bytes.
* Flags: notSI, Ordered*, Compact, Empty*, ReadOnly, LE.
* (*) Earlier versions did not set these.
*
*
* Long || Start Byte Adr:
* Adr:
* || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
* 0 || Seed Hash | Flags | | | FamID | SerVer | PreLongs = 1 |
*
*
* A SingleItemSketch (extends CompactSketch) requires an 8 byte preamble plus a single
* hash item of 8 bytes. Flags: SingleItem*, Ordered, Compact, notEmpty, ReadOnly, LE.
* (*) Earlier versions did not set these.
*
*
* Long || Start Byte Adr:
* Adr:
* || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
* 0 || Seed Hash | Flags | | | FamID | SerVer | PreLongs = 1 |
*
* || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
* 1 ||---------------------------Single long hash----------------------------------------|
*
*
* An exact (non-estimating) CompactSketch requires 16 bytes of preamble plus a compact array of
* longs.
*
*
* Long || Start Byte Adr:
* Adr:
* || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
* 0 || Seed Hash | Flags | | | FamID | SerVer | PreLongs = 2 |
*
* || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
* 1 ||-----------------p-----------------|----------Retained Entries Count---------------|
*
* || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
* 2 ||----------------------Start of Compact Long Array----------------------------------|
*
*
* An estimating CompactSketch requires 24 bytes of preamble plus a compact array of longs.
*
*
* Long || Start Byte Adr:
* Adr:
* || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
* 0 || Seed Hash | Flags | | | FamID | SerVer | PreLongs = 3 |
*
* || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
* 1 ||-----------------p-----------------|----------Retained Entries Count---------------|
*
* || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
* 2 ||------------------------------THETA_LONG-------------------------------------------|
*
* || 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 |
* 3 ||----------------------Start of Compact Long Array----------------------------------|
*
*
* The UpdateSketch and AlphaSketch require 24 bytes of preamble followed by a non-compact
* array of longs representing a hash table.
*
* The following table applies to both the Theta UpdateSketch and the Alpha Sketch
*
* Long || Start Byte Adr:
* Adr:
* || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
* 0 || Seed Hash | Flags | LgArr | lgNom | FamID | SerVer | RF, PreLongs = 3 |
*
* || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
* 1 ||-----------------p-----------------|----------Retained Entries Count---------------|
*
* || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
* 2 ||------------------------------THETA_LONG-------------------------------------------|
*
* || 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 |
* 3 ||----------------------Start of Hash Table of longs---------------------------------|
*
*
* Union objects require 32 bytes of preamble plus a non-compact array of longs representing a
* hash table.
*
*
* Long || Start Byte Adr:
* Adr:
* || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
* 0 || Seed Hash | Flags | LgArr | lgNom | FamID | SerVer | RF, PreLongs = 4 |
*
* || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
* 1 ||-----------------p-----------------|----------Retained Entries Count---------------|
*
* || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
* 2 ||------------------------------THETA_LONG-------------------------------------------|
*
* || 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 |
* 3 ||---------------------------UNION THETA LONG----------------------------------------|
*
* || 39 | 38 | 37 | 36 | 35 | 34 | 33 | 32 |
* 4 ||----------------------Start of Hash Table of longs---------------------------------|
*
*
*
* @author Lee Rhodes
*/
final class PreambleUtil {
private PreambleUtil() {}
// ###### DO NOT MESS WITH THIS FROM HERE ...
// Preamble byte Addresses
static final int PREAMBLE_LONGS_BYTE = 0; //lower 6 bits in byte.
static final int LG_RESIZE_FACTOR_BIT = 6; //upper 2 bits in byte. Not used by compact, direct
static final int SER_VER_BYTE = 1;
static final int FAMILY_BYTE = 2; //SerVer1,2 was SKETCH_TYPE_BYTE
static final int LG_NOM_LONGS_BYTE = 3; //not used by compact
static final int LG_ARR_LONGS_BYTE = 4; //not used by compact
static final int FLAGS_BYTE = 5;
static final int SEED_HASH_SHORT = 6; //byte 6,7
static final int RETAINED_ENTRIES_INT = 8; //8 byte aligned
static final int P_FLOAT = 12; //4 byte aligned, not used by compact
static final int THETA_LONG = 16; //8-byte aligned
static final int UNION_THETA_LONG = 24; //8-byte aligned, only used by Union
// flag bit masks
static final int BIG_ENDIAN_FLAG_MASK = 1; //SerVer 1, 2, 3
static final int READ_ONLY_FLAG_MASK = 2; //Set but not read. Reserved. SerVer 1, 2, 3
static final int EMPTY_FLAG_MASK = 4; //SerVer 2, 3
static final int COMPACT_FLAG_MASK = 8; //SerVer 2 was NO_REBUILD_FLAG_MASK, 3
static final int ORDERED_FLAG_MASK = 16;//SerVer 2 was UNORDERED_FLAG_MASK, 3
static final int SINGLEITEM_FLAG_MASK = 32;//SerVer 3
//The last 2 bits of the flags byte are reserved and assumed to be zero, for now.
//Backward compatibility: SerVer1 preamble always 3 longs, SerVer2 preamble: 1, 2, 3 longs
// SKETCH_TYPE_BYTE 2 //SerVer1, SerVer2
// V1, V2 types: Alpha = 1, QuickSelect = 2, SetSketch = 3; V3 only: Buffered QS = 4
static final int LG_RESIZE_RATIO_BYTE_V1 = 5; //used by SerVer 1
static final int FLAGS_BYTE_V1 = 6; //used by SerVer 1
//Other constants
static final int SER_VER = 3;
// serial version 4 compressed ordered sketch, not empty, not single item
static final int ENTRY_BITS_BYTE_V4 = 3; // number of bits packed in deltas between hashes
static final int NUM_ENTRIES_BYTES_BYTE_V4 = 4; // number of bytes used for the number of entries
static final int THETA_LONG_V4 = 8; //8-byte aligned
static final boolean NATIVE_ORDER_IS_BIG_ENDIAN =
(ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN);
/**
* Computes the number of bytes required for a non-full sized sketch in hash-table form.
* This can be used to compute current storage size for heap sketches, or current off-heap memory
* required for off-heap (direct) sketches. This does not apply for compact sketches.
* @param lgArrLongs log2(current hash-table size)
* @param preambleLongs current preamble size
* @return the size in bytes
*/
static final int getMemBytes(final int lgArrLongs, final int preambleLongs) {
return (8 << lgArrLongs) + (preambleLongs << 3);
}
// STRINGS
/**
* Returns a human readable string summary of the preamble state of the given byte array.
* Used primarily in testing.
*
* @param byteArr the given byte array.
* @return the summary preamble string.
*/
static String preambleToString(final byte[] byteArr) {
final Memory mem = Memory.wrap(byteArr);
return preambleToString(mem);
}
/**
* Returns a human readable string summary of the preamble state of the given Memory.
* Note: other than making sure that the given Memory size is large
* enough for just the preamble, this does not do much value checking of the contents of the
* preamble as this is primarily a tool for debugging the preamble visually.
*
* @param mem the given Memory.
* @return the summary preamble string.
*/
static String preambleToString(final Memory mem) {
final int preLongs = getAndCheckPreLongs(mem);
final int rfId = extractLgResizeFactor(mem);
final ResizeFactor rf = ResizeFactor.getRF(rfId);
final int serVer = extractSerVer(mem);
final int familyId = extractFamilyID(mem);
final Family family = Family.idToFamily(familyId);
final int lgNomLongs = extractLgNomLongs(mem);
final int lgArrLongs = extractLgArrLongs(mem);
//Flags
final int flags = extractFlags(mem);
final String flagsStr = (flags) + ", 0x" + (Integer.toHexString(flags)) + ", "
+ zeroPad(Integer.toBinaryString(flags), 8);
final String nativeOrder = ByteOrder.nativeOrder().toString();
final boolean bigEndian = (flags & BIG_ENDIAN_FLAG_MASK) > 0;
final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0;
final boolean empty = (flags & EMPTY_FLAG_MASK) > 0;
final boolean compact = (flags & COMPACT_FLAG_MASK) > 0;
final boolean ordered = (flags & ORDERED_FLAG_MASK) > 0;
final boolean singleItem = (flags & SINGLEITEM_FLAG_MASK) > 0; //!empty && (preLongs == 1);
final int seedHash = extractSeedHash(mem);
//assumes preLongs == 1; empty or singleItem
int curCount = singleItem ? 1 : 0;
float p = (float) 1.0; //preLongs 1 or 2
long thetaLong = Long.MAX_VALUE; //preLongs 1 or 2
long thetaULong = thetaLong; //preLongs 1, 2 or 3
if (preLongs == 2) { //exact (non-estimating) CompactSketch
curCount = extractCurCount(mem);
p = extractP(mem);
}
else if (preLongs == 3) { //Update Sketch
curCount = extractCurCount(mem);
p = extractP(mem);
thetaLong = extractThetaLong(mem);
thetaULong = thetaLong;
}
else if (preLongs == 4) { //Union
curCount = extractCurCount(mem);
p = extractP(mem);
thetaLong = extractThetaLong(mem);
thetaULong = extractUnionThetaLong(mem);
}
//else the same as an empty sketch or singleItem
final double thetaDbl = thetaLong / Util.LONG_MAX_VALUE_AS_DOUBLE;
final String thetaHex = zeroPad(Long.toHexString(thetaLong), 16);
final double thetaUDbl = thetaULong / Util.LONG_MAX_VALUE_AS_DOUBLE;
final String thetaUHex = zeroPad(Long.toHexString(thetaULong), 16);
final StringBuilder sb = new StringBuilder();
sb.append(LS);
sb.append("### SKETCH PREAMBLE SUMMARY:").append(LS);
sb.append("Native Byte Order : ").append(nativeOrder).append(LS);
sb.append("Byte 0: Preamble Longs : ").append(preLongs).append(LS);
sb.append("Byte 0: ResizeFactor : ").append(rfId + ", " + rf.toString()).append(LS);
sb.append("Byte 1: Serialization Version: ").append(serVer).append(LS);
sb.append("Byte 2: Family : ").append(familyId + ", " + family.toString()).append(LS);
sb.append("Byte 3: LgNomLongs : ").append(lgNomLongs).append(LS);
sb.append("Byte 4: LgArrLongs : ").append(lgArrLongs).append(LS);
sb.append("Byte 5: Flags Field : ").append(flagsStr).append(LS);
sb.append(" Bit Flag Name : State:").append(LS);
sb.append(" 0 BIG_ENDIAN_STORAGE : ").append(bigEndian).append(LS);
sb.append(" 1 READ_ONLY : ").append(readOnly).append(LS);
sb.append(" 2 EMPTY : ").append(empty).append(LS);
sb.append(" 3 COMPACT : ").append(compact).append(LS);
sb.append(" 4 ORDERED : ").append(ordered).append(LS);
sb.append(" 5 SINGLE_ITEM : ").append(singleItem).append(LS);
sb.append("Bytes 6-7 : Seed Hash Hex : ").append(Integer.toHexString(seedHash)).append(LS);
if (preLongs == 1) {
sb.append(" --ABSENT FIELDS, ASSUMED:").append(LS);
sb.append("Bytes 8-11 : CurrentCount : ").append(curCount).append(LS);
sb.append("Bytes 12-15: P : ").append(p).append(LS);
sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS);
sb.append(" Theta (long) : ").append(thetaLong).append(LS);
sb.append(" Theta (long,hex) : ").append(thetaHex).append(LS);
}
else if (preLongs == 2) {
sb.append("Bytes 8-11 : CurrentCount : ").append(curCount).append(LS);
sb.append("Bytes 12-15: P : ").append(p).append(LS);
sb.append(" --ABSENT, ASSUMED:").append(LS);
sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS);
sb.append(" Theta (long) : ").append(thetaLong).append(LS);
sb.append(" Theta (long,hex) : ").append(thetaHex).append(LS);
}
else if (preLongs == 3) {
sb.append("Bytes 8-11 : CurrentCount : ").append(curCount).append(LS);
sb.append("Bytes 12-15: P : ").append(p).append(LS);
sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS);
sb.append(" Theta (long) : ").append(thetaLong).append(LS);
sb.append(" Theta (long,hex) : ").append(thetaHex).append(LS);
}
else { //preLongs == 4
sb.append("Bytes 8-11 : CurrentCount : ").append(curCount).append(LS);
sb.append("Bytes 12-15: P : ").append(p).append(LS);
sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS);
sb.append(" Theta (long) : ").append(thetaLong).append(LS);
sb.append(" Theta (long,hex) : ").append(thetaHex).append(LS);
sb.append("Bytes 25-31: ThetaU (double) : ").append(thetaUDbl).append(LS);
sb.append(" ThetaU (long) : ").append(thetaULong).append(LS);
sb.append(" ThetaU (long,hex): ").append(thetaUHex).append(LS);
}
sb.append( "Preamble Bytes : ").append(preLongs * 8).append(LS);
sb.append( "Data Bytes : ").append(curCount * 8).append(LS);
sb.append( "TOTAL Sketch Bytes : ").append((preLongs + curCount) * 8).append(LS);
sb.append( "TOTAL Capacity Bytes : ").append(mem.getCapacity()).append(LS);
sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS);
return sb.toString();
}
//@formatter:on
static int extractPreLongs(final Memory mem) {
return mem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F;
}
static int extractLgResizeFactor(final Memory mem) {
return (mem.getByte(PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT) & 0X3;
}
static int extractLgResizeRatioV1(final Memory mem) {
return mem.getByte(LG_RESIZE_RATIO_BYTE_V1) & 0X3;
}
static int extractSerVer(final Memory mem) {
return mem.getByte(SER_VER_BYTE) & 0XFF;
}
static int extractFamilyID(final Memory mem) {
return mem.getByte(FAMILY_BYTE) & 0XFF;
}
static int extractLgNomLongs(final Memory mem) {
return mem.getByte(LG_NOM_LONGS_BYTE) & 0XFF;
}
static int extractLgArrLongs(final Memory mem) {
return mem.getByte(LG_ARR_LONGS_BYTE) & 0XFF;
}
static int extractFlags(final Memory mem) {
return mem.getByte(FLAGS_BYTE) & 0XFF;
}
static int extractFlagsV1(final Memory mem) {
return mem.getByte(FLAGS_BYTE_V1) & 0XFF;
}
static int extractSeedHash(final Memory mem) {
return mem.getShort(SEED_HASH_SHORT) & 0XFFFF;
}
static int extractCurCount(final Memory mem) {
return mem.getInt(RETAINED_ENTRIES_INT);
}
static float extractP(final Memory mem) {
return mem.getFloat(P_FLOAT);
}
static long extractThetaLong(final Memory mem) {
return mem.getLong(THETA_LONG);
}
static long extractUnionThetaLong(final Memory mem) {
return mem.getLong(UNION_THETA_LONG);
}
static int extractEntryBitsV4(final Memory mem) {
return mem.getByte(ENTRY_BITS_BYTE_V4) & 0XFF;
}
static int extractNumEntriesBytesV4(final Memory mem) {
return mem.getByte(NUM_ENTRIES_BYTES_BYTE_V4) & 0XFF;
}
static long extractThetaLongV4(final Memory mem) {
return mem.getLong(THETA_LONG_V4);
}
/**
* Sets PreLongs in the low 6 bits and sets LgRF in the upper 2 bits = 0.
* @param wmem the target WritableMemory
* @param preLongs the given number of preamble longs
*/
static void insertPreLongs(final WritableMemory wmem, final int preLongs) {
wmem.putByte(PREAMBLE_LONGS_BYTE, (byte) (preLongs & 0X3F));
}
/**
* Sets the top 2 lgRF bits and does not affect the lower 6 bits (PreLongs).
* To work properly, this should be called after insertPreLongs().
* @param wmem the target WritableMemory
* @param rf the given lgRF bits
*/
static void insertLgResizeFactor(final WritableMemory wmem, final int rf) {
final int curByte = wmem.getByte(PREAMBLE_LONGS_BYTE) & 0xFF;
final int shift = LG_RESIZE_FACTOR_BIT; // shift in bits
final int mask = 3;
final byte newByte = (byte) (((rf & mask) << shift) | (~(mask << shift) & curByte));
wmem.putByte(PREAMBLE_LONGS_BYTE, newByte);
}
static void insertSerVer(final WritableMemory wmem, final int serVer) {
wmem.putByte(SER_VER_BYTE, (byte) serVer);
}
static void insertFamilyID(final WritableMemory wmem, final int famId) {
wmem.putByte(FAMILY_BYTE, (byte) famId);
}
static void insertLgNomLongs(final WritableMemory wmem, final int lgNomLongs) {
wmem.putByte(LG_NOM_LONGS_BYTE, (byte) lgNomLongs);
}
static void insertLgArrLongs(final WritableMemory wmem, final int lgArrLongs) {
wmem.putByte(LG_ARR_LONGS_BYTE, (byte) lgArrLongs);
}
static void insertFlags(final WritableMemory wmem, final int flags) {
wmem.putByte(FLAGS_BYTE, (byte) flags);
}
static void insertSeedHash(final WritableMemory wmem, final int seedHash) {
wmem.putShort(SEED_HASH_SHORT, (short) seedHash);
}
static void insertCurCount(final WritableMemory wmem, final int curCount) {
wmem.putInt(RETAINED_ENTRIES_INT, curCount);
}
static void insertP(final WritableMemory wmem, final float p) {
wmem.putFloat(P_FLOAT, p);
}
static void insertThetaLong(final WritableMemory wmem, final long thetaLong) {
wmem.putLong(THETA_LONG, thetaLong);
}
static void insertUnionThetaLong(final WritableMemory wmem, final long unionThetaLong) {
wmem.putLong(UNION_THETA_LONG, unionThetaLong);
}
static void setEmpty(final WritableMemory wmem) {
int flags = wmem.getByte(FLAGS_BYTE) & 0XFF;
flags |= EMPTY_FLAG_MASK;
wmem.putByte(FLAGS_BYTE, (byte) flags);
}
static void clearEmpty(final WritableMemory wmem) {
int flags = wmem.getByte(FLAGS_BYTE) & 0XFF;
flags &= ~EMPTY_FLAG_MASK;
wmem.putByte(FLAGS_BYTE, (byte) flags);
}
static boolean isEmptyFlag(final Memory mem) {
return ((extractFlags(mem) & EMPTY_FLAG_MASK) > 0);
}
/**
* Checks Memory for capacity to hold the preamble and returns the extracted preLongs.
* @param mem the given Memory
* @return the extracted prelongs value.
*/
static int getAndCheckPreLongs(final Memory mem) {
final long cap = mem.getCapacity();
if (cap < 8) {
throwNotBigEnough(cap, 8);
}
final int preLongs = extractPreLongs(mem);
final int required = Math.max(preLongs << 3, 8);
if (cap < required) {
throwNotBigEnough(cap, required);
}
return preLongs;
}
static final short checkMemorySeedHash(final Memory mem, final long seed) {
final short seedHashMem = (short) extractSeedHash(mem);
ThetaUtil.checkSeedHashes(seedHashMem, ThetaUtil.computeSeedHash(seed)); //throws if bad seedHash
return seedHashMem;
}
private static void throwNotBigEnough(final long cap, final int required) {
throw new SketchesArgumentException(
"Possible Corruption: Size of byte array or Memory not large enough: Size: " + cap
+ ", Required: " + required);
}
}