All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.frequencies.ItemsSketch Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.frequencies;

import static org.apache.datasketches.Util.LS;
import static org.apache.datasketches.Util.isPowerOf2;
import static org.apache.datasketches.Util.toLog2;
import static org.apache.datasketches.frequencies.PreambleUtil.EMPTY_FLAG_MASK;
import static org.apache.datasketches.frequencies.PreambleUtil.SER_VER;
import static org.apache.datasketches.frequencies.PreambleUtil.extractActiveItems;
import static org.apache.datasketches.frequencies.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.frequencies.PreambleUtil.extractFlags;
import static org.apache.datasketches.frequencies.PreambleUtil.extractLgCurMapSize;
import static org.apache.datasketches.frequencies.PreambleUtil.extractLgMaxMapSize;
import static org.apache.datasketches.frequencies.PreambleUtil.extractPreLongs;
import static org.apache.datasketches.frequencies.PreambleUtil.extractSerVer;
import static org.apache.datasketches.frequencies.PreambleUtil.insertActiveItems;
import static org.apache.datasketches.frequencies.PreambleUtil.insertFamilyID;
import static org.apache.datasketches.frequencies.PreambleUtil.insertFlags;
import static org.apache.datasketches.frequencies.PreambleUtil.insertLgCurMapSize;
import static org.apache.datasketches.frequencies.PreambleUtil.insertLgMaxMapSize;
import static org.apache.datasketches.frequencies.PreambleUtil.insertPreLongs;
import static org.apache.datasketches.frequencies.PreambleUtil.insertSerVer;
import static org.apache.datasketches.frequencies.Util.LG_MIN_MAP_SIZE;
import static org.apache.datasketches.frequencies.Util.SAMPLE_SIZE;

import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Comparator;

import org.apache.datasketches.ArrayOfItemsSerDe;
import org.apache.datasketches.Family;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.SketchesStateException;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;

/**
 * 

This sketch is useful for tracking approximate frequencies of items of type <T> * with optional associated counts (<T> item, long count) that are members of a * multiset of such items. The true frequency of an item is defined to be the sum of associated * counts.

* *

This implementation provides the following capabilities:

*
    *
  • Estimate the frequency of an item.
  • *
  • Return upper and lower bounds of any item, such that the true frequency is always * between the upper and lower bounds.
  • *
  • Return a global maximum error that holds for all items in the stream.
  • *
  • Return an array of frequent items that qualify either a NO_FALSE_POSITIVES or a * NO_FALSE_NEGATIVES error type.
  • *
  • Merge itself with another sketch object created from this class.
  • *
  • Serialize/Deserialize to/from a byte array.
  • *
* *

Space Usage

* *

The sketch is initialized with a maxMapSize that specifies the maximum physical * length of the internal hash map of the form (<T> item, long count). * The maxMapSize must be a power of 2.

* *

The hash map starts at a very small size (8 entries), and grows as needed up to the * specified maxMapSize.

* *

Excluding external space required for the item objects, the internal memory space usage of * this sketch is 18 * mapSize bytes (assuming 8 bytes for each Java reference), plus a small * constant number of additional bytes. The internal memory space usage of this sketch will never * exceed 18 * maxMapSize bytes, plus a small constant number of additional bytes.

* *

Maximum Capacity of the Sketch

* *

The LOAD_FACTOR for the hash map is internally set at 75%, * which means at any time the map capacity of (item, count) pairs is mapCap = 0.75 * * mapSize. * The maximum capacity of (item, count) pairs of the sketch is maxMapCap = 0.75 * * maxMapSize.

* *

Updating the sketch with (item, count) pairs

* *

If the item is found in the hash map, the mapped count field (the "counter") is * incremented by the incoming count, otherwise, a new counter "(item, count) pair" is * created. If the number of tracked counters reaches the maximum capacity of the hash map * the sketch decrements all of the counters (by an approximately computed median), and * removes any non-positive counters.

* *

Accuracy

* *

If fewer than 0.75 * maxMapSize different items are inserted into the sketch the * estimated frequencies returned by the sketch will be exact.

* *

The logic of the frequent items sketch is such that the stored counts and true counts are * never too different. * More specifically, for any item, the sketch can return an estimate of the * true frequency of item, along with upper and lower bounds on the frequency * (that hold deterministically).

* *

For this implementation and for a specific active item, it is guaranteed that * the true frequency will be between the Upper Bound (UB) and the Lower Bound (LB) computed for * that item. Specifically, (UB- LB) ≤ W * epsilon, where W denotes the * sum of all item counts, and epsilon = 3.5/M, where M is the maxMapSize.

* *

This is a worst case guarantee that applies to arbitrary inputs.1 * For inputs typically seen in practice (UB-LB) is usually much smaller. *

* *

Background

* *

This code implements a variant of what is commonly known as the "Misra-Gries * algorithm". Variants of it were discovered and rediscovered and redesigned several times * over the years:

*
  • "Finding repeated elements", Misra, Gries, 1982
  • *
  • "Frequency estimation of Internet packet streams with limited space" Demaine, * Lopez-Ortiz, Munro, 2002
  • *
  • "A simple algorithm for finding frequent elements in streams and bags" Karp, Shenker, * Papadimitriou, 2003
  • *
  • "Efficient Computation of Frequent and Top-k Elements in Data Streams" Metwally, * Agrawal, Abbadi, 2006
  • *
* * 1 For speed we do employ some randomization that introduces a small probability that * our proof of the worst-case bound might not apply to a given run. However, we have ensured * that this probability is extremely small. For example, if the stream causes one table purge * (rebuild), our proof of the worst case bound applies with probability at least 1 - 1E-14. * If the stream causes 1E9 purges, our proof applies with probability at least 1 - 1E-5. * * @param The type of item to be tracked by this sketch * * @author Justin Thaler * @author Alexander Saydakov */ public class ItemsSketch { /** * Log2 Maximum length of the arrays internal to the hash map supported by the data * structure. */ private int lgMaxMapSize; /** * The current number of counters supported by the hash map. */ private int curMapCap; //the threshold to purge /** * Tracks the total of decremented counts. */ private long offset; /** * The sum of all frequencies of the stream so far. */ private long streamWeight = 0; /** * The maximum number of samples used to compute approximate median of counters when doing * decrement */ private int sampleSize; /** * Hash map mapping stored items to approximate counts */ private ReversePurgeItemHashMap hashMap; /** * Construct this sketch with the parameter maxMapSize and the default initialMapSize (8). * * @param maxMapSize Determines the physical size of the internal hash map managed by this * sketch and must be a power of 2. The maximum capacity of this internal hash map is * 0.75 times * maxMapSize. Both the ultimate accuracy and size of this sketch are a * function of maxMapSize. */ public ItemsSketch(final int maxMapSize) { this(toLog2(maxMapSize, "maxMapSize"), LG_MIN_MAP_SIZE); } /** * Construct this sketch with parameter lgMapMapSize and lgCurMapSize. This internal * constructor is used when deserializing the sketch. * * @param lgMaxMapSize Log2 of the physical size of the internal hash map managed by this * sketch. The maximum capacity of this internal hash map is 0.75 times 2^lgMaxMapSize. * Both the ultimate accuracy and size of this sketch are a function of lgMaxMapSize. * * @param lgCurMapSize Log2 of the starting (current) physical size of the internal hash * map managed by this sketch. */ ItemsSketch(final int lgMaxMapSize, final int lgCurMapSize) { //set initial size of hash map this.lgMaxMapSize = Math.max(lgMaxMapSize, LG_MIN_MAP_SIZE); final int lgCurMapSz = Math.max(lgCurMapSize, LG_MIN_MAP_SIZE); hashMap = new ReversePurgeItemHashMap<>(1 << lgCurMapSz); this.curMapCap = hashMap.getCapacity(); final int maxMapCap = (int) ((1 << lgMaxMapSize) * ReversePurgeItemHashMap.getLoadFactor()); offset = 0; sampleSize = Math.min(SAMPLE_SIZE, maxMapCap); } /** * Returns a sketch instance of this class from the given srcMem, * which must be a Memory representation of this sketch class. * * @param The type of item that this sketch will track * @param srcMem a Memory representation of a sketch of this class. * See Memory * @param serDe an instance of ArrayOfItemsSerDe * @return a sketch instance of this class. */ public static ItemsSketch getInstance(final Memory srcMem, final ArrayOfItemsSerDe serDe) { final long pre0 = PreambleUtil.checkPreambleSize(srcMem); //make sure preamble will fit final int maxPreLongs = Family.FREQUENCY.getMaxPreLongs(); final int preLongs = extractPreLongs(pre0); //Byte 0 final int serVer = extractSerVer(pre0); //Byte 1 final int familyID = extractFamilyID(pre0); //Byte 2 final int lgMaxMapSize = extractLgMaxMapSize(pre0); //Byte 3 final int lgCurMapSize = extractLgCurMapSize(pre0); //Byte 4 final boolean empty = (extractFlags(pre0) & EMPTY_FLAG_MASK) != 0; //Byte 5 // Checks final boolean preLongsEq1 = (preLongs == 1); //Byte 0 final boolean preLongsEqMax = (preLongs == maxPreLongs); if (!preLongsEq1 && !preLongsEqMax) { throw new SketchesArgumentException( "Possible Corruption: PreLongs must be 1 or " + maxPreLongs + ": " + preLongs); } if (serVer != SER_VER) { //Byte 1 throw new SketchesArgumentException( "Possible Corruption: Ser Ver must be " + SER_VER + ": " + serVer); } final int actFamID = Family.FREQUENCY.getID(); //Byte 2 if (familyID != actFamID) { throw new SketchesArgumentException( "Possible Corruption: FamilyID must be " + actFamID + ": " + familyID); } if (empty ^ preLongsEq1) { //Byte 5 and Byte 0 throw new SketchesArgumentException( "Possible Corruption: (PreLongs == 1) ^ Empty == True."); } if (empty) { return new ItemsSketch<>(lgMaxMapSize, LG_MIN_MAP_SIZE); } //get full preamble final long[] preArr = new long[preLongs]; srcMem.getLongArray(0, preArr, 0, preLongs); final ItemsSketch fis = new ItemsSketch<>(lgMaxMapSize, lgCurMapSize); fis.streamWeight = 0; //update after fis.offset = preArr[3]; final int preBytes = preLongs << 3; final int activeItems = extractActiveItems(preArr[1]); //Get countArray final long[] countArray = new long[activeItems]; srcMem.getLongArray(preBytes, countArray, 0, activeItems); //Get itemArray final int itemsOffset = preBytes + (8 * activeItems); final T[] itemArray = serDe.deserializeFromMemory( srcMem.region(itemsOffset, srcMem.getCapacity() - itemsOffset), activeItems); //update the sketch for (int i = 0; i < activeItems; i++) { fis.update(itemArray[i], countArray[i]); } fis.streamWeight = preArr[2]; //override streamWeight due to updating return fis; } /** * Returns the estimated a priori error given the maxMapSize for the sketch and the * estimatedTotalStreamWeight. * @param maxMapSize the planned map size to be used when constructing this sketch. * @param estimatedTotalStreamWeight the estimated total stream weight. * @return the estimated a priori error. */ public static double getAprioriError(final int maxMapSize, final long estimatedTotalStreamWeight) { return getEpsilon(maxMapSize) * estimatedTotalStreamWeight; } /** * Returns the current number of counters the sketch is configured to support. * * @return the current number of counters the sketch is configured to support. */ public int getCurrentMapCapacity() { return this.curMapCap; } /** * Returns epsilon used to compute a priori error. * This is just the value 3.5 / maxMapSize. * @param maxMapSize the planned map size to be used when constructing this sketch. * @return epsilon used to compute a priori error. */ public static double getEpsilon(final int maxMapSize) { if (!isPowerOf2(maxMapSize)) { throw new SketchesArgumentException("maxMapSize is not a power of 2."); } return 3.5 / maxMapSize; } /** * Gets the estimate of the frequency of the given item. * Note: The true frequency of a item would be the sum of the counts as a result of the * two update functions. * * @param item the given item * @return the estimate of the frequency of the given item */ public long getEstimate(final T item) { // If item is tracked: // Estimate = itemCount + offset; Otherwise it is 0. final long itemCount = hashMap.get(item); return (itemCount > 0) ? itemCount + offset : 0; } /** * Gets the guaranteed lower bound frequency of the given item, which can never be * negative. * * @param item the given item. * @return the guaranteed lower bound frequency of the given item. That is, a number which * is guaranteed to be no larger than the real frequency. */ public long getLowerBound(final T item) { //LB = itemCount or 0 return hashMap.get(item); } /** * Returns an array of Rows that include frequent items, estimates, upper and lower bounds * given a threshold and an ErrorCondition. If the threshold is lower than getMaximumError(), * then getMaximumError() will be used instead. * *

The method first examines all active items in the sketch (items that have a counter). * *

If ErrorType = NO_FALSE_NEGATIVES, this will include an item in the result * list if getUpperBound(item) > threshold. * There will be no false negatives, i.e., no Type II error. * There may be items in the set with true frequencies less than the threshold * (false positives).

* *

If ErrorType = NO_FALSE_POSITIVES, this will include an item in the result * list if getLowerBound(item) > threshold. * There will be no false positives, i.e., no Type I error. * There may be items omitted from the set with true frequencies greater than the * threshold (false negatives).

* * @param threshold to include items in the result list * @param errorType determines whether no false positives or no false negatives are * desired. * @return an array of frequent items */ public Row[] getFrequentItems(final long threshold, final ErrorType errorType) { return sortItems(threshold > getMaximumError() ? threshold : getMaximumError(), errorType); } /** * Returns an array of Rows that include frequent items, estimates, upper and lower bounds * given an ErrorCondition and the default threshold. * This is the same as getFrequentItems(getMaximumError(), errorType) * * @param errorType determines whether no false positives or no false negatives are * desired. * @return an array of frequent items */ public Row[] getFrequentItems(final ErrorType errorType) { return sortItems(getMaximumError(), errorType); } /** * @return An upper bound on the maximum error of getEstimate(item) for any item. * This is equivalent to the maximum distance between the upper bound and the lower bound * for any item. */ public long getMaximumError() { return offset; } /** * Returns the maximum number of counters the sketch is configured to support. * * @return the maximum number of counters the sketch is configured to support. */ public int getMaximumMapCapacity() { return (int) ((1 << lgMaxMapSize) * ReversePurgeLongHashMap.getLoadFactor()); } /** * @return the number of active items in the sketch. */ public int getNumActiveItems() { return hashMap.getNumActive(); } /** * Returns the sum of the frequencies in the stream seen so far by the sketch * * @return the sum of the frequencies in the stream seen so far by the sketch */ public long getStreamLength() { return this.streamWeight; } /** * Gets the guaranteed upper bound frequency of the given item. * * @param item the given item * @return the guaranteed upper bound frequency of the given item. That is, a number which * is guaranteed to be no smaller than the real frequency. */ public long getUpperBound(final T item) { // UB = itemCount + offset return hashMap.get(item) + offset; } /** * Returns true if this sketch is empty * * @return true if this sketch is empty */ public boolean isEmpty() { return getNumActiveItems() == 0; } /** * This function merges the other sketch into this one. * The other sketch may be of a different size. * * @param other sketch of this class * @return a sketch whose estimates are within the guarantees of the * largest error tolerance of the two merged sketches. */ public ItemsSketch merge(final ItemsSketch other) { if (other == null) { return this; } if (other.isEmpty()) { return this; } final long streamLen = this.streamWeight + other.streamWeight; //capture before merge final ReversePurgeItemHashMap.Iterator iter = other.hashMap.iterator(); while (iter.next()) { //this may add to offset during rebuilds this.update(iter.getKey(), iter.getValue()); } this.offset += other.offset; this.streamWeight = streamLen; //corrected streamWeight return this; } /** * Resets this sketch to a virgin state. */ public void reset() { hashMap = new ReversePurgeItemHashMap<>(1 << LG_MIN_MAP_SIZE); this.curMapCap = hashMap.getCapacity(); this.offset = 0; this.streamWeight = 0; } //Serialization /** * Returns a byte array representation of this sketch * @param serDe an instance of ArrayOfItemsSerDe * @return a byte array representation of this sketch */ @SuppressWarnings("null") public byte[] toByteArray(final ArrayOfItemsSerDe serDe) { final int preLongs; final int outBytes; final boolean empty = isEmpty(); final int activeItems = getNumActiveItems(); byte[] bytes = null; if (empty) { preLongs = 1; outBytes = 8; } else { preLongs = Family.FREQUENCY.getMaxPreLongs(); bytes = serDe.serializeToByteArray(hashMap.getActiveKeys()); outBytes = ((preLongs + activeItems) << 3) + bytes.length; } final byte[] outArr = new byte[outBytes]; final WritableMemory mem = WritableMemory.wrap(outArr); // build first preLong empty or not long pre0 = 0L; pre0 = insertPreLongs(preLongs, pre0); //Byte 0 pre0 = insertSerVer(SER_VER, pre0); //Byte 1 pre0 = insertFamilyID(Family.FREQUENCY.getID(), pre0); //Byte 2 pre0 = insertLgMaxMapSize(lgMaxMapSize, pre0); //Byte 3 pre0 = insertLgCurMapSize(hashMap.getLgLength(), pre0); //Byte 4 pre0 = empty ? insertFlags(EMPTY_FLAG_MASK, pre0) : insertFlags(0, pre0); //Byte 5 if (empty) { mem.putLong(0, pre0); } else { final long pre = 0; final long[] preArr = new long[preLongs]; preArr[0] = pre0; preArr[1] = insertActiveItems(activeItems, pre); preArr[2] = this.streamWeight; preArr[3] = this.offset; mem.putLongArray(0, preArr, 0, preLongs); final int preBytes = preLongs << 3; mem.putLongArray(preBytes, hashMap.getActiveValues(), 0, activeItems); mem.putByteArray(preBytes + (this.getNumActiveItems() << 3), bytes, 0, bytes.length); } return outArr; } /** * Returns a human readable summary of this sketch. * @return a human readable summary of this sketch. */ @Override public String toString() { final StringBuilder sb = new StringBuilder(); sb.append("FrequentItemsSketch:").append(LS); sb.append(" Stream Length : " + streamWeight).append(LS); sb.append(" Max Error Offset : " + offset).append(LS); sb.append(hashMap.toString()); return sb.toString(); } /** * Returns a human readable string of the preamble of a byte array image of a ItemsSketch. * @param byteArr the given byte array * @return a human readable string of the preamble of a byte array image of a ItemsSketch. */ public static String toString(final byte[] byteArr) { return toString(Memory.wrap(byteArr)); } /** * Returns a human readable string of the preamble of a Memory image of a ItemsSketch. * @param mem the given Memory object * @return a human readable string of the preamble of a Memory image of a ItemsSketch. */ public static String toString(final Memory mem) { return PreambleUtil.preambleToString(mem); } /** * Update this sketch with an item and a frequency count of one. * @param item for which the frequency should be increased. */ public void update(final T item) { update(item, 1); } /** * Update this sketch with a item and a positive frequency count. * @param item for which the frequency should be increased. The item can be any long value and is * only used by the sketch to determine uniqueness. * @param count the amount by which the frequency of the item should be increased. * An count of zero is a no-op, and a negative count will throw an exception. */ public void update(final T item, final long count) { if ((item == null) || (count == 0)) { return; } if (count < 0) { throw new SketchesArgumentException("Count may not be negative"); } this.streamWeight += count; hashMap.adjustOrPutValue(item, count); if (getNumActiveItems() > curMapCap) { //over the threshold, we need to do something if (hashMap.getLgLength() < lgMaxMapSize) { //below tgt size, we can grow hashMap.resize(2 * hashMap.getLength()); curMapCap = hashMap.getCapacity(); } else { //At tgt size, must purge offset += hashMap.purge(sampleSize); if (getNumActiveItems() > getMaximumMapCapacity()) { throw new SketchesStateException("Purge did not reduce active items."); } } } } /** * Row class that defines the return values from a getFrequentItems query. * @param type of item */ public static class Row implements Comparable> { final T item; final long est; final long ub; final long lb; private static final String FMT = " %12d%12d%12d %s"; private static final String HFMT = " %12s%12s%12s %s"; Row(final T item, final long estimate, final long ub, final long lb) { this.item = item; this.est = estimate; this.ub = ub; this.lb = lb; } /** * @return item of type T */ public T getItem() { return item; } /** * @return the estimate */ public long getEstimate() { return est; } /** * @return the upper bound */ public long getUpperBound() { return ub; } /** * @return return the lower bound */ public long getLowerBound() { return lb; } /** * @return the descriptive row header */ public static String getRowHeader() { return String.format(HFMT,"Est", "UB", "LB", "Item"); } @Override public String toString() { return String.format(FMT, est, ub, lb, item.toString()); } /** * This compareTo is strictly limited to the Row.getEstimate() value and does not imply any * ordering whatsoever to the other elements of the row: item and upper and lower bounds. * Defined this way, this compareTo will be consistent with hashCode() and equals(Object). * @param that the other row to compare to. * @return a negative integer, zero, or a positive integer as this.getEstimate() is less than, * equal to, or greater than that.getEstimate(). */ @Override public int compareTo(final Row that) { return (this.est < that.est) ? -1 : (this.est > that.est) ? 1 : 0; } /** * This hashCode is computed only from the Row.getEstimate() value. * Defined this way, this hashCode will be consistent with equals(Object):
* If (x.equals(y)) implies: x.hashCode() == y.hashCode().
* If (!x.equals(y)) does NOT imply: x.hashCode() != y.hashCode(). * @return the hashCode computed from getEstimate(). */ @Override public int hashCode() { final int prime = 31; int result = 1; result = (prime * result) + (int) (est ^ (est >>> 32)); return result; } /** * This equals is computed only from the Row.getEstimate() value and does not imply equality * of the other items within the row: item and upper and lower bounds. * Defined this way, this equals will be consistent with compareTo(Row). * @param obj the other row to determine equality with. * @return true if this.getEstimate() equals ((Row<T>)obj).getEstimate(). */ @SuppressWarnings("unchecked") @Override public boolean equals(final Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if ( !(obj instanceof Row)) { return false; } final Row that = (Row) obj; if (est != that.est) { return false; } return true; } } //End of class Row Row[] sortItems(final long threshold, final ErrorType errorType) { final ArrayList> rowList = new ArrayList<>(); final ReversePurgeItemHashMap.Iterator iter = hashMap.iterator(); if (errorType == ErrorType.NO_FALSE_NEGATIVES) { while (iter.next()) { final long est = getEstimate(iter.getKey()); final long ub = getUpperBound(iter.getKey()); final long lb = getLowerBound(iter.getKey()); if (ub >= threshold) { final Row row = new Row<>(iter.getKey(), est, ub, lb); rowList.add(row); } } } else { //NO_FALSE_POSITIVES while (iter.next()) { final long est = getEstimate(iter.getKey()); final long ub = getUpperBound(iter.getKey()); final long lb = getLowerBound(iter.getKey()); if (lb >= threshold) { final Row row = new Row<>(iter.getKey(), est, ub, lb); rowList.add(row); } } } // descending order rowList.sort(new Comparator>() { @Override public int compare(final Row r1, final Row r2) { return r2.compareTo(r1); } }); @SuppressWarnings("unchecked") final Row[] rowsArr = rowList.toArray((Row[]) Array.newInstance(Row.class, rowList.size())); return rowsArr; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy