All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.clearspring.analytics.stream.quantile.QDigest Maven / Gradle / Ivy

Go to download

A library for summarizing data in streams for which it is infeasible to store all events

The newest version!
package com.clearspring.analytics.stream.quantile;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongArrayFIFOQueue;

/**
 * Q-Digest datastructure.
 * 

* Answers approximate quantile queries: actual rank of the result of query(q) * is in q-eps .. q+eps, where eps = log(sigma)/compressionFactor * and log(sigma) is ceiling of binary log of the largest value inserted, * i.e. height of the tree. *

* Two Q-Digests can be joined (see {@link #unionOf(QDigest, QDigest)}). *

* Source: * N.Shrivastava, C.Buragohain, D.Agrawal * Medians and Beyond: New Aggregation Techniques for Sensor Networks * http://www.cs.virginia.edu/~son/cs851/papers/ucsb.sensys04.pdf *

* This is a slightly modified version. * There is a small problem with the compression algorithm in the paper, * see https://plus.google.com/u/0/109909935680879695595/posts/768ZZ9Euqz6 *

* So we use a different algorithm here: *

    *
  • When an item is inserted, we compress along the path to root from the item's leaf *
  • When the structure becomes too large (above the theoretical bound), or * at "too destructive" operations (e.g. union or rebuild) we compress fully *
*

* Note that the accuracy of the structure does NOT suffer if "property 2" * from the paper is violated (in fact, restoring property 2 at any node * decreases accuracy). *

* So we can say that we preserve the paper's accuracy and memory consumption claims. */ public class QDigest implements IQuantileEstimator { private static final Comparator RANGES_COMPARATOR = new Comparator() { @Override public int compare(long[] ra, long[] rb) { long rightA = ra[1], rightB = rb[1], sizeA = ra[1] - ra[0], sizeB = rb[1] - rb[0]; if (rightA < rightB) { return -1; } if (rightA > rightB) { return 1; } if (sizeA < sizeB) { return -1; } if (sizeA > sizeB) { return 1; } return 0; } }; private static final int MAP_INITIAL_SIZE = Hash.DEFAULT_INITIAL_SIZE; private static final float MAP_LOAD_FACTOR = Hash.VERY_FAST_LOAD_FACTOR; private long size; private long capacity = 1; private double compressionFactor; private Long2LongOpenHashMap node2count = new Long2LongOpenHashMap(MAP_INITIAL_SIZE, MAP_LOAD_FACTOR); public QDigest(double compressionFactor) { this.compressionFactor = compressionFactor; } private long value2leaf(long x) { return capacity + x; } private long leaf2value(long id) { return id - capacity; } private boolean isRoot(long id) { return id == 1; } private boolean isLeaf(long id) { return id >= capacity; } private long sibling(long id) { return (id % 2 == 0) ? (id + 1) : (id - 1); } private long parent(long id) { return id / 2; } private long leftChild(long id) { return 2 * id; } private long rightChild(long id) { return 2 * id + 1; } private long rangeLeft(long id) { while (!isLeaf(id)) { id = leftChild(id); } return leaf2value(id); } private long rangeRight(long id) { while (!isLeaf(id)) { id = rightChild(id); } return leaf2value(id); } @Override public void offer(long value) { if (value < 0 || value > Long.MAX_VALUE / 2) { throw new IllegalArgumentException("Can only accept values in the range 0.." + Long.MAX_VALUE / 2 + ", got " + value); } // Rebuild if the value is too large for the current tree height if (value >= capacity) { rebuildToCapacity(Long.highestOneBit(value) << 1); } long leaf = value2leaf(value); node2count.addTo(leaf, 1); size++; // Always compress at the inserted node, and recompress fully // if the tree becomes too large. // This is one sensible strategy which both is fast and keeps // the tree reasonably small (within the theoretical bound of 3k nodes) compressUpward(leaf); if (node2count.size() > 3 * compressionFactor) { compressFully(); } } public static QDigest unionOf(QDigest a, QDigest b) { if (a.compressionFactor != b.compressionFactor) { throw new IllegalArgumentException( "Compression factors must be the same: " + "left is " + a.compressionFactor + ", " + "right is " + b.compressionFactor); } if (a.capacity > b.capacity) { return unionOf(b, a); } QDigest res = new QDigest(a.compressionFactor); res.capacity = a.capacity; res.size = a.size + b.size; for (long k : a.node2count.keySet()) { res.node2count.put(k, a.node2count.get(k)); } if (b.capacity > res.capacity) { res.rebuildToCapacity(b.capacity); } for (long k : b.node2count.keySet()) { res.node2count.put(k, b.get(k) + res.get(k)); } res.compressFully(); return res; } private void rebuildToCapacity(long newCapacity) { Long2LongOpenHashMap newNode2count = new Long2LongOpenHashMap(MAP_INITIAL_SIZE, MAP_LOAD_FACTOR); // rebuild to newLogCapacity. // This means that our current tree becomes a leftmost subtree // of the new tree. // E.g. when rebuilding a tree with logCapacity = 2 // (i.e. storing values in 0..3) to logCapacity = 5 (i.e. 0..31): // node 1 => 8 (+= 7 = 2^0*(2^3-1)) // nodes 2..3 => 16..17 (+= 14 = 2^1*(2^3-1)) // nodes 4..7 => 32..35 (+= 28 = 2^2*(2^3-1)) // This is easy to see if you draw it on paper. // Process the keys by "layers" in the original tree. long scaleR = newCapacity / capacity - 1; Long[] keys = node2count.keySet().toArray(new Long[node2count.size()]); Arrays.sort(keys); long scaleL = 1; for (long k : keys) { while (scaleL <= k / 2) { scaleL <<= 1; } newNode2count.put(k + scaleL * scaleR, node2count.get(k)); } node2count = newNode2count; capacity = newCapacity; compressFully(); } private void compressFully() { // Restore property 2 at each node. Long[] allNodes = node2count.keySet().toArray(new Long[node2count.size()]); for (long node : allNodes) { // The root node is not compressible: it has no parent and no sibling if (!isRoot(node)) { compressDownward(node); } } } /** * Restore P2 at node and upward the spine. Note that P2 can vanish * at some nodes sideways as a result of this. We'll fix that later * in compressFully when needed. */ private void compressUpward(long node) { double threshold = Math.floor(size / compressionFactor); long atNode = get(node); while (!isRoot(node)) { if (atNode > threshold) { break; } long atSibling = get(sibling(node)); if (atNode + atSibling > threshold) { break; } long atParent = get(parent(node)); if (atNode + atSibling + atParent > threshold) { break; } node2count.addTo(parent(node), atNode + atSibling); node2count.remove(node); if (atSibling > 0) { node2count.remove(sibling(node)); } node = parent(node); atNode = atParent + atNode + atSibling; } } /** * Restore P2 at seedNode and guarantee that no new violations of P2 appeared. */ private void compressDownward(long seedNode) { double threshold = Math.floor(size / compressionFactor); // P2 check same as above but shorter and slower (and invoked rarely) LongArrayFIFOQueue q = new LongArrayFIFOQueue(); q.enqueue(seedNode); while (!q.isEmpty()) { long node = q.dequeueLong(); long atNode = get(node); long atSibling = get(sibling(node)); if (atNode == 0 && atSibling == 0) { continue; } long atParent = get(parent(node)); if (atParent + atNode + atSibling > threshold) { continue; } node2count.addTo(parent(node), atNode + atSibling); node2count.remove(node); node2count.remove(sibling(node)); // Now P2 could have vanished at the node's and sibling's subtrees since they decreased. if (!isLeaf(node)) { q.enqueue(leftChild(node)); q.enqueue(leftChild(sibling(node))); } } } private long get(long node) { return node2count.get(node); } @Override public long getQuantile(double q) { List ranges = toAscRanges(); long s = 0; for (long[] r : ranges) { s += r[2]; if (s > q * size) { return r[1]; } } return ranges.get(ranges.size() - 1)[1]; } public List toAscRanges() { List ranges = new ArrayList(); for (long key : node2count.keySet()) { ranges.add(new long[]{rangeLeft(key), rangeRight(key), node2count.get(key)}); } Collections.sort(ranges, RANGES_COMPARATOR); return ranges; } public String toString() { List ranges = toAscRanges(); StringBuilder res = new StringBuilder(); for (long[] range : ranges) { if (res.length() > 0) { res.append(", "); } res.append(range[0]).append(" .. ").append(range[1]).append(": ").append(range[2]); } return res.toString(); } public static byte[] serialize(QDigest d) { ByteArrayOutputStream bos = new ByteArrayOutputStream(); DataOutputStream s = new DataOutputStream(bos); try { s.writeLong(d.size); s.writeDouble(d.compressionFactor); s.writeLong(d.capacity); s.writeInt(d.node2count.size()); for (long k : d.node2count.keySet()) { s.writeLong(k); s.writeLong(d.node2count.get(k)); } return bos.toByteArray(); } catch (IOException e) { // Should never happen throw new RuntimeException(e); } } public static QDigest deserialize(byte[] b) { ByteArrayInputStream bis = new ByteArrayInputStream(b); DataInputStream s = new DataInputStream(bis); try { long size = s.readLong(); double compressionFactor = s.readDouble(); long capacity = s.readLong(); int count = s.readInt(); QDigest d = new QDigest(compressionFactor); d.size = size; d.capacity = capacity; for (int i = 0; i < count; ++i) { long k = s.readLong(); long n = s.readLong(); d.node2count.put(k, n); } return d; } catch (IOException e) { throw new RuntimeException(e); } } // For debugging purposes. public long computeActualSize() { long res = 0; for (long x : node2count.values()) res += x; return res; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy