com.clearspring.analytics.stream.quantile.QDigest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stream-lib Show documentation
Show all versions of stream-lib Show documentation
A library for summarizing data in streams for which it is infeasible to store all events
The newest version!
package com.clearspring.analytics.stream.quantile;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongArrayFIFOQueue;
/**
* Q-Digest datastructure.
*
* Answers approximate quantile queries: actual rank of the result of query(q)
* is in q-eps .. q+eps, where eps = log(sigma)/compressionFactor
* and log(sigma) is ceiling of binary log of the largest value inserted,
* i.e. height of the tree.
*
* Two Q-Digests can be joined (see {@link #unionOf(QDigest, QDigest)}).
*
* Source:
* N.Shrivastava, C.Buragohain, D.Agrawal
* Medians and Beyond: New Aggregation Techniques for Sensor Networks
* http://www.cs.virginia.edu/~son/cs851/papers/ucsb.sensys04.pdf
*
* This is a slightly modified version.
* There is a small problem with the compression algorithm in the paper,
* see https://plus.google.com/u/0/109909935680879695595/posts/768ZZ9Euqz6
*
* So we use a different algorithm here:
*
* - When an item is inserted, we compress along the path to root from the item's leaf
*
- When the structure becomes too large (above the theoretical bound), or
* at "too destructive" operations (e.g. union or rebuild) we compress fully
*
*
* Note that the accuracy of the structure does NOT suffer if "property 2"
* from the paper is violated (in fact, restoring property 2 at any node
* decreases accuracy).
*
* So we can say that we preserve the paper's accuracy and memory consumption claims.
*/
public class QDigest implements IQuantileEstimator {
private static final Comparator RANGES_COMPARATOR = new Comparator() {
@Override
public int compare(long[] ra, long[] rb) {
long rightA = ra[1], rightB = rb[1], sizeA = ra[1] - ra[0], sizeB = rb[1] - rb[0];
if (rightA < rightB) {
return -1;
}
if (rightA > rightB) {
return 1;
}
if (sizeA < sizeB) {
return -1;
}
if (sizeA > sizeB) {
return 1;
}
return 0;
}
};
private static final int MAP_INITIAL_SIZE = Hash.DEFAULT_INITIAL_SIZE;
private static final float MAP_LOAD_FACTOR = Hash.VERY_FAST_LOAD_FACTOR;
private long size;
private long capacity = 1;
private double compressionFactor;
private Long2LongOpenHashMap node2count = new Long2LongOpenHashMap(MAP_INITIAL_SIZE, MAP_LOAD_FACTOR);
public QDigest(double compressionFactor) {
this.compressionFactor = compressionFactor;
}
private long value2leaf(long x) {
return capacity + x;
}
private long leaf2value(long id) {
return id - capacity;
}
private boolean isRoot(long id) {
return id == 1;
}
private boolean isLeaf(long id) {
return id >= capacity;
}
private long sibling(long id) {
return (id % 2 == 0) ? (id + 1) : (id - 1);
}
private long parent(long id) {
return id / 2;
}
private long leftChild(long id) {
return 2 * id;
}
private long rightChild(long id) {
return 2 * id + 1;
}
private long rangeLeft(long id) {
while (!isLeaf(id)) {
id = leftChild(id);
}
return leaf2value(id);
}
private long rangeRight(long id) {
while (!isLeaf(id)) {
id = rightChild(id);
}
return leaf2value(id);
}
@Override
public void offer(long value) {
if (value < 0 || value > Long.MAX_VALUE / 2) {
throw new IllegalArgumentException("Can only accept values in the range 0.." + Long.MAX_VALUE / 2 + ", got " + value);
}
// Rebuild if the value is too large for the current tree height
if (value >= capacity) {
rebuildToCapacity(Long.highestOneBit(value) << 1);
}
long leaf = value2leaf(value);
node2count.addTo(leaf, 1);
size++;
// Always compress at the inserted node, and recompress fully
// if the tree becomes too large.
// This is one sensible strategy which both is fast and keeps
// the tree reasonably small (within the theoretical bound of 3k nodes)
compressUpward(leaf);
if (node2count.size() > 3 * compressionFactor) {
compressFully();
}
}
public static QDigest unionOf(QDigest a, QDigest b) {
if (a.compressionFactor != b.compressionFactor) {
throw new IllegalArgumentException(
"Compression factors must be the same: " +
"left is " + a.compressionFactor + ", " +
"right is " + b.compressionFactor);
}
if (a.capacity > b.capacity) {
return unionOf(b, a);
}
QDigest res = new QDigest(a.compressionFactor);
res.capacity = a.capacity;
res.size = a.size + b.size;
for (long k : a.node2count.keySet()) {
res.node2count.put(k, a.node2count.get(k));
}
if (b.capacity > res.capacity) {
res.rebuildToCapacity(b.capacity);
}
for (long k : b.node2count.keySet()) {
res.node2count.put(k, b.get(k) + res.get(k));
}
res.compressFully();
return res;
}
private void rebuildToCapacity(long newCapacity) {
Long2LongOpenHashMap newNode2count = new Long2LongOpenHashMap(MAP_INITIAL_SIZE, MAP_LOAD_FACTOR);
// rebuild to newLogCapacity.
// This means that our current tree becomes a leftmost subtree
// of the new tree.
// E.g. when rebuilding a tree with logCapacity = 2
// (i.e. storing values in 0..3) to logCapacity = 5 (i.e. 0..31):
// node 1 => 8 (+= 7 = 2^0*(2^3-1))
// nodes 2..3 => 16..17 (+= 14 = 2^1*(2^3-1))
// nodes 4..7 => 32..35 (+= 28 = 2^2*(2^3-1))
// This is easy to see if you draw it on paper.
// Process the keys by "layers" in the original tree.
long scaleR = newCapacity / capacity - 1;
Long[] keys = node2count.keySet().toArray(new Long[node2count.size()]);
Arrays.sort(keys);
long scaleL = 1;
for (long k : keys) {
while (scaleL <= k / 2) {
scaleL <<= 1;
}
newNode2count.put(k + scaleL * scaleR, node2count.get(k));
}
node2count = newNode2count;
capacity = newCapacity;
compressFully();
}
private void compressFully() {
// Restore property 2 at each node.
Long[] allNodes = node2count.keySet().toArray(new Long[node2count.size()]);
for (long node : allNodes) {
// The root node is not compressible: it has no parent and no sibling
if (!isRoot(node)) {
compressDownward(node);
}
}
}
/**
* Restore P2 at node and upward the spine. Note that P2 can vanish
* at some nodes sideways as a result of this. We'll fix that later
* in compressFully when needed.
*/
private void compressUpward(long node) {
double threshold = Math.floor(size / compressionFactor);
long atNode = get(node);
while (!isRoot(node)) {
if (atNode > threshold) {
break;
}
long atSibling = get(sibling(node));
if (atNode + atSibling > threshold) {
break;
}
long atParent = get(parent(node));
if (atNode + atSibling + atParent > threshold) {
break;
}
node2count.addTo(parent(node), atNode + atSibling);
node2count.remove(node);
if (atSibling > 0) {
node2count.remove(sibling(node));
}
node = parent(node);
atNode = atParent + atNode + atSibling;
}
}
/**
* Restore P2 at seedNode and guarantee that no new violations of P2 appeared.
*/
private void compressDownward(long seedNode) {
double threshold = Math.floor(size / compressionFactor);
// P2 check same as above but shorter and slower (and invoked rarely)
LongArrayFIFOQueue q = new LongArrayFIFOQueue();
q.enqueue(seedNode);
while (!q.isEmpty()) {
long node = q.dequeueLong();
long atNode = get(node);
long atSibling = get(sibling(node));
if (atNode == 0 && atSibling == 0) {
continue;
}
long atParent = get(parent(node));
if (atParent + atNode + atSibling > threshold) {
continue;
}
node2count.addTo(parent(node), atNode + atSibling);
node2count.remove(node);
node2count.remove(sibling(node));
// Now P2 could have vanished at the node's and sibling's subtrees since they decreased.
if (!isLeaf(node)) {
q.enqueue(leftChild(node));
q.enqueue(leftChild(sibling(node)));
}
}
}
private long get(long node) {
return node2count.get(node);
}
@Override
public long getQuantile(double q) {
List ranges = toAscRanges();
long s = 0;
for (long[] r : ranges) {
s += r[2];
if (s > q * size) {
return r[1];
}
}
return ranges.get(ranges.size() - 1)[1];
}
public List toAscRanges() {
List ranges = new ArrayList();
for (long key : node2count.keySet()) {
ranges.add(new long[]{rangeLeft(key), rangeRight(key), node2count.get(key)});
}
Collections.sort(ranges, RANGES_COMPARATOR);
return ranges;
}
public String toString() {
List ranges = toAscRanges();
StringBuilder res = new StringBuilder();
for (long[] range : ranges) {
if (res.length() > 0) {
res.append(", ");
}
res.append(range[0]).append(" .. ").append(range[1]).append(": ").append(range[2]);
}
return res.toString();
}
public static byte[] serialize(QDigest d) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream s = new DataOutputStream(bos);
try {
s.writeLong(d.size);
s.writeDouble(d.compressionFactor);
s.writeLong(d.capacity);
s.writeInt(d.node2count.size());
for (long k : d.node2count.keySet()) {
s.writeLong(k);
s.writeLong(d.node2count.get(k));
}
return bos.toByteArray();
} catch (IOException e) {
// Should never happen
throw new RuntimeException(e);
}
}
public static QDigest deserialize(byte[] b) {
ByteArrayInputStream bis = new ByteArrayInputStream(b);
DataInputStream s = new DataInputStream(bis);
try {
long size = s.readLong();
double compressionFactor = s.readDouble();
long capacity = s.readLong();
int count = s.readInt();
QDigest d = new QDigest(compressionFactor);
d.size = size;
d.capacity = capacity;
for (int i = 0; i < count; ++i) {
long k = s.readLong();
long n = s.readLong();
d.node2count.put(k, n);
}
return d;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
// For debugging purposes.
public long computeActualSize() {
long res = 0;
for (long x : node2count.values()) res += x;
return res;
}
}