com.clearspring.analytics.stream.quantile.QDigest Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stream-lib Show documentation
A library for summarizing data in streams for which it is infeasible to store all events
The newest version!
package com.clearspring.analytics.stream.quantile;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongArrayFIFOQueue;

/**
 * Q-Digest datastructure.
 * 
 * Answers approximate quantile queries: actual rank of the result of query(q)
 * is in q-eps .. q+eps, where eps = log(sigma)/compressionFactor
 * and log(sigma) is ceiling of binary log of the largest value inserted,
 * i.e. height of the tree.
 * 

 * Two Q-Digests can be joined (see {@link #unionOf(QDigest, QDigest)}).
 * 

 * Source:
 * N.Shrivastava, C.Buragohain, D.Agrawal
 * Medians and Beyond: New Aggregation Techniques for Sensor Networks
 * http://www.cs.virginia.edu/~son/cs851/papers/ucsb.sensys04.pdf
 * 

 * This is a slightly modified version.
 * There is a small problem with the compression algorithm in the paper,
 * see https://plus.google.com/u/0/109909935680879695595/posts/768ZZ9Euqz6
 * 

 * So we use a different algorithm here:
 * 

 * When an item is inserted, we compress along the path to root from the item's leaf
 * 
When the structure becomes too large (above the theoretical bound), or
 * at "too destructive" operations (e.g. union or rebuild) we compress fully
 * 
 * 
 * Note that the accuracy of the structure does NOT suffer if "property 2"
 * from the paper is violated (in fact, restoring property 2 at any node
 * decreases accuracy).
 * 
 * So we can say that we preserve the paper's accuracy and memory consumption claims.
 */
public class QDigest implements IQuantileEstimator {

    private static final Comparator RANGES_COMPARATOR = new Comparator() {
        @Override
        public int compare(long[] ra, long[] rb) {
            long rightA = ra[1], rightB = rb[1], sizeA = ra[1] - ra[0], sizeB = rb[1] - rb[0];
            if (rightA < rightB) {
                return -1;
            }
            if (rightA > rightB) {
                return 1;
            }
            if (sizeA < sizeB) {
                return -1;
            }
            if (sizeA > sizeB) {
                return 1;
            }
            return 0;
        }
    };

    private static final int MAP_INITIAL_SIZE = Hash.DEFAULT_INITIAL_SIZE;
    private static final float MAP_LOAD_FACTOR = Hash.VERY_FAST_LOAD_FACTOR;

    private long size;
    private long capacity = 1;
    private double compressionFactor;
    private Long2LongOpenHashMap node2count = new Long2LongOpenHashMap(MAP_INITIAL_SIZE, MAP_LOAD_FACTOR);

    public QDigest(double compressionFactor) {
        this.compressionFactor = compressionFactor;
    }

    private long value2leaf(long x) {
        return capacity + x;
    }

    private long leaf2value(long id) {
        return id - capacity;
    }

    private boolean isRoot(long id) {
        return id == 1;
    }

    private boolean isLeaf(long id) {
        return id >= capacity;
    }

    private long sibling(long id) {
        return (id % 2 == 0) ? (id + 1) : (id - 1);
    }

    private long parent(long id) {
        return id / 2;
    }

    private long leftChild(long id) {
        return 2 * id;
    }

    private long rightChild(long id) {
        return 2 * id + 1;
    }

    private long rangeLeft(long id) {
        while (!isLeaf(id)) {
            id = leftChild(id);
        }
        return leaf2value(id);
    }

    private long rangeRight(long id) {
        while (!isLeaf(id)) {
            id = rightChild(id);
        }
        return leaf2value(id);
    }

    @Override
    public void offer(long value) {
        if (value < 0 || value > Long.MAX_VALUE / 2) {
            throw new IllegalArgumentException("Can only accept values in the range 0.." + Long.MAX_VALUE / 2 + ", got " + value);
        }
        // Rebuild if the value is too large for the current tree height
        if (value >= capacity) {
            rebuildToCapacity(Long.highestOneBit(value) << 1);
        }

        long leaf = value2leaf(value);
        node2count.addTo(leaf, 1);
        size++;
        // Always compress at the inserted node, and recompress fully
        // if the tree becomes too large.
        // This is one sensible strategy which both is fast and keeps
        // the tree reasonably small (within the theoretical bound of 3k nodes)
        compressUpward(leaf);
        if (node2count.size() > 3 * compressionFactor) {
            compressFully();
        }
    }

    public static QDigest unionOf(QDigest a, QDigest b) {
        if (a.compressionFactor != b.compressionFactor) {
            throw new IllegalArgumentException(
                    "Compression factors must be the same: " +
                    "left is " + a.compressionFactor + ", " +
                    "right is " + b.compressionFactor);
        }
        if (a.capacity > b.capacity) {
            return unionOf(b, a);
        }

        QDigest res = new QDigest(a.compressionFactor);
        res.capacity = a.capacity;
        res.size = a.size + b.size;
        for (long k : a.node2count.keySet()) {
            res.node2count.put(k, a.node2count.get(k));
        }

        if (b.capacity > res.capacity) {
            res.rebuildToCapacity(b.capacity);
        }

        for (long k : b.node2count.keySet()) {
            res.node2count.put(k, b.get(k) + res.get(k));
        }

        res.compressFully();

        return res;
    }

    private void rebuildToCapacity(long newCapacity) {
        Long2LongOpenHashMap newNode2count = new Long2LongOpenHashMap(MAP_INITIAL_SIZE, MAP_LOAD_FACTOR);
        // rebuild to newLogCapacity.
        // This means that our current tree becomes a leftmost subtree
        // of the new tree.
        // E.g. when rebuilding a tree with logCapacity = 2
        // (i.e. storing values in 0..3) to logCapacity = 5 (i.e. 0..31):
        // node 1 => 8 (+= 7 = 2^0*(2^3-1))
        // nodes 2..3 => 16..17 (+= 14 = 2^1*(2^3-1))
        // nodes 4..7 => 32..35 (+= 28 = 2^2*(2^3-1))
        // This is easy to see if you draw it on paper.
        // Process the keys by "layers" in the original tree.
        long scaleR = newCapacity / capacity - 1;
        Long[] keys = node2count.keySet().toArray(new Long[node2count.size()]);
        Arrays.sort(keys);
        long scaleL = 1;
        for (long k : keys) {
            while (scaleL <= k / 2) {
                scaleL <<= 1;
            }
            newNode2count.put(k + scaleL * scaleR, node2count.get(k));
        }
        node2count = newNode2count;
        capacity = newCapacity;
        compressFully();
    }

    private void compressFully() {
        // Restore property 2 at each node.
        Long[] allNodes = node2count.keySet().toArray(new Long[node2count.size()]);
        for (long node : allNodes) {
            // The root node is not compressible: it has no parent and no sibling
            if (!isRoot(node)) {
                compressDownward(node);
            }
        }
    }

    /**
     * Restore P2 at node and upward the spine. Note that P2 can vanish
     * at some nodes sideways as a result of this. We'll fix that later
     * in compressFully when needed.
     */
    private void compressUpward(long node) {
        double threshold = Math.floor(size / compressionFactor);
        long atNode = get(node);
        while (!isRoot(node)) {
            if (atNode > threshold) {
                break;
            }
            long atSibling = get(sibling(node));
            if (atNode + atSibling > threshold) {
                break;
            }
            long atParent = get(parent(node));
            if (atNode + atSibling + atParent > threshold) {
                break;
            }

            node2count.addTo(parent(node), atNode + atSibling);
            node2count.remove(node);
            if (atSibling > 0) {
                node2count.remove(sibling(node));
            }
            node = parent(node);
            atNode = atParent + atNode + atSibling;
        }
    }

    /**
     * Restore P2 at seedNode and guarantee that no new violations of P2 appeared.
     */
    private void compressDownward(long seedNode) {
        double threshold = Math.floor(size / compressionFactor);
        // P2 check same as above but shorter and slower (and invoked rarely)
        LongArrayFIFOQueue q = new LongArrayFIFOQueue();
        q.enqueue(seedNode);
        while (!q.isEmpty()) {
            long node = q.dequeueLong();
            long atNode = get(node);
            long atSibling = get(sibling(node));
            if (atNode == 0 && atSibling == 0) {
                continue;
            }
            long atParent = get(parent(node));
            if (atParent + atNode + atSibling > threshold) {
                continue;
            }
            node2count.addTo(parent(node), atNode + atSibling);
            node2count.remove(node);
            node2count.remove(sibling(node));
            // Now P2 could have vanished at the node's and sibling's subtrees since they decreased.
            if (!isLeaf(node)) {
                q.enqueue(leftChild(node));
                q.enqueue(leftChild(sibling(node)));
            }
        }
    }

    private long get(long node) {
        return node2count.get(node);
    }

    @Override
    public long getQuantile(double q) {
        List ranges = toAscRanges();
        long s = 0;
        for (long[] r : ranges) {
            s += r[2];
            if (s > q * size) {
                return r[1];
            }
        }
        return ranges.get(ranges.size() - 1)[1];
    }

    public List toAscRanges() {
        List ranges = new ArrayList();
        for (long key : node2count.keySet()) {
            ranges.add(new long[]{rangeLeft(key), rangeRight(key), node2count.get(key)});
        }

        Collections.sort(ranges, RANGES_COMPARATOR);
        return ranges;
    }

    public String toString() {
        List ranges = toAscRanges();
        StringBuilder res = new StringBuilder();
        for (long[] range : ranges) {
            if (res.length() > 0) {
                res.append(", ");
            }
            res.append(range[0]).append(" .. ").append(range[1]).append(": ").append(range[2]);
        }
        return res.toString();
    }

    public static byte[] serialize(QDigest d) {
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        DataOutputStream s = new DataOutputStream(bos);
        try {
            s.writeLong(d.size);
            s.writeDouble(d.compressionFactor);
            s.writeLong(d.capacity);
            s.writeInt(d.node2count.size());
            for (long k : d.node2count.keySet()) {
                s.writeLong(k);
                s.writeLong(d.node2count.get(k));
            }
            return bos.toByteArray();
        } catch (IOException e) {
            // Should never happen
            throw new RuntimeException(e);
        }
    }

    public static QDigest deserialize(byte[] b) {
        ByteArrayInputStream bis = new ByteArrayInputStream(b);
        DataInputStream s = new DataInputStream(bis);
        try {
            long size = s.readLong();
            double compressionFactor = s.readDouble();
            long capacity = s.readLong();
            int count = s.readInt();
            QDigest d = new QDigest(compressionFactor);
            d.size = size;
            d.capacity = capacity;
            for (int i = 0; i < count; ++i) {
                long k = s.readLong();
                long n = s.readLong();
                d.node2count.put(k, n);
            }
            return d;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    // For debugging purposes.
    public long computeActualSize() {
        long res = 0;
        for (long x : node2count.values()) res += x;
        return res;
    }
}