org.elasticsearch.search.aggregations.metrics.HyperLogLogPlusPlus Maven / Gradle / Ivy

Go to download
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.search.aggregations.metrics;

import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.packed.PackedInts;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.BitArray;
import org.elasticsearch.common.util.ByteArray;
import org.elasticsearch.common.util.ByteUtils;
import org.elasticsearch.common.util.IntArray;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;

/**
 * Hyperloglog++ counter, implemented based on pseudo code from
 * http://static.googleusercontent.com/media/research.google.com/fr//pubs/archive/40671.pdf and its appendix
 * https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen
 *
 * This implementation is different from the original implementation in that it uses a hash table instead of a sorted list for linear
 * counting. Although this requires more space and makes hyperloglog (which is less accurate) used sooner, this is also considerably faster.
 *
 * Trying to understand what this class does without having read the paper is considered adventurous.
 *
 * The HyperLogLogPlusPlus contains two algorithms, one for linear counting and the HyperLogLog algorithm. Initially hashes added to the
 * data structure are processed using the linear counting until a threshold defined by the precision is reached where the data is replayed
 * to the HyperLogLog algorithm and then this is used.
 *
 * It supports storing several HyperLogLogPlusPlus structures which are identified by a bucket number.
 */
public final class HyperLogLogPlusPlus extends AbstractHyperLogLogPlusPlus {

    private static final float MAX_LOAD_FACTOR = 0.75f;

    public static final int DEFAULT_PRECISION = 14;

    private final BitArray algorithm;
    private final HyperLogLog hll;
    private final LinearCounting lc;

    /**
     * Compute the required precision so that count distinct entries would be counted with linear counting.
     */
    public static int precisionFromThreshold(long count) {
        final long hashTableEntries = (long) Math.ceil(count / MAX_LOAD_FACTOR);
        int precision = PackedInts.bitsRequired(hashTableEntries * Integer.BYTES);
        precision = Math.max(precision, AbstractHyperLogLog.MIN_PRECISION);
        precision = Math.min(precision, AbstractHyperLogLog.MAX_PRECISION);
        return precision;
    }

    /**
     * Return the expected per-bucket memory usage for the given precision.
     */
    public static long memoryUsage(int precision) {
        return 1L << precision;
    }

    public HyperLogLogPlusPlus(int precision, BigArrays bigArrays, long initialBucketCount) {
        super(precision);
        HyperLogLog hll = null;
        LinearCounting lc = null;
        BitArray algorithm = null;
        boolean success = false;
        try {
            hll = new HyperLogLog(bigArrays, initialBucketCount, precision);
            lc = new LinearCounting(bigArrays, initialBucketCount, precision, hll);
            algorithm = new BitArray(1, bigArrays);
            success = true;
        } finally {
            if (success == false) {
                Releasables.close(hll, lc, algorithm);
            }
        }
        this.hll = hll;
        this.lc = lc;
        this.algorithm = algorithm;
    }

    @Override
    public long maxOrd() {
        return hll.maxOrd();
    }

    @Override
    public long cardinality(long bucketOrd) {
        if (getAlgorithm(bucketOrd) == LINEAR_COUNTING) {
            return lc.cardinality(bucketOrd);
        } else {
            return hll.cardinality(bucketOrd);
        }
    }

    @Override
    protected boolean getAlgorithm(long bucketOrd) {
        return algorithm.get(bucketOrd);
    }

    @Override
    protected AbstractLinearCounting.HashesIterator getLinearCounting(long bucketOrd) {
        return lc.values(bucketOrd);
    }

    @Override
    protected AbstractHyperLogLog.RunLenIterator getHyperLogLog(long bucketOrd) {
        return hll.getRunLens(bucketOrd);
    }

    @Override
    public void collect(long bucket, long hash) {
        hll.ensureCapacity(bucket + 1);
        if (algorithm.get(bucket) == LINEAR_COUNTING) {
            final int newSize = lc.collect(bucket, hash);
            if (newSize > lc.threshold) {
                upgradeToHll(bucket);
            }
        } else {
            hll.collect(bucket, hash);
        }
    }

    @Override
    public void close() {
        Releasables.close(algorithm, hll, lc);
    }

    protected void addRunLen(long bucketOrd, int register, int runLen) {
        if (algorithm.get(bucketOrd) == LINEAR_COUNTING) {
            upgradeToHll(bucketOrd);
        }
        hll.addRunLen(0, register, runLen);
    }

    void upgradeToHll(long bucketOrd) {
        // We need to copy values into an arrays as we will override
        // the values on the buffer
        hll.ensureCapacity(bucketOrd + 1);
        // It's safe to reuse lc's readSpare because we're single threaded.
        final AbstractLinearCounting.HashesIterator hashes = new LinearCountingIterator(lc, lc.readSpare, bucketOrd);
        final IntArray values = lc.bigArrays.newIntArray(hashes.size());
        try {
            int i = 0;
            while (hashes.next()) {
                values.set(i++, hashes.value());
            }
            assert i == hashes.size();
            hll.reset(bucketOrd);
            for (long j = 0; j < values.size(); ++j) {
                final int encoded = values.get(j);
                hll.collectEncoded(bucketOrd, encoded);
            }
            algorithm.set(bucketOrd);
        } finally {
            Releasables.close(values);
        }
    }

    public void merge(long thisBucket, AbstractHyperLogLogPlusPlus other, long otherBucket) {
        if (precision() != other.precision()) {
            throw new IllegalArgumentException();
        }
        hll.ensureCapacity(thisBucket + 1);
        if (other.getAlgorithm(otherBucket) == LINEAR_COUNTING) {
            merge(thisBucket, other.getLinearCounting(otherBucket));
        } else {
            merge(thisBucket, other.getHyperLogLog(otherBucket));
        }
    }

    private void merge(long thisBucket, AbstractLinearCounting.HashesIterator values) {
        while (values.next()) {
            final int encoded = values.value();
            if (algorithm.get(thisBucket) == LINEAR_COUNTING) {
                final int newSize = lc.addEncoded(thisBucket, encoded);
                if (newSize > lc.threshold) {
                    upgradeToHll(thisBucket);
                }
            } else {
                hll.collectEncoded(thisBucket, encoded);
            }
        }
    }

    private void merge(long thisBucket, AbstractHyperLogLog.RunLenIterator runLens) {
        if (algorithm.get(thisBucket) != HYPERLOGLOG) {
            upgradeToHll(thisBucket);
        }
        for (int i = 0; i < hll.m; ++i) {
            runLens.next();
            hll.addRunLen(thisBucket, i, runLens.value());
        }
    }

    private static class HyperLogLog extends AbstractHyperLogLog implements Releasable {
        private final BigArrays bigArrays;
        private final int precision;
        // array for holding the runlens.
        private ByteArray runLens;

        HyperLogLog(BigArrays bigArrays, long initialBucketCount, int precision) {
            super(precision);
            this.runLens = bigArrays.newByteArray(initialBucketCount << precision);
            this.bigArrays = bigArrays;
            this.precision = precision;
        }

        public long maxOrd() {
            return runLens.size() >>> precision();
        }

        @Override
        protected void addRunLen(long bucketOrd, int register, int encoded) {
            final long bucketIndex = (bucketOrd << p) + register;
            runLens.set(bucketIndex, (byte) Math.max(encoded, runLens.get(bucketIndex)));
        }

        @Override
        protected RunLenIterator getRunLens(long bucketOrd) {
            return new HyperLogLogIterator(this, bucketOrd);
        }

        protected void reset(long bucketOrd) {
            runLens.fill(bucketOrd << p, (bucketOrd << p) + m, (byte) 0);
        }

        protected void ensureCapacity(long numBuckets) {
            runLens = bigArrays.grow(runLens, numBuckets << p);
        }

        @Override
        public void close() {
            Releasables.close(runLens);
        }
    }

    private static class HyperLogLogIterator implements AbstractHyperLogLog.RunLenIterator {

        private final HyperLogLog hll;
        int pos;
        long start;
        private byte value;

        HyperLogLogIterator(HyperLogLog hll, long bucket) {
            this.hll = hll;
            start = bucket << hll.p;
        }

        @Override
        public boolean next() {
            if (pos < hll.m) {
                value = hll.runLens.get(start + pos);
                pos++;
                return true;
            }
            return false;
        }

        @Override
        public byte value() {
            return value;
        }
    }

    private static class LinearCounting extends AbstractLinearCounting implements Releasable {

        protected final int threshold;
        private final int mask;
        private final BytesRef readSpare;
        private final ByteBuffer writeSpare;
        private final BigArrays bigArrays;
        // We are actually using HyperLogLog's runLens array but interpreting it as a hash set for linear counting.
        private final HyperLogLog hll;
        private final int capacity;
        // Number of elements stored.
        private IntArray sizes;

        LinearCounting(BigArrays bigArrays, long initialBucketCount, int p, HyperLogLog hll) {
            super(p);
            this.bigArrays = bigArrays;
            this.hll = hll;
            this.capacity = (1 << p) / 4; // because ints take 4 bytes
            threshold = (int) (capacity * MAX_LOAD_FACTOR);
            mask = capacity - 1;
            sizes = bigArrays.newIntArray(initialBucketCount);
            readSpare = new BytesRef();
            writeSpare = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN);
        }

        @Override
        protected int addEncoded(long bucketOrd, int encoded) {
            sizes = bigArrays.grow(sizes, bucketOrd + 1);
            assert encoded != 0;
            for (int i = (encoded & mask);; i = (i + 1) & mask) {
                hll.runLens.get(index(bucketOrd, i), 4, readSpare);
                final int v = ByteUtils.readIntLE(readSpare.bytes, readSpare.offset);
                if (v == 0) {
                    // means unused, take it!
                    set(bucketOrd, i, encoded);
                    return sizes.increment(bucketOrd, 1);
                } else if (v == encoded) {
                    // k is already in the set
                    return -1;
                }
            }
        }

        @Override
        protected int size(long bucketOrd) {
            if (bucketOrd >= sizes.size()) {
                return 0;
            }
            final int size = sizes.get(bucketOrd);
            assert size == recomputedSize(bucketOrd);
            return size;
        }

        @Override
        protected HashesIterator values(long bucketOrd) {
            // Make a fresh BytesRef for reading scratch work because this method can be called on many threads
            return new LinearCountingIterator(this, new BytesRef(), bucketOrd);
        }

        private long index(long bucketOrd, int index) {
            return (bucketOrd << p) + (index << 2);
        }

        private void set(long bucketOrd, int index, int value) {
            writeSpare.putInt(0, value);
            hll.runLens.set(index(bucketOrd, index), writeSpare.array(), 0, 4);
        }

        private int recomputedSize(long bucketOrd) {
            if (bucketOrd >= hll.maxOrd()) {
                return 0;
            }
            int size = 0;
            BytesRef spare = new BytesRef();
            for (int i = 0; i <= mask; ++i) {
                hll.runLens.get(index(bucketOrd, i), 4, spare);
                final int v = ByteUtils.readIntLE(spare.bytes, spare.offset);
                if (v != 0) {
                    ++size;
                }
            }
            return size;
        }

        @Override
        public void close() {
            Releasables.close(sizes);
        }
    }

    private static class LinearCountingIterator implements AbstractLinearCounting.HashesIterator {

        private final LinearCounting lc;
        private final BytesRef spare;
        private final long bucketOrd;
        private final int size;
        private int pos;
        private int value;

        LinearCountingIterator(LinearCounting lc, BytesRef spare, long bucketOrd) {
            this.lc = lc;
            this.spare = spare;
            this.bucketOrd = bucketOrd;
            this.size = lc.size(bucketOrd);
            this.pos = size == 0 ? lc.capacity : 0;
        }

        @Override
        public int size() {
            return size;
        }

        @Override
        public boolean next() {
            if (pos < lc.capacity) {
                for (; pos < lc.capacity; ++pos) {
                    lc.hll.runLens.get(lc.index(bucketOrd, pos), 4, spare);
                    int k = ByteUtils.readIntLE(spare.bytes, spare.offset);
                    if (k != 0) {
                        ++pos;
                        value = k;
                        return true;
                    }
                }
            }
            return false;
        }

        @Override
        public int value() {
            return value;
        }
    }
}