All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.util.hll.HLL Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.util.hll;

import java.util.Arrays;

import com.carrotsearch.hppc.IntByteHashMap;
import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.cursors.IntByteCursor;
import com.carrotsearch.hppc.cursors.LongCursor;
import org.apache.solr.util.LongIterator;

/**
 * A probabilistic set of hashed long elements. Useful for computing
 * the approximate cardinality of a stream of data in very small storage.
 *
 * A modified version of the 
 * 'HyperLogLog' data structure and algorithm is used, which combines both
 * probabilistic and non-probabilistic techniques to improve the accuracy and
 * storage requirements of the original algorithm.
 *
 * More specifically, initializing and storing a new {@link HLL} will
 * allocate a sentinel value symbolizing the empty set ({@link HLLType#EMPTY}).
 * After adding the first few values, a sorted list of unique integers is
 * stored in a {@link HLLType#EXPLICIT} hash set. When configured, accuracy can
 * be sacrificed for memory footprint: the values in the sorted list are
 * "promoted" to a "{@link HLLType#SPARSE}" map-based HyperLogLog structure.
 * Finally, when enough registers are set, the map-based HLL will be converted
 * to a bit-packed "{@link HLLType#FULL}" HyperLogLog structure.
 *
 * This data structure is interoperable with the implementations found at:
 * 
 * when properly serialized.
 */
public class HLL implements Cloneable {
    // minimum and maximum values for the log-base-2 of the number of registers
    // in the HLL
    public static final int MINIMUM_LOG2M_PARAM = 4;
    public static final int MAXIMUM_LOG2M_PARAM = 30;

    // minimum and maximum values for the register width of the HLL
    public static final int MINIMUM_REGWIDTH_PARAM = 1;
    public static final int MAXIMUM_REGWIDTH_PARAM = 8;

    // minimum and maximum values for the 'expthresh' parameter of the
    // constructor that is meant to match the PostgreSQL implementation's
    // constructor and parameter names
    public static final int MINIMUM_EXPTHRESH_PARAM = -1;
    public static final int MAXIMUM_EXPTHRESH_PARAM = 18;
    public static final int MAXIMUM_EXPLICIT_THRESHOLD = (1 << (MAXIMUM_EXPTHRESH_PARAM - 1)/*per storage spec*/);

    // ************************************************************************
    // Storage
    // storage used when #type is EXPLICIT, null otherwise
    LongHashSet explicitStorage;
    // storage used when #type is SPARSE, null otherwise
    IntByteHashMap sparseProbabilisticStorage;
    // storage used when #type is FULL, null otherwise
    BitVector probabilisticStorage;

    // current type of this HLL instance, if this changes then so should the
    // storage used (see above)
    private HLLType type;

    // ------------------------------------------------------------------------
    // Characteristic parameters
    // NOTE:  These members are named to match the PostgreSQL implementation's
    //        parameters.
    // log2(the number of probabilistic HLL registers)
    private final int log2m;
    // the size (width) each register in bits
    private final int regwidth;

    // ------------------------------------------------------------------------
    // Computed constants
    // ........................................................................
    // EXPLICIT-specific constants
    // flag indicating if the EXPLICIT representation should NOT be used
    private final boolean explicitOff;
    // flag indicating that the promotion threshold from EXPLICIT should be
    // computed automatically
    // NOTE:  this only has meaning when 'explicitOff' is false
    private final boolean explicitAuto;
    // threshold (in element count) at which a EXPLICIT HLL is converted to a
    // SPARSE or FULL HLL, always greater than or equal to zero and always a
    // power of two OR simply zero
    // NOTE:  this only has meaning when 'explicitOff' is false
    private final int explicitThreshold;

    // ........................................................................
    // SPARSE-specific constants
    // the computed width of the short words
    private final int shortWordLength;
    // flag indicating if the SPARSE representation should not be used
    private final boolean sparseOff;
    // threshold (in register count) at which a SPARSE HLL is converted to a
    // FULL HLL, always greater than zero
    private final int sparseThreshold;

    // ........................................................................
    // Probabilistic algorithm constants
    // the number of registers, will always be a power of 2
    private final int m;
    // a mask of the log2m bits set to one and the rest to zero
    private final int mBitsMask;
    // a mask as wide as a register (see #fromBytes())
    private final int valueMask;
    // mask used to ensure that p(w) does not overflow register (see #Constructor() and #addRaw())
    private final long pwMaxMask;
    // alpha * m^2 (the constant in the "'raw' HyperLogLog estimator")
    private final double alphaMSquared;
    // the cutoff value of the estimator for using the "small" range cardinality
    // correction formula
    private final double smallEstimatorCutoff;
    // the cutoff value of the estimator for using the "large" range cardinality
    // correction formula
    private final double largeEstimatorCutoff;

    // ========================================================================
    /**
     * NOTE: Arguments here are named and structured identically to those in the
     *       PostgreSQL implementation, which can be found
     *       here.
     *
     * @param log2m log-base-2 of the number of registers used in the HyperLogLog
     *        algorithm. Must be at least 4 and at most 30.
     * @param regwidth number of bits used per register in the HyperLogLog
     *        algorithm. Must be at least 1 and at most 8.
     * @param expthresh tunes when the {@link HLLType#EXPLICIT} to
     *        {@link HLLType#SPARSE} promotion occurs,
     *        based on the set's cardinality. Must be at least -1 and at most 18.
     * @param sparseon Flag indicating if the {@link HLLType#SPARSE}
     *        representation should be used.
     * @param type the type in the promotion hierarchy which this instance should
     *        start at. This cannot be null.
     */
    public HLL(final int log2m, final int regwidth, final int expthresh, final boolean sparseon, final HLLType type) {
        this.log2m = log2m;
        if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
            throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
        }

        this.regwidth = regwidth;
        if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
            throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
        }

        this.m = (1 << log2m);
        this.mBitsMask = m - 1;
        this.valueMask = (1 << regwidth) - 1;
        this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
        this.alphaMSquared = HLLUtil.alphaMSquared(m);
        this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
        this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);

        if(expthresh == -1) {
            this.explicitAuto = true;
            this.explicitOff = false;

            // NOTE:  This math matches the size calculation in the PostgreSQL impl.
            final long fullRepresentationSize = (this.regwidth * (long)this.m + 7/*round up to next whole byte*/)/Byte.SIZE;
            final int numLongs = (int)(fullRepresentationSize / 8/*integer division to round down*/);

            if(numLongs > MAXIMUM_EXPLICIT_THRESHOLD) {
                this.explicitThreshold = MAXIMUM_EXPLICIT_THRESHOLD;
            } else {
                this.explicitThreshold = numLongs;
            }
        } else if(expthresh == 0) {
            this.explicitAuto = false;
            this.explicitOff = true;
            this.explicitThreshold = 0;
        } else if((expthresh > 0) && (expthresh <= MAXIMUM_EXPTHRESH_PARAM)){
            this.explicitAuto = false;
            this.explicitOff = false;
            this.explicitThreshold = (1 << (expthresh - 1));
        } else {
            throw new IllegalArgumentException("'expthresh' must be at least " + MINIMUM_EXPTHRESH_PARAM + " and at most " + MAXIMUM_EXPTHRESH_PARAM + " (was: " + expthresh + ")");
        }

        this.shortWordLength = (regwidth + log2m);
        this.sparseOff = !sparseon;
        if(this.sparseOff) {
            this.sparseThreshold = 0;
        } else {
            // TODO improve this cutoff to include the cost overhead of Java
            //      members/objects
            final int largestPow2LessThanCutoff =
                    (int)NumberUtil.log2((this.m * this.regwidth) / this.shortWordLength);
            this.sparseThreshold = (1 << largestPow2LessThanCutoff);
        }

        initializeStorage(type);
    }

    /**
     *  Construct an empty HLL with the given {@code log2m} and {@code regwidth}.
     *
     *  This is equivalent to calling HLL(log2m, regwidth, -1, true, HLLType.EMPTY).
     *
     * @param log2m log-base-2 of the number of registers used in the HyperLogLog
     *        algorithm. Must be at least 4 and at most 30.
     * @param regwidth number of bits used per register in the HyperLogLog
     *        algorithm. Must be at least 1 and at most 8.
     *
     * @see #HLL(int, int, int, boolean, HLLType)
     */
    public HLL(final int log2m, final int regwidth) {
        this(log2m, regwidth, -1, true, HLLType.EMPTY);
    }

    // -------------------------------------------------------------------------
    /**
     * Convenience constructor for testing. Assumes that both {@link HLLType#EXPLICIT}
     * and {@link HLLType#SPARSE} representations should be enabled.
     *
     * @param log2m log-base-2 of the number of registers used in the HyperLogLog
     *        algorithm. Must be at least 4 and at most 30.
     * @param regwidth number of bits used per register in the HyperLogLog
     *        algorithm. Must be at least 1 and at most 8.
     * @param explicitThreshold cardinality threshold at which the {@link HLLType#EXPLICIT}
     *        representation should be promoted to {@link HLLType#SPARSE}.
     *        This must be greater than zero and less than or equal to {@value #MAXIMUM_EXPLICIT_THRESHOLD}.
     * @param sparseThreshold register count threshold at which the {@link HLLType#SPARSE}
     *        representation should be promoted to {@link HLLType#FULL}.
     *        This must be greater than zero.
     * @param type the type in the promotion hierarchy which this instance should
     *        start at. This cannot be null.
     */
    /*package, for testing*/ HLL(final int log2m, final int regwidth, final int explicitThreshold, final int sparseThreshold, final HLLType type) {
        this.log2m = log2m;
        if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
            throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
        }

        this.regwidth = regwidth;
        if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
            throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
        }

        this.m = (1 << log2m);
        this.mBitsMask = m - 1;
        this.valueMask = (1 << regwidth) - 1;
        this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
        this.alphaMSquared = HLLUtil.alphaMSquared(m);
        this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
        this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);

        this.explicitAuto = false;
        this.explicitOff = false;
        this.explicitThreshold = explicitThreshold;
        if((explicitThreshold < 1) || (explicitThreshold > MAXIMUM_EXPLICIT_THRESHOLD)) {
            throw new IllegalArgumentException("'explicitThreshold' must be at least 1 and at most " + MAXIMUM_EXPLICIT_THRESHOLD + " (was: " + explicitThreshold + ")");
        }

        this.shortWordLength = (regwidth + log2m);
        this.sparseOff = false;
        this.sparseThreshold = sparseThreshold;

        initializeStorage(type);
    }

    /**
     * @return the type in the promotion hierarchy of this instance. This will
     *         never be null.
     */
    public HLLType getType() { return type; }

    // ========================================================================
    // Add
    /**
     * Adds rawValue directly to the HLL.
     *
     * @param  rawValue the value to be added. It is very important that this
     *         value already be hashed with a strong (but not
     *         necessarily cryptographic) hash function. For instance, the
     *         Murmur3 implementation in
     *         
     *         Google's Guava library is an excellent hash function for this
     *         purpose and, for seeds greater than zero, matches the output
     *         of the hash provided in the PostgreSQL implementation.
     */
    public void addRaw(final long rawValue) {
        switch(type) {
            case EMPTY: {
                // NOTE:  EMPTY type is always promoted on #addRaw()
                if(explicitThreshold > 0) {
                    initializeStorage(HLLType.EXPLICIT);
                    explicitStorage.add(rawValue);
                } else if(!sparseOff) {
                    initializeStorage(HLLType.SPARSE);
                    addRawSparseProbabilistic(rawValue);
                } else {
                    initializeStorage(HLLType.FULL);
                    addRawProbabilistic(rawValue);
                }
                return;
            }
            case EXPLICIT: {
                explicitStorage.add(rawValue);

                // promotion, if necessary
                if(explicitStorage.size() > explicitThreshold) {
                    if(!sparseOff) {
                        initializeStorage(HLLType.SPARSE);
                        for (LongCursor c : explicitStorage) {
                            addRawSparseProbabilistic(c.value);
                        }
                    } else {
                        initializeStorage(HLLType.FULL);
                        for (LongCursor c : explicitStorage) {
                            addRawProbabilistic(c.value);
                        }
                    }
                    explicitStorage = null;
                }
                return;
            }
            case SPARSE: {
                addRawSparseProbabilistic(rawValue);

                // promotion, if necessary
                if(sparseProbabilisticStorage.size() > sparseThreshold) {
                    initializeStorage(HLLType.FULL);
                    for(IntByteCursor c : sparseProbabilisticStorage) {
                        final int registerIndex = c.key;
                        final byte registerValue = c.value;
                        probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                    }
                    sparseProbabilisticStorage = null;
                }
                return;
            }
            case FULL:
                addRawProbabilistic(rawValue);
                return;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
    }

    // ------------------------------------------------------------------------
    // #addRaw(..) helpers
    /**
     * Adds the raw value to the {@link #sparseProbabilisticStorage}.
     * {@link #type} must be {@link HLLType#SPARSE}.
     *
     * @param rawValue the raw value to add to the sparse storage.
     */
    private void addRawSparseProbabilistic(final long rawValue) {
        // p(w): position of the least significant set bit (one-indexed)
        // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
        //
        // By construction of pwMaxMask (see #Constructor()),
        //      lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
        // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
        // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
        final long substreamValue = (rawValue >>> log2m);
        final byte p_w;

        if(substreamValue == 0L) {
            // The paper does not cover p(0x0), so the special value 0 is used.
            // 0 is the original initialization value of the registers, so by
            // doing this the multiset simply ignores it. This is acceptable
            // because the probability is 1/(2^(2^registerSizeInBits)).
            p_w = 0;
        } else {
            p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
        }

        // Short-circuit if the register is being set to zero, since algorithmically
        // this corresponds to an "unset" register, and "unset" registers aren't
        // stored to save memory. (The very reason this sparse implementation
        // exists.) If a register is set to zero it will break the #algorithmCardinality
        // code.
        if(p_w == 0) {
            return;
        }

        // NOTE:  no +1 as in paper since 0-based indexing
        final int j = (int)(rawValue & mBitsMask);

        final byte currentValue;
        final int index = sparseProbabilisticStorage.indexOf(j);
        if (index >= 0) {
          currentValue = sparseProbabilisticStorage.indexGet(index);
        } else {
          currentValue = 0;
        }

        if(p_w > currentValue) {
            sparseProbabilisticStorage.put(j, p_w);
        }
    }

    /**
     * Adds the raw value to the {@link #probabilisticStorage}.
     * {@link #type} must be {@link HLLType#FULL}.
     *
     * @param rawValue the raw value to add to the full probabilistic storage.
     */
    private void addRawProbabilistic(final long rawValue) {
        // p(w): position of the least significant set bit (one-indexed)
        // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
        //
        // By construction of pwMaxMask (see #Constructor()),
        //      lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
        // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
        // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
        final long substreamValue = (rawValue >>> log2m);
        final byte p_w;

        if (substreamValue == 0L) {
            // The paper does not cover p(0x0), so the special value 0 is used.
            // 0 is the original initialization value of the registers, so by
            // doing this the multiset simply ignores it. This is acceptable
            // because the probability is 1/(2^(2^registerSizeInBits)).
            p_w = 0;
        } else {
            p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
        }

        // Short-circuit if the register is being set to zero, since algorithmically
        // this corresponds to an "unset" register, and "unset" registers aren't
        // stored to save memory. (The very reason this sparse implementation
        // exists.) If a register is set to zero it will break the #algorithmCardinality
        // code.
        if(p_w == 0) {
            return;
        }

        // NOTE:  no +1 as in paper since 0-based indexing
        final int j = (int)(rawValue & mBitsMask);

        probabilisticStorage.setMaxRegister(j, p_w);
    }

    // ------------------------------------------------------------------------
    // Storage helper
    /**
     * Initializes storage for the specified {@link HLLType} and changes the
     * instance's {@link #type}.
     *
     * @param type the {@link HLLType} to initialize storage for. This cannot be
     *        null and must be an instantiable type.
     */
    private void initializeStorage(final HLLType type) {
        this.type = type;
        switch(type) {
            case EMPTY:
                // nothing to be done
                break;
            case EXPLICIT:
                this.explicitStorage = new LongHashSet();
                break;
            case SPARSE:
                this.sparseProbabilisticStorage = new IntByteHashMap();
                break;
            case FULL:
                this.probabilisticStorage = new BitVector(regwidth, m);
                break;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
    }

    // ========================================================================
    // Cardinality
    /**
     * Computes the cardinality of the HLL.
     *
     * @return the cardinality of HLL. This will never be negative.
     */
    public long cardinality() {
        switch(type) {
            case EMPTY:
                return 0/*by definition*/;
            case EXPLICIT:
                return explicitStorage.size();
            case SPARSE:
                return (long)Math.ceil(sparseProbabilisticAlgorithmCardinality());
            case FULL:
                return (long)Math.ceil(fullProbabilisticAlgorithmCardinality());
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
    }

    // ------------------------------------------------------------------------
    // Cardinality helpers
    /**
     * Computes the exact cardinality value returned by the HLL algorithm when
     * represented as a {@link HLLType#SPARSE} HLL. Kept
     * separate from {@link #cardinality()} for testing purposes. {@link #type}
     * must be {@link HLLType#SPARSE}.
     *
     * @return the exact, unrounded cardinality given by the HLL algorithm
     */
    /*package, for testing*/ double sparseProbabilisticAlgorithmCardinality() {
        final int m = this.m/*for performance*/;

        // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
        // 'j'th register value
        double sum = 0;
        int numberOfZeroes = 0/*"V" in the paper*/;
        for(int j=0; jclear does NOT handle
     * transitions between {@link HLLType}s - a probabilistic type will remain
     * probabilistic after being cleared.
     */
    public void clear() {
        switch(type) {
            case EMPTY:
                return /*do nothing*/;
            case EXPLICIT:
                explicitStorage.clear();
                return;
            case SPARSE:
                sparseProbabilisticStorage.clear();
                return;
            case FULL:
                probabilisticStorage.fill(0);
                return;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }
    }

    // ========================================================================
    // Union
    /**
     * Computes the union of HLLs and stores the result in this instance.
     *
     * @param other the other {@link HLL} instance to union into this one. This
     *        cannot be null.
     */
    public void union(final HLL other) {
        // TODO: verify HLLs are compatible
        final HLLType otherType = other.getType();

        if(type.equals(otherType)) {
            homogeneousUnion(other);
            return;
        } else {
            heterogenousUnion(other);
            return;
        }
    }

    // ------------------------------------------------------------------------
    // Union helpers
    /**
     * Computes the union of two HLLs, of different types, and stores the
     * result in this instance.
     *
     * @param other the other {@link HLL} instance to union into this one. This
     *        cannot be null.
     */
    /*package, for testing*/ void heterogenousUnion(final HLL other) {
        /*
         * The logic here is divided into two sections: unions with an EMPTY
         * HLL, and unions between EXPLICIT/SPARSE/FULL
         * HLL.
         *
         * Between those two sections, all possible heterogeneous unions are
         * covered. Should another type be added to HLLType whose unions
         * are not easily reduced (say, as EMPTY's are below) this may be more
         * easily implemented as Strategies. However, that is unnecessary as it
         * stands.
         */

        // ....................................................................
        // Union with an EMPTY
        if(HLLType.EMPTY.equals(type)) {
            // NOTE:  The union of empty with non-empty HLL is just a
            //        clone of the non-empty.

            switch(other.getType()) {
                case EXPLICIT: {
                    // src:  EXPLICIT
                    // dest: EMPTY

                    if(other.explicitStorage.size() <= explicitThreshold) {
                        type = HLLType.EXPLICIT;
                        explicitStorage = other.explicitStorage.clone();
                    } else {
                        if(!sparseOff) {
                            initializeStorage(HLLType.SPARSE);
                        } else {
                            initializeStorage(HLLType.FULL);
                        }
                        for(LongCursor c : other.explicitStorage) {
                            addRaw(c.value);
                        }
                    }
                    return;
                }
                case SPARSE: {
                    // src:  SPARSE
                    // dest: EMPTY

                    if(!sparseOff) {
                        type = HLLType.SPARSE;
                        sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
                    } else {
                        initializeStorage(HLLType.FULL);
                        for(IntByteCursor c : other.sparseProbabilisticStorage) {
                          final int registerIndex = c.key;
                          final byte registerValue = c.value;
                          probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                        }
                    }
                    return;
                }
                default/*case FULL*/: {
                    // src:  FULL
                    // dest: EMPTY

                    type = HLLType.FULL;
                    probabilisticStorage = other.probabilisticStorage.clone();
                    return;
                }
            }
        } else if (HLLType.EMPTY.equals(other.getType())) {
            // source is empty, so just return destination since it is unchanged
            return;
        } /* else -- both of the sets are not empty */

        // ....................................................................
        // NOTE: Since EMPTY is handled above, the HLLs are non-EMPTY below
        switch(type) {
            case EXPLICIT: {
                // src:  FULL/SPARSE
                // dest: EXPLICIT
                // "Storing into destination" cannot be done (since destination
                // is by definition of smaller capacity than source), so a clone
                // of source is made and values from destination are inserted
                // into that.

                // Determine source and destination storage.
                // NOTE:  destination storage may change through promotion if
                //        source is SPARSE.
                if(HLLType.SPARSE.equals(other.getType())) {
                    if(!sparseOff) {
                        type = HLLType.SPARSE;
                        sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
                    } else {
                        initializeStorage(HLLType.FULL);
                        for(IntByteCursor c : other.sparseProbabilisticStorage) {
                          final int registerIndex = c.key;
                          final byte registerValue = c.value;
                          probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                        }
                    }
                } else /*source is HLLType.FULL*/ {
                    type = HLLType.FULL;
                    probabilisticStorage = other.probabilisticStorage.clone();
                }
                for(LongCursor c : explicitStorage) {
                    addRaw(c.value);
                }
                explicitStorage = null;
                return;
            }
            case SPARSE: {
                if(HLLType.EXPLICIT.equals(other.getType())) {
                    // src:  EXPLICIT
                    // dest: SPARSE
                    // Add the raw values from the source to the destination.

                    for(LongCursor c : other.explicitStorage) {
                        addRaw(c.value);
                    }
                    // NOTE:  addRaw will handle promotion cleanup
                } else /*source is HLLType.FULL*/ {
                    // src:  FULL
                    // dest: SPARSE
                    // "Storing into destination" cannot be done (since destination
                    // is by definition of smaller capacity than source), so a
                    // clone of source is made and registers from the destination
                    // are merged into the clone.

                    type = HLLType.FULL;
                    probabilisticStorage = other.probabilisticStorage.clone();
                    for(IntByteCursor c : sparseProbabilisticStorage) {
                      final int registerIndex = c.key;
                      final byte registerValue = c.value;
                      probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                    }
                    sparseProbabilisticStorage = null;
                }
                return;
            }
            default/*destination is HLLType.FULL*/: {
                if(HLLType.EXPLICIT.equals(other.getType())) {
                    // src:  EXPLICIT
                    // dest: FULL
                    // Add the raw values from the source to the destination.
                    // Promotion is not possible, so don't bother checking.

                    for(LongCursor c : other.explicitStorage) {
                        addRaw(c.value);
                    }
                } else /*source is HLLType.SPARSE*/ {
                    // src:  SPARSE
                    // dest: FULL
                    // Merge the registers from the source into the destination.
                    // Promotion is not possible, so don't bother checking.

                    for(IntByteCursor c : other.sparseProbabilisticStorage) {
                      final int registerIndex = c.key;
                      final byte registerValue = c.value;
                      probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                    }
                }
            }
        }
    }

    /**
     * Computes the union of two HLLs of the same type, and stores the
     * result in this instance.
     *
     * @param other the other {@link HLL} instance to union into this one. This
     *        cannot be null.
     */
    private void homogeneousUnion(final HLL other) {
        switch(type) {
            case EMPTY:
                // union of empty and empty is empty
                return;
        case EXPLICIT:
            for(LongCursor c : other.explicitStorage) {
                addRaw(c.value);
            }
            // NOTE:  #addRaw() will handle promotion, if necessary
            return;
        case SPARSE:
            for(IntByteCursor c : other.sparseProbabilisticStorage) {
              final int registerIndex = c.key;
              final byte registerValue = c.value;
              final byte currentRegisterValue = sparseProbabilisticStorage.get(registerIndex);
              if(registerValue > currentRegisterValue) {
                sparseProbabilisticStorage.put(registerIndex, registerValue);
              }
            }

            // promotion, if necessary
            if(sparseProbabilisticStorage.size() > sparseThreshold) {
                initializeStorage(HLLType.FULL);
                for(IntByteCursor c : sparseProbabilisticStorage) {
                  final int registerIndex = c.key;
                  final byte registerValue = c.value;
                  probabilisticStorage.setMaxRegister(registerIndex, registerValue);
                }
                sparseProbabilisticStorage = null;
            }
            return;
        case FULL:
            for(int i=0; inull or empty.
     */
    public byte[] toBytes() {
        return toBytes(SerializationUtil.DEFAULT_SCHEMA_VERSION);
    }

    /**
     * Serializes the HLL to an array of bytes in correspondence with the format
     * of the specified schema version.
     *
     * @param  schemaVersion the schema version dictating the serialization format
     * @return the array of bytes representing the HLL. This will never be
     *         null or empty.
     */
    public byte[] toBytes(final ISchemaVersion schemaVersion) {
        final byte[] bytes;
        switch(type) {
            case EMPTY:
                bytes = new byte[schemaVersion.paddingBytes(type)];
                break;
            case EXPLICIT: {
                final IWordSerializer serializer =
                    schemaVersion.getSerializer(type, Long.SIZE, explicitStorage.size());

                final long[] values = explicitStorage.toArray();
                Arrays.sort(values);
                for(final long value : values) {
                    serializer.writeWord(value);
                }

                bytes = serializer.getBytes();
                break;
            }
            case SPARSE: {
                final IWordSerializer serializer =
                        schemaVersion.getSerializer(type, shortWordLength, sparseProbabilisticStorage.size());

                final int[] indices = sparseProbabilisticStorage.keys().toArray();
                Arrays.sort(indices);
                for(final int registerIndex : indices) {
                    assert sparseProbabilisticStorage.containsKey(registerIndex);
                    final long registerValue = sparseProbabilisticStorage.get(registerIndex);
                    // pack index and value into "short word"
                    final long shortWord = ((registerIndex << regwidth) | registerValue);
                    serializer.writeWord(shortWord);
                }

                bytes = serializer.getBytes();
                break;
            }
            case FULL: {
                final IWordSerializer serializer = schemaVersion.getSerializer(type, regwidth, m);
                probabilisticStorage.getRegisterContents(serializer);

                bytes = serializer.getBytes();
                break;
            }
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }

        final IHLLMetadata metadata = new HLLMetadata(schemaVersion.schemaVersionNumber(),
                                                      type,
                                                      log2m,
                                                      regwidth,
                                                      (int)NumberUtil.log2(explicitThreshold),
                                                      explicitOff,
                                                      explicitAuto,
                                                      !sparseOff);
        schemaVersion.writeMetadata(bytes, metadata);

        return bytes;
    }

    /**
     * Deserializes the HLL (in {@link #toBytes(ISchemaVersion)} format) serialized
     * into bytes.
     *
     * @param  bytes the serialized bytes of new HLL
     * @return the deserialized HLL. This will never be null.
     *
     * @see #toBytes(ISchemaVersion)
     */
    public static HLL fromBytes(final byte[] bytes) {
        final ISchemaVersion schemaVersion = SerializationUtil.getSchemaVersion(bytes);
        final IHLLMetadata metadata = schemaVersion.readMetadata(bytes);

        final HLLType type = metadata.HLLType();
        final int regwidth = metadata.registerWidth();
        final int log2m = metadata.registerCountLog2();
        final boolean sparseon = metadata.sparseEnabled();

        final int expthresh;
        if(metadata.explicitAuto()) {
            expthresh = -1;
        } else if(metadata.explicitOff()) {
            expthresh = 0;
        } else {
            // NOTE: take into account that the postgres-compatible constructor
            //       subtracts one before taking a power of two.
            expthresh = metadata.log2ExplicitCutoff() + 1;
        }

        final HLL hll = new HLL(log2m, regwidth, expthresh, sparseon, type);

        // Short-circuit on empty, which needs no other deserialization.
        if(HLLType.EMPTY.equals(type)) {
            return hll;
        }

        final int wordLength;
        switch(type) {
            case EXPLICIT:
                wordLength = Long.SIZE;
                break;
            case SPARSE:
                wordLength = hll.shortWordLength;
                break;
            case FULL:
                wordLength = hll.regwidth;
                break;
            default:
                throw new RuntimeException("Unsupported HLL type " + type);
        }

        final IWordDeserializer deserializer =
                schemaVersion.getDeserializer(type, wordLength, bytes);
        switch(type) {
            case EXPLICIT:
                // NOTE:  This should not exceed expthresh and this will always
                //        be exactly the number of words that were encoded,
                //        because the word length is at least a byte wide.
                // SEE:   IWordDeserializer#totalWordCount()
                for(int i=0; i>> hll.regwidth), registerValue);
                    }
                }
                break;
            case FULL:
                // NOTE:  Iteration is done using m (register count) and NOT
                //        deserializer#totalWordCount() because regwidth may be
                //        less than 8 and as such the padding on the 'last' byte
                //        may be larger than regwidth, causing an extra register
                //        to be read.
                // SEE: IWordDeserializer#totalWordCount()
                for(long i=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy