All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.olapdb.core.hll.HLLDistinct Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

package com.olapdb.core.hll;

import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;

import java.io.IOException;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Map;

@SuppressWarnings("serial")
public class HLLDistinct implements Serializable, Comparable {

    static double[] harmonicMean;

    static {
        harmonicMean = new double[256];
        for (int i = 1; i < 256; i++)
            harmonicMean[i] = 1.0 / (1L << i);
    }

    // not final for test purpose
    static double OVERFLOW_FACTOR = 0.01;

    private int p;

    private int m;

    private HashFunction hashFunc = Hashing.murmur3_128();

    private Register register;

    public HLLDistinct() {
        this(10, RegisterType.SINGLE_VALUE, Hashing.murmur3_128());
    }

    public HLLDistinct(int p) {
        this(p, RegisterType.SINGLE_VALUE, Hashing.murmur3_128());
    }

    public HLLDistinct(int p, HashFunction hashFunc) {
        this(p, RegisterType.SINGLE_VALUE, hashFunc);
    }

    public HLLDistinct(HLLDistinct another) {
        this(another.p, another.getRegisterType(), another.hashFunc);
        if(another.getRegisterType() == RegisterType.DENSE){
            ((DenseRegister)register).copyFrom((DenseRegister)another.register);
        }else {
            merge(another);
        }
    }

    public HLLDistinct(int p, RegisterType type) {
        this(p, type, Hashing.murmur3_128());
    }

    HLLDistinct(int p, RegisterType type, HashFunction hashFunc) {
        this.p = p;
        this.m = 1 << p;//(int) Math.pow(2, p);
        this.hashFunc = hashFunc;

        if (type == RegisterType.SINGLE_VALUE) {
            this.register = new SingleValueRegister();
        } else if (type == RegisterType.SPARSE) {
            this.register = new SparseRegister();
        } else {
            this.register = new DenseRegister(p);
        }
    }

    public boolean isDense(int size) {
        double over = OVERFLOW_FACTOR * m;
        return size > (int) over;
    }

    public void add(int value) {
        add(hashFunc.hashLong(value).asLong());
    }

    public void add(String value) {
        add(hashFunc.hashString(value, Charset.defaultCharset()).asLong());
    }

    public void add(byte[] value) {
        add(hashFunc.hashBytes(value).asLong());
    }

    public void add(byte[] value, int offset, int length) {
        add(hashFunc.hashBytes(value, offset, length).asLong());
    }

    public void addHashDirectly(long hash){
        add(hash);
    }

    protected void add(long hash) {
        int bucketMask = m - 1;
        int bucket = (int) (hash & bucketMask);
        int firstOnePos = Long.numberOfLeadingZeros(hash | bucketMask) + 1;
        if (register.getRegisterType() == RegisterType.SINGLE_VALUE) {
            SingleValueRegister sr = (SingleValueRegister) register;
            int pos = sr.getSingleValuePos();
            if (pos < 0 || pos == bucket) { //one or zero value
                setIfBigger(register, bucket, (byte) firstOnePos);
            } else { //two value
                this.register = sr.toSparse();
                setIfBigger(register, bucket, (byte) firstOnePos);
            }
        } else {
            setIfBigger(register, bucket, (byte) firstOnePos);
            toDenseIfNeeded();
        }
    }

    private void setIfBigger(Register register, int pos, byte value) {
        byte b = register.get(pos);
        if (value > b) {
            register.set(pos, value);
        }
    }

    private void toDenseIfNeeded() {
        if (register.getRegisterType() == RegisterType.SPARSE) {
            if (isDense(register.getSize())) {
                register = ((SparseRegister) register).toDense(p);
            }
        }
    }

    public void merge(HLLDistinct another) {
        assert this.p == another.p;
        assert this.hashFunc == another.hashFunc;
        switch (register.getRegisterType()) {
            case SINGLE_VALUE:
                switch (another.getRegisterType()) {
                    case SINGLE_VALUE:
                        if (register.getSize() > 0 && another.register.getSize() > 0) {
                            register = ((SingleValueRegister) register).toSparse();
                        } else {
                            SingleValueRegister sr = (SingleValueRegister) another.register;
                            if (sr.getSize() > 0)
                                register.set(sr.getSingleValuePos(), sr.getValue());
                            return;
                        }
                        break;
                    case SPARSE:
                        register = ((SingleValueRegister) register).toSparse();
                        break;
                    case DENSE:
                        register = ((SingleValueRegister) register).toDense(this.p);
                        break;
                    default:
                        break;
                }

                break;
            case SPARSE:
                if (another.getRegisterType() == RegisterType.DENSE) {
                    register = ((SparseRegister) register).toDense(p);
                }
                break;
            default:
                break;
        }
        register.merge(another.register);
        toDenseIfNeeded();
    }

    public long getCountEstimate() {
        return new HLLCSnapshot(this).getCountEstimate();
    }

    public int getPrecision() {
        return this.p;
    }

    public double getErrorRate() {
        return 1.04 / Math.sqrt(m);
    }

    @Override
    public String toString() {
        return "" + getCountEstimate();
    }

    // ============================================================================

    // a memory efficient snapshot of HLL registers which can yield count estimate later
    public static class HLLCSnapshot {
        byte p;
        double registerSum;
        int zeroBuckets;

        public HLLCSnapshot(HLLDistinct hllc) {
            int[] registerNums = new int[256];

            p = (byte) hllc.p;
            registerSum = 0;
            zeroBuckets = 0;
            Register register = hllc.getRegister();
            DenseRegister dr;
            if (register.getRegisterType() == RegisterType.SINGLE_VALUE) {
                dr = ((SingleValueRegister) register).toDense(p);
            } else if (register.getRegisterType() == RegisterType.SPARSE) {
                dr = ((SparseRegister) register).toDense(p);
            } else {
                dr = (DenseRegister) register;
            }
            byte[] registers = dr.getRawRegister();
            for (int i = 0; i < hllc.m; i ++) {
                registerNums[registers[i]] ++;
            }
            zeroBuckets = registerNums[0];
            for (int i= 1; i < 256; i ++)
                registerSum += registerNums[i] * harmonicMean[i];

            registerSum += zeroBuckets;
        }

        public long getCountEstimate() {
            int m = 1 << p;
            double alpha = 0.7213 / (1 + 1.079 / m);
            double estimate = alpha * m * m / registerSum;

            // small cardinality adjustment
            if (zeroBuckets >= m * 0.07) { // (reference presto's HLL impl)
                estimate = m * Math.log(m * 1.0 / zeroBuckets);
            } else if (HyperLogLogPlusTable.isBiasCorrection(m, estimate)) {
                estimate = HyperLogLogPlusTable.biasCorrection(p, estimate);
            }

            return Math.round(estimate);
        }
    }

    public static void main(String[] args) throws IOException {
        dumpErrorRates();
    }

    static void dumpErrorRates() {
        for (int p = 10; p <= 18; p++) {
            double rate = new HLLDistinct(p, RegisterType.SPARSE).getErrorRate();
            double er = Math.round(rate * 10000) / 100D;
            double er2 = Math.round(rate * 2 * 10000) / 100D;
            double er3 = Math.round(rate * 3 * 10000) / 100D;
            long size = Math.round(Math.pow(2, p));
            System.out.println("HLLC" + p + ",\t" + size + " bytes,\t68% err<" + er + "%" + ",\t95% err<" + er2 + "%" + ",\t99.7% err<" + er3 + "%");
        }
    }

    public Register getRegister() {
        return register;
    }

    public void clear() {
        register.clear();
    }

    // ============================================================================

    public void writeRegisters(final ByteBuffer out) throws IOException {

        final int indexLen = getRegisterIndexSize();
        int size = register.getSize();

        // decide output scheme -- map (3*size bytes) or array (2^p bytes)
        byte scheme;
        if (register instanceof SingleValueRegister || register instanceof SparseRegister //
                || 5 + (indexLen + 1) * size < m) {
            scheme = 0; // map
        } else {
            scheme = 1; // array
        }
        out.put(scheme);
        if (scheme == 0) { // map scheme
            BytesUtil.writeVInt(size, out);
            if (register.getRegisterType() == RegisterType.SINGLE_VALUE) { //single value register
                if (size > 0) {
                    SingleValueRegister sr = (SingleValueRegister) register;
                    writeUnsigned(sr.getSingleValuePos(), indexLen, out);
                    out.put(sr.getValue());
                }
            } else if (register.getRegisterType() == RegisterType.SPARSE) { //sparse register
                Collection> allValue = ((SparseRegister) register).getAllValue();
                for (Map.Entry entry : allValue) {
                    writeUnsigned(entry.getKey(), indexLen, out);
                    out.put(entry.getValue());
                }
            } else { //dense register
                byte[] registers = ((DenseRegister) register).getRawRegister();
                for (int i = 0; i < m; i++) {
                    if (registers[i] > 0) {
                        writeUnsigned(i, indexLen, out);
                        out.put(registers[i]);
                    }
                }
            }
        } else if (scheme == 1) { // array scheme
            out.put(((DenseRegister) register).getRawRegister());
        } else
            throw new IllegalStateException();
    }

    public void readRegisters(ByteBuffer in) throws IOException {
        byte scheme = in.get();
        if (scheme == 0) { // map scheme
            clear();
            int size = BytesUtil.readVInt(in);
            if (size > m)
                throw new IllegalArgumentException("register size (" + size + ") cannot be larger than m (" + m + ")");
            if (isDense(size)) {
                register = new DenseRegister(p);
            } else if (size <= 1) {
                register = new SingleValueRegister();
            } else {
                register = new SparseRegister();
            }
            int indexLen = getRegisterIndexSize();
            int key = 0;
            for (int i = 0; i < size; i++) {
                key = readUnsigned(in, indexLen);
                register.set(key, in.get());
            }
        } else if (scheme == 1) { // array scheme
            if (register.getRegisterType() != RegisterType.DENSE) {
                register = new DenseRegister(p);
            }
            in.get(((DenseRegister) register).getRawRegister());
        } else
            throw new IllegalStateException();
    }

    public int peekLength(ByteBuffer in) {
        int mark = in.position();
        int len;
        byte scheme = in.get();
        if (scheme == 0) { // map scheme
            int size = BytesUtil.readVInt(in);
            int indexLen = getRegisterIndexSize();
            len = in.position() - mark + (indexLen + 1) * size;
        } else {
            len = in.position() - mark + m;
        }

        in.position(mark);
        return len;
    }

    public int maxLength() {
        return 1 + m;
    }

    public int getRegisterIndexSize() {
        return (p - 1) / 8 + 1; // 2 when p=16, 3 when p=17
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + ((hashFunc == null) ? 0 : hashFunc.hashCode());
        result = prime * result + p;
        result = prime * result + register.hashCode();
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        HLLDistinct other = (HLLDistinct) obj;
        if (!hashFunc.equals(other.hashFunc))
            return false;
        if (p != other.p)
            return false;
        if (!register.equals(other.register))
            return false;
        return true;
    }

    @Override
    public int compareTo(HLLDistinct o) {
        if (o == null)
            return 1;

        long e1 = this.getCountEstimate();
        long e2 = o.getCountEstimate();

        if (e1 == e2)
            return 0;
        else if (e1 > e2)
            return 1;
        else
            return -1;
    }

    public static void writeUnsigned(int num, int size, ByteBuffer out) {
        for (int i = 0; i < size; i++) {
            out.put((byte) num);
            num >>>= 8;
        }
    }

    public static int readUnsigned(ByteBuffer in, int size) {
        int integer = 0;
        int mask = 0xff;
        int shift = 0;
        for (int i = 0; i < size; i++) {
            integer |= (in.get() << shift) & mask;
            mask = mask << 8;
            shift += 8;
        }
        return integer;
    }

    public RegisterType getRegisterType() {
        return register.getRegisterType();
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy