All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.questdb.griffin.engine.groupby.hyperloglog.HyperLogLog Maven / Gradle / Ivy

There is a newer version: 8.2.1
Show newest version
/*******************************************************************************
 *     ___                  _   ____  ____
 *    / _ \ _   _  ___  ___| |_|  _ \| __ )
 *   | | | | | | |/ _ \/ __| __| | | |  _ \
 *   | |_| | |_| |  __/\__ \ |_| |_| | |_) |
 *    \__\_\\__,_|\___||___/\__|____/|____/
 *
 *  Copyright (c) 2014-2019 Appsicle
 *  Copyright (c) 2019-2024 QuestDB
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 ******************************************************************************/

package io.questdb.griffin.engine.groupby.hyperloglog;

import io.questdb.griffin.engine.groupby.GroupByAllocator;
import io.questdb.std.Unsafe;
import org.jetbrains.annotations.TestOnly;

/**
 * This is an implementation of HyperLogLog++ described in the paper
 * 'HyperLogLog in
 * Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm'.
 * 

* This class assumes that specific HyperLogLog representations reserve the first byte in their memory layouts for * information about their type and the subsequent 8 bytes for cached cardinality. */ public class HyperLogLog { public static final int DEFAULT_PRECISION = 14; public static final int MAX_PRECISION = 18; public static final int MIN_PRECISION = 4; private static final long CACHED_CARDINALITY_OFFSET = Byte.BYTES; private static final long CARDINALITY_NULL_VALUE = -1; private static final byte DENSE = 0; private static final byte SPARSE = 1; private final HyperLogLogDenseRepresentation dense; private final int precision; private final HyperLogLogSparseRepresentation sparse; private long ptr; public HyperLogLog(int precision) { assert precision >= MIN_PRECISION && precision <= MAX_PRECISION : "Precision must be within the range of 4 to 18, inclusive."; this.dense = new HyperLogLogDenseRepresentation(precision); this.sparse = new HyperLogLogSparseRepresentation(precision); this.precision = precision; } public static long merge(HyperLogLog first, HyperLogLog second) { byte firstType = first.getType(); byte secondType = second.getType(); if (firstType == DENSE && secondType == DENSE) { return mergeDenseWithDense(first, second); } else if (firstType == SPARSE && secondType == SPARSE) { if (second.sparse.size() < first.sparse.size()) { return mergeSparseWithSparse(second, first); } else { return mergeSparseWithSparse(first, second); } } else if (firstType == SPARSE && secondType == DENSE) { return mergeSparseWithDense(first, second); } else if (firstType == DENSE && secondType == SPARSE) { return mergeSparseWithDense(second, first); } throw new IllegalStateException("Unexpected combination of HyperLogLog types."); } /** * Adds the provided hash to this instance of HyperLogLog and, if it is * computationally inexpensive, calculates the current cardinality. * Combining these two operations in one method allows us to take advantage of * the data that is already in the CPU cache and avoid cache misses while * performing deferred cardinality computation using the * {@link HyperLogLog#computeCardinality()} method. * * @param hash hash to be added to this instance of HyperLogLog * @return estimated cardinality or -1 */ public long addAndComputeCardinalityFast(long hash) { if (getType() == SPARSE) { sparse.add(hash); ptr = sparse.ptr(); if (sparse.isFull()) { convertToDense(); setCachedCardinality(CARDINALITY_NULL_VALUE); return -1; } long cardinality = sparse.computeCardinality(); setCachedCardinality(cardinality); return cardinality; } else { dense.add(hash); setCachedCardinality(CARDINALITY_NULL_VALUE); return -1; } } public long computeCardinality() { long cardinality = getCachedCardinality(); if (cardinality != CARDINALITY_NULL_VALUE) { return cardinality; } if (getType() == SPARSE) { cardinality = sparse.computeCardinality(); } else { cardinality = dense.computeCardinality(); } setCachedCardinality(cardinality); return cardinality; } @TestOnly public boolean isSparse() { return getType() == SPARSE; } public HyperLogLog of(long ptr) { if (ptr == 0) { if (HyperLogLogSparseRepresentation.calculateSparseSetMaxSize(precision) > 0) { this.ptr = sparse.of(0).ptr(); setType(SPARSE); } else { this.ptr = dense.of(0).ptr(); setType(DENSE); } setCachedCardinality(CARDINALITY_NULL_VALUE); } else { this.ptr = ptr; if (getType() == SPARSE) { sparse.of(ptr); } else { dense.of(ptr); } } return this; } public long ptr() { return ptr; } public void resetPtr() { ptr = 0; } public void setAllocator(GroupByAllocator allocator) { sparse.setAllocator(allocator); dense.setAllocator(allocator); } private static long mergeDenseWithDense(HyperLogLog src, HyperLogLog dst) { src.dense.copyTo(dst.dense); dst.of(dst.dense.ptr()); dst.setCachedCardinality(CARDINALITY_NULL_VALUE); return dst.ptr(); } private static long mergeSparseWithDense(HyperLogLog src, HyperLogLog dst) { src.sparse.copyTo(dst.dense); dst.of(dst.dense.ptr()); dst.setCachedCardinality(CARDINALITY_NULL_VALUE); return dst.ptr(); } private static long mergeSparseWithSparse(HyperLogLog src, HyperLogLog dst) { src.sparse.copyTo(dst.sparse); dst.of(dst.sparse.ptr()); if (dst.sparse.isFull()) { dst.convertToDense(); } dst.setCachedCardinality(CARDINALITY_NULL_VALUE); return dst.ptr(); } private void convertToDense() { sparse.convertToDense(dense); this.ptr = dense.ptr(); setType(DENSE); } private long getCachedCardinality() { return Unsafe.getUnsafe().getLong(ptr + CACHED_CARDINALITY_OFFSET); } private byte getType() { return Unsafe.getUnsafe().getByte(ptr); } private void setCachedCardinality(long estimate) { Unsafe.getUnsafe().putLong(ptr + CACHED_CARDINALITY_OFFSET, estimate); } private void setType(byte type) { Unsafe.getUnsafe().putByte(ptr, type); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy