All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.hll.BaseHllSketch Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.hll;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static org.apache.datasketches.hll.HllUtil.KEY_BITS_26;
import static org.apache.datasketches.hll.HllUtil.KEY_MASK_26;

import org.apache.datasketches.memory.Memory;

/**
 * Although this class is package-private, it provides a single place to define and document
 * the common public API for both HllSketch and Union.
 * @author Lee Rhodes
 * @author Kevin Lang
 */
abstract class BaseHllSketch {

  /**
   * Gets the size in bytes of the current sketch when serialized using
   * toCompactByteArray().
   * @return the size in bytes of the current sketch when serialized using
   * toCompactByteArray().
   */
  public abstract int getCompactSerializationBytes();

  /**
   * This is less accurate than the {@link #getEstimate()} method and is automatically used
   * when the sketch has gone through union operations where the more accurate HIP estimator
   * cannot be used.
   * This is made public only for error characterization software that exists in separate
   * packages and is not intended for normal use.
   * @return the composite estimate
   */
  public abstract double getCompositeEstimate();

  /**
   * Returns the current mode of the sketch: LIST, SET, HLL
   * @return the current mode of the sketch: LIST, SET, HLL
   */
  abstract CurMode getCurMode();

  /**
   * Return the cardinality estimate
   * @return the cardinality estimate
   */
  public abstract double getEstimate();

  /**
   * Gets the {@link TgtHllType}
   * @return the TgtHllType enum value
   */
  public abstract TgtHllType getTgtHllType();

  /**
   * Gets the lgConfigK.
   * @return the lgConfigK.
   */
  public abstract int getLgConfigK();

  /**
   * Gets the approximate lower error bound given the specified number of Standard Deviations.
   *
   * @param numStdDev This must be an integer between 1 and 3, inclusive.
   * See Number of Standard Deviations
   * @return the lower bound.
   */
  public abstract double getLowerBound(int numStdDev);

  /**
   * Returns the current serialization version.
   * @return the current serialization version.
   */
  public static final int getSerializationVersion() {
    return PreambleUtil.SER_VER;
  }

  /**
   * Returns the current serialization version of the given Memory.
   * @param mem the given Memory containing a serialized HllSketch image.
   * @return the current serialization version.
   */
  public static final int getSerializationVersion(final Memory mem) {
    return mem.getByte(PreambleUtil.SER_VER_BYTE) & 0XFF;
  }

  /**
   * Gets the current (approximate) Relative Error (RE) asymptotic values given several
   * parameters. This is used primarily for testing.
   * @param upperBound return the RE for the Upper Bound, otherwise for the Lower Bound.
   * @param unioned set true if the sketch is the result of a union operation.
   * @param lgConfigK the configured value for the sketch.
   * @param numStdDev the given number of Standard Deviations. This must be an integer between
   * 1 and 3, inclusive.
   * Number of Standard Deviations
   * @return the current (approximate) RelativeError
   */
  public double getRelErr(final boolean upperBound, final boolean unioned,
      final int lgConfigK, final int numStdDev) {
    return RelativeErrorTables.getRelErr(upperBound, unioned, lgConfigK, numStdDev);
  }

  /**
   * Gets the size in bytes of the current sketch when serialized using
   * toUpdatableByteArray().
   * @return the size in bytes of the current sketch when serialized using
   * toUpdatableByteArray().
   */
  public abstract int getUpdatableSerializationBytes();

  /**
   * Gets the approximate upper error bound given the specified number of Standard Deviations.
   *
   * @param numStdDev This must be an integer between 1 and 3, inclusive.
   * Number of Standard Deviations
   * @return the upper bound.
   */
  public abstract double getUpperBound(int numStdDev);

  /**
   * Returns true if empty
   * @return true if empty
   */
  public abstract boolean isEmpty();

  /**
   * Returns true if the backing memory of this sketch is in compact form.
   * @return true if the backing memory of this sketch is in compact form.
   */
  public abstract boolean isCompact();

  /**
   * This HLL family of sketches and operators is always estimating, even for very small values.
   * @return true
   */
  public boolean isEstimationMode() {
    return true;
  }

  /**
   * Returns true if this sketch was created using Memory.
   * @return true if this sketch was created using Memory.
   */
  public abstract boolean isMemory();

  /**
   * Returns true if the backing memory for this sketch is off-heap.
   * @return true if the backing memory for this sketch is off-heap.
   */
  public abstract boolean isOffHeap();

  /**
   * Gets the Out-of-order flag.
   * @return true if the current estimator is the non-HIP estimator, which is slightly less
   * accurate than the HIP estimator.
   */
  abstract boolean isOutOfOrderFlag();

  /**
   * Returns true if the given Memory refers to the same underlying resource as this sketch.
   * The capacities must be the same.  If this is a region,
   * the region offset must also be the same.
   *
   * 

This is only relevant for HLL_4 sketches that have been configured for off-heap * using WritableMemory or Memory. For on-heap sketches or unions this will return false. * *

It is rare, but possible, the the off-heap memory that has been allocated to an HLL_4 * sketch may not be large enough. If this should happen, the sketch makes a request for more * memory from the owner of the resource and then moves itself to this new location. This all * happens transparently to the user. This method provides a means for the user to * inquire of the sketch if it has, in fact, moved itself. * * @param mem the given Memory * @return true if the given Memory refers to the same underlying resource as this sketch or * union. */ public abstract boolean isSameResource(Memory mem); /** * Resets to empty, but does not change the configured values of lgConfigK and tgtHllType. */ public abstract void reset(); /** * Serializes this sketch as a byte array in compact form. The compact form is smaller in size * than the updatable form and read-only. It can be used in union operations as follows: *

{@code
   *     Union union; HllSketch sk, sk2;
   *     int lgK = 12;
   *     sk = new HllSketch(lgK, TgtHllType.HLL_4); //can be 4, 6, or 8
   *     for (int i = 0; i < (2 << lgK); i++) { sk.update(i); }
   *     byte[] arr = HllSketch.toCompactByteArray();
   *     //...
   *     union = Union.heapify(arr); //initializes the union using data from the array.
   *     //OR, if used in an off-heap environment:
   *     union = Union.heapify(Memory.wrap(arr)); //same as above, except from Memory object.
   *
   *     //To recover an updatable heap sketch:
   *     sk2 = HllSketch.heapify(arr);
   *     //OR, if used in an off-heap environment:
   *     sk2 = HllSketch.heapify(Memory.wrap(arr));
   * }
* *

The sketch "wrapping" operation skips actual deserialization thus is quite fast. However, * any attempt to update the derived HllSketch will result in a Read-only exception. * @return this sketch as a compact byte array. */ public abstract byte[] toCompactByteArray(); /** * Serializes this sketch as a byte array in an updatable form. The updatable form is larger than * the compact form. The use of this form is primarily in environments that support updating * sketches in off-heap memory. If the sketch is constructed using HLL_8, sketch updating and * union updating operations can actually occur in WritableMemory, which can be off-heap: *

{@code
   *     Union union; HllSketch sk;
   *     int lgK = 12;
   *     sk = new HllSketch(lgK, TgtHllType.HLL_8) //must be 8
   *     for (int i = 0; i < (2 << lgK); i++) { sk.update(i); }
   *     byte[] arr = sk.toUpdatableByteArray();
   *     WritableMemory wmem = WritableMemory.wrap(arr);
   *     //...
   *     union = Union.writableWrap(wmem); //no deserialization!
   * }
* @return this sketch as an updatable byte array. */ public abstract byte[] toUpdatableByteArray(); /** * Human readable summary as a string. * @return Human readable summary as a string. */ @Override public String toString() { return toString(true, false, false, false); } /** * Human readable summary with optional detail. Does not list empty entries. * @param summary if true, output the sketch summary * @param detail if true, output the internal data array * @param auxDetail if true, output the internal Aux array, if it exists. * @return human readable string with optional detail. */ public String toString(final boolean summary, final boolean detail, final boolean auxDetail) { return toString(summary, detail, auxDetail, false); } /** * Human readable summary with optional detail * @param summary if true, output the sketch summary * @param detail if true, output the internal data array * @param auxDetail if true, output the internal Aux array, if it exists. * @param all if true, outputs all entries including empty ones * @return human readable string with optional detail. */ public abstract String toString(boolean summary, boolean detail, boolean auxDetail, boolean all); /** * Present the given long as a potential unique item. * * @param datum The given long datum. */ public void update(final long datum) { final long[] data = { datum }; couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED))); } /** * Present the given double (or float) datum as a potential unique item. * The double will be converted to a long using Double.doubleToLongBits(datum), * which normalizes all NaN values to a single NaN representation. * Plus and minus zero will be normalized to plus zero. * The special floating-point values NaN and +/- Infinity are treated as distinct. * * @param datum The given double datum. */ public void update(final double datum) { final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0 final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN forms couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED))); } /** * Present the given String as a potential unique item. * The string is converted to a byte array using UTF8 encoding. * If the string is null or empty no update attempt is made and the method returns. * *

Note: About 2X faster performance can be obtained by first converting the String to a * char[] and updating the sketch with that. This bypasses the complexity of the Java UTF_8 * encoding. This, of course, will not produce the same internal hash values as updating directly * with a String. So be consistent! Unioning two sketches, one fed with strings and the other * fed with char[] will be meaningless. *

* * @param datum The given String. */ public void update(final String datum) { if ((datum == null) || datum.isEmpty()) { return; } final byte[] data = datum.getBytes(UTF_8); couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED))); } /** * Present the given byte array as a potential unique item. * If the byte array is null or empty no update attempt is made and the method returns. * * @param data The given byte array. */ public void update(final byte[] data) { if ((data == null) || (data.length == 0)) { return; } couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED))); } /** * Present the given char array as a potential unique item. * If the char array is null or empty no update attempt is made and the method returns. * *

Note: this will not produce the same output hash values as the {@link #update(String)} * method but will be a little faster as it avoids the complexity of the UTF8 encoding.

* * @param data The given char array. */ public void update(final char[] data) { if ((data == null) || (data.length == 0)) { return; } couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED))); } /** * Present the given integer array as a potential unique item. * If the integer array is null or empty no update attempt is made and the method returns. * * @param data The given int array. */ public void update(final int[] data) { if ((data == null) || (data.length == 0)) { return; } couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED))); } /** * Present the given long array as a potential unique item. * If the long array is null or empty no update attempt is made and the method returns. * * @param data The given long array. */ public void update(final long[] data) { if ((data == null) || (data.length == 0)) { return; } couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED))); } private static final int coupon(final long[] hash) { final int addr26 = (int) ((hash[0] & KEY_MASK_26)); final int lz = Long.numberOfLeadingZeros(hash[1]); final int value = ((lz > 62 ? 62 : lz) + 1); return (value << KEY_BITS_26) | addr26; } abstract void couponUpdate(int coupon); }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy