All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.theta.UpdateSketch Maven / Gradle / Ivy

There is a newer version: 0.13.4
Show newest version
/*
 * Copyright 2015-16, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.theta;

import static com.yahoo.sketches.hash.MurmurHash3.hash;
import static com.yahoo.sketches.theta.UpdateReturnState.RejectedNullOrEmpty;
import static java.nio.charset.StandardCharsets.UTF_8;

import com.yahoo.memory.Memory;
import com.yahoo.sketches.ResizeFactor;

/**
 * The parent class for the  Update Sketch families, such as QuickSelect and Alpha.
 * The primary task of an Upeate Sketch is to consider datums presented via the update() methods
 * for inclusion in its internal cache. This is the sketch building process.
 *
 * @author Lee Rhodes
 */
public abstract class UpdateSketch extends Sketch {

  UpdateSketch() {}

  //Sketch interface

  @Override
  public abstract boolean isEmpty();

  @Override
  public boolean isCompact() {
    return false;
  }

  @Override
  public boolean isOrdered() {
    return false;
  }

  //UpdateSketch interface

  /**
   * Returns a new builder
   *
   * @return a new builder
   */
  public static final UpdateSketchBuilder builder() {
    return new UpdateSketchBuilder();
  }

  /**
   * Resets this sketch back to a virgin empty state.
   */
  public abstract void reset();

  /**
   * Convert this UpdateSketch to a CompactSketch in the chosen form.
   *
   * 

This compacting process converts the hash table form of an UpdateSketch to * a simple list of the valid hash values from the hash table. Any hash values equal to or * greater than theta will be discarded. The number of valid values remaining in the * Compact Sketch depends on a number of factors, but may be larger or smaller than * Nominal Entries (or k). It will never exceed 2k. If it is critical * to always limit the size to no more than k, then rebuild() should be called * on the UpdateSketch prior to this. * * @param dstOrdered * See Destination Ordered * * @param dstMem * See Destination Memory. * * @return this sketch as a CompactSketch in the chosen form */ public CompactSketch compact(boolean dstOrdered, Memory dstMem) { CompactSketch sketchOut = null; int sw = (dstOrdered ? 2 : 0) | ((dstMem != null) ? 1 : 0); switch (sw) { case 0: { //dst not ordered, dstMem == null sketchOut = new HeapCompactSketch(this); break; } case 1: { //dst not ordered, dstMem == valid sketchOut = new DirectCompactSketch(this, dstMem); break; } case 2: { //dst ordered, dstMem == null sketchOut = new HeapCompactOrderedSketch(this); break; } case 3: { //dst ordered, dstMem == valid sketchOut = new DirectCompactOrderedSketch(this, dstMem); break; } //default: //This cannot happen and cannot be tested } return sketchOut; } /** * Converts this UpdateSketch to an ordered CompactSketch on the Java heap. * @return this sketch as an ordered CompactSketch on the Java heap. */ public CompactSketch compact() { return compact(true, null); } /** * Rebuilds the hash table to remove dirty values or to reduce the size * to nominal entries. * @return this sketch */ public abstract UpdateSketch rebuild(); /** * Returns the configured ResizeFactor * @return the configured ResizeFactor */ public abstract ResizeFactor getResizeFactor(); /** * Present this sketch with a long. * * @param datum The given long datum. * @return * See Update Return State */ public UpdateReturnState update(long datum) { long[] data = { datum }; return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given double (or float) datum. * The double will be converted to a long using Double.doubleToLongBits(datum), * which normalizes all NaN values to a single NaN representation. * Plus and minus zero will be normalized to plus zero. * The special floating-point values NaN and +/- Infinity are treated as distinct. * * @param datum The given double datum. * @return * See Update Return State */ public UpdateReturnState update(double datum) { double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0 long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN forms return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given String. * The string is converted to a byte array using UTF8 encoding. * If the string is null or empty no update attempt is made and the method returns. * *

Note: this will not produce the same output hash values as the {@link #update(char[])} * method and will generally be a little slower depending on the complexity of the UTF8 encoding. *

* * @param datum The given String. * @return * See Update Return State */ public UpdateReturnState update(String datum) { if (datum == null || datum.isEmpty()) { return RejectedNullOrEmpty; } byte[] data = datum.getBytes(UTF_8); return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given byte array. * If the byte array is null or empty no update attempt is made and the method returns. * * @param data The given byte array. * @return * See Update Return State */ public UpdateReturnState update(byte[] data) { if ((data == null) || (data.length == 0)) { return RejectedNullOrEmpty; } return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given char array. * If the char array is null or empty no update attempt is made and the method returns. * *

Note: this will not produce the same output hash values as the {@link #update(String)} * method but will be a little faster as it avoids the complexity of the UTF8 encoding.

* * @param data The given char array. * @return * See Update Return State */ public UpdateReturnState update(char[] data) { if ((data == null) || (data.length == 0)) { return RejectedNullOrEmpty; } return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given integer array. * If the integer array is null or empty no update attempt is made and the method returns. * * @param data The given int array. * @return * See Update Return State */ public UpdateReturnState update(int[] data) { if ((data == null) || (data.length == 0)) { return RejectedNullOrEmpty; } return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given long array. * If the long array is null or empty no update attempt is made and the method returns. * * @param data The given long array. * @return * See Update Return State */ public UpdateReturnState update(long[] data) { if ((data == null) || (data.length == 0)) { return RejectedNullOrEmpty; } return hashUpdate(hash(data, getSeed())[0] >>> 1); } //restricted methods /** * All potential updates converge here. *

Don't ever call this unless you really know what you are doing!

* * @param hash the given input hash value. A hash of zero or Long.MAX_VALUE is ignored. * A negative hash value will throw an exception. * @return See Update Return State */ abstract UpdateReturnState hashUpdate(long hash); /** * Gets the Log base 2 of the current size of the internal cache * @return the Log base 2 of the current size of the internal cache */ abstract int getLgArrLongs(); /** * Gets the Log base 2 of the configured nominal entries * @return the Log base 2 of the configured nominal entries */ abstract int getLgNomLongs(); /** * Gets the Log base 2 of the Resize Factor * @return the Log base 2 of the Resize Factor */ abstract int getLgResizeFactor(); /** * Gets the configured sampling probability, p. * See Sampling Probability, p * @return the sampling probability, p */ abstract float getP(); /** * Gets the configured seed * @return the configured seed */ abstract long getSeed(); /** * Returns true if the internal cache contains "dirty" values that are greater than or equal * to thetaLong. * @return true if the internal cache is dirty. */ abstract boolean isDirty(); }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy