All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.theta.UpdateSketch Maven / Gradle / Ivy

There is a newer version: 0.6.0
Show newest version
/*
 * Copyright 2015-16, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.theta;

import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;
import static com.yahoo.sketches.Util.MIN_LG_NOM_LONGS;
import static com.yahoo.sketches.hash.MurmurHash3.hash;
import static com.yahoo.sketches.theta.PreambleUtil.BIG_ENDIAN_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.FAMILY_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.MAX_THETA_LONG_AS_DOUBLE;
import static com.yahoo.sketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
import static com.yahoo.sketches.theta.PreambleUtil.SER_VER;
import static com.yahoo.sketches.theta.PreambleUtil.SER_VER_BYTE;
import static com.yahoo.sketches.theta.PreambleUtil.extractFamilyID;
import static com.yahoo.sketches.theta.PreambleUtil.extractFlags;
import static com.yahoo.sketches.theta.PreambleUtil.extractP;
import static com.yahoo.sketches.theta.PreambleUtil.extractSeedHash;
import static com.yahoo.sketches.theta.PreambleUtil.extractSerVer;
import static com.yahoo.sketches.theta.PreambleUtil.extractThetaLong;
import static com.yahoo.sketches.theta.PreambleUtil.getMemBytes;
import static com.yahoo.sketches.theta.UpdateReturnState.RejectedNullOrEmpty;
import static java.nio.charset.StandardCharsets.UTF_8;

import com.yahoo.memory.Memory;
import com.yahoo.memory.WritableMemory;
import com.yahoo.sketches.Family;
import com.yahoo.sketches.ResizeFactor;
import com.yahoo.sketches.SketchesArgumentException;
import com.yahoo.sketches.Util;

/**
 * The parent class for the  Update Sketch families, such as QuickSelect and Alpha.
 * The primary task of an Update Sketch is to consider datums presented via the update() methods
 * for inclusion in its internal cache. This is the sketch building process.
 *
 * @author Lee Rhodes
 */
public abstract class UpdateSketch extends Sketch {

  UpdateSketch() {}

  /**
  * Wrap takes the sketch image in Memory and refers to it directly. There is no data copying onto
  * the java heap. Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
  * been explicitly stored as direct objects can be wrapped. This method assumes the
  * {@link Util#DEFAULT_UPDATE_SEED}.
  * Default Update Seed.
  * @param srcMem an image of a Sketch where the image seed hash matches the default seed hash.
  * See Memory
  * @return a Sketch backed by the given Memory
  */
  public static UpdateSketch wrap(final WritableMemory srcMem) {
    return wrap(srcMem, DEFAULT_UPDATE_SEED);
  }

  /**
  * Wrap takes the sketch image in Memory and refers to it directly. There is no data copying onto
  * the java heap. Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
  * been explicitly stored as direct objects can be wrapped.
  * An attempt to "wrap" earlier version sketches will result in a "heapified", normal
  * Java Heap version of the sketch where all data will be copied to the heap.
  * @param srcMem an image of a Sketch where the image seed hash matches the given seed hash.
  * See Memory
  * @param seed See Update Hash Seed.
  * Compact sketches store a 16-bit hash of the seed, but not the seed itself.
  * @return a UpdateSketch backed by the given Memory
  */
  public static UpdateSketch wrap(final WritableMemory srcMem, final long seed) {
    final int  preLongs = srcMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F;
    final int serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF;
    final int familyID = srcMem.getByte(FAMILY_BYTE) & 0XFF;
    final Family family = Family.idToFamily(familyID);
    if (family != Family.QUICKSELECT) {
      throw new SketchesArgumentException(
        "A " + family + " sketch cannot be wrapped as an UpdateSketch.");
    }
    if ((serVer == 3) && (preLongs == 3)) {
      return DirectQuickSelectSketch.writableWrap(srcMem, seed);
    } else {
      throw new SketchesArgumentException(
        "Corrupted: An UpdateSketch image: must have SerVer = 3 and preLongs = 3");
    }
  }

  /**
   * Instantiates an on-heap UpdateSketch from Memory. This method assumes the
   * {@link Util#DEFAULT_UPDATE_SEED}.
   * @param srcMem See Memory
   * @return an UpdateSketch
   */
  public static UpdateSketch heapify(final Memory srcMem) {
    return heapify(srcMem, DEFAULT_UPDATE_SEED);
  }

  /**
   * Instantiates an on-heap UpdateSketch from Memory.
   * @param srcMem See Memory
   * @param seed See Update Hash Seed.
   * @return an UpdateSketch
   */
  public static UpdateSketch heapify(final Memory srcMem, final long seed) {
    final Family family = Family.idToFamily(srcMem.getByte(FAMILY_BYTE));
    if (family.equals(Family.ALPHA)) {
      return HeapAlphaSketch.heapifyInstance(srcMem, seed);
    }
    return HeapQuickSelectSketch.heapifyInstance(srcMem, seed);
  }

  //Sketch interface

  @Override
  public CompactSketch compact() {
    return compact(true, null);
  }

  @Override
  public CompactSketch compact(final boolean dstOrdered, final WritableMemory dstMem) {
    CompactSketch sketchOut = null;
    final int sw = (dstOrdered ? 2 : 0) | ((dstMem != null) ? 1 : 0);
    switch (sw) {
      case 0: { //dst not ordered, dstMem == null
        sketchOut = HeapCompactUnorderedSketch.compact(this);
        break;
      }
      case 1: { //dst not ordered, dstMem == valid
        sketchOut = DirectCompactUnorderedSketch.compact(this, dstMem);
        break;
      }
      case 2: { //dst ordered, dstMem == null
        sketchOut = HeapCompactOrderedSketch.compact(this);
        break;
      }
      case 3: { //dst ordered, dstMem == valid
        sketchOut = DirectCompactOrderedSketch.compact(this, dstMem);
        break;
      }
      //default: //This cannot happen and cannot be tested
    }
    return sketchOut;
  }

  @Override
  public boolean isCompact() {
    return false;
  }

  @Override
  public boolean isOrdered() {
    return false;
  }

  //UpdateSketch interface

  /**
   * Returns a new builder
   *
   * @return a new builder
   */
  public static final UpdateSketchBuilder builder() {
    return new UpdateSketchBuilder();
  }

  /**
   * Resets this sketch back to a virgin empty state.
   */
  public abstract void reset();

  /**
   * Rebuilds the hash table to remove dirty values or to reduce the size
   * to nominal entries.
   * @return this sketch
   */
  public abstract UpdateSketch rebuild();

  /**
   * Returns the configured ResizeFactor
   * @return the configured ResizeFactor
   */
  public abstract ResizeFactor getResizeFactor();

  /**
   * Present this sketch with a long.
   *
   * @param datum The given long datum.
   * @return
   * See Update Return State
   */
  public UpdateReturnState update(final long datum) {
    final long[] data = { datum };
    return hashUpdate(hash(data, getSeed())[0] >>> 1);
  }

  /**
   * Present this sketch with the given double (or float) datum.
   * The double will be converted to a long using Double.doubleToLongBits(datum),
   * which normalizes all NaN values to a single NaN representation.
   * Plus and minus zero will be normalized to plus zero.
   * The special floating-point values NaN and +/- Infinity are treated as distinct.
   *
   * @param datum The given double datum.
   * @return
   * See Update Return State
   */
  public UpdateReturnState update(final double datum) {
    final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0
    final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN forms
    return hashUpdate(hash(data, getSeed())[0] >>> 1);
  }

  /**
   * Present this sketch with the given String.
   * The string is converted to a byte array using UTF8 encoding.
   * If the string is null or empty no update attempt is made and the method returns.
   *
   * 

Note: this will not produce the same output hash values as the {@link #update(char[])} * method and will generally be a little slower depending on the complexity of the UTF8 encoding. *

* * @param datum The given String. * @return * See Update Return State */ public UpdateReturnState update(final String datum) { if ((datum == null) || datum.isEmpty()) { return RejectedNullOrEmpty; } final byte[] data = datum.getBytes(UTF_8); return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given byte array. * If the byte array is null or empty no update attempt is made and the method returns. * * @param data The given byte array. * @return * See Update Return State */ public UpdateReturnState update(final byte[] data) { if ((data == null) || (data.length == 0)) { return RejectedNullOrEmpty; } return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given char array. * If the char array is null or empty no update attempt is made and the method returns. * *

Note: this will not produce the same output hash values as the {@link #update(String)} * method but will be a little faster as it avoids the complexity of the UTF8 encoding.

* * @param data The given char array. * @return * See Update Return State */ public UpdateReturnState update(final char[] data) { if ((data == null) || (data.length == 0)) { return RejectedNullOrEmpty; } return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given integer array. * If the integer array is null or empty no update attempt is made and the method returns. * * @param data The given int array. * @return * See Update Return State */ public UpdateReturnState update(final int[] data) { if ((data == null) || (data.length == 0)) { return RejectedNullOrEmpty; } return hashUpdate(hash(data, getSeed())[0] >>> 1); } /** * Present this sketch with the given long array. * If the long array is null or empty no update attempt is made and the method returns. * * @param data The given long array. * @return * See Update Return State */ public UpdateReturnState update(final long[] data) { if ((data == null) || (data.length == 0)) { return RejectedNullOrEmpty; } return hashUpdate(hash(data, getSeed())[0] >>> 1); } //restricted methods /** * All potential updates converge here. *

Don't ever call this unless you really know what you are doing!

* * @param hash the given input hash value. A hash of zero or Long.MAX_VALUE is ignored. * A negative hash value will throw an exception. * @return See Update Return State */ abstract UpdateReturnState hashUpdate(long hash); /** * Gets the Log base 2 of the current size of the internal cache * @return the Log base 2 of the current size of the internal cache */ abstract int getLgArrLongs(); /** * Gets the Log base 2 of the configured nominal entries * @return the Log base 2 of the configured nominal entries */ public abstract int getLgNomLongs(); /** * Gets the configured sampling probability, p. * See Sampling Probability, p * @return the sampling probability, p */ abstract float getP(); /** * Gets the configured seed * @return the configured seed */ abstract long getSeed(); /** * Returns true if the internal cache contains "dirty" values that are greater than or equal * to thetaLong. * @return true if the internal cache is dirty. */ abstract boolean isDirty(); /** * Returns true if numEntries (curCount) is greater than the hashTableThreshold. * @param numEntries the given number of entries (or current count). * @return true if numEntries (curCount) is greater than the hashTableThreshold. */ abstract boolean isOutOfSpace(int numEntries); static void checkUnionQuickSelectFamily(final Memory mem, final int preambleLongs, final int lgNomLongs) { //Check Family final int familyID = extractFamilyID(mem); //byte 2 final Family family = Family.idToFamily(familyID); if (family.equals(Family.UNION)) { if (preambleLongs != Family.UNION.getMinPreLongs()) { throw new SketchesArgumentException( "Possible corruption: Invalid PreambleLongs value for UNION: " + preambleLongs); } } else if (family.equals(Family.QUICKSELECT)) { if (preambleLongs != Family.QUICKSELECT.getMinPreLongs()) { throw new SketchesArgumentException( "Possible corruption: Invalid PreambleLongs value for QUICKSELECT: " + preambleLongs); } } else { throw new SketchesArgumentException( "Possible corruption: Invalid Family: " + family.toString()); } //Check lgNomLongs if (lgNomLongs < MIN_LG_NOM_LONGS) { throw new SketchesArgumentException( "Possible corruption: Current Memory lgNomLongs < min required size: " + lgNomLongs + " < " + MIN_LG_NOM_LONGS); } } static void checkMemIntegrity(final Memory srcMem, final long seed, final int preambleLongs, final int lgNomLongs, final int lgArrLongs) { //Check SerVer final int serVer = extractSerVer(srcMem); //byte 1 if (serVer != SER_VER) { throw new SketchesArgumentException( "Possible corruption: Invalid Serialization Version: " + serVer); } //Check flags final int flags = extractFlags(srcMem); //byte 5 final int flagsMask = ORDERED_FLAG_MASK | COMPACT_FLAG_MASK | READ_ONLY_FLAG_MASK | BIG_ENDIAN_FLAG_MASK; if ((flags & flagsMask) > 0) { throw new SketchesArgumentException( "Possible corruption: Input srcMem cannot be: big-endian, compact, ordered, or read-only"); } //Check seed hashes final short seedHash = (short)extractSeedHash(srcMem); //byte 6,7 Util.checkSeedHashes(seedHash, Util.computeSeedHash(seed)); //Check mem capacity, lgArrLongs final long curCapBytes = srcMem.getCapacity(); final int minReqBytes = getMemBytes(lgArrLongs, preambleLongs); if (curCapBytes < minReqBytes) { throw new SketchesArgumentException( "Possible corruption: Current Memory size < min required size: " + curCapBytes + " < " + minReqBytes); } //check Theta, p final float p = extractP(srcMem); //bytes 12-15 final long thetaLong = extractThetaLong(srcMem); //bytes 16-23 final double theta = thetaLong / MAX_THETA_LONG_AS_DOUBLE; if ((lgArrLongs <= lgNomLongs) && (theta < p) ) { throw new SketchesArgumentException( "Possible corruption: Theta cannot be < p and lgArrLongs <= lgNomLongs. " + lgArrLongs + " <= " + lgNomLongs + ", Theta: " + theta + ", p: " + p); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy