All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.quantiles.Util Maven / Gradle / Ivy

/*
 * Copyright 2015-16, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.quantiles;

import static com.yahoo.sketches.Util.ceilingPowerOf2;
import static com.yahoo.sketches.Util.isPowerOf2;
import static com.yahoo.sketches.quantiles.PreambleUtil.COMPACT_FLAG_MASK;
import static com.yahoo.sketches.quantiles.PreambleUtil.EMPTY_FLAG_MASK;
import static com.yahoo.sketches.quantiles.PreambleUtil.FLAGS_BYTE;
import static com.yahoo.sketches.quantiles.PreambleUtil.ORDERED_FLAG_MASK;
import static com.yahoo.sketches.quantiles.PreambleUtil.READ_ONLY_FLAG_MASK;
import static com.yahoo.sketches.quantiles.PreambleUtil.extractFlags;

import com.yahoo.memory.Memory;

import com.yahoo.sketches.Family;
import com.yahoo.sketches.SketchesArgumentException;

/**
 * Utility class for quantiles sketches.
 *
 * 

This class contains a highly specialized sort called blockyTandemMergeSort(). * It also contains methods that are used while building histograms and other common * functions.

* * @author Lee Rhodes */ final class Util { private Util() {} /** * The java line separator character as a String. */ static final String LS = System.getProperty("line.separator"); /** * The tab character */ static final char TAB = '\t'; /** * Checks the validity of the given value k * @param k must be greater than 1 and less than 65536. */ static void checkK(final int k) { if ((k < DoublesSketch.MIN_K) || (k >= (1 << 16)) || !isPowerOf2(k)) { throw new SketchesArgumentException("K must be > 1 and < 65536 and Power of 2: " + k); } } /** * Checks the validity of the given family ID * @param familyID the given family ID */ static void checkFamilyID(final int familyID) { final Family family = Family.idToFamily(familyID); if (!family.equals(Family.QUANTILES)) { throw new SketchesArgumentException( "Possible corruption: Invalid Family: " + family.toString()); } } /** * Checks the consistency of the flag bits and the state of preambleLong and the memory * capacity and returns the empty state. * @param preambleLongs the size of preamble in longs * @param flags the flags field * @param memCapBytes the memory capacity * @return the value of the empty state */ // Used by ItemsSketch, Test; //TODO Need to Deprecate static boolean checkPreLongsFlagsCap(final int preambleLongs, final int flags, final long memCapBytes) { final boolean empty = (flags & EMPTY_FLAG_MASK) > 0; //Preamble flags empty state final int minPre = Family.QUANTILES.getMinPreLongs(); //1 final int maxPre = Family.QUANTILES.getMaxPreLongs(); //2 final boolean valid = ((preambleLongs == minPre) && empty) || ((preambleLongs == maxPre) && !empty); if (!valid) { throw new SketchesArgumentException( "Possible corruption: PreambleLongs inconsistent with empty state: " + preambleLongs); } checkHeapFlags(flags); if (!empty && (memCapBytes < (preambleLongs << 3))) { throw new SketchesArgumentException( "Possible corruption: Insufficient capacity for preamble: " + memCapBytes); } return empty; } /** * Checks just the flags field of the preamble. Allowed flags are Read Only, Empty, Compact, and * ordered. * @param flags the flags field */ static void checkHeapFlags(final int flags) { //only used by checkPreLongsFlagsCap and test final int allowedFlags = READ_ONLY_FLAG_MASK | EMPTY_FLAG_MASK | COMPACT_FLAG_MASK | ORDERED_FLAG_MASK; final int flagsMask = ~allowedFlags; if ((flags & flagsMask) > 0) { throw new SketchesArgumentException( "Possible corruption: Invalid flags field: " + Integer.toBinaryString(flags)); } } /** * Checks just the flags field of an input Memory object. Returns true for a compact * sketch, false for an update sketch. Does not perform additional checks, including sketch * family. * @param srcMem the source Memory containign a sketch * @return true if flags indicate a comapct sketch, otherwise false */ static boolean checkIsCompactMemory(final Memory srcMem) { final int flags; if (srcMem.isReadOnly() && !srcMem.isDirect()) { flags = srcMem.getByte(FLAGS_BYTE) & 0XFF; } else { final Object memObj = srcMem.array(); //may be null final long memAdd = srcMem.getCumulativeOffset(0L); flags = extractFlags(memObj, memAdd); } final int compactFlags = READ_ONLY_FLAG_MASK | COMPACT_FLAG_MASK; return (flags & compactFlags) > 0; } /** * Checks the sequential validity of the given array of fractions. * They must be unique, monotonically increasing and not NaN, not < 0 and not > 1.0. * @param fractions array */ static final void validateFractions(final double[] fractions) { if (fractions == null) { throw new SketchesArgumentException("Fractions cannot be null."); } final int len = fractions.length; if (len == 0) { return; } final double flo = fractions[0]; final double fhi = fractions[fractions.length - 1]; if ((flo < 0.0) || (fhi > 1.0)) { throw new SketchesArgumentException( "A fraction cannot be less than zero or greater than 1.0"); } Util.validateValues(fractions); } /** * Checks the sequential validity of the given array of double values. * They must be unique, monotonically increasing and not NaN. * @param values the given array of double values */ static final void validateValues(final double[] values) { if (values == null) { throw new SketchesArgumentException("Values cannot be null."); } final int lenM1 = values.length - 1; for (int j = 0; j < lenM1; j++) { if (values[j] < values[j + 1]) { continue; } throw new SketchesArgumentException( "Values must be unique, monotonically increasing and not NaN."); } } /** * Returns the number of retained valid items in the sketch given k and n. * @param k the given configured k of the sketch * @param n the current number of items seen by the sketch * @return the number of retained items in the sketch given k and n. */ static int computeRetainedItems(final int k, final long n) { final int bbCnt = computeBaseBufferItems(k, n); final long bitPattern = computeBitPattern(k, n); final int validLevels = computeValidLevels(bitPattern); return bbCnt + validLevels * k; } /** * Returns the total item capacity of an updatable, non-compact combined buffer * given k and n. If total levels = 0, this returns the ceiling power of 2 * size for the base buffer or the MIN_BASE_BUF_SIZE, whichever is larger. * * @param k sketch parameter. This determines the accuracy of the sketch and the * size of the updatable data structure, which is a function of k and n. * * @param n The number of items in the input stream * @return the current item capacity of the combined buffer */ static int computeCombinedBufferItemCapacity(final int k, final long n) { final int totLevels = computeNumLevelsNeeded(k, n); if (totLevels == 0) { final int bbItems = computeBaseBufferItems(k, n); return Math.max(2 * DoublesSketch.MIN_K, ceilingPowerOf2(bbItems)); } return (2 + totLevels) * k; } /** * Computes the number of valid levels above the base buffer * @param bitPattern the bit pattern * @return the number of valid levels above the base buffer */ static int computeValidLevels(final long bitPattern) { return Long.bitCount(bitPattern); } /** * Computes the total number of logarithmic levels above the base buffer given the bitPattern. * @param bitPattern the given bit pattern * @return the total number of logarithmic levels above the base buffer */ static int computeTotalLevels(final long bitPattern) { return hiBitPos(bitPattern) + 1; } /** * Computes the total number of logarithmic levels above the base buffer given k and n. * This is equivalent to max(floor(lg(n/k), 0). * Returns zero if n is less than 2 * k. * @param k the configured size of the sketch * @param n the total values presented to the sketch. * @return the total number of levels needed. */ static int computeNumLevelsNeeded(final int k, final long n) { return 1 + hiBitPos(n / (2L * k)); } /** * Computes the number of base buffer items given k, n * @param k the configured size of the sketch * @param n the total values presented to the sketch * @return the number of base buffer items */ static int computeBaseBufferItems(final int k, final long n) { return (int) (n % (2L * k)); } /** * Computes the levels bit pattern given k, n. * This is computed as n / (2*k). * @param k the configured size of the sketch * @param n the total values presented to the sketch. * @return the levels bit pattern */ static long computeBitPattern(final int k, final long n) { return n / (2L * k); } /** * Returns the log_base2 of x * @param x the given x * @return the log_base2 of x */ static double lg(final double x) { return ( Math.log(x) / Math.log(2.0) ); } /** * Zero-based position of the highest one-bit of the given long. * Returns minus one if num is zero. * @param num the given long * @return Zero-based position of the highest one-bit of the given long */ static int hiBitPos(final long num) { return 63 - Long.numberOfLeadingZeros(num); } /** * Returns the zero-based bit position of the lowest zero bit of bits starting at * startingBit. If input is all ones, this returns 64. * @param bits the input bits as a long * @param startingBit the zero-based starting bit position. Only the low 6 bits are used. * @return the zero-based bit position of the lowest zero bit starting at startingBit. */ static int lowestZeroBitStartingAt(final long bits, final int startingBit) { int pos = startingBit & 0X3F; long myBits = bits >>> pos; while ((myBits & 1L) != 0) { myBits = myBits >>> 1; pos++; } return pos; } /** * Computes epsilon from K. The following table are examples. * * eps eps from inverted * K empirical adjusted formula * ------------------------------------- * 16 0.121094 0.121454102233560 * 32 0.063477 0.063586601346532 * 64 0.033081 0.033169048393679 * 128 0.017120 0.017248096847308 * 256 0.008804 0.008944835012965 * 512 0.004509 0.004627803568920 * 1024 0.002303 0.002389303789572 * * these could be used in a unit test * 2 0.821714930853465 * 16 0.12145410223356 * 1024 0.00238930378957284 * 1073741824 3.42875166500824e-09 * */ static class EpsilonFromK { /** * Used while crunching down the empirical results. If this value is changed the adjustKForEps * value will be incorrect and must also be recomputed. Don't touch this! */ private static final double deltaForEps = 0.01; /** * A heuristic fudge factor that causes the inverted formula to better match the empirical. * The value of 4/3 is directly associated with the deltaForEps value of 0.01. * Don't touch this! */ private static final double adjustKForEps = 4.0 / 3.0; // fudge factor /** * Ridiculously fine tolerance given the fudge factor; 1e-3 would probably suffice */ private static final double bracketedBinarySearchForEpsTol = 1e-15; /** * From extensive empirical testing we recommend most users use this method for deriving * epsilon. This uses a fudge factor of 4/3 times the theoretical calculation of epsilon. * @param k the given k that must be greater than one. * @return the resulting epsilon */ static double getAdjustedEpsilon(final int k) { //used by HeapQS, so far return getTheoreticalEpsilon(k, adjustKForEps); } /** * Finds the epsilon given K and a fudge factor. * See Cormode's Mergeable Summaries paper, Journal version, Theorem 3.6. * This has a good fit between values of k between 16 and 1024. * Beyond that has not been empirically tested. * @param k The given value of k * @param ff The given fudge factor. No fudge factor = 1.0. * @return the resulting epsilon */ //used only by getAdjustedEpsilon() private static double getTheoreticalEpsilon(final int k, final double ff) { if (k < 2) { throw new SketchesArgumentException("K must be greater than one."); } // don't need to check in the other direction because an int is very small final double kf = k * ff; assert kf >= 2.15; // ensures that the bracketing succeeds assert kf < 1e12; // ditto, but could actually be bigger final double lo = 1e-16; final double hi = 1.0 - 1e-16; assert epsForKPredicate(lo, kf); assert !epsForKPredicate(hi, kf); return bracketedBinarySearchForEps(kf, lo, hi); } private static double kOfEpsFormula(final double eps) { return (1.0 / eps) * (Math.sqrt(Math.log(1.0 / (eps * deltaForEps)))); } private static boolean epsForKPredicate(final double eps, final double kf) { return kOfEpsFormula(eps) >= kf; } private static double bracketedBinarySearchForEps(final double kf, final double lo, final double hi) { assert lo < hi; assert epsForKPredicate(lo, kf); assert !epsForKPredicate(hi, kf); if ((hi - lo) / lo < bracketedBinarySearchForEpsTol) { return lo; } final double mid = (lo + hi) / 2.0; assert mid > lo; assert mid < hi; if (epsForKPredicate(mid, kf)) { return bracketedBinarySearchForEps(kf, mid, hi); } else { return bracketedBinarySearchForEps(kf, lo, mid); } } } //End of EpsilonFromK }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy