All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.sketches.demo.DemoImpl Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2015, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package com.yahoo.sketches.demo;

import static com.yahoo.sketches.demo.Util.getMinSecFromMilli;
import static com.yahoo.sketches.demo.Util.nextLong;
import static com.yahoo.sketches.demo.Util.println;
import static com.yahoo.sketches.hash.MurmurHash3.hash;
import static java.lang.Math.sqrt;
import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Random;

import com.yahoo.sketches.Family;
import com.yahoo.sketches.ResizeFactor;
import com.yahoo.sketches.hll.HllSketch;
import com.yahoo.sketches.theta.Sketches;
import com.yahoo.sketches.theta.UpdateSketch;

//CHECKSTYLE.OFF: JavadocMethod
//CHECKSTYLE.OFF: WhitespaceAround
/**
 * A simple demo that compares brute force counting of uniques vs. using sketches.
 *
 * 

This demo computes a stream of values and feeds them first to * an exact sort-based method of computing the number of unique values * in the stream and then feeds a similar stream to two different types of * sketches from the library. * *

This demo becomes most significant in the case where the number of uniques in the * stream exceeds what the computer can hold in memory. * *

This demo utilizes the Unix sort and wc commands for the brute force compuation. * So this needs to be run on a linux or mac machine. A windows machine with a similar unix * library installed should also work, but it has not been tested. */ public class DemoImpl { //Static constants private static final String LS = System.getProperty("line.separator"); private static final byte LS_BYTE = LS.getBytes(UTF_8)[0]; private static Random rand = new Random(9001); private static StandardOpenOption C = StandardOpenOption.CREATE; private static StandardOpenOption W = StandardOpenOption.WRITE; private static StandardOpenOption TE = StandardOpenOption.TRUNCATE_EXISTING; //Stream Configuration private int byteBufCap_ = 1000000; //ByteBuffer capacity private long n_ = (long)1E8; //stream length private int batchSz_ = 1000; //batch size private final double uniquesFrac_; //fraction to be unique //Sketch configuration private int lgK_ = 14; //16K //Internal sketch values private int maxMemSkBytes_; private double rse2_; //RSE for 95% confidence private UpdateSketch tSketch_ = null; private HllSketch hllSketch_ = null; //Other internal values private Path path = Paths.get("tmp/test.txt"); private long[] vArr_ = new long[1]; //reuse this array private long fileBytes_ = 0; private long u_; //global unique count; /** * Constuct the demo. * @param streamLen The total stream length. Must be > 1000. Will be rounded up to nearest * 1000. * @param uniquesFraction the fraction of streamLen values less than 1.0, that will be unique. * The actual # of uniques will vary around this value, because it is computed statistically. */ public DemoImpl(long streamLen, double uniquesFraction) { if ((uniquesFraction <= 0.0) || (uniquesFraction > 1.0)) { throw new IllegalArgumentException( "uniquesFraction must be > 0.0 and <= 1.0: "+uniquesFraction); } uniquesFrac_ = uniquesFraction; long m = streamLen/1000; if (m *1000 == streamLen) { n_ = streamLen; } else { n_ = (m+1)*1000; } lgK_ = 14; //Log-base 2 of the configured sketch size = 16K File dir = new File("tmp"); //new directory tmp if (!dir.exists()) { try { dir.mkdir(); } catch(SecurityException e) { throw new SecurityException(e); } } } /** * Run the demo */ public void runDemo() { println("# COMPUTE DISTINCT COUNT EXACTLY:"); long exactTimeMS; exactTimeMS = buildFile(); //exactTimeMS = buildFileAndSketch(); //used instead only for testing println("## SORT & REMOVE DUPLICATES"); String sortCmd = "sort -u -o tmp/sorted.txt tmp/test.txt"; exactTimeMS += UnixCmd.run("sort", sortCmd); println("\n## LINE COUNT"); String wcCmd = "wc -l tmp/sorted.txt"; exactTimeMS += UnixCmd.run("wc", wcCmd); println("Total Exact "+getMinSecFromMilli(exactTimeMS) +LS+LS); println("# COMPUTE DISTINCT COUNT USING SKETCHES"); configureThetaSketch(); long sketchTimeMS = buildSketch(); double factor = exactTimeMS*1.0/sketchTimeMS; println("Speedup Factor "+String.format("%.1f", factor) + LS); configureHLLSketch(); sketchTimeMS = buildSketch(); factor = exactTimeMS*1.0/sketchTimeMS; println("Speedup Factor "+String.format("%.1f", factor)); } /** * @return total test time in milliseconds */ private long buildFile() { println("## BUILD FILE:"); rand = new Random(9001); ByteBuffer byteBuf = ByteBuffer.allocate(byteBufCap_); u_ = 1; //reset global unique counter fileBytes_ = 0; long testStartTime_mS = System.currentTimeMillis(); try (SeekableByteChannel sbc = Files.newByteChannel(path, C, W, TE)) { for (long i=0; i 0) { //write remainder byteBuf.flip(); fileBytes_ += sbc.write(byteBuf); byteBuf.clear(); } } catch (IOException e) { e.printStackTrace(); } long testTime_mS = System.currentTimeMillis() - testStartTime_mS; //Print common results printCommon(testTime_mS, n_, u_); //Print file results println("File Size Bytes: "+String.format("%,d", fileBytes_) + LS); return testTime_mS; } /** * @return total test time in milliseconds */ private long buildSketch() { rand = new Random(9001); u_ = 1; //reset global unique counter long stLen = 0; long[] vArr = new long[batchSz_]; long testTime_nS = 0; while (stLen < n_) { for (int i = 0; i 0) { byteBuf.flip(); fileBytes_ += sbc.write(byteBuf); byteBuf.clear(); } } catch (IOException e) { e.printStackTrace(); } long testTime_mS = System.currentTimeMillis() - testStartTime_mS; //Print common results printCommon(testTime_mS, n_, u_); //Print file results println("File Size Bytes: "+String.format("%,d", fileBytes_)); //Print sketch results printSketchResults(u_, maxMemSkBytes_, rse2_); return testTime_mS; } /** * @return next hashed long value with (1.0 - uniqueFrac_) as duplicates */ private long nextValue() { long out; if ( (rand.nextDouble() < uniquesFrac_) || (u_ <= 1)) { out = u_++; //unique } else { out = nextLong(0, u_); //duplicate } //return out; vArr_[0] = out; return hash(vArr_, 0L)[0]; } private final void configureThetaSketch() { int k = 1 << lgK_; //14 hllSketch_ = null; maxMemSkBytes_ = k *16; //includes full hash table rse2_ = 2.0/sqrt(k); //Error for 95% confidence tSketch_ = Sketches.updateSketchBuilder() .setResizeFactor(ResizeFactor.X1) .setFamily(Family.ALPHA).build(k ); } private final void configureHLLSketch() { int k = 1 << lgK_; //14 boolean compressed = true; boolean hipEstimator = true; boolean denseMode = true; tSketch_ = null; maxMemSkBytes_ = (compressed)? k/2 : k; rse2_ = 2.0 * ((hipEstimator)? 0.836/sqrt(k) : 1.04/sqrt(k)); //for 95% confidence hllSketch_ = HllSketch.builder().setLogBuckets(lgK_) .setHipEstimator(hipEstimator) .setDenseMode(denseMode) .setCompressedDense(compressed) .build(); } private static void printCommon(long testTimeMilli, long n, long u) { println(getMinSecFromMilli(testTimeMilli)); println("Total Values: "+String.format("%,d",n)); int nSecRate = (int) (testTimeMilli *1000000.0/n); println("Build Rate: "+ String.format("%d nSec/Value", nSecRate)); println("Exact Uniques: "+String.format("%,d", u)); } private void printSketchResults(long u, int maxMemSkBytes, double rse2) { println("## SKETCH STATS"); double rounded = Math.round((tSketch_ != null)? tSketch_.getEstimate() : hllSketch_.getEstimate()); println("Sketch Estimate of Uniques: "+ String.format("%,d", (long)rounded)); double err = (u == 0)? 0 : (rounded/u - 1.0); println("Sketch Actual Relative Error: "+String.format("%.3f%%", err*100)); println("Sketch 95%ile Error Bounds : "+String.format("+/- %.3f%%", rse2*100)); println("Max Sketch Size Bytes: "+ String.format("%,d", maxMemSkBytes)); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy