All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.fdt.FdtSketch Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.fdt;

import static org.apache.datasketches.Util.MAX_LG_NOM_LONGS;

import java.util.List;

import org.apache.datasketches.BinomialBoundsN;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.tuple.strings.ArrayOfStringsSketch;

/**
 * A Frequent Distinct Tuples sketch.
 *
 * 

Suppose our data is a stream of pairs {IP address, User ID} and we want to identify the * IP addresses that have the most distinct User IDs. Or conversely, we would like to identify * the User IDs that have the most distinct IP addresses. This is a common challenge in the * analysis of big data and the FDT sketch helps solve this problem using probabilistic techniques. * *

More generally, given a multiset of tuples with dimensions {d1,d2, d3, ..., dN}, * and a primary subset of dimensions M < N, our task is to identify the combinations of * M subset dimensions that have the most frequent number of distinct combinations of * the N-M non-primary dimensions. * *

Please refer to the web page * * https://datasketches.github.io/docs/Frequency/FrequentDistinctTuplesSketch.html for a more * complete discussion about this sketch. * * @author Lee Rhodes */ public class FdtSketch extends ArrayOfStringsSketch { /** * Create new instance of Frequent Distinct Tuples sketch with the given * Log-base2 of required nominal entries. * @param lgK Log-base2 of required nominal entries. */ public FdtSketch(final int lgK) { super(lgK); } /** * Used by deserialization. * @param mem the image of a FdtSketch */ FdtSketch(final Memory mem) { super(mem); } /** * Create a new instance of Frequent Distinct Tuples sketch with a size determined by the given * threshold and rse. * @param threshold : the fraction, between zero and 1.0, of the total distinct stream length * that defines a "Frequent" (or heavy) item. * @param rse the maximum Relative Standard Error for the estimate of the distinct population of a * reported tuple (selected with a primary key) at the threshold. */ public FdtSketch(final double threshold, final double rse) { super(computeLgK(threshold, rse)); } /** * Update the sketch with the given string array tuple. * @param tuple the given string array tuple. */ public void update(final String[] tuple) { super.update(tuple, tuple); } /** * Returns an ordered List of Groups of the most frequent distinct population of subset tuples * represented by the count of entries of each group. * @param priKeyIndices these indices define the dimensions used for the Primary Keys. * @param limit the maximum number of groups to return. If this value is ≤ 0, all * groups will be returned. * @param numStdDev the number of standard deviations for the upper and lower error bounds, * this value is an integer and must be one of 1, 2, or 3. * See Number of Standard Deviations * @param sep the separator character * @return an ordered List of Groups of the most frequent distinct population of subset tuples * represented by the count of entries of each group. */ public List getResult(final int[] priKeyIndices, final int limit, final int numStdDev, final char sep) { final PostProcessor proc = new PostProcessor(this, new Group(), sep); return proc.getGroupList(priKeyIndices, numStdDev, limit); } /** * Returns the PostProcessor that enables multiple queries against the sketch results. * This assumes the default Group and the default separator character '|'. * @return the PostProcessor */ public PostProcessor getPostProcessor() { return getPostProcessor(new Group(), '|'); } /** * Returns the PostProcessor that enables multiple queries against the sketch results. * @param group the Group class to use during post processing. * @param sep the separator character. * @return the PostProcessor */ public PostProcessor getPostProcessor(final Group group, final char sep) { return new PostProcessor(this, group, sep); } /** * Gets the estimate of the true distinct population of subset tuples represented by the count * of entries in a group. This is primarily used internally. * @param numSubsetEntries number of entries for a chosen subset of the sketch. * @return the estimate of the true distinct population of subset tuples represented by the count * of entries in a group. */ public double getEstimate(final int numSubsetEntries) { if (!isEstimationMode()) { return numSubsetEntries; } return numSubsetEntries / getTheta(); } /** * Gets the estimate of the lower bound of the true distinct population represented by the count * of entries in a group. * @param numStdDev * See Number of Standard Deviations * @param numSubsetEntries number of entries for a chosen subset of the sketch. * @return the estimate of the lower bound of the true distinct population represented by the count * of entries in a group. */ public double getLowerBound(final int numStdDev, final int numSubsetEntries) { if (!isEstimationMode()) { return numSubsetEntries; } return BinomialBoundsN.getLowerBound(numSubsetEntries, getTheta(), numStdDev, isEmpty()); } /** * Gets the estimate of the upper bound of the true distinct population represented by the count * of entries in a group. * @param numStdDev * See Number of Standard Deviations * @param numSubsetEntries number of entries for a chosen subset of the sketch. * @return the estimate of the upper bound of the true distinct population represented by the count * of entries in a group. */ public double getUpperBound(final int numStdDev, final int numSubsetEntries) { if (!isEstimationMode()) { return numSubsetEntries; } return BinomialBoundsN.getUpperBound(numSubsetEntries, getTheta(), numStdDev, isEmpty()); } // Restricted /** * Computes LgK given the threshold and RSE. * @param threshold the fraction, between zero and 1.0, of the total stream length that defines * a "Frequent" (or heavy) tuple. * @param rse the maximum Relative Standard Error for the estimate of the distinct population of a * reported tuple (selected with a primary key) at the threshold. * @return LgK */ static int computeLgK(final double threshold, final double rse) { final double v = Math.ceil(1.0 / (threshold * rse * rse)); final int lgK = (int) Math.ceil(Math.log(v) / Math.log(2)); if (lgK > MAX_LG_NOM_LONGS) { throw new SketchesArgumentException("Requested Sketch (LgK = " + lgK + " > 2^26), " + "either increase the threshold, the rse or both."); } return lgK; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy