
org.apache.beam.sdk.extensions.sketching.ApproximateDistinct Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.extensions.sketching;
import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument;
import com.clearspring.analytics.stream.cardinality.CardinalityMergeException;
import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
import com.google.auto.value.AutoValue;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.beam.sdk.coders.ByteArrayCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.CustomCoder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.transforms.Combine;
import org.apache.beam.sdk.transforms.Combine.CombineFn;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.CoderUtils;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
/**
* {@link PTransform}s for computing the approximate number of distinct elements in a stream.
*
* This class relies on the HyperLogLog algorithm, and more precisely HyperLogLog+, the improved
* version of Google.
*
*
References
*
* The implementation comes from Addthis'
* Stream-lib library.
* The original paper of the HyperLogLog is available here.
* A paper from the same authors to have a clearer view of the algorithm is available here.
* Google's HyperLogLog+ version is detailed in this paper.
*
*
Parameters
*
* Two parameters can be tuned in order to control the computation's accuracy:
*
*
* - Precision: {@code p}
* Controls the accuracy of the estimation. The precision value will have an impact on the
* number of buckets used to store information about the distinct elements.
* In general one can expect a relative error of about {@code 1.1 / sqrt(2^p)}. The value
* should be of at least 4 to guarantee a minimal accuracy.
* By default, the precision is set to {@code 12} for a relative error of around {@code 2%}.
* - Sparse Precision: {@code sp}
* Used to create a sparse representation in order to optimize memory and improve accuracy at
* small cardinalities.
* The value of {@code sp} should be greater than {@code p}, but lower than 32.
* By default, the sparse representation is not used ({@code sp = 0}). One should use it if
* the cardinality may be less than {@code 12000}.
*
*
* Examples
*
* There are 2 ways of using this class:
*
*
* - Use the {@link PTransform}s that return {@code PCollection
} corresponding to the
* estimate number of distinct elements in the input {@link PCollection} of objects or for
* each key in a {@link PCollection} of {@link KV}s.
* - Use the {@link ApproximateDistinctFn} {@code CombineFn} that is exposed in order to make
* advanced processing involving the {@link HyperLogLogPlus} structure which resumes the
* stream.
*
*
* Using the Transforms
*
* Example 1: globally default use
*
* {@code
* PCollection input = ...;
* PCollection hllSketch = input.apply(ApproximateDistinct.globally());
* }
*
* Example 2: per key default use
*
* {@code
* PCollection input = ...;
* PCollection hllSketches = input.apply(ApproximateDistinct
* .perKey());
* }
*
* Example 3: tune precision and use sparse representation
*
* One can tune the precision and sparse precision parameters in order to control the accuracy
* and the memory. The tuning works exactly the same for {@link #globally()} and {@link #perKey()}.
*
*
{@code
* int precision = 15;
* int sparsePrecision = 25;
* PCollection input = ...;
* PCollection hllSketch = input.apply(ApproximateDistinct
* .globally()
* .withPrecision(precision)
* .withSparsePrecision(sparsePrecision));
* }
*
* Using the {@link ApproximateDistinctFn} CombineFn
*
* The CombineFn does the same thing as the transform but it can be used in cases where you want
* to manipulate the {@link HyperLogLogPlus} sketch, for example if you want to store it in a
* database to have a backup. It can also be used in stateful processing or in {@link
* org.apache.beam.sdk.transforms.CombineFns.ComposedCombineFn}.
*
*
Example 1: basic use
*
* This example is not really interesting but show how you can properly create an {@link
* ApproximateDistinctFn}. One must always specify a coder using the {@link
* ApproximateDistinctFn#create(Coder)} method.
*
*
{@code
* PCollection input = ...;
* PCollection output = input.apply(Combine.globally(ApproximateDistinctFn
* .create(BigEndianIntegerCoder.of()));
* }
*
* Example 2: use the {@link CombineFn} in a stateful {@link ParDo}
*
* One may want to use the {@link ApproximateDistinctFn} in a stateful ParDo in order to make
* some processing depending on the current cardinality of the stream.
* For more information about stateful processing see the blog spot on this topic here.
*
*
Here is an example of {@link DoFn} using an {@link ApproximateDistinctFn} as a {@link
* org.apache.beam.sdk.state.CombiningState}:
*
*
* {@literal class StatefulCardinality extends DoFn} {
* {@literal @StateId}("hyperloglog")
* {@literal private final StateSpec>}
* indexSpec;
*
* {@literal public StatefulCardinality(ApproximateDistinctFn fn)} {
* indexSpec = StateSpecs.combining(fn);
* }
*
* {@literal @ProcessElement}
* public void processElement(
* ProcessContext context,
* {@literal @StateId}("hllSketch")
* {@literal CombiningState hllSketch)} {
* long current = MoreObjects.firstNonNull(hllSketch.getAccum().cardinality(), 0L);
* hllSketch.add(context.element());
* context.output(...);
* }
* }
*
*
* Then the {@link DoFn} can be called like this:
*
*
{@code
* PCollection input = ...;
* ApproximateDistinctFn myFn = ApproximateDistinctFn.create(input.getCoder());
* PCollection = input.apply(ParDo.of(new StatefulCardinality<>(myFn)));
* }
*
* Example 3: use the {@link RetrieveCardinality} utility class
*
* One may want to retrieve the cardinality as a long after making some advanced processing using
* the {@link HyperLogLogPlus} structure.
* The {@link RetrieveCardinality} utility class provides an easy way to do so:
*
*
{@code
* PCollection input = ...;
* PCollection hll = input.apply(Combine.globally(ApproximateDistinctFn
* .create(new MyObjectCoder())
* .withSparseRepresentation(20)));
*
* // Some advanced processing
* PCollection advancedResult = hll.apply(...);
*
* PCollection cardinality = hll.apply(ApproximateDistinct.RetrieveCardinality.globally());
*
* }
*
* Consider using the {@code HllCount.Init} transform in the {@code zetasketch} extension module if
* you need to create sketches compatible with Google Cloud BigQuery. For more details about using
* {@code HllCount} and the {@code zetasketch} extension module, see
* https://s.apache.org/hll-in-beam#bookmark=id.v6chsij1ixo7
*/
public final class ApproximateDistinct {
/**
* Computes the approximate number of distinct elements in the input {@code PCollection}
* and returns a {@code PCollection}.
*
* @param the type of the elements in the input {@link PCollection}
*/
public static GloballyDistinct globally() {
return GloballyDistinct.builder().build();
}
/**
* Like {@link #globally} but per key, i.e computes the approximate number of distinct values per
* key in a {@code PCollection>} and returns {@code PCollection>}.
*
* @param type of the keys mapping the elements
* @param type of the values being combined per key
*/
public static PerKeyDistinct perKey() {
return PerKeyDistinct.builder().build();
}
/**
* Implementation of {@link #globally()}.
*
* @param the type of the elements in the input {@link PCollection}
*/
@AutoValue
public abstract static class GloballyDistinct
extends PTransform, PCollection> {
abstract int precision();
abstract int sparsePrecision();
abstract Builder toBuilder();
static Builder builder() {
return new AutoValue_ApproximateDistinct_GloballyDistinct.Builder()
.setPrecision(12)
.setSparsePrecision(0);
}
@AutoValue.Builder
abstract static class Builder {
abstract Builder setPrecision(int p);
abstract Builder setSparsePrecision(int sp);
abstract GloballyDistinct build();
}
/**
* Sets the precision {@code p}.
*
* Keep in mind that {@code p} cannot be lower than 4, because the estimation would be too
* inaccurate.
*
*
See {@link ApproximateDistinct#precisionForRelativeError(double)} and {@link
* ApproximateDistinct#relativeErrorForPrecision(int)} to have more information about the
* relationship between precision and relative error.
*
* @param p the precision value for the normal representation
*/
public GloballyDistinct withPrecision(int p) {
return toBuilder().setPrecision(p).build();
}
/**
* Sets the sparse representation's precision {@code sp}.
*
* Values above 32 are not yet supported by the AddThis version of HyperLogLog+.
*
*
Fore more information about the sparse representation, read Google's paper available here.
*
* @param sp the precision of HyperLogLog+' sparse representation
*/
public GloballyDistinct withSparsePrecision(int sp) {
return toBuilder().setSparsePrecision(sp).build();
}
@Override
public PCollection expand(PCollection input) {
return input
.apply(
"Compute HyperLogLog Structure",
Combine.globally(
ApproximateDistinctFn.create(input.getCoder())
.withPrecision(this.precision())
.withSparseRepresentation(this.sparsePrecision())))
.apply("Retrieve Cardinality", ParDo.of(RetrieveCardinality.globally()));
}
}
/**
* Implementation of {@link #perKey()}.
*
* @param type of the keys mapping the elements
* @param type of the values being combined per key
*/
@AutoValue
public abstract static class PerKeyDistinct
extends PTransform>, PCollection>> {
abstract int precision();
abstract int sparsePrecision();
abstract Builder toBuilder();
static Builder builder() {
return new AutoValue_ApproximateDistinct_PerKeyDistinct.Builder()
.setPrecision(12)
.setSparsePrecision(0);
}
@AutoValue.Builder
abstract static class Builder {
abstract Builder setPrecision(int p);
abstract Builder setSparsePrecision(int sp);
abstract PerKeyDistinct build();
}
/**
* Sets the precision {@code p}.
*
* Keep in mind that {@code p} cannot be lower than 4, because the estimation would be too
* inaccurate.
*
*
See {@link ApproximateDistinct#precisionForRelativeError(double)} and {@link
* ApproximateDistinct#relativeErrorForPrecision(int)} to have more information about the
* relationship between precision and relative error.
*
* @param p the precision value for the normal representation
*/
public PerKeyDistinct withPrecision(int p) {
return toBuilder().setPrecision(p).build();
}
/**
* Sets the sparse representation's precision {@code sp}.
*
* Values above 32 are not yet supported by the AddThis version of HyperLogLog+.
*
*
Fore more information about the sparse representation, read Google's paper available here.
*
* @param sp the precision of HyperLogLog+' sparse representation
*/
public PerKeyDistinct withSparsePrecision(int sp) {
return toBuilder().setSparsePrecision(sp).build();
}
@Override
public PCollection> expand(PCollection> input) {
KvCoder inputCoder = (KvCoder) input.getCoder();
return input
.apply(
Combine.perKey(
ApproximateDistinctFn.create(inputCoder.getValueCoder())
.withPrecision(this.precision())
.withSparseRepresentation(this.sparsePrecision())))
.apply("Retrieve Cardinality", ParDo.of(RetrieveCardinality.perKey()));
}
}
/**
* Implements the {@link CombineFn} of {@link ApproximateDistinct} transforms.
*
* @param the type of the elements in the input {@link PCollection}
*/
public static class ApproximateDistinctFn
extends CombineFn {
private final int p;
private final int sp;
private final Coder inputCoder;
private ApproximateDistinctFn(int p, int sp, Coder coder) {
this.p = p;
this.sp = sp;
inputCoder = coder;
}
/**
* Returns an {@link ApproximateDistinctFn} combiner with the given input coder.
*
* @param coder the coder that encodes the elements' type
*/
public static ApproximateDistinctFn create(Coder coder) {
try {
coder.verifyDeterministic();
} catch (Coder.NonDeterministicException e) {
throw new IllegalArgumentException(
"Coder must be deterministic to perform this sketch." + e.getMessage(), e);
}
return new ApproximateDistinctFn<>(12, 0, coder);
}
/**
* Returns an {@link ApproximateDistinctFn} combiner with a new precision {@code p}.
*
* Keep in mind that {@code p} cannot be lower than 4, because the estimation would be too
* inaccurate.
*
*
See {@link ApproximateDistinct#precisionForRelativeError(double)} and {@link
* ApproximateDistinct#relativeErrorForPrecision(int)} to have more information about the
* relationship between precision and relative error.
*
* @param p the precision value for the normal representation
*/
public ApproximateDistinctFn withPrecision(int p) {
checkArgument(p >= 4, "Expected: p >= 4. Actual: p = %s", p);
return new ApproximateDistinctFn<>(p, this.sp, this.inputCoder);
}
/**
* Returns an {@link ApproximateDistinctFn} combiner with a new sparse representation's
* precision {@code sp}.
*
* Values above 32 are not yet supported by the AddThis version of HyperLogLog+.
*
*
Fore more information about the sparse representation, read Google's paper available here.
*
* @param sp the precision of HyperLogLog+' sparse representation
*/
public ApproximateDistinctFn withSparseRepresentation(int sp) {
checkArgument(
(sp > this.p && sp < 32) || (sp == 0),
"Expected: p <= sp <= 32." + "Actual: p = %s, sp = %s",
this.p,
sp);
return new ApproximateDistinctFn<>(this.p, sp, this.inputCoder);
}
@Override
public HyperLogLogPlus createAccumulator() {
return new HyperLogLogPlus(p, sp);
}
@Override
public HyperLogLogPlus addInput(HyperLogLogPlus acc, InputT record) {
try {
acc.offer(CoderUtils.encodeToByteArray(inputCoder, record));
} catch (CoderException e) {
throw new IllegalStateException("The input value cannot be encoded: " + e.getMessage(), e);
}
return acc;
}
/** Output the whole structure so it can be queried, reused or stored easily. */
@Override
public HyperLogLogPlus extractOutput(HyperLogLogPlus accumulator) {
return accumulator;
}
@Override
public HyperLogLogPlus mergeAccumulators(Iterable accumulators) {
HyperLogLogPlus mergedAccum = createAccumulator();
for (HyperLogLogPlus accum : accumulators) {
try {
mergedAccum.addAll(accum);
} catch (CardinalityMergeException e) {
// Should never happen because only HyperLogLogPlus accumulators are instantiated.
throw new IllegalStateException(
"The accumulators cannot be merged: " + e.getMessage(), e);
}
}
return mergedAccum;
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder
.add(DisplayData.item("p", p).withLabel("precision"))
.add(DisplayData.item("sp", sp).withLabel("sparse representation precision"));
}
}
/** Coder for {@link HyperLogLogPlus} class. */
public static class HyperLogLogPlusCoder extends CustomCoder {
private static final HyperLogLogPlusCoder INSTANCE = new HyperLogLogPlusCoder();
private static final ByteArrayCoder BYTE_ARRAY_CODER = ByteArrayCoder.of();
public static HyperLogLogPlusCoder of() {
return INSTANCE;
}
@Override
public void encode(HyperLogLogPlus value, OutputStream outStream) throws IOException {
if (value == null) {
throw new CoderException("cannot encode a null HyperLogLogPlus sketch");
}
BYTE_ARRAY_CODER.encode(value.getBytes(), outStream);
}
@Override
public HyperLogLogPlus decode(InputStream inStream) throws IOException {
return HyperLogLogPlus.Builder.build(BYTE_ARRAY_CODER.decode(inStream));
}
@Override
public boolean isRegisterByteSizeObserverCheap(HyperLogLogPlus value) {
return true;
}
@Override
protected long getEncodedElementByteSize(HyperLogLogPlus value) throws IOException {
if (value == null) {
throw new CoderException("cannot encode a null HyperLogLogPlus sketch");
}
return value.sizeof();
}
}
/**
* Utility class that provides {@link DoFn}s to retrieve the cardinality from a {@link
* HyperLogLogPlus} structure in a global or perKey context.
*/
private static class RetrieveCardinality {
private static DoFn, KV> perKey() {
return new DoFn, KV>() {
@ProcessElement
public void processElement(ProcessContext c) {
KV kv = c.element();
c.output(KV.of(kv.getKey(), kv.getValue().cardinality()));
}
};
}
private static DoFn globally() {
return new DoFn() {
@ProcessElement
public void apply(ProcessContext c) {
c.output(c.element().cardinality());
}
};
}
}
/**
* Computes the precision based on the desired relative error.
*
* According to the paper, the mean squared error is bounded by the following formula:
*
*
b(m) / sqrt(m)
* Where m is the number of buckets used ({@code p = log2(m)})
* and {@code b(m) < 1.106} for {@code m > 16 (and p > 4)}.
*
*
*
* WARNING:
* This does not mean relative error in the estimation can't be higher.
* This only means that on average the relative error will be lower than the desired relative
* error.
* Nevertheless, the more elements arrive in the {@link PCollection}, the lower the variation will
* be.
* Indeed, this is like when you throw a dice millions of time: the relative frequency of each
* different result {1,2,3,4,5,6}
will get closer to {@code 1/6}.
*
* @param relativeError the mean squared error should be in the interval ]0,1]
* @return the minimum precision p in order to have the desired relative error on average.
*/
public static long precisionForRelativeError(double relativeError) {
return Math.round(
Math.ceil(Math.log(Math.pow(1.106, 2.0) / Math.pow(relativeError, 2.0)) / Math.log(2)));
}
/**
* @param p the precision i.e. the number of bits used for indexing the buckets
* @return the Mean squared error of the Estimation of cardinality to expect for the given value
* of p.
*/
public static double relativeErrorForPrecision(int p) {
if (p < 4) {
return 1.0;
}
double betaM;
switch (p) {
case 4:
betaM = 1.156;
break;
case 5:
betaM = 1.2;
break;
case 6:
betaM = 1.104;
break;
case 7:
betaM = 1.096;
break;
default:
betaM = 1.05;
break;
}
return betaM / Math.sqrt(Math.exp(p * Math.log(2)));
}
}