All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.beam.sdk.extensions.sketching.TDigestQuantiles Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.extensions.sketching;

import com.google.auto.value.AutoValue;
import com.tdunning.math.stats.MergingDigest;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.Iterator;
import org.apache.beam.sdk.coders.ByteArrayCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.CoderRegistry;
import org.apache.beam.sdk.coders.CustomCoder;
import org.apache.beam.sdk.transforms.Combine;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;

/**
 * {@code PTransform}s for getting information about quantiles in a stream.
 *
 * 

This class uses the T-Digest structure introduced by Ted Dunning, and more precisely the * {@link MergingDigest} implementation. * *

References

* *

The paper and implementation are available on Ted Dunning's Github profile * *

Parameters

* *

Only one parameter can be tuned in order to control the tradeoff between the estimation * accuracy and the memory use.
* *

Stream elements are compressed into a linked list of centroids. The compression factor {@code * cf} is used to limit the number of elements represented by each centroid as well as the total * number of centroids.
* The relative error will always be a small fraction of 1% for values at extreme quantiles and * always be less than 3/cf at middle quantiles.
* *

By default the compression factor is set to 100, which guarantees a relative error less than * 3%. * *

Examples

* *

There are 2 ways of using this class: * *

    *
  • Use the {@link PTransform}s that return a {@link PCollection} which contains a {@link * MergingDigest} for querying the value at a given quantile or the approximate quantile * position of an element. *
  • Use the {@link TDigestQuantilesFn} {@code CombineFn} that is exposed in order to make * advanced processing involving the {@link MergingDigest}. *
* *

Example 1: Default use

* *

The simplest use is to call the {@link #globally()} or {@link #perKey()} method in order to * retrieve the digest, and then to query the structure. * *


 * {@literal PCollection} pc = ...;
 * {@literal PCollection} countMinSketch = pc.apply(TDigestQuantiles
 *         .globally()); // .perKey()
 * 
* *

Example 2: tune accuracy parameters

* *

One can tune the compression factor {@code cf} in order to control accuracy and memory.
* This tuning works exactly the same for {@link #globally()} and {@link #perKey()}. * *


 *  double cf = 500;
 * {@literal PCollection} pc = ...;
 * {@literal PCollection} countMinSketch = pc.apply(TDigestQuantiles
 *         .globally() // .perKey()
 *         .withCompression(cf);
 * 
* *

Example 3 : Query the resulting structure

* *

This example shows how to query the resulting structure, for example to build {@code * PCollection} of {@link KV}s with each pair corresponding to a couple (quantile, value). * *


 * {@literal PCollection} pc = ...;
 * {@literal PCollection>} quantiles = pc.apply(ParDo.of(
 *        {@literal new DoFn>()} {
 *          {@literal @ProcessElement}
 *           public void processElement(ProcessContext c) {
 *             double[] quantiles = {0.01, 0.25, 0.5, 0.75, 0.99}
 *             for (double q : quantiles) {
 *                c.output(KV.of(q, c.element().quantile(q));
 *             }
 *           }}));
 * 
* *

One can also retrieve the approximate quantile position of a given element in the stream using * {@code cdf(double)} method instead of {@code quantile(double)}. * *

Example 4: Using the CombineFn

* *

The {@code CombineFn} does the same thing as the {@code PTransform}s but it can be used for * doing stateful processing or in {@link * org.apache.beam.sdk.transforms.CombineFns.ComposedCombineFn}. * *

This example is not really interesting but it shows how one can properly create a {@link * TDigestQuantilesFn}. * *


 *  double cf = 250;
 * {@literal PCollection} input = ...;
 * {@literal PCollection} output = input.apply(Combine
 *         .globally(TDigestQuantilesFn.create(cf)));
 * 
*/ @SuppressWarnings({ "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) }) public final class TDigestQuantiles { /** * Compute the stream in order to build a T-Digest structure (MergingDigest) for keeping track of * the stream distribution and returns a {@code PCollection}.
* The resulting structure can be queried in order to retrieve the approximate value at a given * quantile or the approximate quantile position of a given element. */ public static GlobalDigest globally() { return GlobalDigest.builder().build(); } /** * Like {@link #globally()}, but builds a digest for each key in the stream. * * @param the type of the keys */ public static PerKeyDigest perKey() { return PerKeyDigest.builder().build(); } /** Implementation of {@link #globally()}. */ @AutoValue public abstract static class GlobalDigest extends PTransform, PCollection> { abstract double compression(); abstract Builder toBuilder(); static Builder builder() { return new AutoValue_TDigestQuantiles_GlobalDigest.Builder().setCompression(100); } @AutoValue.Builder abstract static class Builder { abstract Builder setCompression(double cf); abstract GlobalDigest build(); } /** * Sets the compression factor {@code cf}. * *

Keep in mind that a compression factor {@code cf} of c guarantees a relative error less * than 3/c at mid quantiles.
* The accuracy will always be significantly less than 1% at extreme quantiles. * * @param cf the bound value for centroid and digest sizes. */ public GlobalDigest withCompression(double cf) { return toBuilder().setCompression(cf).build(); } @Override public PCollection expand(PCollection input) { return input.apply( "Compute T-Digest Structure", Combine.globally(TDigestQuantilesFn.create(this.compression()))); } } /** Implementation of {@link #perKey()}. */ @AutoValue public abstract static class PerKeyDigest extends PTransform>, PCollection>> { abstract double compression(); abstract Builder toBuilder(); static Builder builder() { return new AutoValue_TDigestQuantiles_PerKeyDigest.Builder().setCompression(100); } @AutoValue.Builder abstract static class Builder { abstract Builder setCompression(double cf); abstract PerKeyDigest build(); } /** * Sets the compression factor {@code cf}. * *

Keep in mind that a compression factor {@code cf} of c guarantees a relative error less * than 3/c at mid quantiles.
* The accuracy will always be significantly less than 1% at extreme quantiles. * * @param cf the bound value for centroid and digest sizes. */ public PerKeyDigest withCompression(double cf) { return toBuilder().setCompression(cf).build(); } @Override public PCollection> expand(PCollection> input) { return input.apply( "Compute T-Digest Structure", Combine.perKey(TDigestQuantilesFn.create(this.compression()))); } } /** Implements the {@link Combine.CombineFn} of {@link TDigestQuantiles} transforms. */ public static class TDigestQuantilesFn extends Combine.CombineFn { private final double compression; private TDigestQuantilesFn(double compression) { this.compression = compression; } /** * Returns {@link TDigestQuantilesFn} combiner with the given compression factor. * *

Keep in mind that a compression factor {@code cf} of c guarantees a relative error less * than 3/c at mid quantiles.
* The accuracy will always be significantly less than 1% at extreme quantiles. * * @param compression the bound value for centroid and digest sizes. */ public static TDigestQuantilesFn create(double compression) { if (compression > 0) { return new TDigestQuantilesFn(compression); } throw new IllegalArgumentException("Compression factor should be greater than 0."); } @Override public MergingDigest createAccumulator() { return new MergingDigest(compression); } @Override public MergingDigest addInput(MergingDigest accum, Double value) { accum.add(value); return accum; } /** Output the whole structure so it can be queried, reused or stored easily. */ @Override public MergingDigest extractOutput(MergingDigest accum) { return accum; } @Override public MergingDigest mergeAccumulators(Iterable accumulators) { Iterator it = accumulators.iterator(); MergingDigest merged = it.next(); while (it.hasNext()) { merged.add(it.next()); } return merged; } @Override public Coder getAccumulatorCoder(CoderRegistry registry, Coder inputCoder) { return new MergingDigestCoder(); } @Override public Coder getDefaultOutputCoder(CoderRegistry registry, Coder inputCoder) { return new MergingDigestCoder(); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("compression", compression).withLabel("Compression factor")); } } /** Coder for {@link MergingDigest} class. */ static class MergingDigestCoder extends CustomCoder { private static final ByteArrayCoder BYTE_ARRAY_CODER = ByteArrayCoder.of(); @Override public void encode(MergingDigest value, OutputStream outStream) throws IOException { if (value == null) { throw new CoderException("cannot encode a null T-Digest sketch"); } ByteBuffer buf = ByteBuffer.allocate(value.byteSize()); value.asBytes(buf); BYTE_ARRAY_CODER.encode(buf.array(), outStream); } @Override public MergingDigest decode(InputStream inStream) throws IOException { byte[] bytes = BYTE_ARRAY_CODER.decode(inStream); ByteBuffer buf = ByteBuffer.wrap(bytes); return MergingDigest.fromBytes(buf); } @Override public boolean isRegisterByteSizeObserverCheap(MergingDigest value) { return true; } @Override protected long getEncodedElementByteSize(MergingDigest value) throws IOException { if (value == null) { throw new CoderException("cannot encode a null T-Digest sketch"); } return value.byteSize(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy