io.trino.operator.aggregation.ApproximateMostFrequentHistogram Maven / Gradle / Ivy

Go to download
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.operator.aggregation;

import com.clearspring.analytics.stream.Counter;
import com.clearspring.analytics.stream.StreamSummary;
import com.clearspring.analytics.util.ListNode2;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import io.airlift.slice.DynamicSliceOutput;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceInput;

import java.util.List;
import java.util.Map;

import static com.google.common.base.Preconditions.checkArgument;
import static io.airlift.slice.SizeOf.instanceSize;
import static java.lang.Math.toIntExact;
import static java.util.Objects.requireNonNull;

/**
 * Calculate the histogram approximately for topk elements based on the
 * Space-Saving algorithm and the Stream-Summary data structure
 * as described in:
 * Efficient Computation of Frequent and Top-k Elements in Data Streams
 * by Metwally, Agrawal, and Abbadi
 */
public class ApproximateMostFrequentHistogram
{
    private static final byte FORMAT_TAG = 0;
    private static final int INSTANCE_SIZE = instanceSize(ApproximateMostFrequentHistogram.class);
    private static final int STREAM_SUMMARY_SIZE = instanceSize(StreamSummary.class);
    private static final int LIST_NODE2_SIZE = instanceSize(ListNode2.class);
    private static final int COUNTER_SIZE = instanceSize(Counter.class);

    private final StreamSummary streamSummary;
    private final int maxBuckets;
    private final int capacity;
    private final ApproximateMostFrequentBucketSerializer serializer;
    private final ApproximateMostFrequentBucketDeserializer deserializer;

    /**
     * @param maxBuckets The maximum number of elements stored in the bucket.
     * @param capacity The maximum capacity of the stream summary data structure.
     * @param serializer It serializes a bucket into varbinary slice.
     * @param deserializer It appends a bucket into the histogram.
     */
    public ApproximateMostFrequentHistogram(
            int maxBuckets,
            int capacity,
            ApproximateMostFrequentBucketSerializer serializer,
            ApproximateMostFrequentBucketDeserializer deserializer)
    {
        requireNonNull(serializer, "serializer is null");
        requireNonNull(deserializer, "deserializer is null");
        streamSummary = new StreamSummary<>(capacity);
        this.maxBuckets = maxBuckets;
        this.capacity = capacity;
        this.serializer = serializer;
        this.deserializer = deserializer;
    }

    public ApproximateMostFrequentHistogram(
            Slice serialized,
            ApproximateMostFrequentBucketSerializer serializer,
            ApproximateMostFrequentBucketDeserializer deserializer)
    {
        SliceInput input = serialized.getInput();

        checkArgument(input.readByte() == FORMAT_TAG, "Unsupported format tag");

        this.maxBuckets = input.readInt();
        this.capacity = input.readInt();
        int bucketSize = input.readInt();
        this.streamSummary = new StreamSummary<>(capacity);
        this.serializer = serializer;
        this.deserializer = deserializer;

        for (int i = 0; i < bucketSize; i++) {
            this.deserializer.deserialize(input, this);
        }
    }

    public void add(K value)
    {
        streamSummary.offer(value);
    }

    public void add(K value, long incrementCount)
    {
        streamSummary.offer(value, toIntExact(incrementCount));
    }

    public Slice serialize()
    {
        List> counters = streamSummary.topK(maxBuckets);
        int estimatedSliceSize = Byte.BYTES + // FORMAT_TAG
                Integer.BYTES + // maxBuckets
                Integer.BYTES + // capacity
                Integer.BYTES + // Counters size
                counters.size() * Long.BYTES * 2; // Bytes allocated for item and count. Although the estimation is not correct for variable length slices, it should work.
        DynamicSliceOutput output = new DynamicSliceOutput(estimatedSliceSize);
        output.appendByte(FORMAT_TAG);
        output.appendInt(maxBuckets);
        output.appendInt(capacity);
        output.appendInt(counters.size());
        // Serialize key and counts.
        for (Counter counter : counters) {
            serializer.serialize(counter.getItem(), counter.getCount(), output);
        }

        return output.slice();
    }

    public void merge(ApproximateMostFrequentHistogram other)
    {
        List> counters = other.streamSummary.topK(capacity);
        for (Counter counter : counters) {
            add(counter.getItem(), counter.getCount());
        }
    }

    public void forEachBucket(BucketConsumer consumer)
    {
        List> counters = streamSummary.topK(maxBuckets);
        for (Counter counter : counters) {
            consumer.process(counter.getItem(), counter.getCount());
        }
    }

    @VisibleForTesting
    public Map getBuckets()
    {
        ImmutableMap.Builder buckets = ImmutableMap.builder();
        forEachBucket(buckets::put);

        return buckets.buildOrThrow();
    }

    public long estimatedInMemorySize()
    {
        // imperfect estimate of the size of the underlying StreamSummary. TODO: reimplement StreamSummary with flat structures and proper size accounting
        return INSTANCE_SIZE +
                STREAM_SUMMARY_SIZE +
                streamSummary.size() * (LIST_NODE2_SIZE + COUNTER_SIZE + Long.BYTES); // Long.BYTES as a proxy for the size of K
    }
}