io.trino.operator.aggregation.ApproximateMostFrequentHistogram Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.operator.aggregation;
import com.clearspring.analytics.stream.Counter;
import com.clearspring.analytics.stream.StreamSummary;
import com.clearspring.analytics.util.ListNode2;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import io.airlift.slice.DynamicSliceOutput;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceInput;
import java.util.List;
import java.util.Map;
import static com.google.common.base.Preconditions.checkArgument;
import static io.airlift.slice.SizeOf.instanceSize;
import static java.lang.Math.toIntExact;
import static java.util.Objects.requireNonNull;
/**
* Calculate the histogram approximately for topk elements based on the
* Space-Saving algorithm and the Stream-Summary data structure
* as described in:
* Efficient Computation of Frequent and Top-k Elements in Data Streams
* by Metwally, Agrawal, and Abbadi
*/
public class ApproximateMostFrequentHistogram
{
private static final byte FORMAT_TAG = 0;
private static final int INSTANCE_SIZE = instanceSize(ApproximateMostFrequentHistogram.class);
private static final int STREAM_SUMMARY_SIZE = instanceSize(StreamSummary.class);
private static final int LIST_NODE2_SIZE = instanceSize(ListNode2.class);
private static final int COUNTER_SIZE = instanceSize(Counter.class);
private final StreamSummary streamSummary;
private final int maxBuckets;
private final int capacity;
private final ApproximateMostFrequentBucketSerializer serializer;
private final ApproximateMostFrequentBucketDeserializer deserializer;
/**
* @param maxBuckets The maximum number of elements stored in the bucket.
* @param capacity The maximum capacity of the stream summary data structure.
* @param serializer It serializes a bucket into varbinary slice.
* @param deserializer It appends a bucket into the histogram.
*/
public ApproximateMostFrequentHistogram(
int maxBuckets,
int capacity,
ApproximateMostFrequentBucketSerializer serializer,
ApproximateMostFrequentBucketDeserializer deserializer)
{
requireNonNull(serializer, "serializer is null");
requireNonNull(deserializer, "deserializer is null");
streamSummary = new StreamSummary<>(capacity);
this.maxBuckets = maxBuckets;
this.capacity = capacity;
this.serializer = serializer;
this.deserializer = deserializer;
}
public ApproximateMostFrequentHistogram(
Slice serialized,
ApproximateMostFrequentBucketSerializer serializer,
ApproximateMostFrequentBucketDeserializer deserializer)
{
SliceInput input = serialized.getInput();
checkArgument(input.readByte() == FORMAT_TAG, "Unsupported format tag");
this.maxBuckets = input.readInt();
this.capacity = input.readInt();
int bucketSize = input.readInt();
this.streamSummary = new StreamSummary<>(capacity);
this.serializer = serializer;
this.deserializer = deserializer;
for (int i = 0; i < bucketSize; i++) {
this.deserializer.deserialize(input, this);
}
}
public void add(K value)
{
streamSummary.offer(value);
}
public void add(K value, long incrementCount)
{
streamSummary.offer(value, toIntExact(incrementCount));
}
public Slice serialize()
{
List> counters = streamSummary.topK(maxBuckets);
int estimatedSliceSize = Byte.BYTES + // FORMAT_TAG
Integer.BYTES + // maxBuckets
Integer.BYTES + // capacity
Integer.BYTES + // Counters size
counters.size() * Long.BYTES * 2; // Bytes allocated for item and count. Although the estimation is not correct for variable length slices, it should work.
DynamicSliceOutput output = new DynamicSliceOutput(estimatedSliceSize);
output.appendByte(FORMAT_TAG);
output.appendInt(maxBuckets);
output.appendInt(capacity);
output.appendInt(counters.size());
// Serialize key and counts.
for (Counter counter : counters) {
serializer.serialize(counter.getItem(), counter.getCount(), output);
}
return output.slice();
}
public void merge(ApproximateMostFrequentHistogram other)
{
List> counters = other.streamSummary.topK(capacity);
for (Counter counter : counters) {
add(counter.getItem(), counter.getCount());
}
}
public void forEachBucket(BucketConsumer consumer)
{
List> counters = streamSummary.topK(maxBuckets);
for (Counter counter : counters) {
consumer.process(counter.getItem(), counter.getCount());
}
}
@VisibleForTesting
public Map getBuckets()
{
ImmutableMap.Builder buckets = ImmutableMap.builder();
forEachBucket(buckets::put);
return buckets.buildOrThrow();
}
public long estimatedInMemorySize()
{
// imperfect estimate of the size of the underlying StreamSummary. TODO: reimplement StreamSummary with flat structures and proper size accounting
return INSTANCE_SIZE +
STREAM_SUMMARY_SIZE +
streamSummary.size() * (LIST_NODE2_SIZE + COUNTER_SIZE + Long.BYTES); // Long.BYTES as a proxy for the size of K
}
}