![JAR search and dependency download from the Maven repository](/logo.png)
org.opensearch.search.aggregations.bucket.histogram.InternalVariableWidthHistogram Maven / Gradle / Ivy
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.search.aggregations.bucket.histogram;
import org.apache.lucene.util.PriorityQueue;
import org.opensearch.common.io.stream.StreamInput;
import org.opensearch.common.io.stream.StreamOutput;
import org.opensearch.common.xcontent.XContentBuilder;
import org.opensearch.search.DocValueFormat;
import org.opensearch.search.aggregations.Aggregations;
import org.opensearch.search.aggregations.InternalAggregation;
import org.opensearch.search.aggregations.InternalAggregations;
import org.opensearch.search.aggregations.InternalMultiBucketAggregation;
import org.opensearch.search.aggregations.KeyComparable;
import org.opensearch.search.aggregations.bucket.IteratorAndCurrent;
import org.opensearch.search.aggregations.bucket.MultiBucketsAggregation;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
public class InternalVariableWidthHistogram extends InternalMultiBucketAggregation<
InternalVariableWidthHistogram,
InternalVariableWidthHistogram.Bucket> implements Histogram, HistogramFactory {
public static class Bucket extends InternalMultiBucketAggregation.InternalBucket implements Histogram.Bucket, KeyComparable {
public static class BucketBounds {
public double min;
public double max;
public BucketBounds(double min, double max) {
assert min <= max;
this.min = min;
this.max = max;
}
public BucketBounds(StreamInput in) throws IOException {
this(in.readDouble(), in.readDouble());
}
public void writeTo(StreamOutput out) throws IOException {
out.writeDouble(min);
out.writeDouble(max);
}
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
BucketBounds that = (BucketBounds) obj;
return min == that.min && max == that.max;
}
@Override
public int hashCode() {
return Objects.hash(getClass(), min, max);
}
}
private final BucketBounds bounds;
private long docCount;
private InternalAggregations aggregations;
protected final transient DocValueFormat format;
private double centroid;
public Bucket(double centroid, BucketBounds bounds, long docCount, DocValueFormat format, InternalAggregations aggregations) {
this.format = format;
this.centroid = centroid;
this.bounds = bounds;
this.docCount = docCount;
this.aggregations = aggregations;
}
/**
* Read from a stream.
*/
public Bucket(StreamInput in, DocValueFormat format) throws IOException {
this.format = format;
centroid = in.readDouble();
docCount = in.readVLong();
bounds = new BucketBounds(in);
aggregations = InternalAggregations.readFrom(in);
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeDouble(centroid);
out.writeVLong(docCount);
bounds.writeTo(out);
aggregations.writeTo(out);
}
@Override
public boolean equals(Object obj) {
if (obj == null || obj.getClass() != InternalVariableWidthHistogram.Bucket.class) {
return false;
}
InternalVariableWidthHistogram.Bucket that = (InternalVariableWidthHistogram.Bucket) obj;
return centroid == that.centroid
&& bounds.equals(that.bounds)
&& docCount == that.docCount
&& Objects.equals(aggregations, that.aggregations);
}
@Override
public int hashCode() {
return Objects.hash(getClass(), centroid, bounds, docCount, aggregations);
}
@Override
public String getKeyAsString() {
return format.format((double) getKey()).toString();
}
/**
* Buckets are compared using their centroids. But, in the final XContent returned by the aggregation,
* we want the bucket's key to be its min. Otherwise, it would look like the distances between centroids
* are buckets, which is incorrect.
*/
@Override
public Object getKey() {
return centroid;
}
public double min() {
return bounds.min;
}
public double max() {
return bounds.max;
}
public double centroid() {
return centroid;
}
@Override
public long getDocCount() {
return docCount;
}
@Override
public Aggregations getAggregations() {
return aggregations;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
String keyAsString = format.format((double) getKey()).toString();
builder.startObject();
builder.field(CommonFields.MIN.getPreferredName(), min());
if (format != DocValueFormat.RAW) {
builder.field(CommonFields.MIN_AS_STRING.getPreferredName(), format.format(min()));
}
builder.field(CommonFields.KEY.getPreferredName(), getKey());
if (format != DocValueFormat.RAW) {
builder.field(CommonFields.KEY_AS_STRING.getPreferredName(), keyAsString);
}
builder.field(CommonFields.MAX.getPreferredName(), max());
if (format != DocValueFormat.RAW) {
builder.field(CommonFields.MAX_AS_STRING.getPreferredName(), format.format(max()));
}
builder.field(CommonFields.DOC_COUNT.getPreferredName(), docCount);
aggregations.toXContentInternal(builder, params);
builder.endObject();
return builder;
}
@Override
public int compareKey(InternalVariableWidthHistogram.Bucket other) {
return Double.compare(centroid, other.centroid); // Use centroid for bucket ordering
}
public DocValueFormat getFormatter() {
return format;
}
}
static class EmptyBucketInfo {
final InternalAggregations subAggregations;
EmptyBucketInfo(InternalAggregations subAggregations) {
this.subAggregations = subAggregations;
}
EmptyBucketInfo(StreamInput in) throws IOException {
this(InternalAggregations.readFrom(in));
}
public void writeTo(StreamOutput out) throws IOException {
subAggregations.writeTo(out);
}
@Override
public boolean equals(Object obj) {
if (obj == null || getClass() != obj.getClass()) {
return false;
}
EmptyBucketInfo that = (EmptyBucketInfo) obj;
return Objects.equals(subAggregations, that.subAggregations);
}
@Override
public int hashCode() {
return Objects.hash(getClass(), subAggregations);
}
}
private List buckets;
private final DocValueFormat format;
private final int targetNumBuckets;
final EmptyBucketInfo emptyBucketInfo;
InternalVariableWidthHistogram(
String name,
List buckets,
EmptyBucketInfo emptyBucketInfo,
int targetNumBuckets,
DocValueFormat formatter,
Map metaData
) {
super(name, metaData);
this.buckets = buckets;
this.emptyBucketInfo = emptyBucketInfo;
this.format = formatter;
this.targetNumBuckets = targetNumBuckets;
}
/**
* Stream from a stream.
*/
public InternalVariableWidthHistogram(StreamInput in) throws IOException {
super(in);
emptyBucketInfo = new EmptyBucketInfo(in);
format = in.readNamedWriteable(DocValueFormat.class);
buckets = in.readList(stream -> new Bucket(stream, format));
targetNumBuckets = in.readVInt();
}
@Override
protected void doWriteTo(StreamOutput out) throws IOException {
emptyBucketInfo.writeTo(out);
out.writeNamedWriteable(format);
out.writeList(buckets);
out.writeVInt(targetNumBuckets);
}
@Override
public String getWriteableName() {
return VariableWidthHistogramAggregationBuilder.NAME;
}
@Override
public List getBuckets() {
return Collections.unmodifiableList(buckets);
}
DocValueFormat getFormatter() {
return format;
}
public int getTargetBuckets() {
return targetNumBuckets;
}
public EmptyBucketInfo getEmptyBucketInfo() {
return emptyBucketInfo;
}
@Override
public InternalVariableWidthHistogram create(List buckets) {
return new InternalVariableWidthHistogram(name, buckets, emptyBucketInfo, targetNumBuckets, format, metadata);
}
@Override
public Bucket createBucket(InternalAggregations aggregations, Bucket prototype) {
return new Bucket(prototype.centroid, prototype.bounds, prototype.docCount, prototype.format, aggregations);
}
@Override
public Bucket createBucket(Number key, long docCount, InternalAggregations aggregations) {
return new Bucket(key.doubleValue(), new Bucket.BucketBounds(key.doubleValue(), key.doubleValue()), docCount, format, aggregations);
}
@Override
public Number getKey(MultiBucketsAggregation.Bucket bucket) {
return ((Bucket) bucket).centroid;
}
@Override
public Number nextKey(Number key) {
return nextKey(key.doubleValue());
}
/**
* This method should not be called for this specific subclass of InternalHistogram, since there should not be
* empty buckets when clustering.
= */
private double nextKey(double key) {
return key + 1;
}
@Override
protected Bucket reduceBucket(List buckets, ReduceContext context) {
List aggregations = new ArrayList<>(buckets.size());
long docCount = 0;
double min = Double.POSITIVE_INFINITY;
double max = Double.NEGATIVE_INFINITY;
double sum = 0;
for (InternalVariableWidthHistogram.Bucket bucket : buckets) {
docCount += bucket.docCount;
min = Math.min(min, bucket.bounds.min);
max = Math.max(max, bucket.bounds.max);
sum += bucket.docCount * bucket.centroid;
aggregations.add((InternalAggregations) bucket.getAggregations());
}
InternalAggregations aggs = InternalAggregations.reduce(aggregations, context);
double centroid = sum / docCount;
Bucket.BucketBounds bounds = new Bucket.BucketBounds(min, max);
return new Bucket(centroid, bounds, docCount, format, aggs);
}
public List reduceBuckets(List aggregations, ReduceContext reduceContext) {
PriorityQueue> pq = new PriorityQueue>(aggregations.size()) {
@Override
protected boolean lessThan(IteratorAndCurrent a, IteratorAndCurrent b) {
return Double.compare(a.current().centroid, b.current().centroid) < 0;
}
};
for (InternalAggregation aggregation : aggregations) {
InternalVariableWidthHistogram histogram = (InternalVariableWidthHistogram) aggregation;
if (histogram.buckets.isEmpty() == false) {
pq.add(new IteratorAndCurrent(histogram.buckets.iterator()));
}
}
List reducedBuckets = new ArrayList<>();
if (pq.size() > 0) {
double key = pq.top().current().centroid();
// list of buckets coming from different shards that have the same key
List currentBuckets = new ArrayList<>();
do {
IteratorAndCurrent top = pq.top();
if (Double.compare(top.current().centroid(), key) != 0) {
// The key changes, reduce what we already buffered and reset the buffer for current buckets.
final Bucket reduced = reduceBucket(currentBuckets, reduceContext);
reduceContext.consumeBucketsAndMaybeBreak(1);
reducedBuckets.add(reduced);
currentBuckets.clear();
key = top.current().centroid();
}
currentBuckets.add(top.current());
if (top.hasNext()) {
Bucket prev = top.current();
top.next();
assert top.current().compareKey(prev) >= 0 : "shards must return data sorted by centroid";
pq.updateTop();
} else {
pq.pop();
}
} while (pq.size() > 0);
if (currentBuckets.isEmpty() == false) {
final Bucket reduced = reduceBucket(currentBuckets, reduceContext);
reduceContext.consumeBucketsAndMaybeBreak(1);
reducedBuckets.add(reduced);
}
}
mergeBucketsIfNeeded(reducedBuckets, targetNumBuckets, reduceContext);
return reducedBuckets;
}
class BucketRange {
int startIdx;
int endIdx;
/**
* These are optional utility fields
* They're useful for determining whether buckets should be merged
*/
double min;
double max;
double centroid;
long docCount;
public void mergeWith(BucketRange other) {
startIdx = Math.min(startIdx, other.startIdx);
endIdx = Math.max(endIdx, other.endIdx);
if (docCount + other.docCount > 0) {
// Avoids div by 0 error. This condition could be false if the optional docCount field was not set
centroid = ((centroid * docCount) + (other.centroid * other.docCount)) / (docCount + other.docCount);
docCount += other.docCount;
}
min = Math.min(min, other.min);
max = Math.max(max, other.max);
}
}
/**
* For each range {startIdx, endIdx} in ranges
, all the buckets in that index range
* from buckets
are merged, and this merged bucket replaces the entire range.
*/
private void mergeBucketsWithPlan(List buckets, List plan, ReduceContext reduceContext) {
for (int i = plan.size() - 1; i >= 0; i--) {
BucketRange range = plan.get(i);
int endIdx = range.endIdx;
int startIdx = range.startIdx;
if (startIdx == endIdx) continue;
List toMerge = new ArrayList<>();
for (int idx = endIdx; idx > startIdx; idx--) {
toMerge.add(buckets.get(idx));
buckets.remove(idx);
}
toMerge.add(buckets.get(startIdx)); // Don't remove the startIdx bucket because it will be replaced by the merged bucket
int toRemove = toMerge.stream().mapToInt(b -> countInnerBucket(b) + 1).sum();
reduceContext.consumeBucketsAndMaybeBreak(-toRemove + 1);
Bucket merged_bucket = reduceBucket(toMerge, reduceContext);
buckets.set(startIdx, merged_bucket);
}
}
/**
* Makes a merge plan by simulating the merging of the two closest buckets, until the target number of buckets is reached.
* Distance is determined by centroid comparison.
* Then, this plan is actually executed and the underlying buckets are merged.
*
* Requires: buckets
is sorted by centroid.
*/
private void mergeBucketsIfNeeded(List buckets, int targetNumBuckets, ReduceContext reduceContext) {
// Make a plan for getting the target number of buckets
// Each range represents a set of adjacent bucket indices of buckets that will be merged together
List ranges = new ArrayList<>();
// Initialize each range to represent an individual bucket
for (int i = 0; i < buckets.size(); i++) {
// Since buckets is sorted by centroid, ranges will be as well
BucketRange range = new BucketRange();
range.centroid = buckets.get(i).centroid;
range.docCount = buckets.get(i).getDocCount();
range.startIdx = i;
range.endIdx = i;
ranges.add(range);
}
// Continually merge the two closest ranges until the target is reached
while (ranges.size() > targetNumBuckets) {
// Find two closest ranges (i.e. the two closest buckets after the previous merges are completed)
// We only have to make one pass through the list because it is sorted by centroid
int closestIdx = 0; // After this loop, (closestIdx, closestIdx + 1) will be the 2 closest buckets
double smallest_distance = Double.POSITIVE_INFINITY;
for (int i = 0; i < ranges.size() - 1; i++) {
double new_distance = ranges.get(i + 1).centroid - ranges.get(i).centroid; // Positive because buckets is sorted
if (new_distance < smallest_distance) {
closestIdx = i;
smallest_distance = new_distance;
}
}
// Merge the two closest ranges
ranges.get(closestIdx).mergeWith(ranges.get(closestIdx + 1));
ranges.remove(closestIdx + 1);
}
// Execute the plan (merge the underlying buckets)
mergeBucketsWithPlan(buckets, ranges, reduceContext);
}
private void mergeBucketsWithSameMin(List buckets, ReduceContext reduceContext) {
// Create a merge plan
List ranges = new ArrayList<>();
// Initialize each range to represent an individual bucket
for (int i = 0; i < buckets.size(); i++) {
BucketRange range = new BucketRange();
range.min = buckets.get(i).min();
range.startIdx = i;
range.endIdx = i;
ranges.add(range);
}
// Merge ranges with same min value
int i = 0;
while (i < ranges.size() - 1) {
BucketRange range = ranges.get(i);
BucketRange nextRange = ranges.get(i + 1);
if (range.min == nextRange.min) {
range.mergeWith(nextRange);
ranges.remove(i + 1);
} else {
i++;
}
}
// Execute the plan (merge the underlying buckets)
mergeBucketsWithPlan(buckets, ranges, reduceContext);
}
/**
* When two adjacent buckets A, B overlap (A.max > B.min) then their boundary is set to
* the midpoint: (A.max + B.min) / 2.
*
* After this adjustment, A will contain more values than indicated and B will have less.
*/
private void adjustBoundsForOverlappingBuckets(List buckets, ReduceContext reduceContext) {
for (int i = 1; i < buckets.size(); i++) {
Bucket curBucket = buckets.get(i);
Bucket prevBucket = buckets.get(i - 1);
if (curBucket.bounds.min < prevBucket.bounds.max) {
// We don't want overlapping buckets --> Adjust their bounds
// TODO: Think of a fairer way to do this. Should prev.max = cur.min?
curBucket.bounds.min = (prevBucket.bounds.max + curBucket.bounds.min) / 2;
prevBucket.bounds.max = curBucket.bounds.min;
}
}
}
@Override
public InternalAggregation reduce(List aggregations, ReduceContext reduceContext) {
List reducedBuckets = reduceBuckets(aggregations, reduceContext);
if (reduceContext.isFinalReduce()) {
buckets.sort(Comparator.comparing(Bucket::min));
mergeBucketsWithSameMin(reducedBuckets, reduceContext);
adjustBoundsForOverlappingBuckets(reducedBuckets, reduceContext);
}
return new InternalVariableWidthHistogram(getName(), reducedBuckets, emptyBucketInfo, targetNumBuckets, format, metadata);
}
@Override
public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException {
builder.startArray(CommonFields.BUCKETS.getPreferredName());
for (Bucket bucket : buckets) {
bucket.toXContent(builder, params);
}
builder.endArray();
return builder;
}
@Override
public InternalAggregation createAggregation(List buckets) {
// convert buckets to the right type
List buckets2 = new ArrayList<>(buckets.size());
for (Object b : buckets) {
buckets2.add((Bucket) b);
}
buckets2 = Collections.unmodifiableList(buckets2);
return new InternalVariableWidthHistogram(name, buckets2, emptyBucketInfo, targetNumBuckets, format, getMetadata());
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
if (super.equals(obj) == false) return false;
InternalVariableWidthHistogram that = (InternalVariableWidthHistogram) obj;
return Objects.equals(buckets, that.buckets)
&& Objects.equals(format, that.format)
&& Objects.equals(emptyBucketInfo, that.emptyBucketInfo);
}
@Override
public int hashCode() {
return Objects.hash(super.hashCode(), buckets, format, emptyBucketInfo);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy