![JAR search and dependency download from the Maven repository](/logo.png)
org.opensearch.search.aggregations.bucket.histogram.InternalVariableWidthHistogram Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearch Show documentation
Show all versions of opensearch Show documentation
OpenSearch subproject :server
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.search.aggregations.bucket.histogram;
import org.apache.lucene.util.PriorityQueue;
import org.opensearch.common.io.stream.StreamInput;
import org.opensearch.common.io.stream.StreamOutput;
import org.opensearch.common.xcontent.XContentBuilder;
import org.opensearch.search.DocValueFormat;
import org.opensearch.search.aggregations.Aggregations;
import org.opensearch.search.aggregations.InternalAggregation;
import org.opensearch.search.aggregations.InternalAggregations;
import org.opensearch.search.aggregations.InternalMultiBucketAggregation;
import org.opensearch.search.aggregations.KeyComparable;
import org.opensearch.search.aggregations.bucket.IteratorAndCurrent;
import org.opensearch.search.aggregations.bucket.MultiBucketsAggregation;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
public class InternalVariableWidthHistogram extends InternalMultiBucketAggregation<
InternalVariableWidthHistogram,
InternalVariableWidthHistogram.Bucket> implements Histogram, HistogramFactory {
public static class Bucket extends InternalMultiBucketAggregation.InternalBucket implements Histogram.Bucket, KeyComparable {
public static class BucketBounds {
public double min;
public double max;
public BucketBounds(double min, double max) {
assert min <= max;
this.min = min;
this.max = max;
}
public BucketBounds(StreamInput in) throws IOException {
this(in.readDouble(), in.readDouble());
}
public void writeTo(StreamOutput out) throws IOException {
out.writeDouble(min);
out.writeDouble(max);
}
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
BucketBounds that = (BucketBounds) obj;
return min == that.min && max == that.max;
}
@Override
public int hashCode() {
return Objects.hash(getClass(), min, max);
}
}
private final BucketBounds bounds;
private long docCount;
private InternalAggregations aggregations;
protected final transient DocValueFormat format;
private double centroid;
public Bucket(double centroid, BucketBounds bounds, long docCount, DocValueFormat format, InternalAggregations aggregations) {
this.format = format;
this.centroid = centroid;
this.bounds = bounds;
this.docCount = docCount;
this.aggregations = aggregations;
}
/**
* Read from a stream.
*/
public Bucket(StreamInput in, DocValueFormat format) throws IOException {
this.format = format;
centroid = in.readDouble();
docCount = in.readVLong();
bounds = new BucketBounds(in);
aggregations = InternalAggregations.readFrom(in);
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeDouble(centroid);
out.writeVLong(docCount);
bounds.writeTo(out);
aggregations.writeTo(out);
}
@Override
public boolean equals(Object obj) {
if (obj == null || obj.getClass() != InternalVariableWidthHistogram.Bucket.class) {
return false;
}
InternalVariableWidthHistogram.Bucket that = (InternalVariableWidthHistogram.Bucket) obj;
return centroid == that.centroid
&& bounds.equals(that.bounds)
&& docCount == that.docCount
&& Objects.equals(aggregations, that.aggregations);
}
@Override
public int hashCode() {
return Objects.hash(getClass(), centroid, bounds, docCount, aggregations);
}
@Override
public String getKeyAsString() {
return format.format((double) getKey()).toString();
}
/**
* Buckets are compared using their centroids. But, in the final XContent returned by the aggregation,
* we want the bucket's key to be its min. Otherwise, it would look like the distances between centroids
* are buckets, which is incorrect.
*/
@Override
public Object getKey() {
return centroid;
}
public double min() {
return bounds.min;
}
public double max() {
return bounds.max;
}
public double centroid() {
return centroid;
}
@Override
public long getDocCount() {
return docCount;
}
@Override
public Aggregations getAggregations() {
return aggregations;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
String keyAsString = format.format((double) getKey()).toString();
builder.startObject();
builder.field(CommonFields.MIN.getPreferredName(), min());
if (format != DocValueFormat.RAW) {
builder.field(CommonFields.MIN_AS_STRING.getPreferredName(), format.format(min()));
}
builder.field(CommonFields.KEY.getPreferredName(), getKey());
if (format != DocValueFormat.RAW) {
builder.field(CommonFields.KEY_AS_STRING.getPreferredName(), keyAsString);
}
builder.field(CommonFields.MAX.getPreferredName(), max());
if (format != DocValueFormat.RAW) {
builder.field(CommonFields.MAX_AS_STRING.getPreferredName(), format.format(max()));
}
builder.field(CommonFields.DOC_COUNT.getPreferredName(), docCount);
aggregations.toXContentInternal(builder, params);
builder.endObject();
return builder;
}
@Override
public int compareKey(InternalVariableWidthHistogram.Bucket other) {
return Double.compare(centroid, other.centroid); // Use centroid for bucket ordering
}
public DocValueFormat getFormatter() {
return format;
}
}
static class EmptyBucketInfo {
final InternalAggregations subAggregations;
EmptyBucketInfo(InternalAggregations subAggregations) {
this.subAggregations = subAggregations;
}
EmptyBucketInfo(StreamInput in) throws IOException {
this(InternalAggregations.readFrom(in));
}
public void writeTo(StreamOutput out) throws IOException {
subAggregations.writeTo(out);
}
@Override
public boolean equals(Object obj) {
if (obj == null || getClass() != obj.getClass()) {
return false;
}
EmptyBucketInfo that = (EmptyBucketInfo) obj;
return Objects.equals(subAggregations, that.subAggregations);
}
@Override
public int hashCode() {
return Objects.hash(getClass(), subAggregations);
}
}
private List buckets;
private final DocValueFormat format;
private final int targetNumBuckets;
final EmptyBucketInfo emptyBucketInfo;
InternalVariableWidthHistogram(
String name,
List buckets,
EmptyBucketInfo emptyBucketInfo,
int targetNumBuckets,
DocValueFormat formatter,
Map metaData
) {
super(name, metaData);
this.buckets = buckets;
this.emptyBucketInfo = emptyBucketInfo;
this.format = formatter;
this.targetNumBuckets = targetNumBuckets;
}
/**
* Stream from a stream.
*/
public InternalVariableWidthHistogram(StreamInput in) throws IOException {
super(in);
emptyBucketInfo = new EmptyBucketInfo(in);
format = in.readNamedWriteable(DocValueFormat.class);
buckets = in.readList(stream -> new Bucket(stream, format));
targetNumBuckets = in.readVInt();
}
@Override
protected void doWriteTo(StreamOutput out) throws IOException {
emptyBucketInfo.writeTo(out);
out.writeNamedWriteable(format);
out.writeList(buckets);
out.writeVInt(targetNumBuckets);
}
@Override
public String getWriteableName() {
return VariableWidthHistogramAggregationBuilder.NAME;
}
@Override
public List getBuckets() {
return Collections.unmodifiableList(buckets);
}
DocValueFormat getFormatter() {
return format;
}
public int getTargetBuckets() {
return targetNumBuckets;
}
public EmptyBucketInfo getEmptyBucketInfo() {
return emptyBucketInfo;
}
@Override
public InternalVariableWidthHistogram create(List buckets) {
return new InternalVariableWidthHistogram(name, buckets, emptyBucketInfo, targetNumBuckets, format, metadata);
}
@Override
public Bucket createBucket(InternalAggregations aggregations, Bucket prototype) {
return new Bucket(prototype.centroid, prototype.bounds, prototype.docCount, prototype.format, aggregations);
}
@Override
public Bucket createBucket(Number key, long docCount, InternalAggregations aggregations) {
return new Bucket(key.doubleValue(), new Bucket.BucketBounds(key.doubleValue(), key.doubleValue()), docCount, format, aggregations);
}
@Override
public Number getKey(MultiBucketsAggregation.Bucket bucket) {
return ((Bucket) bucket).centroid;
}
@Override
public Number nextKey(Number key) {
return nextKey(key.doubleValue());
}
/**
* This method should not be called for this specific subclass of InternalHistogram, since there should not be
* empty buckets when clustering.
= */
private double nextKey(double key) {
return key + 1;
}
@Override
protected Bucket reduceBucket(List buckets, ReduceContext context) {
List aggregations = new ArrayList<>(buckets.size());
long docCount = 0;
double min = Double.POSITIVE_INFINITY;
double max = Double.NEGATIVE_INFINITY;
double sum = 0;
for (InternalVariableWidthHistogram.Bucket bucket : buckets) {
docCount += bucket.docCount;
min = Math.min(min, bucket.bounds.min);
max = Math.max(max, bucket.bounds.max);
sum += bucket.docCount * bucket.centroid;
aggregations.add((InternalAggregations) bucket.getAggregations());
}
InternalAggregations aggs = InternalAggregations.reduce(aggregations, context);
double centroid = sum / docCount;
Bucket.BucketBounds bounds = new Bucket.BucketBounds(min, max);
return new Bucket(centroid, bounds, docCount, format, aggs);
}
public List reduceBuckets(List aggregations, ReduceContext reduceContext) {
PriorityQueue> pq = new PriorityQueue>(aggregations.size()) {
@Override
protected boolean lessThan(IteratorAndCurrent a, IteratorAndCurrent b) {
return Double.compare(a.current().centroid, b.current().centroid) < 0;
}
};
for (InternalAggregation aggregation : aggregations) {
InternalVariableWidthHistogram histogram = (InternalVariableWidthHistogram) aggregation;
if (histogram.buckets.isEmpty() == false) {
pq.add(new IteratorAndCurrent(histogram.buckets.iterator()));
}
}
List reducedBuckets = new ArrayList<>();
if (pq.size() > 0) {
double key = pq.top().current().centroid();
// list of buckets coming from different shards that have the same key
List currentBuckets = new ArrayList<>();
do {
IteratorAndCurrent top = pq.top();
if (Double.compare(top.current().centroid(), key) != 0) {
// The key changes, reduce what we already buffered and reset the buffer for current buckets.
final Bucket reduced = reduceBucket(currentBuckets, reduceContext);
reduceContext.consumeBucketsAndMaybeBreak(1);
reducedBuckets.add(reduced);
currentBuckets.clear();
key = top.current().centroid();
}
currentBuckets.add(top.current());
if (top.hasNext()) {
Bucket prev = top.current();
top.next();
assert top.current().compareKey(prev) >= 0 : "shards must return data sorted by centroid";
pq.updateTop();
} else {
pq.pop();
}
} while (pq.size() > 0);
if (currentBuckets.isEmpty() == false) {
final Bucket reduced = reduceBucket(currentBuckets, reduceContext);
reduceContext.consumeBucketsAndMaybeBreak(1);
reducedBuckets.add(reduced);
}
}
mergeBucketsIfNeeded(reducedBuckets, targetNumBuckets, reduceContext);
return reducedBuckets;
}
class BucketRange {
int startIdx;
int endIdx;
/**
* These are optional utility fields
* They're useful for determining whether buckets should be merged
*/
double min;
double max;
double centroid;
long docCount;
public void mergeWith(BucketRange other) {
startIdx = Math.min(startIdx, other.startIdx);
endIdx = Math.max(endIdx, other.endIdx);
if (docCount + other.docCount > 0) {
// Avoids div by 0 error. This condition could be false if the optional docCount field was not set
centroid = ((centroid * docCount) + (other.centroid * other.docCount)) / (docCount + other.docCount);
docCount += other.docCount;
}
min = Math.min(min, other.min);
max = Math.max(max, other.max);
}
}
/**
* For each range {startIdx, endIdx} in ranges
, all the buckets in that index range
* from buckets
are merged, and this merged bucket replaces the entire range.
*/
private void mergeBucketsWithPlan(List buckets, List plan, ReduceContext reduceContext) {
for (int i = plan.size() - 1; i >= 0; i--) {
BucketRange range = plan.get(i);
int endIdx = range.endIdx;
int startIdx = range.startIdx;
if (startIdx == endIdx) continue;
List toMerge = new ArrayList<>();
for (int idx = endIdx; idx > startIdx; idx--) {
toMerge.add(buckets.get(idx));
buckets.remove(idx);
}
toMerge.add(buckets.get(startIdx)); // Don't remove the startIdx bucket because it will be replaced by the merged bucket
int toRemove = toMerge.stream().mapToInt(b -> countInnerBucket(b) + 1).sum();
reduceContext.consumeBucketsAndMaybeBreak(-toRemove + 1);
Bucket merged_bucket = reduceBucket(toMerge, reduceContext);
buckets.set(startIdx, merged_bucket);
}
}
/**
* Makes a merge plan by simulating the merging of the two closest buckets, until the target number of buckets is reached.
* Distance is determined by centroid comparison.
* Then, this plan is actually executed and the underlying buckets are merged.
*
* Requires: buckets
is sorted by centroid.
*/
private void mergeBucketsIfNeeded(List buckets, int targetNumBuckets, ReduceContext reduceContext) {
// Make a plan for getting the target number of buckets
// Each range represents a set of adjacent bucket indices of buckets that will be merged together
List ranges = new ArrayList<>();
// Initialize each range to represent an individual bucket
for (int i = 0; i < buckets.size(); i++) {
// Since buckets is sorted by centroid, ranges will be as well
BucketRange range = new BucketRange();
range.centroid = buckets.get(i).centroid;
range.docCount = buckets.get(i).getDocCount();
range.startIdx = i;
range.endIdx = i;
ranges.add(range);
}
// Continually merge the two closest ranges until the target is reached
while (ranges.size() > targetNumBuckets) {
// Find two closest ranges (i.e. the two closest buckets after the previous merges are completed)
// We only have to make one pass through the list because it is sorted by centroid
int closestIdx = 0; // After this loop, (closestIdx, closestIdx + 1) will be the 2 closest buckets
double smallest_distance = Double.POSITIVE_INFINITY;
for (int i = 0; i < ranges.size() - 1; i++) {
double new_distance = ranges.get(i + 1).centroid - ranges.get(i).centroid; // Positive because buckets is sorted
if (new_distance < smallest_distance) {
closestIdx = i;
smallest_distance = new_distance;
}
}
// Merge the two closest ranges
ranges.get(closestIdx).mergeWith(ranges.get(closestIdx + 1));
ranges.remove(closestIdx + 1);
}
// Execute the plan (merge the underlying buckets)
mergeBucketsWithPlan(buckets, ranges, reduceContext);
}
private void mergeBucketsWithSameMin(List buckets, ReduceContext reduceContext) {
// Create a merge plan
List ranges = new ArrayList<>();
// Initialize each range to represent an individual bucket
for (int i = 0; i < buckets.size(); i++) {
BucketRange range = new BucketRange();
range.min = buckets.get(i).min();
range.startIdx = i;
range.endIdx = i;
ranges.add(range);
}
// Merge ranges with same min value
int i = 0;
while (i < ranges.size() - 1) {
BucketRange range = ranges.get(i);
BucketRange nextRange = ranges.get(i + 1);
if (range.min == nextRange.min) {
range.mergeWith(nextRange);
ranges.remove(i + 1);
} else {
i++;
}
}
// Execute the plan (merge the underlying buckets)
mergeBucketsWithPlan(buckets, ranges, reduceContext);
}
/**
* When two adjacent buckets A, B overlap (A.max > B.min) then their boundary is set to
* the midpoint: (A.max + B.min) / 2.
*
* After this adjustment, A will contain more values than indicated and B will have less.
*/
private void adjustBoundsForOverlappingBuckets(List buckets, ReduceContext reduceContext) {
for (int i = 1; i < buckets.size(); i++) {
Bucket curBucket = buckets.get(i);
Bucket prevBucket = buckets.get(i - 1);
if (curBucket.bounds.min < prevBucket.bounds.max) {
// We don't want overlapping buckets --> Adjust their bounds
// TODO: Think of a fairer way to do this. Should prev.max = cur.min?
curBucket.bounds.min = (prevBucket.bounds.max + curBucket.bounds.min) / 2;
prevBucket.bounds.max = curBucket.bounds.min;
}
}
}
@Override
public InternalAggregation reduce(List aggregations, ReduceContext reduceContext) {
List reducedBuckets = reduceBuckets(aggregations, reduceContext);
if (reduceContext.isFinalReduce()) {
buckets.sort(Comparator.comparing(Bucket::min));
mergeBucketsWithSameMin(reducedBuckets, reduceContext);
adjustBoundsForOverlappingBuckets(reducedBuckets, reduceContext);
}
return new InternalVariableWidthHistogram(getName(), reducedBuckets, emptyBucketInfo, targetNumBuckets, format, metadata);
}
@Override
public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException {
builder.startArray(CommonFields.BUCKETS.getPreferredName());
for (Bucket bucket : buckets) {
bucket.toXContent(builder, params);
}
builder.endArray();
return builder;
}
@Override
public InternalAggregation createAggregation(List buckets) {
// convert buckets to the right type
List buckets2 = new ArrayList<>(buckets.size());
for (Object b : buckets) {
buckets2.add((Bucket) b);
}
buckets2 = Collections.unmodifiableList(buckets2);
return new InternalVariableWidthHistogram(name, buckets2, emptyBucketInfo, targetNumBuckets, format, getMetadata());
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
if (super.equals(obj) == false) return false;
InternalVariableWidthHistogram that = (InternalVariableWidthHistogram) obj;
return Objects.equals(buckets, that.buckets)
&& Objects.equals(format, that.format)
&& Objects.equals(emptyBucketInfo, that.emptyBucketInfo);
}
@Override
public int hashCode() {
return Objects.hash(super.hashCode(), buckets, format, emptyBucketInfo);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy