Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.search.aggregations.bucket.histogram;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.opensearch.common.Nullable;
import org.opensearch.common.lease.Releasable;
import org.opensearch.common.lease.Releasables;
import org.opensearch.common.util.BigArrays;
import org.opensearch.common.util.DoubleArray;
import org.opensearch.index.fielddata.SortedNumericDoubleValues;
import org.opensearch.search.DocValueFormat;
import org.opensearch.search.aggregations.Aggregator;
import org.opensearch.search.aggregations.AggregatorFactories;
import org.opensearch.search.aggregations.BucketOrder;
import org.opensearch.search.aggregations.InternalAggregation;
import org.opensearch.search.aggregations.InternalAggregations;
import org.opensearch.search.aggregations.LeafBucketCollector;
import org.opensearch.search.aggregations.LeafBucketCollectorBase;
import org.opensearch.search.aggregations.bucket.DeferableBucketAggregator;
import org.opensearch.search.aggregations.bucket.DeferringBucketCollector;
import org.opensearch.search.aggregations.bucket.MergingBucketsDeferringCollector;
import org.opensearch.search.aggregations.bucket.nested.NestedAggregator;
import org.opensearch.search.aggregations.support.ValuesSource;
import org.opensearch.search.aggregations.support.ValuesSourceConfig;
import org.opensearch.search.internal.SearchContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.function.LongUnaryOperator;
public class VariableWidthHistogramAggregator extends DeferableBucketAggregator {
/**
* This aggregator goes through multiple phases of collection. Each phase has a different CollectionPhase::collectValue
* implementation
*
* Running a clustering algorithm like K-Means is unfeasible because large indices don't fit into memory.
* But having multiple collection phases lets us accurately bucket the docs in one pass.
*/
private abstract class CollectionPhase implements Releasable {
/**
* This method will collect the doc and then either return itself or a new CollectionPhase
* It is responsible for determining when a phase is over and what phase will run next
*/
abstract CollectionPhase collectValue(LeafBucketCollector sub, int doc, double val) throws IOException;
/**
* @return the final number of buckets that will be used
* If this is not the final phase, then an instance of the next phase is created and it is asked for this answer.
*/
abstract int finalNumBuckets();
/**
* If this CollectionPhase is the final phase then this method will build and return the i'th bucket
* Otherwise, it will create an instance of the next phase and ask it for the i'th bucket (naturally, if that phase
* not the last phase then it will do the same and so on...)
*/
abstract InternalVariableWidthHistogram.Bucket buildBucket(int bucketOrd, InternalAggregations subAggregations) throws IOException;
}
/**
* Phase 1: Build up a buffer of docs (i.e. give each new doc its own bucket). No clustering decisions are made here.
* Building this buffer lets us analyze the distribution of the data before we begin clustering.
*/
private class BufferValuesPhase extends CollectionPhase {
private DoubleArray buffer;
private int bufferSize;
private int bufferLimit;
private MergeBucketsPhase mergeBucketsPhase;
BufferValuesPhase(int bufferLimit) {
this.buffer = bigArrays.newDoubleArray(1);
this.bufferSize = 0;
this.bufferLimit = bufferLimit;
this.mergeBucketsPhase = null;
}
@Override
public CollectionPhase collectValue(LeafBucketCollector sub, int doc, double val) throws IOException {
if (bufferSize < bufferLimit) {
// Add to the buffer i.e store the doc in a new bucket
buffer = bigArrays.grow(buffer, bufferSize + 1);
buffer.set((long) bufferSize, val);
collectBucket(sub, doc, bufferSize);
bufferSize += 1;
}
if (bufferSize == bufferLimit) {
// We have hit the buffer limit. Switch to merge mode
CollectionPhase mergeBuckets = new MergeBucketsPhase(buffer, bufferSize);
Releasables.close(this);
return mergeBuckets;
} else {
// There is still room in the buffer
return this;
}
}
int finalNumBuckets() {
return getMergeBucketPhase().finalNumBuckets();
}
@Override
InternalVariableWidthHistogram.Bucket buildBucket(int bucketOrd, InternalAggregations subAggregations) throws IOException {
InternalVariableWidthHistogram.Bucket bucket = getMergeBucketPhase().buildBucket(bucketOrd, subAggregations);
return bucket;
}
MergeBucketsPhase getMergeBucketPhase() {
if (mergeBucketsPhase == null) {
mergeBucketsPhase = new MergeBucketsPhase(buffer, bufferSize);
}
return mergeBucketsPhase;
}
@Override
public void close() {
if (mergeBucketsPhase != null) {
Releasables.close(mergeBucketsPhase);
}
Releasables.close(buffer);
}
}
/**
* Phase 2: This phase is initialized with the buffer created in Phase 1.
* It is responsible for merging the buffered docs into a smaller number of buckets and then determining which existing
* bucket all subsequent docs belong to. New buckets will be created for docs that are distant from all existing ones
*/
private class MergeBucketsPhase extends CollectionPhase {
/**
* "Cluster" refers to intermediate buckets during collection
* They are kept sorted by centroid. The i'th index in all these arrays always refers to the i'th cluster
*/
public DoubleArray clusterMaxes;
public DoubleArray clusterMins;
public DoubleArray clusterCentroids;
public DoubleArray clusterSizes; // clusterSizes != bucketDocCounts when clusters are in the middle of a merge
public int numClusters;
private double avgBucketDistance;
MergeBucketsPhase(DoubleArray buffer, int bufferSize) {
// Cluster the documents to reduce the number of buckets
bucketBufferedDocs(buffer, bufferSize, mergePhaseInitialBucketCount(shardSize));
if (bufferSize > 1) {
updateAvgBucketDistance();
}
}
/**
* Sorts the indices of values by their underlying value
* This will produce a merge map whose application will sort values
*/
private class ClusterSorter extends InPlaceMergeSorter {
final DoubleArray values;
final long[] indexes;
ClusterSorter(DoubleArray values, int length) {
this.values = values;
this.indexes = new long[length];
for (int i = 0; i < indexes.length; i++) {
indexes[i] = i;
}
}
@Override
protected int compare(int i, int j) {
double iVal = values.get(indexes[i]);
double jVal = values.get(indexes[j]);
return Double.compare(iVal, jVal);
}
@Override
protected void swap(int i, int j) {
long hold = indexes[i];
indexes[i] = indexes[j];
indexes[j] = hold;
}
/**
* Produces a merge map where `mergeMap[i]` represents the index that values[i]
* would be moved to ifvalues were sorted
* In other words, this method produces a merge map that will sort values
*
* See BucketsAggregator::mergeBuckets to learn more about the merge map
*/
public long[] generateMergeMap() {
sort(0, indexes.length);
return indexes;
}
}
/**
* Sorting the documents by key lets us bucket the documents into groups with a single linear scan
*
* But we can't do this by just sorting buffer, because we also need to generate a merge map
* for every change we make to the list, so that we can apply the changes to the underlying buckets as well.
*
* By just creating a merge map, we eliminate the need to actually sort buffer. We can just
* use the merge map to find any doc's sorted index.
*/
private void bucketBufferedDocs(final DoubleArray buffer, final int bufferSize, final int numBuckets) {
// Allocate space for the clusters about to be created
clusterMins = bigArrays.newDoubleArray(1);
clusterMaxes = bigArrays.newDoubleArray(1);
clusterCentroids = bigArrays.newDoubleArray(1);
clusterSizes = bigArrays.newDoubleArray(1);
numClusters = 0;
ClusterSorter sorter = new ClusterSorter(buffer, bufferSize);
long[] mergeMap = sorter.generateMergeMap();
// Naively use basic linear separation to group the first bufferSize docs into initialNumBuckets buckets
// This will require modifying the merge map, which currently represents a sorted list of buckets with 1 doc / bucket
int docsPerBucket = (int) Math.ceil((double) bufferSize / (double) numBuckets);
int bucketOrd = 0;
for (int i = 0; i < mergeMap.length; i++) {
// mergeMap[i] is the index of the i'th smallest doc
double val = buffer.get(mergeMap[i]);
// Put the i'th smallest doc into the bucket at bucketOrd
mergeMap[i] = (int) (mergeMap[i] / docsPerBucket);
if (bucketOrd == numClusters) {
createAndAppendNewCluster(val);
} else {
addToCluster(bucketOrd, val);
}
if ((i + 1) % docsPerBucket == 0) {
// This bucket is full. Make a new one
bucketOrd += 1;
}
}
mergeBuckets(mergeMap, bucketOrd + 1);
if (deferringCollector != null) {
deferringCollector.mergeBuckets(mergeMap);
}
}
@Override
public CollectionPhase collectValue(LeafBucketCollector sub, int doc, double val) throws IOException {
int bucketOrd = getNearestBucket(val);
double distance = Math.abs(clusterCentroids.get(bucketOrd) - val);
if (bucketOrd == -1 || distance > (2 * avgBucketDistance) && numClusters < shardSize) {
// Make a new bucket since the document is distant from all existing buckets
// TODO: (maybe) Create a new bucket for all distant docs and merge down to shardSize buckets at end
createAndAppendNewCluster(val);
collectBucket(sub, doc, numClusters - 1);
if (val > clusterCentroids.get(bucketOrd)) {
/*
* If the new value is bigger than the nearest bucket then insert
* just ahead of bucketOrd so that the array remains sorted.
*/
bucketOrd += 1;
}
moveLastCluster(bucketOrd);
// We've added a new bucket so update the average distance between the buckets
updateAvgBucketDistance();
} else {
addToCluster(bucketOrd, val);
collectExistingBucket(sub, doc, bucketOrd);
if (bucketOrd == 0 || bucketOrd == numClusters - 1) {
// Only update average distance if the centroid of one of the end buckets is modifed.
updateAvgBucketDistance();
}
}
return this;
}
private void updateAvgBucketDistance() {
// Centroids are sorted so the average distance is the difference between the first and last.
avgBucketDistance = (clusterCentroids.get(numClusters - 1) - clusterCentroids.get(0)) / (numClusters - 1);
}
/**
* Creates a new cluster with value and appends it to the cluster arrays
*/
private void createAndAppendNewCluster(double value) {
// Ensure there is space for the cluster
clusterMaxes = bigArrays.grow(clusterMaxes, numClusters + 1); // + 1 because indexing starts at 0
clusterMins = bigArrays.grow(clusterMins, numClusters + 1);
clusterCentroids = bigArrays.grow(clusterCentroids, numClusters + 1);
clusterSizes = bigArrays.grow(clusterSizes, numClusters + 1);
// Initialize the cluster at the end of the array
clusterMaxes.set(numClusters, value);
clusterMins.set(numClusters, value);
clusterCentroids.set(numClusters, value);
clusterSizes.set(numClusters, 1);
numClusters += 1;
}
/**
* Move the last cluster to position idx
* This is expensive because a merge map of size numClusters is created, so don't call this method too often
*
* TODO: Make this more efficient
*/
private void moveLastCluster(int index) {
if (index != numClusters - 1) {
// Move the cluster metadata
double holdMax = clusterMaxes.get(numClusters - 1);
double holdMin = clusterMins.get(numClusters - 1);
double holdCentroid = clusterCentroids.get(numClusters - 1);
double holdSize = clusterSizes.get(numClusters - 1);
for (int i = numClusters - 1; i > index; i--) {
// The clusters in range {index ... numClusters - 1} move up 1 index to make room for the new cluster
clusterMaxes.set(i, clusterMaxes.get(i - 1));
clusterMins.set(i, clusterMins.get(i - 1));
clusterCentroids.set(i, clusterCentroids.get(i - 1));
clusterSizes.set(i, clusterSizes.get(i - 1));
}
clusterMaxes.set(index, holdMax);
clusterMins.set(index, holdMin);
clusterCentroids.set(index, holdCentroid);
clusterSizes.set(index, holdSize);
// Move the underlying buckets
LongUnaryOperator mergeMap = new LongUnaryOperator() {
@Override
public long applyAsLong(long i) {
if (i < index) {
// The clusters in range {0 ... idx - 1} don't move
return i;
}
if (i == numClusters - 1) {
// The new cluster moves to index
return (long) index;
}
// The clusters in range {index ... numClusters - 1} shift forward
return i + 1;
}
};
mergeBuckets(numClusters, mergeMap);
if (deferringCollector != null) {
deferringCollector.mergeBuckets(mergeMap);
}
}
}
/**
* Adds val to the cluster at index bucketOrd.
* The cluster's centroid, min, max, and size are recalculated.
*/
private void addToCluster(int bucketOrd, double val) {
assert bucketOrd < numClusters;
double max = Math.max(clusterMaxes.get(bucketOrd), val);
double min = Math.min(clusterMins.get(bucketOrd), val);
// Recalculate the centroid
double oldCentroid = clusterCentroids.get(bucketOrd);
double size = clusterSizes.get(bucketOrd);
double newCentroid = ((oldCentroid * size) + val) / (size + 1);
clusterMaxes.set(bucketOrd, max);
clusterMins.set(bucketOrd, min);
clusterCentroids.set(bucketOrd, newCentroid);
clusterSizes.increment(bucketOrd, 1);
}
/**
* Returns the ordinal of the bucket whose centroid is closest to val, or -1 if there are no buckets.
**/
private int getNearestBucket(double value) {
if (numClusters == 0) {
return -1;
}
BigArrays.DoubleBinarySearcher binarySearcher = new BigArrays.DoubleBinarySearcher(clusterCentroids);
return binarySearcher.search(0, numClusters - 1, value);
}
@Override
int finalNumBuckets() {
return numClusters;
}
@Override
InternalVariableWidthHistogram.Bucket buildBucket(int bucketOrd, InternalAggregations subAggregations) {
return new InternalVariableWidthHistogram.Bucket(
clusterCentroids.get(bucketOrd),
new InternalVariableWidthHistogram.Bucket.BucketBounds(clusterMins.get(bucketOrd), clusterMaxes.get(bucketOrd)),
bucketDocCount(bucketOrd),
formatter,
subAggregations
);
}
@Override
public void close() {
Releasables.close(clusterMaxes, clusterMins, clusterCentroids, clusterSizes);
}
}
private final ValuesSource.Numeric valuesSource;
private final DocValueFormat formatter;
// Aggregation parameters
private final int numBuckets;
private final int shardSize;
private final int bufferLimit;
final BigArrays bigArrays;
private CollectionPhase collector;
private MergingBucketsDeferringCollector deferringCollector;
VariableWidthHistogramAggregator(
String name,
AggregatorFactories factories,
int numBuckets,
int shardSize,
int initialBuffer,
@Nullable ValuesSourceConfig valuesSourceConfig,
SearchContext context,
Aggregator parent,
Map metadata
) throws IOException {
super(name, factories, context, parent, metadata);
this.numBuckets = numBuckets;
this.valuesSource = (ValuesSource.Numeric) valuesSourceConfig.getValuesSource();
this.formatter = valuesSourceConfig.format();
this.shardSize = shardSize;
this.bufferLimit = initialBuffer;
bigArrays = context.bigArrays();
collector = new BufferValuesPhase(this.bufferLimit);
String scoringAgg = subAggsNeedScore();
String nestedAgg = descendsFromNestedAggregator(parent);
if (scoringAgg != null && nestedAgg != null) {
/*
* Terms agg would force the collect mode to depth_first here, because
* we need to access the score of nested documents in a sub-aggregation
* and we are not able to generate this score while replaying deferred documents.
*
* But the VariableWidthHistogram agg _must_ execute in breadth first since it relies on
* deferring execution, so we just have to throw up our hands and refuse
*/
throw new IllegalStateException(
"VariableWidthHistogram agg ["
+ name()
+ "] is the child of the nested agg ["
+ nestedAgg
+ "], and also has a scoring child agg ["
+ scoringAgg
+ "]. This combination is not supported because "
+ "it requires executing in [depth_first] mode, which the VariableWidthHistogram agg cannot do."
);
}
}
private String subAggsNeedScore() {
for (Aggregator subAgg : subAggregators) {
if (subAgg.scoreMode().needsScores()) {
return subAgg.name();
}
}
return null;
}
private String descendsFromNestedAggregator(Aggregator parent) {
while (parent != null) {
if (parent.getClass() == NestedAggregator.class) {
return parent.name();
}
parent = parent.parent();
}
return null;
}
@Override
public ScoreMode scoreMode() {
if (valuesSource != null && valuesSource.needsScores()) {
return ScoreMode.COMPLETE;
}
return super.scoreMode();
}
@Override
protected boolean shouldDefer(Aggregator aggregator) {
return true;
}
@Override
public DeferringBucketCollector getDeferringCollector() {
deferringCollector = new MergingBucketsDeferringCollector(context, descendsFromGlobalAggregator(parent()));
return deferringCollector;
}
@Override
protected LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
if (valuesSource == null) {
return LeafBucketCollector.NO_OP_COLLECTOR;
}
final SortedNumericDoubleValues values = valuesSource.doubleValues(ctx);
return new LeafBucketCollectorBase(sub, values) {
@Override
public void collect(int doc, long bucket) throws IOException {
assert bucket == 0;
if (values.advanceExact(doc)) {
final int valuesCount = values.docValueCount();
double prevVal = Double.NEGATIVE_INFINITY;
for (int i = 0; i < valuesCount; ++i) {
double val = values.nextValue();
assert val >= prevVal;
if (val == prevVal) {
continue;
}
collector = collector.collectValue(sub, doc, val);
}
}
}
};
}
@Override
public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
int numClusters = collector.finalNumBuckets();
long[] bucketOrdsToCollect = new long[numClusters];
for (int i = 0; i < numClusters; i++) {
bucketOrdsToCollect[i] = i;
}
InternalAggregations[] subAggregationResults = buildSubAggsForBuckets(bucketOrdsToCollect);
List buckets = new ArrayList<>(numClusters);
for (int bucketOrd = 0; bucketOrd < numClusters; bucketOrd++) {
buckets.add(collector.buildBucket(bucketOrd, subAggregationResults[bucketOrd]));
}
Function, InternalAggregation> resultBuilder = bucketsToFormat -> {
// The contract of the histogram aggregation is that shards must return
// buckets ordered by centroid in ascending order
CollectionUtil.introSort(bucketsToFormat, BucketOrder.key(true).comparator());
InternalVariableWidthHistogram.EmptyBucketInfo emptyBucketInfo = new InternalVariableWidthHistogram.EmptyBucketInfo(
buildEmptySubAggregations()
);
return new InternalVariableWidthHistogram(name, bucketsToFormat, emptyBucketInfo, numBuckets, formatter, metadata());
};
return new InternalAggregation[] { resultBuilder.apply(buckets) };
}
@Override
public InternalAggregation buildEmptyAggregation() {
InternalVariableWidthHistogram.EmptyBucketInfo emptyBucketInfo = new InternalVariableWidthHistogram.EmptyBucketInfo(
buildEmptySubAggregations()
);
return new InternalVariableWidthHistogram(name(), Collections.emptyList(), emptyBucketInfo, numBuckets, formatter, metadata());
}
@Override
public void doClose() {
Releasables.close(collector);
}
public static int mergePhaseInitialBucketCount(int shardSize) {
// Target shardSizes * (3/4) buckets so that there's room for more distant buckets to be added during rest of collection
return (int) ((long) shardSize * 3 / 4);
}
}