All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.search.aggregations.support.ValuesSource Maven / Gradle / Ivy

/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */
package org.elasticsearch.search.aggregations.support;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Rounding;
import org.elasticsearch.common.Rounding.Prepared;
import org.elasticsearch.common.lucene.ScorerAware;
import org.elasticsearch.common.util.CollectionUtils;
import org.elasticsearch.index.fielddata.AbstractSortingNumericDocValues;
import org.elasticsearch.index.fielddata.DocValueBits;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexGeoPointFieldData;
import org.elasticsearch.index.fielddata.IndexNumericFieldData;
import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
import org.elasticsearch.index.fielddata.LeafOrdinalsFieldData;
import org.elasticsearch.index.fielddata.MultiGeoPointValues;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;
import org.elasticsearch.index.fielddata.SortingBinaryDocValues;
import org.elasticsearch.index.fielddata.SortingNumericDoubleValues;
import org.elasticsearch.index.mapper.RangeType;
import org.elasticsearch.script.AggregationScript;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.AggregationExecutionException;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.bucket.geogrid.GeoTileCellIdSource;
import org.elasticsearch.search.aggregations.bucket.range.RangeAggregator;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator;
import org.elasticsearch.search.aggregations.support.values.ScriptBytesValues;
import org.elasticsearch.search.aggregations.support.values.ScriptDoubleValues;
import org.elasticsearch.search.aggregations.support.values.ScriptLongValues;

import java.io.IOException;
import java.util.function.Function;
import java.util.function.LongUnaryOperator;

/**
 * A unified interface to different ways of getting input data for
 * {@link Aggregator}s like DocValues from Lucene or script output. The
 * top level sub-classes define type-specific behavior, such as
 * {@link ValuesSource.Numeric#isFloatingPoint()}. Second level subclasses are
 * then specialized based on where they read values from, e.g. script or field
 * cases. There are also adapter classes like {@link GeoTileCellIdSource} which do
 * run-time conversion from one type to another, often dependent on a user
 * specified parameter (precision in that case).
 */
public abstract class ValuesSource {

    /**
     * Get a byte array like view into the values. This is the "native" way
     * to access {@link Bytes}-style values.
     */
    public abstract SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException;

    /**
     * Get a "has any values" view into the values. It'll try to pick the
     * "most native" way to check if there are any values, but it builds its
     * own view into the values so if you need any of the actual values its
     * best to use something like {@link #bytesValues} or
     * {@link Numeric#doubleValues} but if you just
     * need to know if there are any values then use this.
     */
    public abstract DocValueBits docsWithValue(LeafReaderContext context) throws IOException;

    /** Whether this values source needs scores. */
    public boolean needsScores() {
        return false;
    }

    /**
     * Build a function to prepare {@link Rounding}s.
     * 

* This returns a {@linkplain Function} because auto date histogram will * need to call it many times over the course of running the aggregation. * Other aggregations should feel free to call it once. */ protected abstract Function roundingPreparer() throws IOException; /** * Check if this values source supports using global and segment ordinals. *

* If this returns {@code true} then it is safe to cast it to {@link ValuesSource.Bytes.WithOrdinals}. */ public boolean hasOrdinals() { return false; } /** * {@linkplain ValuesSource} for fields who's values are best thought of * as byte arrays without any other meaning like {@code keyword} or * {@code ip}. Aggregations that operate on these values presume only * that {@link DocValueFormat#format(BytesRef)} will correctly convert * the resulting {@link BytesRef} into something human readable. */ public abstract static class Bytes extends ValuesSource { @Override public DocValueBits docsWithValue(LeafReaderContext context) throws IOException { final SortedBinaryDocValues bytes = bytesValues(context); return org.elasticsearch.index.fielddata.FieldData.docsWithValue(bytes); } @Override public final Function roundingPreparer() throws IOException { throw new AggregationExecutionException("can't round a [BYTES]"); } /** * Specialization of {@linkplain Bytes} who's underlying storage * de-duplicates its bytes by storing them in a per-leaf sorted * lookup table. Aggregations that are aware of these lookup tables * can operate directly on the value's position in the table, know as * the "ordinal". They can then later translate the ordinal into * the {@link BytesRef} value. */ public abstract static class WithOrdinals extends Bytes { public static final WithOrdinals EMPTY = new WithOrdinals() { @Override public SortedSetDocValues ordinalsValues(LeafReaderContext context) { return DocValues.emptySortedSet(); } @Override public SortedSetDocValues globalOrdinalsValues(LeafReaderContext context) { return DocValues.emptySortedSet(); } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { return org.elasticsearch.index.fielddata.FieldData.emptySortedBinary(); } @Override public LongUnaryOperator globalOrdinalsMapping(LeafReaderContext context) throws IOException { return LongUnaryOperator.identity(); } @Override public boolean supportsGlobalOrdinalsMapping() { return true; } }; @Override public DocValueBits docsWithValue(LeafReaderContext context) throws IOException { final SortedSetDocValues ordinals = ordinalsValues(context); return org.elasticsearch.index.fielddata.FieldData.docsWithValue(ordinals); } /** * Get a view into the leaf's ordinals and their {@link BytesRef} values. *

* Use {@link SortedSetDocValues#advanceExact}, {@link SortedSetDocValues#getValueCount}, * and {@link SortedSetDocValues#nextOrd} to fetch the ordinals. Use * {@link SortedSetDocValues#lookupOrd} to convert form the ordinal * number into the {@link BytesRef} value. Make sure to * {@link BytesRef#deepCopyOf(BytesRef) copy} the result if you need * to keep it. *

* Each leaf may have a different ordinal for the same byte array. * Imagine, for example, an index where one leaf has the values * {@code "a", "b", "d"} and another leaf has the values * {@code "b", "c", "d"}. {@code "a"} has the ordinal {@code 0} in * the first leaf and doesn't exist in the second leaf. * {@code "b"} has the ordinal {@code 1} in the first leaf * and {@code 0} in the second leaf. {@code "c"} doesn't exist in * the first leaf and has the ordinal {@code 1} in the second leaf. * And {@code "d"} gets the ordinal {@code 2} in both leaves. *

* If you have to compare the ordinals of values from different * segments then you'd need to somehow merge them. {@link #globalOrdinalsValues} * provides such a merging at the cost of longer startup times when * the index has been modified. */ public abstract SortedSetDocValues ordinalsValues(LeafReaderContext context) throws IOException; /** * Get a "global" view into the leaf's ordinals. This can require * construction of fairly large set of lookups in memory so prefer * {@link #ordinalsValues} unless you need the global view. *

* This functions just like {@link #ordinalsValues} except that the * ordinals that {@link SortedSetDocValues#nextOrd} and * {@link SortedSetDocValues#lookupOrd(long)} operate on are "global" * to all segments in the shard. They are ordinals into a lookup * table containing all values on the shard. *

* Compare this to the example in the docs for {@link #ordinalsValues}. * Imagine, again, an index where one leaf has the values * {@code "a", "b", "d"} and another leaf has the values * {@code "b", "c", "d"}. The global ordinal for {@code "a"} is {@code 0}. * The global ordinal for {@code "b"} is {@code 1}. The global ordinal * for {@code "c"} is {@code 2}. And the global ordinal for {@code "d"} * is, you guessed it, {@code 3}. *

* This makes comparing the values from different segments much simpler. * But it comes with a fairly high memory cost and a substantial * performance hit when this method is first called after modifying the index. * If the global ordinals lookup hasn't been built then this method's runtime * is roughly proportional to the number of distinct values on the field. * If there are very few distinct values then the runtime'll be dominated * by factors related to the number of segments. But in that case it'll * be fast enough that you won't usually care. */ public abstract SortedSetDocValues globalOrdinalsValues(LeafReaderContext context) throws IOException; /** * Whether this values source is able to provide a mapping between global and segment ordinals, * by returning the underlying {@link OrdinalMap}. If this method returns false, then calling * {@link #globalOrdinalsMapping} will result in an {@link UnsupportedOperationException}. */ public abstract boolean supportsGlobalOrdinalsMapping(); @Override public boolean hasOrdinals() { return true; } /** * Returns a mapping from segment ordinals to global ordinals. This * allows you to post process segment ordinals into global ordinals * which could save you a few lookups. Also, operating on segment * ordinals is likely to produce a more "dense" list of, say, counts. *

* Anyone looking to use this strategy rather than looking up on the * fly should benchmark well and update this documentation with what * they learn. */ public abstract LongUnaryOperator globalOrdinalsMapping(LeafReaderContext context) throws IOException; /** * Get the maximum global ordinal. Requires {@link #globalOrdinalsValues} * so see the note about its performance. */ public long globalMaxOrd(IndexSearcher indexSearcher) throws IOException { IndexReader indexReader = indexSearcher.getIndexReader(); if (indexReader.leaves().isEmpty()) { return 0; } else { LeafReaderContext atomicReaderContext = indexReader.leaves().get(0); SortedSetDocValues values = globalOrdinalsValues(atomicReaderContext); return values.getValueCount(); } } public static class FieldData extends WithOrdinals { protected final IndexOrdinalsFieldData indexFieldData; public FieldData(IndexOrdinalsFieldData indexFieldData) { this.indexFieldData = indexFieldData; } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) { final LeafOrdinalsFieldData atomicFieldData = indexFieldData.load(context); return atomicFieldData.getBytesValues(); } @Override public SortedSetDocValues ordinalsValues(LeafReaderContext context) { final LeafOrdinalsFieldData atomicFieldData = indexFieldData.load(context); return atomicFieldData.getOrdinalsValues(); } @Override public SortedSetDocValues globalOrdinalsValues(LeafReaderContext context) { final IndexOrdinalsFieldData global = indexFieldData.loadGlobal((DirectoryReader) context.parent.reader()); final LeafOrdinalsFieldData atomicFieldData = global.load(context); return atomicFieldData.getOrdinalsValues(); } @Override public boolean supportsGlobalOrdinalsMapping() { return indexFieldData.supportsGlobalOrdinalsMapping(); } @Override public LongUnaryOperator globalOrdinalsMapping(LeafReaderContext context) throws IOException { final IndexOrdinalsFieldData global = indexFieldData.loadGlobal((DirectoryReader) context.parent.reader()); final OrdinalMap map = global.getOrdinalMap(); if (map == null) { // segments and global ordinals are the same return LongUnaryOperator.identity(); } final org.apache.lucene.util.LongValues segmentToGlobalOrd = map.getGlobalOrds(context.ord); return segmentToGlobalOrd::get; } } } public static class FieldData extends Bytes { protected final IndexFieldData indexFieldData; public FieldData(IndexFieldData indexFieldData) { this.indexFieldData = indexFieldData; } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) { return indexFieldData.load(context).getBytesValues(); } } /** * {@link ValuesSource} implementation for stand alone scripts returning a Bytes value */ public static class Script extends Bytes { private final AggregationScript.LeafFactory script; public Script(AggregationScript.LeafFactory script) { this.script = script; } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { return new ScriptBytesValues(script.newInstance(context)); } @Override public boolean needsScores() { return script.needs_score(); } } // No need to implement ReaderContextAware here, the delegate already takes care of updating data structures /** * {@link ValuesSource} subclass for Bytes fields with a Value Script applied */ public static class WithScript extends Bytes { private final ValuesSource delegate; private final AggregationScript.LeafFactory script; public WithScript(ValuesSource delegate, AggregationScript.LeafFactory script) { this.delegate = delegate; this.script = script; } @Override public boolean needsScores() { return script.needs_score(); } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { return new BytesValues(delegate.bytesValues(context), script.newInstance(context)); } static class BytesValues extends SortingBinaryDocValues implements ScorerAware { private final SortedBinaryDocValues bytesValues; private final AggregationScript script; BytesValues(SortedBinaryDocValues bytesValues, AggregationScript script) { this.bytesValues = bytesValues; this.script = script; } @Override public void setScorer(Scorable scorer) { script.setScorer(scorer); } @Override public boolean advanceExact(int doc) throws IOException { if (bytesValues.advanceExact(doc)) { count = bytesValues.docValueCount(); grow(); script.setDocument(doc); for (int i = 0; i < count; ++i) { final BytesRef value = bytesValues.nextValue(); script.setNextAggregationValue(value.utf8ToString()); Object run = script.execute(); CollectionUtils.ensureNoSelfReferences(run, "ValuesSource.BytesValues script"); values[i].copyChars(run.toString()); } sort(); return true; } else { count = 0; grow(); return false; } } } } } /** * {@linkplain ValuesSource} for fields who's values are best thought of * as numbers. Aggregations that operate on these values often may chose * to operate on double precision floating point * {@link Numeric#doubleValues values} or on 64 bit signed two's complement * {@link Numeric#longValues values}. They'll do normal "number stuff" * to those values like add, multiply, and compare them to other numbers. */ public abstract static class Numeric extends ValuesSource { public static final Numeric EMPTY = new Numeric() { @Override public boolean isFloatingPoint() { return false; } @Override public SortedNumericDocValues longValues(LeafReaderContext context) { return DocValues.emptySortedNumeric(); } @Override public SortedNumericDoubleValues doubleValues(LeafReaderContext context) throws IOException { return org.elasticsearch.index.fielddata.FieldData.emptySortedNumericDoubles(); } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { return org.elasticsearch.index.fielddata.FieldData.emptySortedBinary(); } }; /** * Are values of this field better represented as a double precision * floating point numbers ({@code true}) or 64 bit signed * numbers ({@code false})? *

* Aggregations may, if they feel it is important, use this to pick * which of {@link #longValues} and {@link #doubleValues} is better for * the field values. Most metric aggregations are quite happy to operate * on floating point numbers all the time and never call this. Bucketing * aggregations that want to enumerate all values * (like {@link TermsAggregator}) will want to check this but bucketing * aggregations that just compare values ({@link RangeAggregator}) are, * like metric aggregators, fine ignoring it. */ public abstract boolean isFloatingPoint(); /** * Get a 64 bit signed view into the values in this leaf. *

* If the values have precision beyond the decimal point then they'll be * "narrowed" * but they'll accurately represent values up to {@link Long#MAX_VALUE}. */ public abstract SortedNumericDocValues longValues(LeafReaderContext context) throws IOException; /** * Get a double precision floating point view into the values in this leaf. *

* These values will preserve any precision beyond the decimal point but * are limited to {@code double}'s standard 53 bit mantissa. If the "native" * field has values that can't be accurately represented in those 53 bits * they'll be "widened" */ public abstract SortedNumericDoubleValues doubleValues(LeafReaderContext context) throws IOException; @Override public DocValueBits docsWithValue(LeafReaderContext context) throws IOException { // We try and pick the lowest overhead implementation. if (isFloatingPoint()) { final SortedNumericDoubleValues values = doubleValues(context); return org.elasticsearch.index.fielddata.FieldData.docsWithValue(values); } else { final SortedNumericDocValues values = longValues(context); return org.elasticsearch.index.fielddata.FieldData.docsWithValue(values); } } @Override public Function roundingPreparer() throws IOException { return Rounding::prepareForUnknown; } /** * {@link ValuesSource} subclass for Numeric fields with a Value Script applied */ public static class WithScript extends Numeric { private final Numeric delegate; private final AggregationScript.LeafFactory script; public WithScript(Numeric delegate, AggregationScript.LeafFactory script) { this.delegate = delegate; this.script = script; } @Override public boolean isFloatingPoint() { return true; // even if the underlying source produces longs, scripts can change them to doubles } @Override public boolean needsScores() { return script.needs_score(); } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { return new Bytes.WithScript.BytesValues(delegate.bytesValues(context), script.newInstance(context)); } @Override public SortedNumericDocValues longValues(LeafReaderContext context) throws IOException { return new LongValues(delegate.longValues(context), script.newInstance(context)); } @Override public SortedNumericDoubleValues doubleValues(LeafReaderContext context) throws IOException { return new DoubleValues(delegate.doubleValues(context), script.newInstance(context)); } static class LongValues extends AbstractSortingNumericDocValues implements ScorerAware { private final SortedNumericDocValues longValues; private final AggregationScript script; LongValues(SortedNumericDocValues values, AggregationScript script) { this.longValues = values; this.script = script; } @Override public void setScorer(Scorable scorer) { script.setScorer(scorer); } @Override public boolean advanceExact(int target) throws IOException { if (longValues.advanceExact(target)) { resize(longValues.docValueCount()); script.setDocument(target); for (int i = 0; i < docValueCount(); ++i) { script.setNextAggregationValue(longValues.nextValue()); values[i] = script.runAsLong(); } sort(); return true; } return false; } } static class DoubleValues extends SortingNumericDoubleValues implements ScorerAware { private final SortedNumericDoubleValues doubleValues; private final AggregationScript script; DoubleValues(SortedNumericDoubleValues values, AggregationScript script) { this.doubleValues = values; this.script = script; } @Override public void setScorer(Scorable scorer) { script.setScorer(scorer); } @Override public boolean advanceExact(int target) throws IOException { if (doubleValues.advanceExact(target)) { resize(doubleValues.docValueCount()); script.setDocument(target); for (int i = 0; i < docValueCount(); ++i) { script.setNextAggregationValue(doubleValues.nextValue()); values[i] = script.runAsDouble(); } sort(); return true; } return false; } } } public static class FieldData extends Numeric { protected final IndexNumericFieldData indexFieldData; public FieldData(IndexNumericFieldData indexFieldData) { this.indexFieldData = indexFieldData; } @Override public boolean isFloatingPoint() { return indexFieldData.getNumericType().isFloatingPoint(); } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) { return indexFieldData.load(context).getBytesValues(); } @Override public SortedNumericDocValues longValues(LeafReaderContext context) { return indexFieldData.load(context).getLongValues(); } @Override public SortedNumericDoubleValues doubleValues(LeafReaderContext context) { return indexFieldData.load(context).getDoubleValues(); } } /** * {@link ValuesSource} implementation for stand alone scripts returning a Numeric value */ public static class Script extends Numeric { private final AggregationScript.LeafFactory script; private final ValueType scriptValueType; public Script(AggregationScript.LeafFactory script, ValueType scriptValueType) { this.script = script; this.scriptValueType = scriptValueType; } @Override public boolean isFloatingPoint() { return scriptValueType != null ? scriptValueType == ValueType.DOUBLE : true; } @Override public SortedNumericDocValues longValues(LeafReaderContext context) throws IOException { return new ScriptLongValues(script.newInstance(context)); } @Override public SortedNumericDoubleValues doubleValues(LeafReaderContext context) throws IOException { return new ScriptDoubleValues(script.newInstance(context)); } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { return new ScriptBytesValues(script.newInstance(context)); } @Override public boolean needsScores() { return script.needs_score(); } } } /** * {@linkplain ValuesSource} for fields who's values are best thought of * as ranges of numbers, dates, or IP addresses. */ public static class Range extends ValuesSource { private final RangeType rangeType; protected final IndexFieldData indexFieldData; public Range(IndexFieldData indexFieldData, RangeType rangeType) { this.indexFieldData = indexFieldData; this.rangeType = rangeType; } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) { return indexFieldData.load(context).getBytesValues(); } @Override public DocValueBits docsWithValue(LeafReaderContext context) throws IOException { final SortedBinaryDocValues bytes = bytesValues(context); return org.elasticsearch.index.fielddata.FieldData.docsWithValue(bytes); } @Override public Function roundingPreparer() throws IOException { // TODO lookup the min and max rounding when appropriate return Rounding::prepareForUnknown; } public RangeType rangeType() { return rangeType; } } /** * {@linkplain ValuesSource} for fields who's values are best thought of * as points on a globe. */ public abstract static class GeoPoint extends ValuesSource { public static final GeoPoint EMPTY = new GeoPoint() { @Override public MultiGeoPointValues geoPointValues(LeafReaderContext context) { return org.elasticsearch.index.fielddata.FieldData.emptyMultiGeoPoints(); } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { return org.elasticsearch.index.fielddata.FieldData.emptySortedBinary(); } }; @Override public DocValueBits docsWithValue(LeafReaderContext context) throws IOException { final MultiGeoPointValues geoPoints = geoPointValues(context); return org.elasticsearch.index.fielddata.FieldData.docsWithValue(geoPoints); } @Override public final Function roundingPreparer() throws IOException { throw new AggregationExecutionException("can't round a [GEO_POINT]"); } public abstract MultiGeoPointValues geoPointValues(LeafReaderContext context); public static class Fielddata extends GeoPoint { protected final IndexGeoPointFieldData indexFieldData; public Fielddata(IndexGeoPointFieldData indexFieldData) { this.indexFieldData = indexFieldData; } @Override public SortedBinaryDocValues bytesValues(LeafReaderContext context) { return indexFieldData.load(context).getBytesValues(); } public org.elasticsearch.index.fielddata.MultiGeoPointValues geoPointValues(LeafReaderContext context) { return indexFieldData.load(context).getGeoPointValues(); } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy