org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.search.aggregations.bucket.terms;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.hppc.BitMixer;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.xcontent.ParseField;
import org.elasticsearch.xcontent.ToXContentFragment;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser;
import java.io.IOException;
import java.util.HashSet;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
/**
* Defines the include/exclude regular expression filtering for string terms aggregation. In this filtering logic,
* exclusion has precedence, where the {@code include} is evaluated first and then the {@code exclude}.
*/
public class IncludeExclude implements Writeable, ToXContentFragment {
public static final ParseField INCLUDE_FIELD = new ParseField("include");
public static final ParseField EXCLUDE_FIELD = new ParseField("exclude");
public static final ParseField PARTITION_FIELD = new ParseField("partition");
public static final ParseField NUM_PARTITIONS_FIELD = new ParseField("num_partitions");
// Needed to add this seed for a deterministic term hashing policy
// otherwise tests fail to get expected results and worse, shards
// can disagree on which terms hash to the required partition.
private static final int HASH_PARTITIONING_SEED = 31;
// for parsing purposes only
// TODO: move all aggs to the same package so that this stuff could be pkg-private
public static IncludeExclude merge(IncludeExclude include, IncludeExclude exclude) {
if (include == null) {
return exclude;
}
if (exclude == null) {
return include;
}
if (include.isPartitionBased()) {
throw new IllegalArgumentException("Cannot specify any excludes when using a partition-based include");
}
return new IncludeExclude(
include.include == null ? null : include.include.getOriginalString(),
exclude.exclude == null ? null : exclude.exclude.getOriginalString(),
include.includeValues,
exclude.excludeValues
);
}
public static IncludeExclude parseInclude(XContentParser parser) throws IOException {
XContentParser.Token token = parser.currentToken();
if (token == XContentParser.Token.VALUE_STRING) {
return new IncludeExclude(parser.text(), null, null, null);
} else if (token == XContentParser.Token.START_ARRAY) {
return new IncludeExclude(null, null, new TreeSet<>(parseArrayToSet(parser)), null);
} else if (token == XContentParser.Token.START_OBJECT) {
String currentFieldName = null;
Integer partition = null, numPartitions = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (NUM_PARTITIONS_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
numPartitions = parser.intValue();
} else if (PARTITION_FIELD.match(currentFieldName, parser.getDeprecationHandler())) {
partition = parser.intValue();
} else {
throw new ElasticsearchParseException("Unknown parameter in Include/Exclude clause: " + currentFieldName);
}
}
if (partition == null) {
throw new IllegalArgumentException(
"Missing [" + PARTITION_FIELD.getPreferredName() + "] parameter for partition-based include"
);
}
if (numPartitions == null) {
throw new IllegalArgumentException(
"Missing [" + NUM_PARTITIONS_FIELD.getPreferredName() + "] parameter for partition-based include"
);
}
return new IncludeExclude(partition, numPartitions);
} else {
throw new IllegalArgumentException("Unrecognized token for an include [" + token + "]");
}
}
public static IncludeExclude parseExclude(XContentParser parser) throws IOException {
XContentParser.Token token = parser.currentToken();
if (token == XContentParser.Token.VALUE_STRING) {
return new IncludeExclude(null, parser.text(), null, null);
} else if (token == XContentParser.Token.START_ARRAY) {
return new IncludeExclude(null, null, null, new TreeSet<>(parseArrayToSet(parser)));
} else {
throw new IllegalArgumentException("Unrecognized token for an exclude [" + token + "]");
}
}
public abstract static class Filter {}
// The includeValue and excludeValue ByteRefs which are the result of the parsing
// process are converted into a LongFilter when used on numeric fields
// in the index.
public abstract static class LongFilter extends Filter {
public abstract boolean accept(long value);
}
public class PartitionedLongFilter extends LongFilter {
@Override
public boolean accept(long value) {
// hash the value to keep even distributions
final long hashCode = BitMixer.mix64(value);
return Math.floorMod(hashCode, incNumPartitions) == incZeroBasedPartition;
}
}
public static class SetBackedLongFilter extends LongFilter {
// Autoboxing long could cause allocations when doing Set.contains, so
// this alternative to java.lang.Long is not final so that a preallocated instance
// can be used in accept (note that none of this is threadsafe!)
private static class Long {
private long value;
private Long(long value) {
this.value = value;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Long that = (Long) o;
return value == that.value;
}
@Override
public int hashCode() {
return java.lang.Long.hashCode(value);
}
}
private Set valids;
private Set invalids;
private final Long spare = new Long(0);
private SetBackedLongFilter(int numValids, int numInvalids) {
if (numValids > 0) {
valids = Sets.newHashSetWithExpectedSize(numValids);
}
if (numInvalids > 0) {
invalids = Sets.newHashSetWithExpectedSize(numInvalids);
}
}
@Override
public boolean accept(long value) {
spare.value = value;
return (valids == null || valids.contains(spare)) && (invalids == null || invalids.contains(spare) == false);
}
private void addAccept(long val) {
valids.add(new Long(val));
}
private void addReject(long val) {
invalids.add(new Long(val));
}
}
// Only used for the 'map' execution mode (ie. scripts)
public abstract static class StringFilter extends Filter {
public abstract boolean accept(BytesRef value);
}
class PartitionedStringFilter extends StringFilter {
@Override
public boolean accept(BytesRef value) {
return Math.floorMod(StringHelper.murmurhash3_x86_32(value, HASH_PARTITIONING_SEED), incNumPartitions) == incZeroBasedPartition;
}
}
class SetAndRegexStringFilter extends StringFilter {
private final ByteRunAutomaton runAutomaton;
private final Set valids;
private final Set invalids;
private SetAndRegexStringFilter(DocValueFormat format) {
Automaton automaton = toAutomaton();
this.runAutomaton = automaton == null ? null : new ByteRunAutomaton(automaton);
this.valids = parseForDocValues(includeValues, format);
this.invalids = parseForDocValues(excludeValues, format);
}
/**
* Returns whether the given value is accepted based on the {@code includeValues} & {@code excludeValues}
* sets, as well as the {@code include} & {@code exclude} patterns.
*/
@Override
public boolean accept(BytesRef value) {
if (valids != null && valids.contains(value) == false) {
return false;
}
if (runAutomaton != null && runAutomaton.run(value.bytes, value.offset, value.length) == false) {
return false;
}
return invalids == null || invalids.contains(value) == false;
}
}
public abstract static class OrdinalsFilter extends Filter {
public abstract LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException;
}
class PartitionedOrdinalsFilter extends OrdinalsFilter {
@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
final long numOrds = globalOrdinals.getValueCount();
final LongBitSet acceptedGlobalOrdinals = new LongBitSet(numOrds);
final TermsEnum termEnum = globalOrdinals.termsEnum();
BytesRef term = termEnum.next();
while (term != null) {
if (Math.floorMod(
StringHelper.murmurhash3_x86_32(term, HASH_PARTITIONING_SEED),
incNumPartitions
) == incZeroBasedPartition) {
acceptedGlobalOrdinals.set(termEnum.ord());
}
term = termEnum.next();
}
return acceptedGlobalOrdinals;
}
}
class SetAndRegexOrdinalsFilter extends OrdinalsFilter {
private final CompiledAutomaton compiled;
private final SortedSet valids;
private final SortedSet invalids;
private SetAndRegexOrdinalsFilter(DocValueFormat format) {
Automaton automaton = toAutomaton();
this.compiled = automaton == null ? null : new CompiledAutomaton(automaton);
this.valids = parseForDocValues(includeValues, format);
this.invalids = parseForDocValues(excludeValues, format);
}
/**
* Computes which global ordinals are accepted by this IncludeExclude instance, based on the combination of
* the {@code includeValues} & {@code excludeValues} sets, as well as the {@code include} &
* {@code exclude} patterns.
*/
@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet acceptedGlobalOrdinals = null;
if (valids != null) {
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
for (BytesRef term : valids) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.set(ord);
}
}
}
if (compiled != null) {
LongBitSet automatonGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
Terms globalTerms = new DocValuesTerms(globalOrdinals);
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
globalTermsEnum = compiled.getTermsEnum(globalTerms);
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
automatonGlobalOrdinals.set(globalTermsEnum.ord());
}
if (acceptedGlobalOrdinals == null) {
acceptedGlobalOrdinals = automatonGlobalOrdinals;
} else {
acceptedGlobalOrdinals.and(automatonGlobalOrdinals);
}
}
if (acceptedGlobalOrdinals == null) {
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
if (acceptedGlobalOrdinals.length() > 0) {
// default to all terms being acceptable
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
}
}
if (invalids != null) {
for (BytesRef term : invalids) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.clear(ord);
}
}
}
return acceptedGlobalOrdinals;
}
}
private final RegExp include, exclude;
private final SortedSet includeValues, excludeValues;
private final int incZeroBasedPartition;
private final int incNumPartitions;
/**
* @param include The regular expression pattern for the terms to be included
* @param exclude The regular expression pattern for the terms to be excluded
*/
public IncludeExclude(
@Nullable String include,
@Nullable String exclude,
@Nullable SortedSet includeValues,
@Nullable SortedSet excludeValues
) {
if (include == null && exclude == null && includeValues == null && excludeValues == null) {
throw new IllegalArgumentException();
}
if (include != null && includeValues != null) {
throw new IllegalArgumentException();
}
if (exclude != null && excludeValues != null) {
throw new IllegalArgumentException();
}
this.include = include == null ? null : new RegExp(include);
this.exclude = exclude == null ? null : new RegExp(exclude);
this.includeValues = includeValues;
this.excludeValues = excludeValues;
this.incZeroBasedPartition = 0;
this.incNumPartitions = 0;
}
public IncludeExclude(int partition, int numPartitions) {
if (partition < 0 || partition >= numPartitions) {
throw new IllegalArgumentException("Partition must be >=0 and < numPartition which is " + numPartitions);
}
this.incZeroBasedPartition = partition;
this.incNumPartitions = numPartitions;
this.include = null;
this.exclude = null;
this.includeValues = null;
this.excludeValues = null;
}
/**
* Read from a stream.
*/
public IncludeExclude(StreamInput in) throws IOException {
if (in.readBoolean()) {
String includeString = in.readOptionalString();
include = includeString == null ? null : new RegExp(includeString);
String excludeString = in.readOptionalString();
exclude = excludeString == null ? null : new RegExp(excludeString);
} else {
include = null;
exclude = null;
}
if (in.readBoolean()) {
int size = in.readVInt();
includeValues = new TreeSet<>();
for (int i = 0; i < size; i++) {
includeValues.add(in.readBytesRef());
}
} else {
includeValues = null;
}
if (in.readBoolean()) {
int size = in.readVInt();
excludeValues = new TreeSet<>();
for (int i = 0; i < size; i++) {
excludeValues.add(in.readBytesRef());
}
} else {
excludeValues = null;
}
incNumPartitions = in.readVInt();
incZeroBasedPartition = in.readVInt();
}
@Override
public void writeTo(StreamOutput out) throws IOException {
boolean regexBased = isRegexBased();
out.writeBoolean(regexBased);
if (regexBased) {
out.writeOptionalString(include == null ? null : include.getOriginalString());
out.writeOptionalString(exclude == null ? null : exclude.getOriginalString());
}
boolean hasIncludes = includeValues != null;
out.writeBoolean(hasIncludes);
if (hasIncludes) {
out.writeCollection(includeValues, StreamOutput::writeBytesRef);
}
boolean hasExcludes = excludeValues != null;
out.writeBoolean(hasExcludes);
if (hasExcludes) {
out.writeCollection(excludeValues, StreamOutput::writeBytesRef);
}
out.writeVInt(incNumPartitions);
out.writeVInt(incZeroBasedPartition);
}
/**
* Terms adapter around doc values.
*/
private static class DocValuesTerms extends Terms {
private final SortedSetDocValues values;
DocValuesTerms(SortedSetDocValues values) {
this.values = values;
}
@Override
public TermsEnum iterator() throws IOException {
return values.termsEnum();
}
@Override
public long size() throws IOException {
return -1;
}
@Override
public long getSumTotalTermFreq() throws IOException {
return -1;
}
@Override
public long getSumDocFreq() throws IOException {
return -1;
}
@Override
public int getDocCount() throws IOException {
return -1;
}
@Override
public boolean hasFreqs() {
return false;
}
@Override
public boolean hasOffsets() {
return false;
}
@Override
public boolean hasPositions() {
return false;
}
@Override
public boolean hasPayloads() {
return false;
}
}
private static Set parseArrayToSet(XContentParser parser) throws IOException {
final Set set = new HashSet<>();
if (parser.currentToken() != XContentParser.Token.START_ARRAY) {
throw new ElasticsearchParseException("Missing start of array in include/exclude clause");
}
while (parser.nextToken() != XContentParser.Token.END_ARRAY) {
if (parser.currentToken().isValue() == false) {
throw new ElasticsearchParseException("Array elements in include/exclude clauses should be string values");
}
set.add(new BytesRef(parser.text()));
}
return set;
}
public boolean isRegexBased() {
return include != null || exclude != null;
}
public boolean isPartitionBased() {
return incNumPartitions > 0;
}
private Automaton toAutomaton() {
Automaton a = null;
if (include == null && exclude == null) {
return a;
}
if (include != null) {
a = include.toAutomaton();
} else {
a = Automata.makeAnyString();
}
if (exclude != null) {
a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}
return a;
}
public StringFilter convertToStringFilter(DocValueFormat format) {
if (isPartitionBased()) {
return new PartitionedStringFilter();
}
return new SetAndRegexStringFilter(format);
}
private static SortedSet parseForDocValues(SortedSet endUserFormattedValues, DocValueFormat format) {
SortedSet result = endUserFormattedValues;
if (endUserFormattedValues != null) {
if (format != DocValueFormat.RAW) {
result = new TreeSet<>();
for (BytesRef formattedVal : endUserFormattedValues) {
result.add(format.parseBytesRef(formattedVal.utf8ToString()));
}
}
}
return result;
}
public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) {
if (isPartitionBased()) {
return new PartitionedOrdinalsFilter();
}
return new SetAndRegexOrdinalsFilter(format);
}
public LongFilter convertToLongFilter(DocValueFormat format) {
if (isPartitionBased()) {
return new PartitionedLongFilter();
}
int numValids = includeValues == null ? 0 : includeValues.size();
int numInvalids = excludeValues == null ? 0 : excludeValues.size();
SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids);
if (includeValues != null) {
for (BytesRef val : includeValues) {
result.addAccept(format.parseLong(val.utf8ToString(), false, null));
}
}
if (excludeValues != null) {
for (BytesRef val : excludeValues) {
result.addReject(format.parseLong(val.utf8ToString(), false, null));
}
}
return result;
}
public LongFilter convertToDoubleFilter() {
if (isPartitionBased()) {
return new PartitionedLongFilter();
}
int numValids = includeValues == null ? 0 : includeValues.size();
int numInvalids = excludeValues == null ? 0 : excludeValues.size();
SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids);
if (includeValues != null) {
for (BytesRef val : includeValues) {
double dval = Double.parseDouble(val.utf8ToString());
result.addAccept(NumericUtils.doubleToSortableLong(dval));
}
}
if (excludeValues != null) {
for (BytesRef val : excludeValues) {
double dval = Double.parseDouble(val.utf8ToString());
result.addReject(NumericUtils.doubleToSortableLong(dval));
}
}
return result;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
if (include != null) {
builder.field(INCLUDE_FIELD.getPreferredName(), include.getOriginalString());
} else if (includeValues != null) {
builder.startArray(INCLUDE_FIELD.getPreferredName());
for (BytesRef value : includeValues) {
builder.value(value.utf8ToString());
}
builder.endArray();
} else if (isPartitionBased()) {
builder.startObject(INCLUDE_FIELD.getPreferredName());
builder.field(PARTITION_FIELD.getPreferredName(), incZeroBasedPartition);
builder.field(NUM_PARTITIONS_FIELD.getPreferredName(), incNumPartitions);
builder.endObject();
}
if (exclude != null) {
builder.field(EXCLUDE_FIELD.getPreferredName(), exclude.getOriginalString());
} else if (excludeValues != null) {
builder.startArray(EXCLUDE_FIELD.getPreferredName());
for (BytesRef value : excludeValues) {
builder.value(value.utf8ToString());
}
builder.endArray();
}
return builder;
}
@Override
public int hashCode() {
return Objects.hash(
include == null ? null : include.getOriginalString(),
exclude == null ? null : exclude.getOriginalString(),
includeValues,
excludeValues,
incZeroBasedPartition,
incNumPartitions
);
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
IncludeExclude other = (IncludeExclude) obj;
return Objects.equals(
include == null ? null : include.getOriginalString(),
other.include == null ? null : other.include.getOriginalString()
)
&& Objects.equals(
exclude == null ? null : exclude.getOriginalString(),
other.exclude == null ? null : other.exclude.getOriginalString()
)
&& Objects.equals(includeValues, other.includeValues)
&& Objects.equals(excludeValues, other.excludeValues)
&& Objects.equals(incZeroBasedPartition, other.incZeroBasedPartition)
&& Objects.equals(incNumPartitions, other.incNumPartitions);
}
}