com.yahoo.search.predicate.PredicateIndexBuilder Maven / Gradle / Ivy
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.predicate;
import com.yahoo.api.annotations.Beta;
import com.google.common.base.Preconditions;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Shorts;
import com.yahoo.document.predicate.BooleanPredicate;
import com.yahoo.document.predicate.Predicate;
import com.yahoo.search.predicate.annotator.PredicateTreeAnnotations;
import com.yahoo.search.predicate.annotator.PredicateTreeAnnotator;
import com.yahoo.search.predicate.index.Feature;
import com.yahoo.search.predicate.index.Interval;
import com.yahoo.search.predicate.index.IntervalWithBounds;
import com.yahoo.search.predicate.index.Posting;
import com.yahoo.search.predicate.index.PredicateIntervalStore;
import com.yahoo.search.predicate.index.PredicateOptimizer;
import com.yahoo.search.predicate.index.SimpleIndex;
import com.yahoo.search.predicate.index.conjunction.ConjunctionIndexBuilder;
import com.yahoo.search.predicate.index.conjunction.IndexableFeatureConjunction;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import static java.util.stream.Collectors.joining;
/**
* A builder for {@link PredicateIndex}.
*
* When creating a PredicateIndexBuilder, you must specify an arity. This is used for
* range features, and is a trade-off of index size vs. query speed. Higher
* arities gives larger index but faster search.
*
*
* {@link #indexDocument(int, Predicate)}
* takes a document id and a predicate to insert into the index.
* Predicates should be specified using the predicate syntax described in the documentation.
* Create the {@link Predicate} objects using {@link Predicate#fromString(String)}.
*
*
* Use {@link #build()} to create an instance of {@link PredicateIndex}.
*
* @author bjorncs
*/
@Beta
public class PredicateIndexBuilder {
// Unique ids / mapping from internal to external id. LinkedHashSet as the insertion order is crucial.
private final Set seenIds = new LinkedHashSet<>();
private final List intervalEndsBuilder = new ArrayList<>();
private final List minFeatureIndexBuilder = new ArrayList<>();
private final List zeroConstraintDocuments = new ArrayList<>();
private final SimpleIndex.Builder intervalIndexBuilder = new SimpleIndex.Builder();
private final SimpleIndex.Builder boundsIndexBuilder = new SimpleIndex.Builder();
private final SimpleIndex.Builder conjunctionIntervalIndexBuilder = new SimpleIndex.Builder();
private final ConjunctionIndexBuilder conjunctionIndexBuilder = new ConjunctionIndexBuilder();
private final PredicateIntervalStore.Builder intervalStoreBuilder;
private final PredicateOptimizer optimizer;
private final Config config;
private int documentIdCounter = 0;
private int nZStarDocuments = 0;
private int nZStarIntervals = 0;
private int highestIntervalEnd = 1;
/**
* Creates a PredicateIndexBuilder with default upper and lower bounds.
*
* @param arity the arity to use when indexing range predicates.
* Small arity gives smaller index, but more expensive searches.
*/
public PredicateIndexBuilder(int arity) {
this(new Config.Builder().setArity(arity).build());
}
/**
* Creates a PredicateIndexBuilder.
* Limiting the range of possible values in range predicates reduces index size
* and increases search performance.
*
* @param arity the arity to use when indexing range predicates.
* Small arity gives smaller index, but more expensive searches.
* @param lowerBound the lower bound for the range of values used by range predicates
* @param upperBound the upper bound for the range of values used by range predicates
*/
public PredicateIndexBuilder(int arity, long lowerBound, long upperBound) {
this(new Config.Builder().setArity(arity).setLowerBound(lowerBound).setUpperBound(upperBound).build());
}
/**
* Creates a PredicateIndexBuilder based on a Config object.
*
* @param config configuration for the PredicateIndexBuilder
*/
public PredicateIndexBuilder(Config config) {
this.config = config;
this.optimizer = new PredicateOptimizer(config);
this.intervalStoreBuilder = new PredicateIntervalStore.Builder();
}
/**
* Indexes a predicate with the given id.
*
* @param docId a 32-bit document id, returned in the Hit objects when the predicate matches
* @param predicate the predicate to index
*/
public void indexDocument(int docId, Predicate predicate) {
if (documentIdCounter == Integer.MAX_VALUE) {
throw new IllegalStateException("Index is full, max number of documents is: " + Integer.MAX_VALUE);
} else if (seenIds.contains(docId)) {
throw new IllegalArgumentException("Document id is already in use: " + docId);
} else if (isNeverMatchingDocument(predicate)) {
return;
}
seenIds.add(docId);
predicate = optimizer.optimizePredicate(predicate);
int internalId = documentIdCounter++;
if (isAlwaysMatchingDocument(predicate)) {
indexZeroConstraintDocument(internalId);
} else {
indexDocument(internalId, PredicateTreeAnnotator.createPredicateTreeAnnotations(predicate));
}
}
private static boolean isAlwaysMatchingDocument(Predicate p) {
return p instanceof BooleanPredicate && ((BooleanPredicate) p).getValue();
}
private static boolean isNeverMatchingDocument(Predicate p) {
return p instanceof BooleanPredicate && !((BooleanPredicate) p).getValue();
}
private void indexZeroConstraintDocument(int docId) {
minFeatureIndexBuilder.add((byte) 0);
intervalEndsBuilder.add((short) Interval.ZERO_CONSTRAINT_RANGE);
zeroConstraintDocuments.add(docId);
}
private void indexDocument(int docId, PredicateTreeAnnotations annotations) {
int minFeature = annotations.minFeature;
Preconditions.checkState(minFeature <= 0xFF,
"Predicate is too complex. Expected min-feature less than %d, was %d.", 0xFF, minFeature);
int intervalEnd = annotations.intervalEnd;
Preconditions.checkState(intervalEnd <= Interval.MAX_INTERVAL_END,
"Predicate is too complex. Expected min-feature less than %d, was %d.",
Interval.MAX_INTERVAL_END, intervalEnd);
highestIntervalEnd = Math.max(highestIntervalEnd, intervalEnd);
intervalEndsBuilder.add((short) intervalEnd);
minFeatureIndexBuilder.add((byte) minFeature);
indexDocumentFeatures(docId, annotations.intervalMap);
indexDocumentBoundsFeatures(docId, annotations.boundsMap);
indexDocumentConjunctions(docId, annotations.featureConjunctions);
aggregateZStarStatistics(annotations.intervalMap);
}
private void aggregateZStarStatistics(Map> intervalMap) {
List intervals = intervalMap.get(Feature.Z_STAR_COMPRESSED_ATTRIBUTE_HASH);
if (intervals != null) {
++nZStarDocuments;
nZStarIntervals += intervals.size();
}
}
private void indexDocumentFeatures(int docId, Map> intervalMap) {
intervalMap.entrySet().stream()
.forEach(entry -> intervalIndexBuilder.insert(entry.getKey(),
new Posting(docId,
intervalStoreBuilder.insert(entry.getValue()))));
}
private void indexDocumentBoundsFeatures(int docId, Map> boundsMap) {
boundsMap.entrySet().stream()
.forEach(entry -> boundsIndexBuilder.insert(entry.getKey(),
new Posting(docId,
intervalStoreBuilder.insert(
entry.getValue().stream().flatMap(IntervalWithBounds::stream).toList()))));
}
private void indexDocumentConjunctions(
int docId, Map> featureConjunctions) {
for (Map.Entry> e : featureConjunctions.entrySet()) {
IndexableFeatureConjunction fc = e.getKey();
List intervals = e.getValue();
Posting posting = new Posting(docId, intervalStoreBuilder.insert(intervals));
conjunctionIntervalIndexBuilder.insert(fc.id, posting);
conjunctionIndexBuilder.indexConjunction(fc);
}
}
public PredicateIndex build() {
return new PredicateIndex(
config,
Ints.toArray(seenIds),
Bytes.toArray(minFeatureIndexBuilder),
Shorts.toArray(intervalEndsBuilder),
highestIntervalEnd,
intervalIndexBuilder.build(),
boundsIndexBuilder.build(),
conjunctionIntervalIndexBuilder.build(),
intervalStoreBuilder.build(),
conjunctionIndexBuilder.build(),
Ints.toArray(zeroConstraintDocuments)
);
}
public int getZeroConstraintDocCount() {
return zeroConstraintDocuments.size();
}
/**
* Retrieves metrics about the current index.
*
* @return an object containing metrics
*/
public PredicateIndexStats getStats() {
return new PredicateIndexStats(zeroConstraintDocuments, intervalIndexBuilder,
boundsIndexBuilder, intervalStoreBuilder, conjunctionIndexBuilder, nZStarDocuments, nZStarIntervals);
}
/**
* A collection of metrics about the currently built {@link PredicateIndex}.
*/
public static class PredicateIndexStats {
private final Map metrics = new TreeMap<>();
public PredicateIndexStats(
List zeroConstraintDocuments,
SimpleIndex.Builder intervalIndex,
SimpleIndex.Builder boundsIndex,
PredicateIntervalStore.Builder intervalStore,
ConjunctionIndexBuilder conjunctionIndex,
int nZStarDocuments,
int nZStarIntervals) {
Map intervalStoreEntries = intervalStore.getEntriesForSize();
metrics.put("Zero-constraint documents", zeroConstraintDocuments.size());
metrics.put("Interval index keys", intervalIndex.getKeyCount());
metrics.put("Interval index entries", intervalIndex.getEntryCount());
metrics.put("Bounds index keys", boundsIndex.getKeyCount());
metrics.put("Bounds index entries", boundsIndex.getEntryCount());
metrics.put("Conjunction index feature count", conjunctionIndex.calculateFeatureCount());
metrics.put("Conjunction index unique conjunction count", conjunctionIndex.getUniqueConjunctionCount());
metrics.put("Conjunction index conjunction count", conjunctionIndex.getConjunctionsSeen());
metrics.put("Conjunction index Z list size", conjunctionIndex.getZListSize());
metrics.put("Interval store cache hits", intervalStore.getCacheHits());
metrics.put("Interval store insert count", intervalStore.getTotalInserts());
metrics.put("Interval store interval count", intervalStore.getNumberOfIntervals());
metrics.put("Documents with ZStar intervals", nZStarDocuments);
metrics.put("Total ZStar intervals", nZStarIntervals);
intervalStoreEntries.entrySet().stream()
.filter(entry -> entry.getKey() != 0)
.forEach(entry -> metrics.put("Size " + entry.getKey() + " intervals", entry.getValue()));
}
public void putValues(Map valueMap) {
valueMap.putAll(metrics);
}
@Override
public String toString() {
return metrics.entrySet().stream()
.map(e -> String.format("%50s: %s", e.getKey(), e.getValue()))
.collect(joining("\n"));
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy