All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.sandbox.search.DocValuesTermsQuery Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.sandbox.search;

import java.io.IOException;
import java.util.AbstractList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Objects;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.RamUsageEstimator;

/**
 * A {@link Query} that only accepts documents whose term value in the specified field is contained
 * in the provided set of allowed terms.
 *
 * 

This is the same functionality as TermsQuery (from queries/), but because of drastically * different implementations, they also have different performance characteristics, as described * below. * *

NOTE: be very careful using this query: it is typically much slower than using {@code * TermsQuery}, but in certain specialized cases may be faster. * *

With each search, this query translates the specified set of Terms into a private {@link * LongBitSet} keyed by term number per unique {@link IndexReader} (normally one reader per * segment). Then, during matching, the term number for each docID is retrieved from the cache and * then checked for inclusion using the {@link LongBitSet}. Since all testing is done using RAM * resident data structures, performance should be very fast, most likely fast enough to not require * further caching of the DocIdSet for each possible combination of terms. However, because docIDs * are simply scanned linearly, an index with a great many small documents may find this linear scan * too costly. * *

In contrast, TermsQuery builds up an {@link FixedBitSet}, keyed by docID, every time it's * created, by enumerating through all matching docs using {@link * org.apache.lucene.index.PostingsEnum} to seek and scan through each term's docID list. While * there is no linear scan of all docIDs, besides the allocation of the underlying array in the * {@link FixedBitSet}, this approach requires a number of "disk seeks" in proportion to the number * of terms, which can be exceptionally costly when there are cache misses in the OS's IO cache. * *

Generally, this filter will be slower on the first invocation for a given field, but * subsequent invocations, even if you change the allowed set of Terms, should be faster than * TermsQuery, especially as the number of Terms being matched increases. If you are matching only a * very small number of terms, and those terms in turn match a very small number of documents, * TermsQuery may perform faster. * *

Which query is best is very application dependent. * * @lucene.experimental */ public class DocValuesTermsQuery extends Query implements Accountable { private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(DocValuesTermsQuery.class); private final String field; private final PrefixCodedTerms termData; private final int termDataHashCode; // cached hashcode of termData public DocValuesTermsQuery(String field, Collection terms) { this.field = Objects.requireNonNull(field); Objects.requireNonNull(terms, "Collection of terms must not be null"); BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]); ArrayUtil.timSort(sortedTerms); PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); BytesRef previous = null; for (BytesRef term : sortedTerms) { if (term.equals(previous) == false) { builder.add(field, term); } previous = term; } termData = builder.finish(); termDataHashCode = termData.hashCode(); } public DocValuesTermsQuery(String field, BytesRef... terms) { this(field, Arrays.asList(terms)); } public DocValuesTermsQuery(String field, String... terms) { this( field, new AbstractList() { @Override public BytesRef get(int index) { return new BytesRef(terms[index]); } @Override public int size() { return terms.length; } }); } @Override public boolean equals(Object other) { return sameClassAs(other) && equalsTo(getClass().cast(other)); } private boolean equalsTo(DocValuesTermsQuery other) { // termData might be heavy to compare so check the hash code first return termDataHashCode == other.termDataHashCode && termData.equals(other.termData); } @Override public int hashCode() { return 31 * classHash() + termDataHashCode; } @Override public String toString(String defaultField) { StringBuilder builder = new StringBuilder(); boolean first = true; TermIterator iterator = termData.iterator(); for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { if (!first) { builder.append(' '); } first = false; builder.append(new Term(iterator.field(), term).toString()); } return builder.toString(); } /** @return the name of the field searched by this query. */ public String getField() { return field; } /** @return the terms looked up by this query, prefix-encoded. */ public PrefixCodedTerms getTerms() { return termData; } @Override public long ramBytesUsed() { return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(field) + RamUsageEstimator.sizeOfObject(termData); } @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field)) { visitor.visitLeaf(this); } } @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { return new ConstantScoreWeight(this, boost) { @Override public Scorer scorer(LeafReaderContext context) throws IOException { final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field); final LongBitSet bits = new LongBitSet(values.getValueCount()); boolean matchesAtLeastOneTerm = false; TermIterator iterator = termData.iterator(); for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { final long ord = values.lookupTerm(term); if (ord >= 0) { matchesAtLeastOneTerm = true; bits.set(ord); } } if (matchesAtLeastOneTerm == false) { return null; } return new ConstantScoreScorer( this, score(), scoreMode, new TwoPhaseIterator(values) { @Override public boolean matches() throws IOException { for (int i = 0; i < values.docValueCount(); i++) { if (bits.get(values.nextOrd())) { return true; } } return false; } @Override public float matchCost() { return 3; // lookup in a bitset } }); } @Override public boolean isCacheable(LeafReaderContext ctx) { return DocValues.isCacheable(ctx, field); } }; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy