All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.TermInSetQuery Maven / Gradle / Ivy

There is a newer version: 6.4.2_1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collection;
import java.util.Collections;
import java.util.SortedSet;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.*;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;

/**
 * Specialization for a disjunction over many terms that, by default, behaves like a {@link
 * ConstantScoreQuery} over a {@link BooleanQuery} containing only {@link
 * org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
 *
 * 

For instance in the following example, both {@code q1} and {@code q2} would yield the same * scores: * *

 * Query q1 = new TermInSetQuery("field", new BytesRef("foo"), new BytesRef("bar"));
 *
 * BooleanQuery bq = new BooleanQuery();
 * bq.add(new TermQuery(new Term("field", "foo")), Occur.SHOULD);
 * bq.add(new TermQuery(new Term("field", "bar")), Occur.SHOULD);
 * Query q2 = new ConstantScoreQuery(bq);
 * 
* *

Unless a custom {@link MultiTermQuery.RewriteMethod} is provided, this query executes like a * regular disjunction where there are few terms. However, when there are many terms, instead of * merging iterators on the fly, it will populate a bit set with matching docs for the least-costly * terms and maintain a size-limited set of more costly iterators that are merged on the fly. For * more details, see {@link MultiTermQuery#CONSTANT_SCORE_BLENDED_REWRITE}. * *

Users may also provide a custom {@link MultiTermQuery.RewriteMethod} to define different * execution behavior, such as relying on doc values (see: {@link * MultiTermQuery#DOC_VALUES_REWRITE}), or if scores are required (see: {@link * MultiTermQuery#SCORING_BOOLEAN_REWRITE}). See {@link MultiTermQuery} documentation for more * rewrite options. * *

NOTE: This query produces scores that are equal to its boost */ public class TermInSetQuery extends MultiTermQuery implements Accountable { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermInSetQuery.class); private final String field; private final PrefixCodedTerms termData; private final int termDataHashCode; // cached hashcode of termData public TermInSetQuery(String field, Collection terms) { this(field, packTerms(field, terms)); } /** Creates a new {@link TermInSetQuery} from the given collection of terms. */ public TermInSetQuery(RewriteMethod rewriteMethod, String field, Collection terms) { super(field, rewriteMethod); this.field = field; this.termData = packTerms(field, terms); termDataHashCode = termData.hashCode(); } private TermInSetQuery(String field, PrefixCodedTerms termData) { super(field, MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE); this.field = field; this.termData = termData; termDataHashCode = termData.hashCode(); } private static PrefixCodedTerms packTerms(String field, Collection terms) { BytesRef[] sortedTerms = terms.toArray(new BytesRef[0]); // already sorted if we are a SortedSet with natural order boolean sorted = terms instanceof SortedSet && ((SortedSet) terms).comparator() == null; if (sorted == false) { new StringSorter(BytesRefComparator.NATURAL) { @Override protected void get(BytesRefBuilder builder, BytesRef result, int i) { BytesRef term = sortedTerms[i]; result.length = term.length; result.offset = term.offset; result.bytes = term.bytes; } @Override protected void swap(int i, int j) { BytesRef b = sortedTerms[i]; sortedTerms[i] = sortedTerms[j]; sortedTerms[j] = b; } }.sort(0, sortedTerms.length); } PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); BytesRefBuilder previous = null; for (BytesRef term : sortedTerms) { if (previous == null) { previous = new BytesRefBuilder(); } else if (previous.get().equals(term)) { continue; // deduplicate } builder.add(field, term); previous.copyBytes(term); } return builder.finish(); } @Override public long getTermsCount() { return termData.size(); } /** * Get an iterator over the encoded terms for query inspection. * * @lucene.experimental */ public BytesRefIterator getBytesRefIterator() { final TermIterator iterator = this.termData.iterator(); return () -> iterator.next(); } @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field) == false) { return; } if (termData.size() == 1) { visitor.consumeTerms(this, new Term(field, termData.iterator().next())); } if (termData.size() > 1) { visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton); } } // TODO: This is pretty heavy-weight. If we have TermInSetQuery directly extend AutomatonQuery // we won't have to do this (see GH#12176). private ByteRunAutomaton asByteRunAutomaton() { try { Automaton a = Automata.makeBinaryStringUnion(termData.iterator()); return new ByteRunAutomaton(a, true); } catch (IOException e) { // Shouldn't happen since termData.iterator() provides an interator implementation that // never throws: throw new UncheckedIOException(e); } } @Override public boolean equals(Object other) { return sameClassAs(other) && equalsTo(getClass().cast(other)); } private boolean equalsTo(TermInSetQuery other) { // no need to check 'field' explicitly since it is encoded in 'termData' // termData might be heavy to compare so check the hash code first return termDataHashCode == other.termDataHashCode && termData.equals(other.termData); } @Override public int hashCode() { return 31 * classHash() + termDataHashCode; } @Override public String toString(String defaultField) { StringBuilder builder = new StringBuilder(); builder.append(field); builder.append(":("); TermIterator iterator = termData.iterator(); boolean first = true; for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { if (!first) { builder.append(' '); } first = false; builder.append(Term.toString(term)); } builder.append(')'); return builder.toString(); } @Override public long ramBytesUsed() { return BASE_RAM_BYTES_USED + termData.ramBytesUsed(); } @Override public Collection getChildResources() { return Collections.emptyList(); } @Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { return new SetEnum(terms.iterator()); } /** * Like a baby {@link org.apache.lucene.index.AutomatonTermsEnum}, ping-pong intersects the terms * dict against our encoded query terms. */ private class SetEnum extends FilteredTermsEnum { private final TermIterator iterator; private BytesRef seekTerm; SetEnum(TermsEnum termsEnum) { super(termsEnum); iterator = termData.iterator(); seekTerm = iterator.next(); } @Override protected AcceptStatus accept(BytesRef term) throws IOException { // next() our iterator until it is >= the incoming term // if it matches exactly, it's a hit, otherwise it's a miss int cmp = 0; while (seekTerm != null && (cmp = seekTerm.compareTo(term)) < 0) { seekTerm = iterator.next(); } if (seekTerm == null) { return AcceptStatus.END; } else if (cmp == 0) { return AcceptStatus.YES_AND_SEEK; } else { return AcceptStatus.NO_AND_SEEK; } } @Override protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException { // next() our iterator until it is > the currentTerm, must always make progress. if (currentTerm == null) { return seekTerm; } while (seekTerm != null && seekTerm.compareTo(currentTerm) <= 0) { seekTerm = iterator.next(); } return seekTerm; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy