org.apache.lucene.search.TermInSetQuery Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.servicemix.bundles.lucene
This OSGi bundle wraps ${pkgArtifactId} ${pkgVersion} jar file.
There is a newer version: 6.4.2_1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collection;
import java.util.Collections;
import java.util.SortedSet;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.*;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;

/**
 * Specialization for a disjunction over many terms that, by default, behaves like a {@link
 * ConstantScoreQuery} over a {@link BooleanQuery} containing only {@link
 * org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
 *
 * For instance in the following example, both {@code q1} and {@code q2} would yield the same
 * scores:
 *
 * 
 * Query q1 = new TermInSetQuery("field", new BytesRef("foo"), new BytesRef("bar"));
 *
 * BooleanQuery bq = new BooleanQuery();
 * bq.add(new TermQuery(new Term("field", "foo")), Occur.SHOULD);
 * bq.add(new TermQuery(new Term("field", "bar")), Occur.SHOULD);
 * Query q2 = new ConstantScoreQuery(bq);
 * 
 *
 * Unless a custom {@link MultiTermQuery.RewriteMethod} is provided, this query executes like a
 * regular disjunction where there are few terms. However, when there are many terms, instead of
 * merging iterators on the fly, it will populate a bit set with matching docs for the least-costly
 * terms and maintain a size-limited set of more costly iterators that are merged on the fly. For
 * more details, see {@link MultiTermQuery#CONSTANT_SCORE_BLENDED_REWRITE}.
 *
 * 
Users may also provide a custom {@link MultiTermQuery.RewriteMethod} to define different
 * execution behavior, such as relying on doc values (see: {@link
 * MultiTermQuery#DOC_VALUES_REWRITE}), or if scores are required (see: {@link
 * MultiTermQuery#SCORING_BOOLEAN_REWRITE}). See {@link MultiTermQuery} documentation for more
 * rewrite options.
 *
 * NOTE: This query produces scores that are equal to its boost
 */
public class TermInSetQuery extends MultiTermQuery implements Accountable {

  private static final long BASE_RAM_BYTES_USED =
      RamUsageEstimator.shallowSizeOfInstance(TermInSetQuery.class);

  private final String field;
  private final PrefixCodedTerms termData;
  private final int termDataHashCode; // cached hashcode of termData

  public TermInSetQuery(String field, Collection terms) {
    this(field, packTerms(field, terms));
  }

  /** Creates a new {@link TermInSetQuery} from the given collection of terms. */
  public TermInSetQuery(RewriteMethod rewriteMethod, String field, Collection terms) {
    super(field, rewriteMethod);
    this.field = field;
    this.termData = packTerms(field, terms);
    termDataHashCode = termData.hashCode();
  }

  private TermInSetQuery(String field, PrefixCodedTerms termData) {
    super(field, MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE);
    this.field = field;
    this.termData = termData;
    termDataHashCode = termData.hashCode();
  }

  private static PrefixCodedTerms packTerms(String field, Collection terms) {
    BytesRef[] sortedTerms = terms.toArray(new BytesRef[0]);
    // already sorted if we are a SortedSet with natural order
    boolean sorted =
        terms instanceof SortedSet && ((SortedSet) terms).comparator() == null;
    if (sorted == false) {
      new StringSorter(BytesRefComparator.NATURAL) {

        @Override
        protected void get(BytesRefBuilder builder, BytesRef result, int i) {
          BytesRef term = sortedTerms[i];
          result.length = term.length;
          result.offset = term.offset;
          result.bytes = term.bytes;
        }

        @Override
        protected void swap(int i, int j) {
          BytesRef b = sortedTerms[i];
          sortedTerms[i] = sortedTerms[j];
          sortedTerms[j] = b;
        }
      }.sort(0, sortedTerms.length);
    }
    PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
    BytesRefBuilder previous = null;
    for (BytesRef term : sortedTerms) {
      if (previous == null) {
        previous = new BytesRefBuilder();
      } else if (previous.get().equals(term)) {
        continue; // deduplicate
      }
      builder.add(field, term);
      previous.copyBytes(term);
    }

    return builder.finish();
  }

  @Override
  public long getTermsCount() {
    return termData.size();
  }

  /**
   * Get an iterator over the encoded terms for query inspection.
   *
   * @lucene.experimental
   */
  public BytesRefIterator getBytesRefIterator() {
    final TermIterator iterator = this.termData.iterator();
    return () -> iterator.next();
  }

  @Override
  public void visit(QueryVisitor visitor) {
    if (visitor.acceptField(field) == false) {
      return;
    }
    if (termData.size() == 1) {
      visitor.consumeTerms(this, new Term(field, termData.iterator().next()));
    }
    if (termData.size() > 1) {
      visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton);
    }
  }

  // TODO: This is pretty heavy-weight. If we have TermInSetQuery directly extend AutomatonQuery
  // we won't have to do this (see GH#12176).
  private ByteRunAutomaton asByteRunAutomaton() {
    try {
      Automaton a = Automata.makeBinaryStringUnion(termData.iterator());
      return new ByteRunAutomaton(a, true);
    } catch (IOException e) {
      // Shouldn't happen since termData.iterator() provides an interator implementation that
      // never throws:
      throw new UncheckedIOException(e);
    }
  }

  @Override
  public boolean equals(Object other) {
    return sameClassAs(other) && equalsTo(getClass().cast(other));
  }

  private boolean equalsTo(TermInSetQuery other) {
    // no need to check 'field' explicitly since it is encoded in 'termData'
    // termData might be heavy to compare so check the hash code first
    return termDataHashCode == other.termDataHashCode && termData.equals(other.termData);
  }

  @Override
  public int hashCode() {
    return 31 * classHash() + termDataHashCode;
  }

  @Override
  public String toString(String defaultField) {
    StringBuilder builder = new StringBuilder();
    builder.append(field);
    builder.append(":(");

    TermIterator iterator = termData.iterator();
    boolean first = true;
    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
      if (!first) {
        builder.append(' ');
      }
      first = false;
      builder.append(Term.toString(term));
    }
    builder.append(')');

    return builder.toString();
  }

  @Override
  public long ramBytesUsed() {
    return BASE_RAM_BYTES_USED + termData.ramBytesUsed();
  }

  @Override
  public Collection getChildResources() {
    return Collections.emptyList();
  }

  @Override
  protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
    return new SetEnum(terms.iterator());
  }

  /**
   * Like a baby {@link org.apache.lucene.index.AutomatonTermsEnum}, ping-pong intersects the terms
   * dict against our encoded query terms.
   */
  private class SetEnum extends FilteredTermsEnum {
    private final TermIterator iterator;
    private BytesRef seekTerm;

    SetEnum(TermsEnum termsEnum) {
      super(termsEnum);
      iterator = termData.iterator();
      seekTerm = iterator.next();
    }

    @Override
    protected AcceptStatus accept(BytesRef term) throws IOException {
      // next() our iterator until it is >= the incoming term
      // if it matches exactly, it's a hit, otherwise it's a miss
      int cmp = 0;
      while (seekTerm != null && (cmp = seekTerm.compareTo(term)) < 0) {
        seekTerm = iterator.next();
      }
      if (seekTerm == null) {
        return AcceptStatus.END;
      } else if (cmp == 0) {
        return AcceptStatus.YES_AND_SEEK;
      } else {
        return AcceptStatus.NO_AND_SEEK;
      }
    }

    @Override
    protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException {
      // next() our iterator until it is > the currentTerm, must always make progress.
      if (currentTerm == null) {
        return seekTerm;
      }
      while (seekTerm != null && seekTerm.compareTo(currentTerm) <= 0) {
        seekTerm = iterator.next();
      }
      return seekTerm;
    }
  }
}