org.apache.lucene.sandbox.search.DocValuesTermsQuery Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-sandbox Show documentation
Apache Lucene (module: sandbox)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.sandbox.search;

import java.io.IOException;
import java.util.AbstractList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Objects;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.RamUsageEstimator;

/**
 * A {@link Query} that only accepts documents whose term value in the specified field is contained
 * in the provided set of allowed terms.
 *
 * This is the same functionality as TermsQuery (from queries/), but because of drastically
 * different implementations, they also have different performance characteristics, as described
 * below.
 *
 * 
NOTE: be very careful using this query: it is typically much slower than using {@code
 * TermsQuery}, but in certain specialized cases may be faster.
 *
 * 
With each search, this query translates the specified set of Terms into a private {@link
 * LongBitSet} keyed by term number per unique {@link IndexReader} (normally one reader per
 * segment). Then, during matching, the term number for each docID is retrieved from the cache and
 * then checked for inclusion using the {@link LongBitSet}. Since all testing is done using RAM
 * resident data structures, performance should be very fast, most likely fast enough to not require
 * further caching of the DocIdSet for each possible combination of terms. However, because docIDs
 * are simply scanned linearly, an index with a great many small documents may find this linear scan
 * too costly.
 *
 * 
In contrast, TermsQuery builds up an {@link FixedBitSet}, keyed by docID, every time it's
 * created, by enumerating through all matching docs using {@link
 * org.apache.lucene.index.PostingsEnum} to seek and scan through each term's docID list. While
 * there is no linear scan of all docIDs, besides the allocation of the underlying array in the
 * {@link FixedBitSet}, this approach requires a number of "disk seeks" in proportion to the number
 * of terms, which can be exceptionally costly when there are cache misses in the OS's IO cache.
 *
 * 
Generally, this filter will be slower on the first invocation for a given field, but
 * subsequent invocations, even if you change the allowed set of Terms, should be faster than
 * TermsQuery, especially as the number of Terms being matched increases. If you are matching only a
 * very small number of terms, and those terms in turn match a very small number of documents,
 * TermsQuery may perform faster.
 *
 * Which query is best is very application dependent.
 *
 * @lucene.experimental
 */
public class DocValuesTermsQuery extends Query implements Accountable {
  private static final long BASE_RAM_BYTES =
      RamUsageEstimator.shallowSizeOfInstance(DocValuesTermsQuery.class);

  private final String field;
  private final PrefixCodedTerms termData;
  private final int termDataHashCode; // cached hashcode of termData

  public DocValuesTermsQuery(String field, Collection terms) {
    this.field = Objects.requireNonNull(field);
    Objects.requireNonNull(terms, "Collection of terms must not be null");
    BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
    ArrayUtil.timSort(sortedTerms);
    PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
    BytesRef previous = null;
    for (BytesRef term : sortedTerms) {
      if (term.equals(previous) == false) {
        builder.add(field, term);
      }
      previous = term;
    }
    termData = builder.finish();
    termDataHashCode = termData.hashCode();
  }

  public DocValuesTermsQuery(String field, BytesRef... terms) {
    this(field, Arrays.asList(terms));
  }

  public DocValuesTermsQuery(String field, String... terms) {
    this(
        field,
        new AbstractList() {
          @Override
          public BytesRef get(int index) {
            return new BytesRef(terms[index]);
          }

          @Override
          public int size() {
            return terms.length;
          }
        });
  }

  @Override
  public boolean equals(Object other) {
    return sameClassAs(other) && equalsTo(getClass().cast(other));
  }

  private boolean equalsTo(DocValuesTermsQuery other) {
    // termData might be heavy to compare so check the hash code first
    return termDataHashCode == other.termDataHashCode && termData.equals(other.termData);
  }

  @Override
  public int hashCode() {
    return 31 * classHash() + termDataHashCode;
  }

  @Override
  public String toString(String defaultField) {
    StringBuilder builder = new StringBuilder();
    boolean first = true;
    TermIterator iterator = termData.iterator();
    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
      if (!first) {
        builder.append(' ');
      }
      first = false;
      builder.append(new Term(iterator.field(), term).toString());
    }

    return builder.toString();
  }

  /** @return the name of the field searched by this query. */
  public String getField() {
    return field;
  }

  /** @return the terms looked up by this query, prefix-encoded. */
  public PrefixCodedTerms getTerms() {
    return termData;
  }

  @Override
  public long ramBytesUsed() {
    return BASE_RAM_BYTES
        + RamUsageEstimator.sizeOfObject(field)
        + RamUsageEstimator.sizeOfObject(termData);
  }

  @Override
  public void visit(QueryVisitor visitor) {
    if (visitor.acceptField(field)) {
      visitor.visitLeaf(this);
    }
  }

  @Override
  public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
      throws IOException {
    return new ConstantScoreWeight(this, boost) {

      @Override
      public Scorer scorer(LeafReaderContext context) throws IOException {
        final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field);
        final LongBitSet bits = new LongBitSet(values.getValueCount());
        boolean matchesAtLeastOneTerm = false;
        TermIterator iterator = termData.iterator();
        for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
          final long ord = values.lookupTerm(term);
          if (ord >= 0) {
            matchesAtLeastOneTerm = true;
            bits.set(ord);
          }
        }
        if (matchesAtLeastOneTerm == false) {
          return null;
        }
        return new ConstantScoreScorer(
            this,
            score(),
            scoreMode,
            new TwoPhaseIterator(values) {

              @Override
              public boolean matches() throws IOException {
                for (int i = 0; i < values.docValueCount(); i++) {
                  if (bits.get(values.nextOrd())) {
                    return true;
                  }
                }
                return false;
              }

              @Override
              public float matchCost() {
                return 3; // lookup in a bitset
              }
            });
      }

      @Override
      public boolean isCacheable(LeafReaderContext ctx) {
        return DocValues.isCacheable(ctx, field);
      }
    };
  }
}