org.apache.lucene.search.PhraseQuery Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.servicemix.bundles.lucene
This OSGi bundle wraps ${pkgArtifactId} ${pkgVersion} jar file.
There is a newer version: 6.4.2_1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.internal.hppc.IntArrayList;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOSupplier;

/**
 * A Query that matches documents containing a particular sequence of terms. A PhraseQuery is built
 * by QueryParser for input like "new york".
 *
 * This query may be combined with other terms or queries with a {@link BooleanQuery}.
 *
 * 
NOTE: All terms in the phrase must match, even those at the same position. If you have
 * terms at the same position, perhaps synonyms, you probably want {@link MultiPhraseQuery} instead
 * which only requires one term at a position to match. 

 * Also, Leading holes don't have any particular meaning for this query and will be ignored. For
 * instance this query:
 *
 * 
 * PhraseQuery.Builder builder = new PhraseQuery.Builder();
 * builder.add(new Term("body", "one"), 4);
 * builder.add(new Term("body", "two"), 5);
 * PhraseQuery pq = builder.build();
 * 
 *
 * is equivalent to the below query:
 *
 *  * PhraseQuery.Builder builder = new PhraseQuery.Builder();
 * builder.add(new Term("body", "one"), 0);
 * builder.add(new Term("body", "two"), 1);
 * PhraseQuery pq = builder.build();
 * 
 */
public class PhraseQuery extends Query {

  /** A builder for phrase queries. */
  public static class Builder {

    private int slop;
    private final List terms;
    private final IntArrayList positions;

    /** Sole constructor. */
    public Builder() {
      slop = 0;
      terms = new ArrayList<>();
      positions = new IntArrayList();
    }

    /**
     * Set the slop.
     *
     * @see PhraseQuery#getSlop()
     */
    public Builder setSlop(int slop) {
      this.slop = slop;
      return this;
    }

    /**
     * Adds a term to the end of the query phrase. The relative position of the term is the one
     * immediately after the last term added.
     */
    public Builder add(Term term) {
      return add(term, positions.isEmpty() ? 0 : 1 + positions.get(positions.size() - 1));
    }

    /**
     * Adds a term to the end of the query phrase. The relative position of the term within the
     * phrase is specified explicitly, but must be greater than or equal to that of the previously
     * added term. A greater position allows phrases with gaps (e.g. in connection with stopwords).
     * If the position is equal, you most likely should be using {@link MultiPhraseQuery} instead
     * which only requires one term at each position to match; this class requires all of them.
     */
    public Builder add(Term term, int position) {
      Objects.requireNonNull(term, "Cannot add a null term to PhraseQuery");
      if (position < 0) {
        throw new IllegalArgumentException("Positions must be >= 0, got " + position);
      }
      if (positions.isEmpty() == false) {
        final int lastPosition = positions.get(positions.size() - 1);
        if (position < lastPosition) {
          throw new IllegalArgumentException(
              "Positions must be added in order, got " + position + " after " + lastPosition);
        }
      }
      if (terms.isEmpty() == false && term.field().equals(terms.get(0).field()) == false) {
        throw new IllegalArgumentException(
            "All terms must be on the same field, got "
                + term.field()
                + " and "
                + terms.get(0).field());
      }
      terms.add(term);
      positions.add(position);
      return this;
    }

    /** Build a phrase query based on the terms that have been added. */
    public PhraseQuery build() {
      Term[] terms = this.terms.toArray(new Term[0]);
      return new PhraseQuery(slop, terms, positions.toArray());
    }
  }

  private final int slop;
  private final String field;
  private final Term[] terms;
  private final int[] positions;

  private PhraseQuery(int slop, Term[] terms, int[] positions) {
    if (terms.length != positions.length) {
      throw new IllegalArgumentException("Must have as many terms as positions");
    }
    if (slop < 0) {
      throw new IllegalArgumentException("Slop must be >= 0, got " + slop);
    }
    for (Term term : terms) {
      Objects.requireNonNull(term, "Cannot add a null term to PhraseQuery");
    }
    for (int i = 1; i < terms.length; ++i) {
      if (terms[i - 1].field().equals(terms[i].field()) == false) {
        throw new IllegalArgumentException("All terms should have the same field");
      }
    }
    for (int position : positions) {
      if (position < 0) {
        throw new IllegalArgumentException("Positions must be >= 0, got " + position);
      }
    }
    for (int i = 1; i < positions.length; ++i) {
      if (positions[i] < positions[i - 1]) {
        throw new IllegalArgumentException(
            "Positions should not go backwards, got "
                + positions[i - 1]
                + " before "
                + positions[i]);
      }
    }
    this.slop = slop;
    this.terms = terms;
    this.positions = positions;
    this.field = terms.length == 0 ? null : terms[0].field();
  }

  private static int[] incrementalPositions(int length) {
    int[] positions = new int[length];
    for (int i = 0; i < length; ++i) {
      positions[i] = i;
    }
    return positions;
  }

  private static Term[] toTerms(String field, String... termStrings) {
    Term[] terms = new Term[termStrings.length];
    for (int i = 0; i < terms.length; ++i) {
      Objects.requireNonNull(termStrings[i], "Cannot add a null term to PhraseQuery");
      terms[i] = new Term(field, termStrings[i]);
    }
    return terms;
  }

  private static Term[] toTerms(String field, BytesRef... termBytes) {
    Term[] terms = new Term[termBytes.length];
    for (int i = 0; i < terms.length; ++i) {
      Objects.requireNonNull(termBytes[i], "Cannot add a null term to PhraseQuery");
      terms[i] = new Term(field, termBytes[i]);
    }
    return terms;
  }

  /**
   * Create a phrase query which will match documents that contain the given list of terms at
   * consecutive positions in {@code field}, and at a maximum edit distance of {@code slop}. For
   * more complicated use-cases, use {@link PhraseQuery.Builder}.
   *
   * @see #getSlop()
   */
  public PhraseQuery(int slop, String field, String... terms) {
    this(slop, toTerms(field, terms), incrementalPositions(terms.length));
  }

  /**
   * Create a phrase query which will match documents that contain the given list of terms at
   * consecutive positions in {@code field}.
   */
  public PhraseQuery(String field, String... terms) {
    this(0, field, terms);
  }

  /**
   * Create a phrase query which will match documents that contain the given list of terms at
   * consecutive positions in {@code field}, and at a maximum edit distance of {@code slop}. For
   * more complicated use-cases, use {@link PhraseQuery.Builder}.
   *
   * @see #getSlop()
   */
  public PhraseQuery(int slop, String field, BytesRef... terms) {
    this(slop, toTerms(field, terms), incrementalPositions(terms.length));
  }

  /**
   * Create a phrase query which will match documents that contain the given list of terms at
   * consecutive positions in {@code field}.
   */
  public PhraseQuery(String field, BytesRef... terms) {
    this(0, field, terms);
  }

  /**
   * Return the slop for this {@link PhraseQuery}.
   *
   * The slop is an edit distance between respective positions of terms as defined in this {@link
   * PhraseQuery} and the positions of terms in a document.
   *
   * 
For instance, when searching for {@code "quick fox"}, it is expected that the difference
   * between the positions of {@code fox} and {@code quick} is 1. So {@code "a quick brown fox"}
   * would be at an edit distance of 1 since the difference of the positions of {@code fox} and
   * {@code quick} is 2. Similarly, {@code "the fox is quick"} would be at an edit distance of 3
   * since the difference of the positions of {@code fox} and {@code quick} is -2. The slop defines
   * the maximum edit distance for a document to match.
   *
   * 
More exact matches are scored higher than sloppier matches, thus search results are sorted
   * by exactness.
   */
  public int getSlop() {
    return slop;
  }

  /** Returns the field this query applies to */
  public String getField() {
    return field;
  }

  /** Returns the list of terms in this phrase. */
  public Term[] getTerms() {
    return terms;
  }

  /** Returns the relative positions of terms in this phrase. */
  public int[] getPositions() {
    return positions;
  }

  @Override
  public Query rewrite(IndexSearcher indexSearcher) throws IOException {
    if (terms.length == 0) {
      return new MatchNoDocsQuery("empty PhraseQuery");
    } else if (terms.length == 1) {
      return new TermQuery(terms[0]);
    } else if (positions[0] != 0) {
      int[] newPositions = new int[positions.length];
      for (int i = 0; i < positions.length; ++i) {
        newPositions[i] = positions[i] - positions[0];
      }
      return new PhraseQuery(slop, terms, newPositions);
    } else {
      return super.rewrite(indexSearcher);
    }
  }

  @Override
  public void visit(QueryVisitor visitor) {
    if (visitor.acceptField(field) == false) {
      return;
    }
    QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.MUST, this);
    v.consumeTerms(this, terms);
  }

  /**
   * Term postings and position information for phrase matching
   *
   * @lucene.internal
   */
  public static class PostingsAndFreq implements Comparable {
    final PostingsEnum postings;
    final ImpactsEnum impacts;
    final int position;
    final Term[] terms;
    final int nTerms; // for faster comparisons

    /** Creates PostingsAndFreq instance */
    public PostingsAndFreq(
        PostingsEnum postings, ImpactsEnum impacts, int position, Term... terms) {
      this.postings = postings;
      this.impacts = impacts;
      this.position = position;
      nTerms = terms == null ? 0 : terms.length;
      if (nTerms > 0) {
        if (terms.length == 1) {
          this.terms = terms;
        } else {
          Term[] terms2 = new Term[terms.length];
          System.arraycopy(terms, 0, terms2, 0, terms.length);
          Arrays.sort(terms2);
          this.terms = terms2;
        }
      } else {
        this.terms = null;
      }
    }

    public PostingsAndFreq(
        PostingsEnum postings, ImpactsEnum impacts, int position, List terms) {
      this.postings = postings;
      this.impacts = impacts;
      this.position = position;
      nTerms = terms == null ? 0 : terms.size();
      if (nTerms > 0) {
        Term[] terms2 = terms.toArray(new Term[0]);
        if (nTerms > 1) {
          Arrays.sort(terms2);
        }
        this.terms = terms2;
      } else {
        this.terms = null;
      }
    }

    @Override
    public int compareTo(PostingsAndFreq other) {
      if (position != other.position) {
        return position - other.position;
      }
      if (nTerms != other.nTerms) {
        return nTerms - other.nTerms;
      }
      if (nTerms == 0) {
        return 0;
      }
      for (int i = 0; i < terms.length; i++) {
        int res = terms[i].compareTo(other.terms[i]);
        if (res != 0) return res;
      }
      return 0;
    }

    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + position;
      for (int i = 0; i < nTerms; i++) {
        result = prime * result + terms[i].hashCode();
      }
      return result;
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj) return true;
      if (obj == null) return false;
      if (getClass() != obj.getClass()) return false;
      PostingsAndFreq other = (PostingsAndFreq) obj;
      if (position != other.position) return false;
      if (terms == null) return other.terms == null;
      return Arrays.equals(terms, other.terms);
    }
  }

  /**
   * A guess of the average number of simple operations for the initial seek and buffer refill per
   * document for the positions of a term. See also {@link
   * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}.
   *
   * Aside: Instead of being constant this could depend among others on {@link
   * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
   * TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs),
   * {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block
   * size of the device storing the index.
   */
  private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;

  /**
   * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}
   * when no seek or buffer refill is done.
   */
  private static final int TERM_OPS_PER_POS = 7;

  /**
   * Returns an expected cost in simple operations of processing the occurrences of a term in a
   * document that contains the term. This is for use by {@link TwoPhaseIterator#matchCost}
   * implementations.
   *
   * @param termsEnum The term is the term at which this TermsEnum is positioned.
   */
  public static float termPositionsCost(TermsEnum termsEnum) throws IOException {
    int docFreq = termsEnum.docFreq();
    assert docFreq > 0;
    long totalTermFreq = termsEnum.totalTermFreq();
    float expOccurrencesInMatchingDoc = totalTermFreq / (float) docFreq;
    return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS;
  }

  @Override
  public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
      throws IOException {
    return new PhraseWeight(this, field, searcher, scoreMode) {

      private transient TermStates[] states;

      @Override
      protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException {
        final int[] positions = PhraseQuery.this.getPositions();
        if (positions.length < 2) {
          throw new IllegalStateException(
              "PhraseWeight does not support less than 2 terms, call rewrite first");
        } else if (positions[0] != 0) {
          throw new IllegalStateException(
              "PhraseWeight requires that the first position is 0, call rewrite first");
        }
        states = new TermStates[terms.length];
        TermStatistics[] termStats = new TermStatistics[terms.length];
        int termUpTo = 0;
        for (int i = 0; i < terms.length; i++) {
          final Term term = terms[i];
          states[i] = TermStates.build(searcher, term, scoreMode.needsScores());
          if (scoreMode.needsScores()) {
            TermStates ts = states[i];
            if (ts.docFreq() > 0) {
              termStats[termUpTo++] =
                  searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq());
            }
          }
        }
        if (termUpTo > 0) {
          return similarity.scorer(
              boost,
              searcher.collectionStatistics(field),
              ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
        } else {
          return null; // no terms at all, we won't use similarity
        }
      }

      @Override
      protected PhraseMatcher getPhraseMatcher(
          LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException {
        assert terms.length > 0;
        final LeafReader reader = context.reader();
        PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.length];

        final Terms fieldTerms = reader.terms(field);
        if (fieldTerms == null) {
          return null;
        }

        if (fieldTerms.hasPositions() == false) {
          throw new IllegalStateException(
              "field \""
                  + field
                  + "\" was indexed without position data; cannot run PhraseQuery (phrase="
                  + getQuery()
                  + ")");
        }

        // Reuse single TermsEnum below:
        final TermsEnum te = fieldTerms.iterator();
        float totalMatchCost = 0;

        for (int i = 0; i < terms.length; i++) {
          final Term t = terms[i];
          final IOSupplier supplier = states[i].get(context);
          final TermState state = supplier == null ? null : supplier.get();
          if (state == null) {
            /* term doesnt exist in this segment */
            assert termNotInReader(reader, t) : "no termstate found but term exists in reader";
            return null;
          }
          te.seekExact(t.bytes(), state);
          PostingsEnum postingsEnum;
          ImpactsEnum impactsEnum;
          if (scoreMode == ScoreMode.TOP_SCORES) {
            postingsEnum =
                impactsEnum =
                    te.impacts(exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
          } else {
            postingsEnum =
                te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
            impactsEnum = new SlowImpactsEnum(postingsEnum);
          }
          postingsFreqs[i] = new PostingsAndFreq(postingsEnum, impactsEnum, positions[i], t);
          totalMatchCost += termPositionsCost(te);
        }

        // sort by increasing docFreq order
        if (slop == 0) {
          ArrayUtil.timSort(postingsFreqs);
          return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
        } else {
          return new SloppyPhraseMatcher(
              postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
        }
      }
    };
  }

  // only called from assert
  private static boolean termNotInReader(LeafReader reader, Term term) throws IOException {
    return reader.docFreq(term) == 0;
  }

  /** Prints a user-readable version of this query. */
  @Override
  public String toString(String f) {
    StringBuilder buffer = new StringBuilder();
    if (field != null && !field.equals(f)) {
      buffer.append(field);
      buffer.append(":");
    }

    buffer.append("\"");
    final int maxPosition;
    if (positions.length == 0) {
      maxPosition = -1;
    } else {
      maxPosition = positions[positions.length - 1];
    }
    String[] pieces = new String[maxPosition + 1];
    for (int i = 0; i < terms.length; i++) {
      int pos = positions[i];
      String s = pieces[pos];
      if (s == null) {
        s = (terms[i]).text();
      } else {
        s = s + "|" + (terms[i]).text();
      }
      pieces[pos] = s;
    }
    for (int i = 0; i < pieces.length; i++) {
      if (i > 0) {
        buffer.append(' ');
      }
      String s = pieces[i];
      if (s == null) {
        buffer.append('?');
      } else {
        buffer.append(s);
      }
    }
    buffer.append("\"");

    if (slop != 0) {
      buffer.append("~");
      buffer.append(slop);
    }

    return buffer.toString();
  }

  /** Returns true iff o is equal to this. */
  @Override
  public boolean equals(Object other) {
    return sameClassAs(other) && equalsTo(getClass().cast(other));
  }

  private boolean equalsTo(PhraseQuery other) {
    return slop == other.slop
        && Arrays.equals(terms, other.terms)
        && Arrays.equals(positions, other.positions);
  }

  /** Returns a hash code value for this object. */
  @Override
  public int hashCode() {
    int h = classHash();
    h = 31 * h + slop;
    h = 31 * h + Arrays.hashCode(terms);
    h = 31 * h + Arrays.hashCode(positions);
    return h;
  }
}