All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.PhraseQuery Maven / Gradle / Ivy

There is a newer version: 6.4.2_1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.internal.hppc.IntArrayList;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOSupplier;

/**
 * A Query that matches documents containing a particular sequence of terms. A PhraseQuery is built
 * by QueryParser for input like "new york".
 *
 * 

This query may be combined with other terms or queries with a {@link BooleanQuery}. * *

NOTE: All terms in the phrase must match, even those at the same position. If you have * terms at the same position, perhaps synonyms, you probably want {@link MultiPhraseQuery} instead * which only requires one term at a position to match.
* Also, Leading holes don't have any particular meaning for this query and will be ignored. For * instance this query: * *

 * PhraseQuery.Builder builder = new PhraseQuery.Builder();
 * builder.add(new Term("body", "one"), 4);
 * builder.add(new Term("body", "two"), 5);
 * PhraseQuery pq = builder.build();
 * 
* * is equivalent to the below query: * *
 * PhraseQuery.Builder builder = new PhraseQuery.Builder();
 * builder.add(new Term("body", "one"), 0);
 * builder.add(new Term("body", "two"), 1);
 * PhraseQuery pq = builder.build();
 * 
*/ public class PhraseQuery extends Query { /** A builder for phrase queries. */ public static class Builder { private int slop; private final List terms; private final IntArrayList positions; /** Sole constructor. */ public Builder() { slop = 0; terms = new ArrayList<>(); positions = new IntArrayList(); } /** * Set the slop. * * @see PhraseQuery#getSlop() */ public Builder setSlop(int slop) { this.slop = slop; return this; } /** * Adds a term to the end of the query phrase. The relative position of the term is the one * immediately after the last term added. */ public Builder add(Term term) { return add(term, positions.isEmpty() ? 0 : 1 + positions.get(positions.size() - 1)); } /** * Adds a term to the end of the query phrase. The relative position of the term within the * phrase is specified explicitly, but must be greater than or equal to that of the previously * added term. A greater position allows phrases with gaps (e.g. in connection with stopwords). * If the position is equal, you most likely should be using {@link MultiPhraseQuery} instead * which only requires one term at each position to match; this class requires all of them. */ public Builder add(Term term, int position) { Objects.requireNonNull(term, "Cannot add a null term to PhraseQuery"); if (position < 0) { throw new IllegalArgumentException("Positions must be >= 0, got " + position); } if (positions.isEmpty() == false) { final int lastPosition = positions.get(positions.size() - 1); if (position < lastPosition) { throw new IllegalArgumentException( "Positions must be added in order, got " + position + " after " + lastPosition); } } if (terms.isEmpty() == false && term.field().equals(terms.get(0).field()) == false) { throw new IllegalArgumentException( "All terms must be on the same field, got " + term.field() + " and " + terms.get(0).field()); } terms.add(term); positions.add(position); return this; } /** Build a phrase query based on the terms that have been added. */ public PhraseQuery build() { Term[] terms = this.terms.toArray(new Term[0]); return new PhraseQuery(slop, terms, positions.toArray()); } } private final int slop; private final String field; private final Term[] terms; private final int[] positions; private PhraseQuery(int slop, Term[] terms, int[] positions) { if (terms.length != positions.length) { throw new IllegalArgumentException("Must have as many terms as positions"); } if (slop < 0) { throw new IllegalArgumentException("Slop must be >= 0, got " + slop); } for (Term term : terms) { Objects.requireNonNull(term, "Cannot add a null term to PhraseQuery"); } for (int i = 1; i < terms.length; ++i) { if (terms[i - 1].field().equals(terms[i].field()) == false) { throw new IllegalArgumentException("All terms should have the same field"); } } for (int position : positions) { if (position < 0) { throw new IllegalArgumentException("Positions must be >= 0, got " + position); } } for (int i = 1; i < positions.length; ++i) { if (positions[i] < positions[i - 1]) { throw new IllegalArgumentException( "Positions should not go backwards, got " + positions[i - 1] + " before " + positions[i]); } } this.slop = slop; this.terms = terms; this.positions = positions; this.field = terms.length == 0 ? null : terms[0].field(); } private static int[] incrementalPositions(int length) { int[] positions = new int[length]; for (int i = 0; i < length; ++i) { positions[i] = i; } return positions; } private static Term[] toTerms(String field, String... termStrings) { Term[] terms = new Term[termStrings.length]; for (int i = 0; i < terms.length; ++i) { Objects.requireNonNull(termStrings[i], "Cannot add a null term to PhraseQuery"); terms[i] = new Term(field, termStrings[i]); } return terms; } private static Term[] toTerms(String field, BytesRef... termBytes) { Term[] terms = new Term[termBytes.length]; for (int i = 0; i < terms.length; ++i) { Objects.requireNonNull(termBytes[i], "Cannot add a null term to PhraseQuery"); terms[i] = new Term(field, termBytes[i]); } return terms; } /** * Create a phrase query which will match documents that contain the given list of terms at * consecutive positions in {@code field}, and at a maximum edit distance of {@code slop}. For * more complicated use-cases, use {@link PhraseQuery.Builder}. * * @see #getSlop() */ public PhraseQuery(int slop, String field, String... terms) { this(slop, toTerms(field, terms), incrementalPositions(terms.length)); } /** * Create a phrase query which will match documents that contain the given list of terms at * consecutive positions in {@code field}. */ public PhraseQuery(String field, String... terms) { this(0, field, terms); } /** * Create a phrase query which will match documents that contain the given list of terms at * consecutive positions in {@code field}, and at a maximum edit distance of {@code slop}. For * more complicated use-cases, use {@link PhraseQuery.Builder}. * * @see #getSlop() */ public PhraseQuery(int slop, String field, BytesRef... terms) { this(slop, toTerms(field, terms), incrementalPositions(terms.length)); } /** * Create a phrase query which will match documents that contain the given list of terms at * consecutive positions in {@code field}. */ public PhraseQuery(String field, BytesRef... terms) { this(0, field, terms); } /** * Return the slop for this {@link PhraseQuery}. * *

The slop is an edit distance between respective positions of terms as defined in this {@link * PhraseQuery} and the positions of terms in a document. * *

For instance, when searching for {@code "quick fox"}, it is expected that the difference * between the positions of {@code fox} and {@code quick} is 1. So {@code "a quick brown fox"} * would be at an edit distance of 1 since the difference of the positions of {@code fox} and * {@code quick} is 2. Similarly, {@code "the fox is quick"} would be at an edit distance of 3 * since the difference of the positions of {@code fox} and {@code quick} is -2. The slop defines * the maximum edit distance for a document to match. * *

More exact matches are scored higher than sloppier matches, thus search results are sorted * by exactness. */ public int getSlop() { return slop; } /** Returns the field this query applies to */ public String getField() { return field; } /** Returns the list of terms in this phrase. */ public Term[] getTerms() { return terms; } /** Returns the relative positions of terms in this phrase. */ public int[] getPositions() { return positions; } @Override public Query rewrite(IndexSearcher indexSearcher) throws IOException { if (terms.length == 0) { return new MatchNoDocsQuery("empty PhraseQuery"); } else if (terms.length == 1) { return new TermQuery(terms[0]); } else if (positions[0] != 0) { int[] newPositions = new int[positions.length]; for (int i = 0; i < positions.length; ++i) { newPositions[i] = positions[i] - positions[0]; } return new PhraseQuery(slop, terms, newPositions); } else { return super.rewrite(indexSearcher); } } @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field) == false) { return; } QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.MUST, this); v.consumeTerms(this, terms); } /** * Term postings and position information for phrase matching * * @lucene.internal */ public static class PostingsAndFreq implements Comparable { final PostingsEnum postings; final ImpactsEnum impacts; final int position; final Term[] terms; final int nTerms; // for faster comparisons /** Creates PostingsAndFreq instance */ public PostingsAndFreq( PostingsEnum postings, ImpactsEnum impacts, int position, Term... terms) { this.postings = postings; this.impacts = impacts; this.position = position; nTerms = terms == null ? 0 : terms.length; if (nTerms > 0) { if (terms.length == 1) { this.terms = terms; } else { Term[] terms2 = new Term[terms.length]; System.arraycopy(terms, 0, terms2, 0, terms.length); Arrays.sort(terms2); this.terms = terms2; } } else { this.terms = null; } } public PostingsAndFreq( PostingsEnum postings, ImpactsEnum impacts, int position, List terms) { this.postings = postings; this.impacts = impacts; this.position = position; nTerms = terms == null ? 0 : terms.size(); if (nTerms > 0) { Term[] terms2 = terms.toArray(new Term[0]); if (nTerms > 1) { Arrays.sort(terms2); } this.terms = terms2; } else { this.terms = null; } } @Override public int compareTo(PostingsAndFreq other) { if (position != other.position) { return position - other.position; } if (nTerms != other.nTerms) { return nTerms - other.nTerms; } if (nTerms == 0) { return 0; } for (int i = 0; i < terms.length; i++) { int res = terms[i].compareTo(other.terms[i]); if (res != 0) return res; } return 0; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + position; for (int i = 0; i < nTerms; i++) { result = prime * result + terms[i].hashCode(); } return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; PostingsAndFreq other = (PostingsAndFreq) obj; if (position != other.position) return false; if (terms == null) return other.terms == null; return Arrays.equals(terms, other.terms); } } /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}. * *

Aside: Instead of being constant this could depend among others on {@link * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link * TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs), * {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block * size of the device storing the index. */ private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()} * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7; /** * Returns an expected cost in simple operations of processing the occurrences of a term in a * document that contains the term. This is for use by {@link TwoPhaseIterator#matchCost} * implementations. * * @param termsEnum The term is the term at which this TermsEnum is positioned. */ public static float termPositionsCost(TermsEnum termsEnum) throws IOException { int docFreq = termsEnum.docFreq(); assert docFreq > 0; long totalTermFreq = termsEnum.totalTermFreq(); float expOccurrencesInMatchingDoc = totalTermFreq / (float) docFreq; return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS; } @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { return new PhraseWeight(this, field, searcher, scoreMode) { private transient TermStates[] states; @Override protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException { final int[] positions = PhraseQuery.this.getPositions(); if (positions.length < 2) { throw new IllegalStateException( "PhraseWeight does not support less than 2 terms, call rewrite first"); } else if (positions[0] != 0) { throw new IllegalStateException( "PhraseWeight requires that the first position is 0, call rewrite first"); } states = new TermStates[terms.length]; TermStatistics[] termStats = new TermStatistics[terms.length]; int termUpTo = 0; for (int i = 0; i < terms.length; i++) { final Term term = terms[i]; states[i] = TermStates.build(searcher, term, scoreMode.needsScores()); if (scoreMode.needsScores()) { TermStates ts = states[i]; if (ts.docFreq() > 0) { termStats[termUpTo++] = searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq()); } } } if (termUpTo > 0) { return similarity.scorer( boost, searcher.collectionStatistics(field), ArrayUtil.copyOfSubArray(termStats, 0, termUpTo)); } else { return null; // no terms at all, we won't use similarity } } @Override protected PhraseMatcher getPhraseMatcher( LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException { assert terms.length > 0; final LeafReader reader = context.reader(); PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.length]; final Terms fieldTerms = reader.terms(field); if (fieldTerms == null) { return null; } if (fieldTerms.hasPositions() == false) { throw new IllegalStateException( "field \"" + field + "\" was indexed without position data; cannot run PhraseQuery (phrase=" + getQuery() + ")"); } // Reuse single TermsEnum below: final TermsEnum te = fieldTerms.iterator(); float totalMatchCost = 0; for (int i = 0; i < terms.length; i++) { final Term t = terms[i]; final IOSupplier supplier = states[i].get(context); final TermState state = supplier == null ? null : supplier.get(); if (state == null) { /* term doesnt exist in this segment */ assert termNotInReader(reader, t) : "no termstate found but term exists in reader"; return null; } te.seekExact(t.bytes(), state); PostingsEnum postingsEnum; ImpactsEnum impactsEnum; if (scoreMode == ScoreMode.TOP_SCORES) { postingsEnum = impactsEnum = te.impacts(exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS); } else { postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS); impactsEnum = new SlowImpactsEnum(postingsEnum); } postingsFreqs[i] = new PostingsAndFreq(postingsEnum, impactsEnum, positions[i], t); totalMatchCost += termPositionsCost(te); } // sort by increasing docFreq order if (slop == 0) { ArrayUtil.timSort(postingsFreqs); return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost); } else { return new SloppyPhraseMatcher( postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets); } } }; } // only called from assert private static boolean termNotInReader(LeafReader reader, Term term) throws IOException { return reader.docFreq(term) == 0; } /** Prints a user-readable version of this query. */ @Override public String toString(String f) { StringBuilder buffer = new StringBuilder(); if (field != null && !field.equals(f)) { buffer.append(field); buffer.append(":"); } buffer.append("\""); final int maxPosition; if (positions.length == 0) { maxPosition = -1; } else { maxPosition = positions[positions.length - 1]; } String[] pieces = new String[maxPosition + 1]; for (int i = 0; i < terms.length; i++) { int pos = positions[i]; String s = pieces[pos]; if (s == null) { s = (terms[i]).text(); } else { s = s + "|" + (terms[i]).text(); } pieces[pos] = s; } for (int i = 0; i < pieces.length; i++) { if (i > 0) { buffer.append(' '); } String s = pieces[i]; if (s == null) { buffer.append('?'); } else { buffer.append(s); } } buffer.append("\""); if (slop != 0) { buffer.append("~"); buffer.append(slop); } return buffer.toString(); } /** Returns true iff o is equal to this. */ @Override public boolean equals(Object other) { return sameClassAs(other) && equalsTo(getClass().cast(other)); } private boolean equalsTo(PhraseQuery other) { return slop == other.slop && Arrays.equals(terms, other.terms) && Arrays.equals(positions, other.positions); } /** Returns a hash code value for this object. */ @Override public int hashCode() { int h = classHash(); h = 31 * h + slop; h = 31 * h + Arrays.hashCode(terms); h = 31 * h + Arrays.hashCode(positions); return h; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy