org.apache.lucene.search.similarities.Similarity Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.similarities;


import java.util.Collections;

import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.SmallFloat;

/** 
 * Similarity defines the components of Lucene scoring.
 * 
 * Expert: Scoring API.
 * 

 * This is a low-level API, you should only extend this API if you want to implement 
 * an information retrieval model.  If you are instead looking for a convenient way 
 * to alter Lucene's scoring, consider just tweaking the default implementation:
 * {@link BM25Similarity} or extend {@link SimilarityBase}, which makes it easy to compute
 * a score from index statistics.
 * 

 * Similarity determines how Lucene weights terms, and Lucene interacts with
 * this class at both index-time and 
 * query-time.
 * 

 * Indexing Time
 * At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing
 * the Similarity implementation to set a per-document value for the field that will 
 * be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}.
 * Lucene makes no assumption about what is in this norm, but it is most useful for
 * encoding length normalization information.
 * 

 * Implementations should carefully consider how the normalization is encoded: while
 * Lucene's {@link BM25Similarity} encodes length normalization information with
 * {@link SmallFloat} into a single byte, this might not be suitable for all purposes.
 * 

 * Many formulas require the use of average document length, which can be computed via a 
 * combination of {@link CollectionStatistics#sumTotalTermFreq()} and 
 * {@link CollectionStatistics#docCount()}.
 * 

 * Additional scoring factors can be stored in named {@link NumericDocValuesField}s and
 * accessed at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
 * However this should not be done in the {@link Similarity} but externally, for instance
 * by using FunctionScoreQuery.
 * 

 * Finally, using index-time boosts (either via folding into the normalization byte or
 * via DocValues), is an inefficient way to boost the scores of different fields if the
 * boost will be the same for every document, instead the Similarity can simply take a constant
 * boost parameter C, and {@link PerFieldSimilarityWrapper} can return different 
 * instances with different boosts depending upon field name.
 * 

 * Query time
 * At query-time, Queries interact with the Similarity via these steps:
 * 

 *   The {@link #scorer(float, CollectionStatistics, TermStatistics...)} method is called a single time,
 *       allowing the implementation to compute any statistics (such as IDF, average document length, etc)
 *       across the entire collection. The {@link TermStatistics} and {@link CollectionStatistics} passed in 
 *       already contain all of the raw statistics involved, so a Similarity can freely use any combination
 *       of statistics without causing any additional I/O. Lucene makes no assumption about what is 
 *       stored in the returned {@link Similarity.SimScorer} object.
 *   
Then {@link SimScorer#score(float, long)} is called for every matching document to compute its score.
 * 
 * 
 * Explanations
 * When {@link IndexSearcher#explain(org.apache.lucene.search.Query, int)} is called, queries consult the Similarity's DocScorer for an 
 * explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency
 * was computed.
 *
 * @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity)
 * @see IndexSearcher#setSimilarity(Similarity)
 * @lucene.experimental
 */
public abstract class Similarity {
  
  /**
   * Sole constructor. (For invocation by subclass 
   * constructors, typically implicit.)
   */
  public Similarity() {}
  
  /**
   * Computes the normalization value for a field, given the accumulated
   * state of term processing for this field (see {@link FieldInvertState}).
   *
   * 
Matches in longer fields are less precise, so implementations of this
   * method usually set smaller values when state.getLength() is large,
   * and larger values when state.getLength() is small.
   *
   * 
Note that for a given term-document frequency, greater unsigned norms
   * must produce scores that are lower or equal, ie. for two encoded norms
   * {@code n1} and {@code n2} so that
   * {@code Long.compareUnsigned(n1, n2) > 0} then
   * {@code SimScorer.score(freq, n1) <= SimScorer.score(freq, n2)}
   * for any legal {@code freq}.
   *
   * 
{@code 0} is not a legal norm, so {@code 1} is the norm that produces
   * the highest scores.
   *
   * @lucene.experimental
   * 
   * @param state current processing state for this field
   * @return computed norm value
   */
  public abstract long computeNorm(FieldInvertState state);

  /**
   * Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring a query.
   *
   * @param boost a multiplicative factor to apply to the produces scores
   * @param collectionStats collection-level statistics, such as the number of tokens in the collection.
   * @param termStats term-level statistics, such as the document frequency of a term across the collection.
   * @return SimWeight object with the information this Similarity needs to score a query.
   */
  public abstract SimScorer scorer(float boost,
      CollectionStatistics collectionStats, TermStatistics... termStats);
  
  /** Stores the weight for a query across the indexed collection. This abstract
   * implementation is empty; descendants of {@code Similarity} should
   * subclass {@code SimWeight} and define the statistics they require in the
   * subclass. Examples include idf, average field length, etc.
   */
  public static abstract class SimScorer {

    /**
     * Sole constructor. (For invocation by subclass 
     * constructors.)
     */
    protected SimScorer() {}

    /**
     * Score a single document. {@code freq} is the document-term sloppy
     * frequency and must be finite and positive. {@code norm} is the
     * encoded normalization factor as computed by
     * {@link Similarity#computeNorm(FieldInvertState)} at index time, or
     * {@code 1} if norms are disabled. {@code norm} is never {@code 0}.
     * 

     * Score must not decrease when {@code freq} increases, ie. if
     * {@code freq1 > freq2}, then {@code score(freq1, norm) >=
     * score(freq2, norm)} for any value of {@code norm} that may be produced
     * by {@link Similarity#computeNorm(FieldInvertState)}.
     * 

     * Score must not increase when the unsigned {@code norm} increases, ie. if
     * {@code Long.compareUnsigned(norm1, norm2) > 0} then
     * {@code score(freq, norm1) <= score(freq, norm2)} for any legal
     * {@code freq}.
     * 
     * As a consequence, the maximum score that this scorer can produce is bound
     * by {@code score(Float.MAX_VALUE, 1)}.
     * @param freq sloppy term frequency, must be finite and positive
     * @param norm encoded normalization factor or {@code 1} if norms are disabled
     * @return document's score
     */
    public abstract float score(float freq, long norm);

    /**
     * Explain the score for a single document
     * @param freq Explanation of how the sloppy term frequency was computed
     * @param norm encoded normalization factor, as returned by {@link Similarity#computeNorm}, or {@code 1} if norms are disabled
     * @return document's score
     */
    public Explanation explain(Explanation freq, long norm) {
      return Explanation.match(
          score(freq.getValue().floatValue(), norm),
          "score(freq=" + freq.getValue() +"), with freq of:",
          Collections.singleton(freq));
    }

  }
}