All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.similarities.SimilarityBase Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.similarities;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;

/**
 * A subclass of {@code Similarity} that provides a simplified API for its
 * descendants. Subclasses are only required to implement the {@link #score}
 * and {@link #toString()} methods. Implementing
 * {@link #explain(List, BasicStats, int, float, float)} is optional,
 * inasmuch as SimilarityBase already provides a basic explanation of the score
 * and the term frequency. However, implementers of a subclass are encouraged to
 * include as much detail about the scoring method as possible.
 * 

* Note: multi-word queries such as phrase queries are scored in a different way * than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for * the phrase as a whole (since it does not know it), this class instead scores * phrases as a summation of the individual term scores. * @lucene.experimental */ public abstract class SimilarityBase extends Similarity { /** For {@link #log2(double)}. Precomputed for efficiency reasons. */ private static final double LOG_2 = Math.log(2); /** * True if overlap tokens (tokens with a position of increment of zero) are * discounted from the document's length. */ protected boolean discountOverlaps = true; /** * Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ public SimilarityBase() {} /** Determines whether overlap tokens (Tokens with * 0 position increment) are ignored when computing * norm. By default this is true, meaning overlap * tokens do not count when computing norms. * * @lucene.experimental * * @see #computeNorm */ public void setDiscountOverlaps(boolean v) { discountOverlaps = v; } /** * Returns true if overlap tokens are discounted from the document's length. * @see #setDiscountOverlaps */ public boolean getDiscountOverlaps() { return discountOverlaps; } @Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { BasicStats stats[] = new BasicStats[termStats.length]; for (int i = 0; i < termStats.length; i++) { stats[i] = newStats(collectionStats.field()); fillBasicStats(stats[i], collectionStats, termStats[i]); } return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats); } /** Factory method to return a custom stats object */ protected BasicStats newStats(String field) { return new BasicStats(field); } /** Fills all member fields defined in {@code BasicStats} in {@code stats}. * Subclasses can override this method to fill additional stats. */ protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { // #positions(field) must be >= #positions(term) assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq(); long numberOfDocuments = collectionStats.maxDoc(); long docFreq = termStats.docFreq(); long totalTermFreq = termStats.totalTermFreq(); // codec does not supply totalTermFreq: substitute docFreq if (totalTermFreq == -1) { totalTermFreq = docFreq; } final long numberOfFieldTokens; final float avgFieldLength; long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { // field does not exist; // We have to provide something if codec doesnt supply these measures, // or if someone omitted frequencies for the field... negative values cause // NaN/Inf for some scorers. numberOfFieldTokens = docFreq; avgFieldLength = 1; } else { numberOfFieldTokens = sumTotalTermFreq; avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; } // TODO: add sumDocFreq for field (numberOfFieldPostings) stats.setNumberOfDocuments(numberOfDocuments); stats.setNumberOfFieldTokens(numberOfFieldTokens); stats.setAvgFieldLength(avgFieldLength); stats.setDocFreq(docFreq); stats.setTotalTermFreq(totalTermFreq); } /** * Scores the document {@code doc}. *

Subclasses must apply their scoring formula in this class.

* @param stats the corpus level statistics. * @param freq the term frequency. * @param docLen the document length. * @return the score. */ protected abstract float score(BasicStats stats, float freq, float docLen); /** * Subclasses should implement this method to explain the score. {@code expl} * already contains the score, the name of the class and the doc id, as well * as the term frequency and its explanation; subclasses can add additional * clauses to explain details of their scoring formulae. *

The default implementation does nothing.

* * @param subExpls the list of details of the explanation to extend * @param stats the corpus level statistics. * @param doc the document id. * @param freq the term frequency. * @param docLen the document length. */ protected void explain( List subExpls, BasicStats stats, int doc, float freq, float docLen) {} /** * Explains the score. The implementation here provides a basic explanation * in the format score(name-of-similarity, doc=doc-id, * freq=term-frequency), computed from:, and * attaches the score (computed via the {@link #score(BasicStats, float, float)} * method) and the explanation for the term frequency. Subclasses content with * this format may add additional details in * {@link #explain(List, BasicStats, int, float, float)}. * * @param stats the corpus level statistics. * @param doc the document id. * @param freq the term frequency and its explanation. * @param docLen the document length. * @return the explanation. */ protected Explanation explain( BasicStats stats, int doc, Explanation freq, float docLen) { List subs = new ArrayList<>(); explain(subs, stats, doc, freq.getValue(), docLen); return Explanation.match( score(stats, freq.getValue(), docLen), "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:", subs); } @Override public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { if (stats instanceof MultiSimilarity.MultiStats) { // a multi term query (e.g. phrase). return the summation, // scoring almost as if it were boolean query SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats; SimScorer subScorers[] = new SimScorer[subStats.length]; for (int i = 0; i < subScorers.length; i++) { BasicStats basicstats = (BasicStats) subStats[i]; subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field)); } return new MultiSimilarity.MultiSimScorer(subScorers); } else { BasicStats basicstats = (BasicStats) stats; return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field)); } } /** * Subclasses must override this method to return the name of the Similarity * and preferably the values of parameters (if any) as well. */ @Override public abstract String toString(); // ------------------------------ Norm handling ------------------------------ /** Norm to document length map. */ private static final float[] NORM_TABLE = new float[256]; static { for (int i = 1; i < 256; i++) { float floatNorm = SmallFloat.byte315ToFloat((byte)i); NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm); } NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf } /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */ @Override public long computeNorm(FieldInvertState state) { final float numTerms; if (discountOverlaps) numTerms = state.getLength() - state.getNumOverlap(); else numTerms = state.getLength(); return encodeNormValue(state.getBoost(), numTerms); } /** Decodes a normalization factor (document length) stored in an index. * @see #encodeNormValue(float,float) */ protected float decodeNormValue(byte norm) { return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127 } /** Encodes the length to a byte via SmallFloat. */ protected byte encodeNormValue(float boost, float length) { return SmallFloat.floatToByte315((boost / (float) Math.sqrt(length))); } // ----------------------------- Static methods ------------------------------ /** Returns the base two logarithm of {@code x}. */ public static double log2(double x) { // Put this to a 'util' class if we need more of these. return Math.log(x) / LOG_2; } // --------------------------------- Classes --------------------------------- /** Delegates the {@link #score(int, float)} and * {@link #explain(int, Explanation)} methods to * {@link SimilarityBase#score(BasicStats, float, float)} and * {@link SimilarityBase#explain(BasicStats, int, Explanation, float)}, * respectively. */ private class BasicSimScorer extends SimScorer { private final BasicStats stats; private final NumericDocValues norms; BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException { this.stats = stats; this.norms = norms; } @Override public float score(int doc, float freq) { // We have to supply something in case norms are omitted return SimilarityBase.this.score(stats, freq, norms == null ? 1F : decodeNormValue((byte)norms.get(doc))); } @Override public Explanation explain(int doc, Explanation freq) { return SimilarityBase.this.explain(stats, doc, freq, norms == null ? 1F : decodeNormValue((byte)norms.get(doc))); } @Override public float computeSlopFactor(int distance) { return 1.0f / (distance + 1); } @Override public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { return 1f; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy