smile.nlp.relevance.BM25 Maven / Gradle / Ivy

Go to download
/*******************************************************************************
 * Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Smile.  If not, see .
 ******************************************************************************/

package smile.nlp.relevance;

import smile.nlp.Corpus;
import smile.nlp.TextTerms;

/**
 * The BM25 weighting scheme, often called Okapi weighting, after the system in
 * which it was first implemented, was developed as a way of building a
 * probabilistic model sensitive to term frequency and document length while
 * not introducing too many additional parameters into the model. It is not
 * a single function, but actually a whole family of scoring functions, with
 * slightly different components and parameters.
 * 
 * At the extreme values of the coefficient b, BM25 turns into ranking functions
 * known as BM11 (for b = 1) and BM15 (for b = 0). BM25F is a modification of
 * BM25 in which the document is considered to be composed from several fields
 * (such as headlines, main text, anchor text) with possibly different degrees
 * of importance.
 * 
 * BM25 and its newer variants represent state-of-the-art TF-IDF-like retrieval
 * functions used in document retrieval, such as web search.
 *
 * @see TFIDF
 *
 * @author Haifeng Li
 */
public class BM25 implements RelevanceRanker {

    /**
     * Free parameter, usually chosen as k1 = 2.0.
     */
    private double k1;
    /**
     * Free parameter, usually chosen as b = 0.75.
     */
    private double b;
    /**
     * The control parameter in BM25+. The standard BM25 in which the
     * component of term frequency normalization by document length
     * is not properly lower-bounded; as a result of this deficiency,
     * long documents which do match the query term can often be scored
     * unfairly by BM25 as having a similar relevance to shorter
     * documents that do not contain the query term at all. 
     */
    private double delta;
    
    /**
     * BM25F parameters
     */
    private double kf = 4.9; // k1 in BM25F
    private double bTitle = 0.6;
    private double bBody = 0.5;
    private double bAnchor = 0.6;
    private double wTitle = 13.5;
    private double wBody = 1.0;
    private double wAnchor = 11.5;
    
    /**
     * Default constructor with k1 = 1.2, b = 0.75, delta = 1.0.
     */
    public BM25() {
        this(1.2, 0.75, 1.0);
    }

    /**
     * Constructor.
     *
     * @param k1 is a positive tuning parameter that calibrates
     * the document term frequency scaling. A k1 value of 0 corresponds to a
     * binary model (no term frequency), and a large value corresponds to using
     * raw term frequency.
     * @param b b is another tuning parameter (0 ≤ b ≤ 1) which determines
     * the scaling by document length: b = 1 corresponds to fully scaling the
     * term weight by the document length, while b = 0 corresponds to no length
     * normalization.
     */
    public BM25(double k1, double b, double delta) {
        if (k1 < 0) {
            throw new IllegalArgumentException("Negative k1 = " + k1);
        }

        if (b < 0 || b > 1) {
            throw new IllegalArgumentException("Invalid b = " + b);
        }

        if (delta < 0) {
            throw new IllegalArgumentException("Invalid delta = " + delta);
        }

        this.k1 = k1;
        this.b = b;
        this.delta = delta;
    }

    /**
     * Returns a relevance score between a term and a document based on a corpus.
     * @param termFreq normalized term frequency of searching term in the document to rank.
     * @param N the number of documents in the corpus.
     * @param n the number of documents containing the given term in the corpus;
     */
    public double score(int termFreq, int docLen, double avgDocLen, int titleTermFreq, int titleLen, double avgTitleLen, int anchorTermFreq, int anchorLen, double avgAnchorLen, long N, long n) {
        if (termFreq <= 0) return 0.0;

        double tf = wBody * termFreq / (1.0 + bBody * (docLen / avgDocLen - 1.0));
        
        if (titleTermFreq > 0) {
            tf += wTitle * titleTermFreq / (1.0 + bTitle * (titleLen / avgTitleLen - 1.0));
        }
        
        if (anchorTermFreq > 0) {
            tf += wAnchor * anchorTermFreq / (1.0 + bAnchor * (anchorLen / avgAnchorLen - 1.0));
        }
        
        tf = tf / (kf + tf);
        double idf = Math.log((N - n + 0.5) / (n + 0.5));

        return (tf + delta) * idf;
    }

    /**
     * Returns a relevance score between a term and a document based on a corpus.
     * @param freq normalized term frequency of searching term in the document to rank.
     * @param N the number of documents in the corpus.
     * @param n the number of documents containing the given term in the corpus;
     */
    public double score(double freq, long N, long n) {
        if (freq <= 0) return 0.0;

        double tf = (k1 + 1) * freq / (freq + k1);
        double idf = Math.log((N - n + 0.5) / (n + 0.5));

        return (tf + delta) * idf;
    }

    /**
     * Returns a relevance score between a term and a document based on a corpus.
     * @param freq the frequency of searching term in the document to rank.
     * @param docSize the size of document to rank.
     * @param avgDocSize the average size of documents in the corpus.
     * @param N the number of documents in the corpus.
     * @param n the number of documents containing the given term in the corpus;
     */
    public double score(double freq, int docSize, double avgDocSize, long N, long n) {
        if (freq <= 0) return 0.0;

        double tf = freq * (k1 + 1) / (freq + k1 * (1 - b + b * docSize / avgDocSize));
        double idf = Math.log((N - n + 0.5) / (n + 0.5));

        return (tf + delta) * idf;
    }

    @Override
    public double rank(Corpus corpus, TextTerms doc, String term, int tf, int n) {
        if (tf <= 0) return 0.0;

        int N = corpus.getNumDocuments();
        int docSize = doc.size();
        int avgDocSize = corpus.getAverageDocumentSize();

        return score(tf, docSize, avgDocSize, N, n);
    }

    @Override
    public double rank(Corpus corpus, TextTerms doc, String[] terms, int[] tf, int n) {
        int N = corpus.getNumDocuments();
        int docSize = doc.size();
        int avgDocSize = corpus.getAverageDocumentSize();

        double r = 0.0;
        for (int i = 0; i < terms.length; i++) {
            r += score(tf[i], docSize, avgDocSize, N, n);
        }

        return r;
    }
}