smile.nlp.relevance.BM25 Maven / Gradle / Ivy

Go to download
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.nlp.relevance;

import smile.nlp.Corpus;
import smile.nlp.TextTerms;

/**
 * The BM25 weighting scheme, often called Okapi weighting, after the system in
 * which it was first implemented, was developed as a way of building a
 * probabilistic model sensitive to term frequency and document length while
 * not introducing too many additional parameters into the model. It is not
 * a single function, but actually a whole family of scoring functions, with
 * slightly different components and parameters.
 * 
 * At the extreme values of the coefficient b, BM25 turns into ranking functions
 * known as BM11 (for b = 1) and BM15 (for b = 0). BM25F is a modification of
 * BM25 in which the document is considered to be composed from several fields
 * (such as headlines, main text, anchor text) with possibly different degrees
 * of importance.
 * 
 * BM25 and its newer variants represent state-of-the-art TF-IDF-like retrieval
 * functions used in document retrieval, such as web search.
 *
 * @see TFIDF
 *
 * @author Haifeng Li
 */
public class BM25 implements RelevanceRanker {

    /**
     * Free parameter, usually chosen as k1 = 2.0.
     */
    private final double k1;
    /**
     * Free parameter, usually chosen as b = 0.75.
     */
    private final double b;
    /**
     * The control parameter in BM25+. The standard BM25 in which the
     * component of term frequency normalization by document length
     * is not properly lower-bounded; as a result of this deficiency,
     * long documents which do match the query term can often be scored
     * unfairly by BM25 as having a similar relevance to shorter
     * documents that do not contain the query term at all. 
     */
    private final double delta;

    /**
     * Default constructor with k1 = 1.2, b = 0.75, delta = 1.0.
     */
    public BM25() {
        this(1.2, 0.75, 1.0);
    }

    /**
     * Constructor.
     *
     * @param k1 is a positive tuning parameter that calibrates
     *           the document term frequency scaling. A k1 value of 0
     *           corresponds to a binary model (no term frequency),
     *           and a large value corresponds to using raw term frequency.
     *
     * @param b b is another tuning parameter ({@code 0 <= b <= 1}) which
     *          determines the scaling by document length: b = 1 corresponds
     *          to fully scaling the term weight by the document length,
     *          while b = 0 corresponds to no length normalization.
     *
     * @param delta the control parameter in BM25+. The standard BM25 in
     *              which the component of term frequency normalization
     *              by document length is not properly lower-bounded;
     *              as a result of this deficiency, long documents which
     *              do match the query term can often be scored unfairly
     *              by BM25 as having a similar relevance to shorter
     *              documents that do not contain the query term at all.
     */
    public BM25(double k1, double b, double delta) {
        if (k1 < 0) {
            throw new IllegalArgumentException("Negative k1 = " + k1);
        }

        if (b < 0 || b > 1) {
            throw new IllegalArgumentException("Invalid b = " + b);
        }

        if (delta < 0) {
            throw new IllegalArgumentException("Invalid delta = " + delta);
        }

        this.k1 = k1;
        this.b = b;
        this.delta = delta;
    }

    /**
     * Returns the relevance score between a term and a document based on a corpus.
     * @param termFreq the term frequency in the text body.
     * @param docSize the text length.
     * @param avgDocSize the average text length in the corpus.
     * @param titleTermFreq the term frequency in the title.
     * @param titleSize the title length.
     * @param avgTitleSize the average title length in the corpus.
     * @param anchorTermFreq the term frequency in the anchor.
     * @param anchorSize the anchor length.
     * @param avgAnchorSize the average anchor length in the corpus.
     * @param N the number of documents in the corpus.
     * @param n the number of documents containing the given term in the corpus;
     * @return the relevance score.
     */
    public double score(int termFreq, int docSize, double avgDocSize, int titleTermFreq,
                        int titleSize, double avgTitleSize, int anchorTermFreq, int anchorSize,
                        double avgAnchorSize, long N, long n) {
        if (termFreq <= 0) return 0.0;

        // BM25F parameters
        final double kf = 4.9; // k1 in BM25F
        final double bTitle = 0.6;
        final double bBody = 0.5;
        final double bAnchor = 0.6;
        final double wTitle = 13.5;
        final double wBody = 1.0;
        final double wAnchor = 11.5;

        double tf = wBody * termFreq / (1.0 + bBody * (docSize / avgDocSize - 1.0));
        
        if (titleTermFreq > 0) {
            tf += wTitle * titleTermFreq / (1.0 + bTitle * (titleSize / avgTitleSize - 1.0));
        }
        
        if (anchorTermFreq > 0) {
            tf += wAnchor * anchorTermFreq / (1.0 + bAnchor * (anchorSize / avgAnchorSize - 1.0));
        }
        
        tf = tf / (kf + tf);
        double idf = Math.log((N - n + 0.5) / (n + 0.5) + 1);

        return (tf + delta) * idf;
    }

    /**
     * Returns the relevance score between a term and a document based on a corpus.
     * @param freq the normalized term frequency of searching term in the document to rank.
     * @param N the number of documents in the corpus.
     * @param n the number of documents containing the given term in the corpus;
     * @return the relevance score.
     */
    public double score(double freq, long N, long n) {
        if (freq <= 0) return 0.0;

        double tf = (k1 + 1) * freq / (freq + k1);
        double idf = Math.log((N - n + 0.5) / (n + 0.5) + 1);

        return (tf + delta) * idf;
    }

    /**
     * Returns the relevance score between a term and a document based on a corpus.
     * @param freq the frequency of searching term in the document to rank.
     * @param docSize the size of document to rank.
     * @param avgDocSize the average size of documents in the corpus.
     * @param N the number of documents in the corpus.
     * @param n the number of documents containing the given term in the corpus;
     * @return the relevance score.
     */
    public double score(double freq, int docSize, double avgDocSize, long N, long n) {
        if (freq <= 0) return 0.0;

        double tf = freq * (k1 + 1) / (freq + k1 * (1 - b + b * docSize / avgDocSize));
        double idf = Math.log((N - n + 0.5) / (n + 0.5) + 1);

        return (tf + delta) * idf;
    }

    @Override
    public double rank(Corpus corpus, TextTerms doc, String term, int tf, int n) {
        if (tf <= 0) return 0.0;

        int N = corpus.ndoc();
        int docSize = doc.size();
        int avgDocSize = corpus.avgDocSize();

        return score(tf, docSize, avgDocSize, N, n);
    }

    @Override
    public double rank(Corpus corpus, TextTerms doc, String[] terms, int[] tf, int n) {
        int N = corpus.ndoc();
        int docSize = doc.size();
        int avgDocSize = corpus.avgDocSize();

        double r = 0.0;
        for (int i = 0; i < terms.length; i++) {
            r += score(tf[i], docSize, avgDocSize, N, n);
        }

        return r;
    }
}