All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.nlp.relevance.BM25 Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.nlp.relevance;

import smile.nlp.Corpus;
import smile.nlp.TextTerms;

/**
 * The BM25 weighting scheme, often called Okapi weighting, after the system in
 * which it was first implemented, was developed as a way of building a
 * probabilistic model sensitive to term frequency and document length while
 * not introducing too many additional parameters into the model. It is not
 * a single function, but actually a whole family of scoring functions, with
 * slightly different components and parameters.
 * 

* At the extreme values of the coefficient b, BM25 turns into ranking functions * known as BM11 (for b = 1) and BM15 (for b = 0). BM25F is a modification of * BM25 in which the document is considered to be composed from several fields * (such as headlines, main text, anchor text) with possibly different degrees * of importance. *

* BM25 and its newer variants represent state-of-the-art TF-IDF-like retrieval * functions used in document retrieval, such as web search. * * @see TFIDF * * @author Haifeng Li */ public class BM25 implements RelevanceRanker { /** * Free parameter, usually chosen as k1 = 2.0. */ private final double k1; /** * Free parameter, usually chosen as b = 0.75. */ private final double b; /** * The control parameter in BM25+. The standard BM25 in which the * component of term frequency normalization by document length * is not properly lower-bounded; as a result of this deficiency, * long documents which do match the query term can often be scored * unfairly by BM25 as having a similar relevance to shorter * documents that do not contain the query term at all. */ private final double delta; /** * Default constructor with k1 = 1.2, b = 0.75, delta = 1.0. */ public BM25() { this(1.2, 0.75, 1.0); } /** * Constructor. * * @param k1 is a positive tuning parameter that calibrates * the document term frequency scaling. A k1 value of 0 * corresponds to a binary model (no term frequency), * and a large value corresponds to using raw term frequency. * * @param b b is another tuning parameter ({@code 0 <= b <= 1}) which * determines the scaling by document length: b = 1 corresponds * to fully scaling the term weight by the document length, * while b = 0 corresponds to no length normalization. * * @param delta the control parameter in BM25+. The standard BM25 in * which the component of term frequency normalization * by document length is not properly lower-bounded; * as a result of this deficiency, long documents which * do match the query term can often be scored unfairly * by BM25 as having a similar relevance to shorter * documents that do not contain the query term at all. */ public BM25(double k1, double b, double delta) { if (k1 < 0) { throw new IllegalArgumentException("Negative k1 = " + k1); } if (b < 0 || b > 1) { throw new IllegalArgumentException("Invalid b = " + b); } if (delta < 0) { throw new IllegalArgumentException("Invalid delta = " + delta); } this.k1 = k1; this.b = b; this.delta = delta; } /** * Returns the relevance score between a term and a document based on a corpus. * @param termFreq the term frequency in the text body. * @param docSize the text length. * @param avgDocSize the average text length in the corpus. * @param titleTermFreq the term frequency in the title. * @param titleSize the title length. * @param avgTitleSize the average title length in the corpus. * @param anchorTermFreq the term frequency in the anchor. * @param anchorSize the anchor length. * @param avgAnchorSize the average anchor length in the corpus. * @param N the number of documents in the corpus. * @param n the number of documents containing the given term in the corpus; * @return the relevance score. */ public double score(int termFreq, int docSize, double avgDocSize, int titleTermFreq, int titleSize, double avgTitleSize, int anchorTermFreq, int anchorSize, double avgAnchorSize, long N, long n) { if (termFreq <= 0) return 0.0; // BM25F parameters final double kf = 4.9; // k1 in BM25F final double bTitle = 0.6; final double bBody = 0.5; final double bAnchor = 0.6; final double wTitle = 13.5; final double wBody = 1.0; final double wAnchor = 11.5; double tf = wBody * termFreq / (1.0 + bBody * (docSize / avgDocSize - 1.0)); if (titleTermFreq > 0) { tf += wTitle * titleTermFreq / (1.0 + bTitle * (titleSize / avgTitleSize - 1.0)); } if (anchorTermFreq > 0) { tf += wAnchor * anchorTermFreq / (1.0 + bAnchor * (anchorSize / avgAnchorSize - 1.0)); } tf = tf / (kf + tf); double idf = Math.log((N - n + 0.5) / (n + 0.5) + 1); return (tf + delta) * idf; } /** * Returns the relevance score between a term and a document based on a corpus. * @param freq the normalized term frequency of searching term in the document to rank. * @param N the number of documents in the corpus. * @param n the number of documents containing the given term in the corpus; * @return the relevance score. */ public double score(double freq, long N, long n) { if (freq <= 0) return 0.0; double tf = (k1 + 1) * freq / (freq + k1); double idf = Math.log((N - n + 0.5) / (n + 0.5) + 1); return (tf + delta) * idf; } /** * Returns the relevance score between a term and a document based on a corpus. * @param freq the frequency of searching term in the document to rank. * @param docSize the size of document to rank. * @param avgDocSize the average size of documents in the corpus. * @param N the number of documents in the corpus. * @param n the number of documents containing the given term in the corpus; * @return the relevance score. */ public double score(double freq, int docSize, double avgDocSize, long N, long n) { if (freq <= 0) return 0.0; double tf = freq * (k1 + 1) / (freq + k1 * (1 - b + b * docSize / avgDocSize)); double idf = Math.log((N - n + 0.5) / (n + 0.5) + 1); return (tf + delta) * idf; } @Override public double rank(Corpus corpus, TextTerms doc, String term, int tf, int n) { if (tf <= 0) return 0.0; int N = corpus.ndoc(); int docSize = doc.size(); int avgDocSize = corpus.avgDocSize(); return score(tf, docSize, avgDocSize, N, n); } @Override public double rank(Corpus corpus, TextTerms doc, String[] terms, int[] tf, int n) { int N = corpus.ndoc(); int docSize = doc.size(); int avgDocSize = corpus.avgDocSize(); double r = 0.0; for (int i = 0; i < terms.length; i++) { r += score(tf[i], docSize, avgDocSize, N, n); } return r; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy