All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.nlp.relevance.BM25 Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Smile.  If not, see .
 ******************************************************************************/

package smile.nlp.relevance;

import smile.nlp.Corpus;
import smile.nlp.TextTerms;

/**
 * The BM25 weighting scheme, often called Okapi weighting, after the system in
 * which it was first implemented, was developed as a way of building a
 * probabilistic model sensitive to term frequency and document length while
 * not introducing too many additional parameters into the model. It is not
 * a single function, but actually a whole family of scoring functions, with
 * slightly different components and parameters.
 * 

* At the extreme values of the coefficient b, BM25 turns into ranking functions * known as BM11 (for b = 1) and BM15 (for b = 0). BM25F is a modification of * BM25 in which the document is considered to be composed from several fields * (such as headlines, main text, anchor text) with possibly different degrees * of importance. *

* BM25 and its newer variants represent state-of-the-art TF-IDF-like retrieval * functions used in document retrieval, such as web search. * * @see TFIDF * * @author Haifeng Li */ public class BM25 implements RelevanceRanker { /** * Free parameter, usually chosen as k1 = 2.0. */ private double k1; /** * Free parameter, usually chosen as b = 0.75. */ private double b; /** * The control parameter in BM25+. The standard BM25 in which the * component of term frequency normalization by document length * is not properly lower-bounded; as a result of this deficiency, * long documents which do match the query term can often be scored * unfairly by BM25 as having a similar relevance to shorter * documents that do not contain the query term at all. */ private double delta; /** * BM25F parameters */ private double kf = 4.9; // k1 in BM25F private double bTitle = 0.6; private double bBody = 0.5; private double bAnchor = 0.6; private double wTitle = 13.5; private double wBody = 1.0; private double wAnchor = 11.5; /** * Default constructor with k1 = 1.2, b = 0.75, delta = 1.0. */ public BM25() { this(1.2, 0.75, 1.0); } /** * Constructor. * * @param k1 is a positive tuning parameter that calibrates * the document term frequency scaling. A k1 value of 0 corresponds to a * binary model (no term frequency), and a large value corresponds to using * raw term frequency. * @param b b is another tuning parameter (0 ≤ b ≤ 1) which determines * the scaling by document length: b = 1 corresponds to fully scaling the * term weight by the document length, while b = 0 corresponds to no length * normalization. */ public BM25(double k1, double b, double delta) { if (k1 < 0) { throw new IllegalArgumentException("Negative k1 = " + k1); } if (b < 0 || b > 1) { throw new IllegalArgumentException("Invalid b = " + b); } if (delta < 0) { throw new IllegalArgumentException("Invalid delta = " + delta); } this.k1 = k1; this.b = b; this.delta = delta; } /** * Returns a relevance score between a term and a document based on a corpus. * @param termFreq normalized term frequency of searching term in the document to rank. * @param N the number of documents in the corpus. * @param n the number of documents containing the given term in the corpus; */ public double score(int termFreq, int docLen, double avgDocLen, int titleTermFreq, int titleLen, double avgTitleLen, int anchorTermFreq, int anchorLen, double avgAnchorLen, long N, long n) { if (termFreq <= 0) return 0.0; double tf = wBody * termFreq / (1.0 + bBody * (docLen / avgDocLen - 1.0)); if (titleTermFreq > 0) { tf += wTitle * titleTermFreq / (1.0 + bTitle * (titleLen / avgTitleLen - 1.0)); } if (anchorTermFreq > 0) { tf += wAnchor * anchorTermFreq / (1.0 + bAnchor * (anchorLen / avgAnchorLen - 1.0)); } tf = tf / (kf + tf); double idf = Math.log((N - n + 0.5) / (n + 0.5)); return (tf + delta) * idf; } /** * Returns a relevance score between a term and a document based on a corpus. * @param freq normalized term frequency of searching term in the document to rank. * @param N the number of documents in the corpus. * @param n the number of documents containing the given term in the corpus; */ public double score(double freq, long N, long n) { if (freq <= 0) return 0.0; double tf = (k1 + 1) * freq / (freq + k1); double idf = Math.log((N - n + 0.5) / (n + 0.5)); return (tf + delta) * idf; } /** * Returns a relevance score between a term and a document based on a corpus. * @param freq the frequency of searching term in the document to rank. * @param docSize the size of document to rank. * @param avgDocSize the average size of documents in the corpus. * @param N the number of documents in the corpus. * @param n the number of documents containing the given term in the corpus; */ public double score(double freq, int docSize, double avgDocSize, long N, long n) { if (freq <= 0) return 0.0; double tf = freq * (k1 + 1) / (freq + k1 * (1 - b + b * docSize / avgDocSize)); double idf = Math.log((N - n + 0.5) / (n + 0.5)); return (tf + delta) * idf; } @Override public double rank(Corpus corpus, TextTerms doc, String term, int tf, int n) { if (tf <= 0) return 0.0; int N = corpus.getNumDocuments(); int docSize = doc.size(); int avgDocSize = corpus.getAverageDocumentSize(); return score(tf, docSize, avgDocSize, N, n); } @Override public double rank(Corpus corpus, TextTerms doc, String[] terms, int[] tf, int n) { int N = corpus.getNumDocuments(); int docSize = doc.size(); int avgDocSize = corpus.getAverageDocumentSize(); double r = 0.0; for (int i = 0; i < terms.length; i++) { r += score(tf[i], docSize, avgDocSize, N, n); } return r; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy