com.hankcs.hanlp.summary.BM25 Maven / Gradle / Ivy
/*
*
* He Han
* [email protected]
* 2014/8/22 14:17
*
*
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
*
*/
package com.hankcs.hanlp.summary;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
/**
* 搜索相关性评分算法
* @author hankcs
*/
public class BM25
{
/**
* 文档句子的个数
*/
int D;
/**
* 文档句子的平均长度
*/
double avgdl;
/**
* 拆分为[句子[单词]]形式的文档
*/
List> docs;
/**
* 文档中每个句子中的每个词与词频
*/
Map[] f;
/**
* 文档中全部词语与出现在几个句子中
*/
Map df;
/**
* IDF
*/
Map idf;
/**
* 调节因子
*/
final static float k1 = 1.5f;
/**
* 调节因子
*/
final static float b = 0.75f;
public BM25(List> docs)
{
this.docs = docs;
D = docs.size();
for (List sentence : docs)
{
avgdl += sentence.size();
}
avgdl /= D;
f = new Map[D];
df = new TreeMap();
idf = new TreeMap();
init();
}
/**
* 在构造时初始化自己的所有参数
*/
private void init()
{
int index = 0;
for (List sentence : docs)
{
Map tf = new TreeMap();
for (String word : sentence)
{
Integer freq = tf.get(word);
freq = (freq == null ? 0 : freq) + 1;
tf.put(word, freq);
}
f[index] = tf;
for (Map.Entry entry : tf.entrySet())
{
String word = entry.getKey();
Integer freq = df.get(word);
freq = (freq == null ? 0 : freq) + 1;
df.put(word, freq);
}
++index;
}
for (Map.Entry entry : df.entrySet())
{
String word = entry.getKey();
Integer freq = entry.getValue();
idf.put(word, Math.log(D - freq + 0.5) - Math.log(freq + 0.5));
}
}
public double sim(List sentence, int index)
{
double score = 0;
for (String word : sentence)
{
if (!f[index].containsKey(word)) continue;
int d = docs.get(index).size();
Integer wf = f[index].get(word);
score += (idf.get(word) * wf * (k1 + 1)
/ (wf + k1 * (1 - b + b * d
/ avgdl)));
}
return score;
}
public double[] simAll(List sentence)
{
double[] scores = new double[D];
for (int i = 0; i < D; ++i)
{
scores[i] = sim(sentence, i);
}
return scores;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy