
org.bhu.nlp.keyword.TextRankKeyword Maven / Gradle / Ivy
package org.bhu.nlp.keyword;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.bhu.nlp.utils.MaxHeap;
/**
* 基于TextRank算法的关键字提取,适用于单文档
*/
public class TextRankKeyword {
/**
* 提取多少个关键词
*/
private int nKeyword = 10;
/**
* 阻尼系数(DampingFactor),一般取值为0.85
*/
private final static float d = 0.85f;
/**
* 最大迭代次数
*/
private final static int max_iter = 200;
private final static float min_diff = 0.001f;
/**
* 提取关键词
*
* @param words 文档内容
*
* @param size 希望提取几个关键词
*
* @return 关键词列表
*/
public List getKeywordList(List words, int size) {
TextRankKeyword textRankKeyword = new TextRankKeyword();
textRankKeyword.nKeyword = size;
return textRankKeyword.getKeyword(words);
}
/**
* 提取关键词
*
* @param words 待处理文本的词集
* @return 关键词结果
*/
private List getKeyword(List words) {
Set> entrySet = getTermAndRank(words, nKeyword).entrySet();
List result = new ArrayList(entrySet.size());
for (Map.Entry entry : entrySet) {
result.add(entry.getKey());
}
return result;
}
/**
* 返回分数最高的前size个分词结果和对应的rank
*
* @param words 待处理文本的词集
* @param size 关键词个数
* @return 关键词列表
*/
private Map getTermAndRank(List words, Integer size) {
Map map = getRank(words);
Map result = new LinkedHashMap();
for (Map.Entry entry : new MaxHeap>(size,
new Comparator>() {
@Override
public int compare(Map.Entry o1, Map.Entry o2) {
return o1.getValue().compareTo(o2.getValue());
}
}).addAll(map.entrySet()).toList()) {
result.put(entry.getKey(), entry.getValue());
}
return result;
}
/**
* 使用已经分好的词来计算rank
*
* @param wordList 待处理文本的词集
* @return rank值
*/
private Map getRank(List wordList) {
// System.out.println(wordList);
Map> words = new TreeMap>();
Queue que = new LinkedList();
for (String w : wordList) {
if (!words.containsKey(w)) {
words.put(w, new TreeSet());
}
que.offer(w);
if (que.size() > 5) {
que.poll();
}
for (String w1 : que) {
for (String w2 : que) {
if (w1.equals(w2)) {
continue;
}
words.get(w1).add(w2);
words.get(w2).add(w1);
}
}
}
// System.out.println(words);
Map score = new HashMap();
for (int i = 0; i < max_iter; ++i) {
Map m = new HashMap();
float max_diff = 0;
for (Map.Entry> entry : words.entrySet()) {
String key = entry.getKey();
Set value = entry.getValue();
m.put(key, 1 - d);
for (String element : value) {
int size = words.get(element).size();
if (key.equals(element) || size == 0)
continue;
m.put(key, m.get(key) + d / size * (score.get(element) == null ? 0 : score.get(element)));
}
max_diff = Math.max(max_diff, Math.abs(m.get(key) - (score.get(key) == null ? 0 : score.get(key))));
}
score = m;
if (max_diff <= min_diff)
break;
}
return score;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy