
org.bhu.nlp.keyword.TextRankKeyword Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Baturu Show documentation
Show all versions of Baturu Show documentation
This is a IO library for NLP
package org.bhu.nlp.keyword;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.bhu.nlp.utils.MaxHeap;
/**
* 基于TextRank算法的关键字提取,适用于单文档
*/
public class TextRankKeyword {
/**
* 提取多少个关键词
*/
private int nKeyword = 10;
/**
* 阻尼系数(DampingFactor),一般取值为0.85
*/
private final static float d = 0.85f;
/**
* 最大迭代次数
*/
private final static int max_iter = 200;
private final static float min_diff = 0.001f;
/**
* 提取关键词
*
* @param words 文档内容
*
* @param size 希望提取几个关键词
*
* @return 关键词列表
*/
public List getKeywordList(List words, int size) {
TextRankKeyword textRankKeyword = new TextRankKeyword();
textRankKeyword.nKeyword = size;
return textRankKeyword.getKeyword(words);
}
/**
* 提取关键词
*
* @param words 待处理文本的词集
* @return 关键词结果
*/
private List getKeyword(List words) {
Set> entrySet = getTermAndRank(words, nKeyword).entrySet();
List result = new ArrayList(entrySet.size());
for (Map.Entry entry : entrySet) {
result.add(entry.getKey());
}
return result;
}
/**
* 返回分数最高的前size个分词结果和对应的rank
*
* @param words 待处理文本的词集
* @param size 关键词个数
* @return 关键词列表
*/
private Map getTermAndRank(List words, Integer size) {
Map map = getRank(words);
Map result = new LinkedHashMap();
for (Map.Entry entry : new MaxHeap>(size,
new Comparator>() {
@Override
public int compare(Map.Entry o1, Map.Entry o2) {
return o1.getValue().compareTo(o2.getValue());
}
}).addAll(map.entrySet()).toList()) {
result.put(entry.getKey(), entry.getValue());
}
return result;
}
/**
* 使用已经分好的词来计算rank
*
* @param wordList 待处理文本的词集
* @return rank值
*/
private Map getRank(List wordList) {
// System.out.println(wordList);
Map> words = new TreeMap>();
Queue que = new LinkedList();
for (String w : wordList) {
if (!words.containsKey(w)) {
words.put(w, new TreeSet());
}
que.offer(w);
if (que.size() > 5) {
que.poll();
}
for (String w1 : que) {
for (String w2 : que) {
if (w1.equals(w2)) {
continue;
}
words.get(w1).add(w2);
words.get(w2).add(w1);
}
}
}
// System.out.println(words);
Map score = new HashMap();
for (int i = 0; i < max_iter; ++i) {
Map m = new HashMap();
float max_diff = 0;
for (Map.Entry> entry : words.entrySet()) {
String key = entry.getKey();
Set value = entry.getValue();
m.put(key, 1 - d);
for (String element : value) {
int size = words.get(element).size();
if (key.equals(element) || size == 0)
continue;
m.put(key, m.get(key) + d / size * (score.get(element) == null ? 0 : score.get(element)));
}
max_diff = Math.max(max_diff, Math.abs(m.get(key) - (score.get(key) == null ? 0 : score.get(key))));
}
score = m;
if (max_diff <= min_diff)
break;
}
return score;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy