com.hankcs.hanlp.mining.word.NewWordDiscover Maven / Gradle / Ivy
package com.hankcs.hanlp.mining.word;
import com.hankcs.hanlp.algorithm.MaxHeap;
import com.hankcs.hanlp.utility.LexiconUtility;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.regex.Pattern;
/**
* 新词发现工具
* 在实现上参考了:https://github.com/Moonshile/ChineseWordSegmentation
*
* @author hankcs
*/
public class NewWordDiscover
{
private int max_word_len;
private float min_freq;
private float min_entropy;
private float min_aggregation;
private boolean filter;
public NewWordDiscover()
{
this(4, 0.00005f, .4f, 1.2f, false);
}
/**
* 构造一个新词识别工具
*
* @param max_word_len 词语最长长度
* @param min_freq 词语最低频率
* @param min_entropy 词语最低熵
* @param min_aggregation 词语最低互信息
* @param filter 是否过滤掉HanLP中的词库中已存在的词语
*/
public NewWordDiscover(int max_word_len, float min_freq, float min_entropy, float min_aggregation, boolean filter)
{
this.max_word_len = max_word_len;
this.min_freq = min_freq;
this.min_entropy = min_entropy;
this.min_aggregation = min_aggregation;
this.filter = filter;
}
/**
* 提取词语
*
* @param reader 大文本
* @param size 需要提取词语的数量
* @return 一个词语列表
*/
public List discover(BufferedReader reader, int size) throws IOException
{
String doc;
Map word_cands = new TreeMap();
int totalLength = 0;
Pattern delimiter = Pattern.compile("[\\s\\d,.<>/?:;'\"\\[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+");
while ((doc = reader.readLine()) != null)
{
doc = delimiter.matcher(doc).replaceAll("\0");
int docLength = doc.length();
for (int i = 0; i < docLength; ++i)
{
int end = Math.min(i + 1 + max_word_len, docLength + 1);
for (int j = i + 1; j < end; ++j)
{
String word = doc.substring(i, j);
WordInfo info = word_cands.get(word);
if (info == null)
{
info = new WordInfo(word);
word_cands.put(word, info);
}
info.update(i == 0 ? '\0' : doc.charAt(i - 1), j < docLength ? doc.charAt(j) : '\0');
}
}
totalLength += docLength;
}
for (WordInfo info : word_cands.values())
{
info.computeProbabilityEntropy(totalLength);
}
for (WordInfo info : word_cands.values())
{
info.computeAggregation(word_cands);
}
// 过滤
ArrayList wordInfoList = new ArrayList(word_cands.values());
ListIterator listIterator = wordInfoList.listIterator();
while (listIterator.hasNext())
{
WordInfo info = listIterator.next();
if (info.text.trim().length() < 2 || info.p < min_freq || info.entropy < min_entropy || info.aggregation < min_aggregation
|| (filter && LexiconUtility.getFrequency(info.text) > 0)
)
{
listIterator.remove();
}
}
// 按照频率排序
MaxHeap topN = new MaxHeap(size, new Comparator()
{
public int compare(WordInfo o1, WordInfo o2)
{
return Float.compare(o1.p, o2.p);
}
});
topN.addAll(wordInfoList);
return topN.toList();
}
/**
* 提取词语
*
* @param doc 大文本
* @param size 需要提取词语的数量
* @return 一个词语列表
*/
public List discover(String doc, int size)
{
try
{
return discover(new BufferedReader(new StringReader(doc)), size);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy