org.ansj.dic.LearnTool Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ansj_seg Show documentation
Show all versions of ansj_seg Show documentation
best java chinese word seg !
package org.ansj.dic;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import org.ansj.app.crf.SplitWord;
import org.ansj.domain.Nature;
import org.ansj.domain.NewWord;
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.util.Graph;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.CollectionUtil;
/**
* 新词发现,这是个线程安全的.所以可以多个对象公用一个
*
* @author ansj
*
*/
public class LearnTool {
private SplitWord splitWord = null;
/**
* 是否开启学习机
*/
public boolean isAsianName = true;
public boolean isForeignName = true;
/**
* 告诉大家你学习了多少个词了
*/
public int count;
/**
* 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做.
*/
private final SmartForest sf = new SmartForest();
/**
* 公司名称学习.
*
* @param graph
*/
public void learn(Graph graph, SplitWord splitWord) {
this.splitWord = splitWord;
// 亚洲人名识别
if (isAsianName) {
findAsianPerson(graph);
}
// 外国人名识别
if (isForeignName) {
findForeignPerson(graph);
}
}
private void findAsianPerson(Graph graph) {
List newWords = new AsianPersonRecognition().getNewWords(graph.terms);
addListToTerm(newWords);
}
private void findForeignPerson(Graph graph) {
List newWords = new ForeignPersonRecognition().getNewWords(graph.terms);
addListToTerm(newWords);
}
// 批量将新词加入到词典中
private void addListToTerm(List newWords) {
if (newWords.size() == 0)
return;
for (NewWord newWord : newWords) {
addTerm(newWord);
}
}
/**
* 增加一个新词到树中
*
* @param newWord
*/
public void addTerm(NewWord newWord) {
NewWord temp = null;
SmartForest smartForest = null;
if ((smartForest = sf.getBranch(newWord.getName())) != null && smartForest.getParam() != null) {
temp = smartForest.getParam();
temp.update(newWord.getNature(), newWord.getAllFreq());
} else {
count++;
if(splitWord==null){
newWord.setScore(-1);
}else{
newWord.setScore(-splitWord.cohesion(newWord.getName()));
}
synchronized (sf) {
sf.add(newWord.getName(), newWord);
}
}
}
public SmartForest getForest() {
return this.sf;
}
/**
* 返回学习到的新词.
*
* @param num
* 返回数目.0为全部返回
* @return
*/
public List> getTopTree(int num) {
return getTopTree(num, null);
}
public List> getTopTree(int num, Nature nature) {
if (sf.branches == null) {
return null;
}
HashMap hm = new HashMap();
for (int i = 0; i < sf.branches.length; i++) {
valueResult(sf.branches[i], hm, nature);
}
List> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1);
if (num == 0) {
return sortMapByValue;
} else {
num = Math.min(num, sortMapByValue.size());
return sortMapByValue.subList(0, num);
}
}
private void valueResult(SmartForest smartForest, HashMap hm, Nature nature) {
if (smartForest == null || smartForest.branches == null) {
return;
}
for (int i = 0; i < smartForest.branches.length; i++) {
NewWord param = smartForest.branches[i].getParam();
if (smartForest.branches[i].getStatus() == 3) {
if (param.isActive() && (nature == null || param.getNature().equals(nature))) {
hm.put(param.getName(), param.getScore());
}
} else if (smartForest.branches[i].getStatus() == 2) {
if (param.isActive() && (nature == null || param.getNature().equals(nature))) {
hm.put(param.getName(), param.getScore());
}
valueResult(smartForest.branches[i], hm, nature);
} else {
valueResult(smartForest.branches[i], hm, nature);
}
}
}
/**
* 尝试激活,新词
*
* @param name
*/
public void active(String name) {
SmartForest branch = sf.getBranch(name);
if (branch != null && branch.getParam() != null) {
branch.getParam().setActive(true);
}
}
}