All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ansj.dic.LearnTool Maven / Gradle / Ivy

There is a newer version: 5.1.6
Show newest version
package org.ansj.dic;

import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;

import org.ansj.app.crf.SplitWord;
import org.ansj.domain.Nature;
import org.ansj.domain.NewWord;
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.util.Graph;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.CollectionUtil;

/**
 * 新词发现,这是个线程安全的.所以可以多个对象公用一个
 * 
 * @author ansj
 * 
 */
public class LearnTool {

	private SplitWord splitWord = null;

	/**
	 * 是否开启学习机
	 */
	public boolean isAsianName = true;

	public boolean isForeignName = true;

	/**
	 * 告诉大家你学习了多少个词了
	 */
	public int count;

	/**
	 * 新词发现的结果集.可以序列化到硬盘.然后可以当做训练集来做.
	 */
	private final SmartForest sf = new SmartForest();

	/**
	 * 公司名称学习.
	 * 
	 * @param graph
	 */
	public void learn(Graph graph, SplitWord splitWord) {

		this.splitWord = splitWord;

		// 亚洲人名识别
		if (isAsianName) {
			findAsianPerson(graph);
		}

		// 外国人名识别
		if (isForeignName) {
			findForeignPerson(graph);
		}

	}

	private void findAsianPerson(Graph graph) {
		List newWords = new AsianPersonRecognition().getNewWords(graph.terms);
		addListToTerm(newWords);
	}

	private void findForeignPerson(Graph graph) {
		List newWords = new ForeignPersonRecognition().getNewWords(graph.terms);
		addListToTerm(newWords);
	}

	// 批量将新词加入到词典中
	private void addListToTerm(List newWords) {
		if (newWords.size() == 0)
			return;
		for (NewWord newWord : newWords) {
			addTerm(newWord);
		}
	}

	/**
	 * 增加一个新词到树中
	 * 
	 * @param newWord
	 */
	public void addTerm(NewWord newWord) {
		NewWord temp = null;
		SmartForest smartForest = null;
		if ((smartForest = sf.getBranch(newWord.getName())) != null && smartForest.getParam() != null) {
			temp = smartForest.getParam();
			temp.update(newWord.getNature(), newWord.getAllFreq());
		} else {
			count++;
			if(splitWord==null){
				newWord.setScore(-1);
			}else{
				newWord.setScore(-splitWord.cohesion(newWord.getName()));	
			}
			
			synchronized (sf) {
				sf.add(newWord.getName(), newWord);
			}
		}
	}

	public SmartForest getForest() {
		return this.sf;
	}

	/**
	 * 返回学习到的新词.
	 * 
	 * @param num
	 *            返回数目.0为全部返回
	 * @return
	 */
	public List> getTopTree(int num) {
		return getTopTree(num, null);
	}

	public List> getTopTree(int num, Nature nature) {
		if (sf.branches == null) {
			return null;
		}
		HashMap hm = new HashMap();
		for (int i = 0; i < sf.branches.length; i++) {
			valueResult(sf.branches[i], hm, nature);
		}
		List> sortMapByValue = CollectionUtil.sortMapByValue(hm, -1);
		if (num == 0) {
			return sortMapByValue;
		} else {
			num = Math.min(num, sortMapByValue.size());
			return sortMapByValue.subList(0, num);
		}
	}

	private void valueResult(SmartForest smartForest, HashMap hm, Nature nature) {
		
		if (smartForest == null || smartForest.branches == null) {
			return;
		}
		for (int i = 0; i < smartForest.branches.length; i++) {
			NewWord param = smartForest.branches[i].getParam();
			if (smartForest.branches[i].getStatus() == 3) {
				if (param.isActive() && (nature == null || param.getNature().equals(nature))) {
					hm.put(param.getName(), param.getScore());
				}
			} else if (smartForest.branches[i].getStatus() == 2) {
				if (param.isActive() && (nature == null || param.getNature().equals(nature))) {
					hm.put(param.getName(), param.getScore());
				}
				valueResult(smartForest.branches[i], hm, nature);
			} else {
				valueResult(smartForest.branches[i], hm, nature);
			}
		}
	}

	/**
	 * 尝试激活,新词
	 * 
	 * @param name
	 */
	public void active(String name) {
		SmartForest branch = sf.getBranch(name);
		if (branch != null && branch.getParam() != null) {
			branch.getParam().setActive(true);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy