All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ansj.recognition.impl.NatureRecognition Maven / Gradle / Ivy

There is a newer version: 5.1.6
Show newest version
package org.ansj.recognition.impl;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.ansj.domain.AnsjItem;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.DATDictionary;
import org.ansj.library.DicLibrary;
import org.ansj.recognition.Recognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.util.MathUtil;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.WordAlert;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;

/**
 * 词性标注工具类
 * 
 * @author ansj
 * 
 */
public class NatureRecognition implements Recognition {

	private static final long serialVersionUID = 1L;

	private static final Log logger = LogFactory.getLog();

	private static final Forest SUFFIX_FOREST = new Forest();

	private Forest[] forests = null;

	static {
		try (BufferedReader reader = MyStaticValue.getNatureClassSuffix()) {
			String temp = null;
			while ((temp = reader.readLine()) != null) {
				String[] split = temp.split("\t");
				String word = split[0];
				if (word.length() > 1) {
					word = new StringBuffer(word).reverse().toString();
				}
				SUFFIX_FOREST.add(word, new String[] { split[1] });
			}
		} catch (IOException e) {
			logger.warn("IO异常", e);
		}
	}

	public NatureRecognition() {
		forests = new Forest[] { DicLibrary.get() };
	}

	public NatureRecognition(Forest... forests) {
		this.forests = forests;
	}

	private NatureTerm root = new NatureTerm(TermNature.BEGIN);

	private NatureTerm[] end = { new NatureTerm(TermNature.END) };

	private List terms = null;

	private NatureTerm[][] natureTermTable = null;

	/**
	 * 进行最佳词性查找,引用赋值.所以不需要有返回值
	 */
	public void recognition(Result result) {
		this.terms = result.getTerms();
		natureTermTable = new NatureTerm[terms.size() + 1][];
		natureTermTable[terms.size()] = end;

		int length = terms.size();
		for (int i = 0; i < length; i++) {
			natureTermTable[i] = getNatureTermArr(terms.get(i).termNatures().termNatures);
		}
		walk();
	}

	/**
	 * 传入一组。词对词语进行。词性标注
	 * 
	 * @param words
	 * @param offe
	 * @return
	 */
	public List recognition(List words) {
		return recognition(words, 0);
	}

	/**
	 * 传入一组。词对词语进行。词性标注
	 * 
	 * @param words
	 * @param offe
	 * @return
	 */
	public List recognition(List words, int offe) {
		List terms = new ArrayList(words.size());
		int tempOffe = 0;
		for (String word : words) {
			TermNatures tn = getTermNatures(word);

			terms.add(new Term(word, offe + tempOffe, tn));
			tempOffe += word.length();
		}
		new NatureRecognition().recognition(new Result(terms));
		return terms;
	}

	/**
	 * 传入一次词语获得相关的词性
	 * 
	 * @param word
	 * @return
	 */
	public TermNatures getTermNatures(String word) {
		String[] params = null;
		// 获得词性 , 先从系统辞典。在从用户自定义辞典
		AnsjItem ansjItem = DATDictionary.getItem(word);
		TermNatures tn = null;

		if (ansjItem != AnsjItem.NULL) {
			tn = ansjItem.termNatures;
		} else if ((params = getParams(word)) != null) {
			tn = new TermNatures(new TermNature(params[0], 1));
		} else if (WordAlert.isEnglish(word)) {
			tn = TermNatures.EN;
		} else if (WordAlert.isNumber(word)) {
			tn = TermNatures.M;
		} else {
			tn = TermNatures.NULL;
		}
		return tn;
	}

	/**
	 * 获取一个词语的参数
	 * 
	 * @param word
	 * @return
	 */
	public String[] getParams(String word) {
		for (Forest forest : forests) {
			if (forest == null) {
				continue;
			}
			SmartForest sf = forest;
			for (int i = 0; i < word.length(); i++) {
				sf = sf.get(word.charAt(i));
				if (sf == null) {
					return null;
				}
			}
			if (sf.getStatus() > 1) {
				return sf.getParam();
			} else {
				return null;
			}
		}
		return null;
	}

	/**
	 * 通过规则 猜测词性
	 * 
	 * @param word
	 * @return
	 */
	public static TermNatures guessNature(String word) {
		String nature = null;
		SmartForest smartForest = SUFFIX_FOREST;
		int len = 0;
		for (int i = word.length() - 1; i >= 0; i--) {
			smartForest = smartForest.get(word.charAt(i));
			if (smartForest == null) {
				break;
			}
			len++;
			if (smartForest.getStatus() == 2) {
				nature = smartForest.getParam()[0];
			} else if (smartForest.getStatus() == 3) {
				nature = smartForest.getParam()[0];
				break;
			}
		}

		if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
			return TermNatures.NT;
		} else if ("ns".equals(nature)) {
			return TermNatures.NS;
		} else if (word.length() < 5) {
			Result parse = ToAnalysis.parse(word);
			for (Term term : parse.getTerms()) {
				if ("nr".equals(term.getNatureStr())) {
					return TermNatures.NR;
				}
			}
		} else if (ForeignPersonRecognition.isFName(word)) {
			return TermNatures.NRF;
		}

		return TermNatures.NW;
	}

	public void walk() {
		int length = natureTermTable.length - 1;
		setScore(root, natureTermTable[0]);
		for (int i = 0; i < length; i++) {
			for (int j = 0; j < natureTermTable[i].length; j++) {
				setScore(natureTermTable[i][j], natureTermTable[i + 1]);
			}
		}
		optimalRoot();
	}

	private void setScore(NatureTerm natureTerm, NatureTerm[] natureTerms) {

		for (int i = 0; i < natureTerms.length; i++) {
			natureTerms[i].setScore(natureTerm);
		}
	}

	private NatureTerm[] getNatureTermArr(TermNature[] termNatures) {
		NatureTerm[] natureTerms = new NatureTerm[termNatures.length];
		for (int i = 0; i < natureTerms.length; i++) {
			natureTerms[i] = new NatureTerm(termNatures[i]);
		}
		return natureTerms;
	}

	/**
	 * 获得最优路径
	 */
	private void optimalRoot() {
		NatureTerm to = end[0];
		NatureTerm from = null;
		int index = natureTermTable.length - 1;
		while ((from = to.from) != null && index > 0) {
			terms.get(--index).setNature(from.termNature.nature);
			to = from;
		}
	}

	/**
	 * 关于这个term的词性
	 * 
	 * @author ansj
	 * 
	 */
	public class NatureTerm {

		public TermNature termNature;

		public double score = 0;

		public double selfScore;

		public NatureTerm from;

		protected NatureTerm(TermNature termNature) {
			this.termNature = termNature;
			selfScore = termNature.frequency + 1;
		}

		public void setScore(NatureTerm natureTerm) {
			double tempScore = MathUtil.compuNatureFreq(natureTerm, this);
			if (from == null || score < tempScore) {
				this.score = tempScore;
				this.from = natureTerm;
			}
		}

		@Override
		public String toString() {
			return termNature.nature.natureStr + "/" + selfScore;
		}

	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy