org.ansj.recognition.impl.NatureRecognition Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ansj_seg Show documentation
Show all versions of ansj_seg Show documentation
best java chinese word seg !
package org.ansj.recognition.impl;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.ansj.domain.AnsjItem;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.DATDictionary;
import org.ansj.library.DicLibrary;
import org.ansj.recognition.Recognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.util.MathUtil;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.WordAlert;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* 词性标注工具类
*
* @author ansj
*
*/
public class NatureRecognition implements Recognition {
private static final long serialVersionUID = 1L;
private static final Log logger = LogFactory.getLog();
private static final Forest SUFFIX_FOREST = new Forest();
private Forest[] forests = null;
static {
try (BufferedReader reader = MyStaticValue.getNatureClassSuffix()) {
String temp = null;
while ((temp = reader.readLine()) != null) {
String[] split = temp.split("\t");
String word = split[0];
if (word.length() > 1) {
word = new StringBuffer(word).reverse().toString();
}
SUFFIX_FOREST.add(word, new String[] { split[1] });
}
} catch (IOException e) {
logger.warn("IO异常", e);
}
}
public NatureRecognition() {
forests = new Forest[] { DicLibrary.get() };
}
public NatureRecognition(Forest... forests) {
this.forests = forests;
}
private NatureTerm root = new NatureTerm(TermNature.BEGIN);
private NatureTerm[] end = { new NatureTerm(TermNature.END) };
private List terms = null;
private NatureTerm[][] natureTermTable = null;
/**
* 进行最佳词性查找,引用赋值.所以不需要有返回值
*/
public void recognition(Result result) {
this.terms = result.getTerms();
natureTermTable = new NatureTerm[terms.size() + 1][];
natureTermTable[terms.size()] = end;
int length = terms.size();
for (int i = 0; i < length; i++) {
natureTermTable[i] = getNatureTermArr(terms.get(i).termNatures().termNatures);
}
walk();
}
/**
* 传入一组。词对词语进行。词性标注
*
* @param words
* @param offe
* @return
*/
public List recognition(List words) {
return recognition(words, 0);
}
/**
* 传入一组。词对词语进行。词性标注
*
* @param words
* @param offe
* @return
*/
public List recognition(List words, int offe) {
List terms = new ArrayList(words.size());
int tempOffe = 0;
for (String word : words) {
TermNatures tn = getTermNatures(word);
terms.add(new Term(word, offe + tempOffe, tn));
tempOffe += word.length();
}
new NatureRecognition().recognition(new Result(terms));
return terms;
}
/**
* 传入一次词语获得相关的词性
*
* @param word
* @return
*/
public TermNatures getTermNatures(String word) {
String[] params = null;
// 获得词性 , 先从系统辞典。在从用户自定义辞典
AnsjItem ansjItem = DATDictionary.getItem(word);
TermNatures tn = null;
if (ansjItem != AnsjItem.NULL) {
tn = ansjItem.termNatures;
} else if ((params = getParams(word)) != null) {
tn = new TermNatures(new TermNature(params[0], 1));
} else if (WordAlert.isEnglish(word)) {
tn = TermNatures.EN;
} else if (WordAlert.isNumber(word)) {
tn = TermNatures.M;
} else {
tn = TermNatures.NULL;
}
return tn;
}
/**
* 获取一个词语的参数
*
* @param word
* @return
*/
public String[] getParams(String word) {
for (Forest forest : forests) {
if (forest == null) {
continue;
}
SmartForest sf = forest;
for (int i = 0; i < word.length(); i++) {
sf = sf.get(word.charAt(i));
if (sf == null) {
return null;
}
}
if (sf.getStatus() > 1) {
return sf.getParam();
} else {
return null;
}
}
return null;
}
/**
* 通过规则 猜测词性
*
* @param word
* @return
*/
public static TermNatures guessNature(String word) {
String nature = null;
SmartForest smartForest = SUFFIX_FOREST;
int len = 0;
for (int i = word.length() - 1; i >= 0; i--) {
smartForest = smartForest.get(word.charAt(i));
if (smartForest == null) {
break;
}
len++;
if (smartForest.getStatus() == 2) {
nature = smartForest.getParam()[0];
} else if (smartForest.getStatus() == 3) {
nature = smartForest.getParam()[0];
break;
}
}
if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
return TermNatures.NT;
} else if ("ns".equals(nature)) {
return TermNatures.NS;
} else if (word.length() < 5) {
Result parse = ToAnalysis.parse(word);
for (Term term : parse.getTerms()) {
if ("nr".equals(term.getNatureStr())) {
return TermNatures.NR;
}
}
} else if (ForeignPersonRecognition.isFName(word)) {
return TermNatures.NRF;
}
return TermNatures.NW;
}
public void walk() {
int length = natureTermTable.length - 1;
setScore(root, natureTermTable[0]);
for (int i = 0; i < length; i++) {
for (int j = 0; j < natureTermTable[i].length; j++) {
setScore(natureTermTable[i][j], natureTermTable[i + 1]);
}
}
optimalRoot();
}
private void setScore(NatureTerm natureTerm, NatureTerm[] natureTerms) {
for (int i = 0; i < natureTerms.length; i++) {
natureTerms[i].setScore(natureTerm);
}
}
private NatureTerm[] getNatureTermArr(TermNature[] termNatures) {
NatureTerm[] natureTerms = new NatureTerm[termNatures.length];
for (int i = 0; i < natureTerms.length; i++) {
natureTerms[i] = new NatureTerm(termNatures[i]);
}
return natureTerms;
}
/**
* 获得最优路径
*/
private void optimalRoot() {
NatureTerm to = end[0];
NatureTerm from = null;
int index = natureTermTable.length - 1;
while ((from = to.from) != null && index > 0) {
terms.get(--index).setNature(from.termNature.nature);
to = from;
}
}
/**
* 关于这个term的词性
*
* @author ansj
*
*/
public class NatureTerm {
public TermNature termNature;
public double score = 0;
public double selfScore;
public NatureTerm from;
protected NatureTerm(TermNature termNature) {
this.termNature = termNature;
selfScore = termNature.frequency + 1;
}
public void setScore(NatureTerm natureTerm) {
double tempScore = MathUtil.compuNatureFreq(natureTerm, this);
if (from == null || score < tempScore) {
this.score = tempScore;
this.from = natureTerm;
}
}
@Override
public String toString() {
return termNature.nature.natureStr + "/" + selfScore;
}
}
}