All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ansj.app.crf.SplitWord Maven / Gradle / Ivy

There is a newer version: 5.1.6
Show newest version
package org.ansj.app.crf;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.ansj.app.crf.pojo.Element;
import org.ansj.util.MatrixUtil;
import org.nlpcn.commons.lang.util.StringUtil;

/**
 * 分词
 * 
 * @author ansj
 * 
 */
public class SplitWord {

	private Model model = null;

	public SplitWord(Model model) {
		this.model = model;
	};

	public List cut(char[] chars) {
		return cut(new String(chars));
	}

	public List cut(String line) {

		if (StringUtil.isBlank(line)) {
			return Collections.emptyList();
		}

		List elements = vterbi(line);

		List result = new ArrayList();

		Element e = null;
		int begin = 0;
		int end = 0;
		int size = elements.size() - 1;
		for (int i = 0; i < elements.size(); i++) {
			e = elements.get(i);
			switch (e.getTag()) {
			case 0:
				end += e.len;
				result.add(line.substring(begin, end));
				begin = end;
				break;
			case 1:
				end += e.len;
				while (i < size && (e = elements.get(++i)).getTag() != 3) {
					end += e.len;
				}
				end += e.len;
				result.add(line.substring(begin, end));
				begin = end;
			default:
				break;
			}
		}
		return result;
	}

	private List vterbi(String line) {
		List elements = Config.wordAlert(line);

		int length = elements.size();

		if (length == 0) { // 避免空list,下面get(0)操作越界
			return elements;
		}
		if (length == 1) {
			elements.get(0).updateTag(0);
			return elements;
		}

		/**
		 * 填充图
		 */
		for (int i = 0; i < length; i++) {
			computeTagScore(elements, i);
		}

		// 如果是开始不可能从 m,e开始 ,所以将它设为一个很小的值
		elements.get(0).tagScore[2] = -1000;
		elements.get(0).tagScore[3] = -1000;

		for (int i = 1; i < length; i++) {
			elements.get(i).maxFrom(model, elements.get(i - 1));
		}

		// 末位置只能从S,E开始
		// 末位置只能从0,3开始

		Element next = elements.get(elements.size() - 1);

		Element self = null;

		int maxStatus = next.tagScore[0] > next.tagScore[3] ? 0 : 3;

		next.updateTag(maxStatus);

		maxStatus = next.from[maxStatus];

		// 逆序寻找
		for (int i = elements.size() - 2; i > 0; i--) {
			self = elements.get(i);
			self.updateTag(maxStatus);
			maxStatus = self.from[self.getTag()];
			next = self;
		}
		elements.get(0).updateTag(maxStatus);

		// printElements(elements) ;

		return elements;

	}

	private void computeTagScore(List elements, int index) {

		char[][] feautres = model.getConfig().makeFeatureArr(elements, index);

		//TODO: set 20 很大吧!
		float[] tagScore = new float[20]; //Config.TAG_NUM*Config.TAG_NUM+Config.TAG_NUM

		for (int i = 0; i < feautres.length; i++) {
			MatrixUtil.dot(tagScore, model.getFeature(feautres[i]));
		}

		elements.get(index).tagScore = tagScore;
	}

	/**
	 * 随便给一个词。计算这个词的内聚分值,可以理解为计算这个词的可信度
	 * 
	 * @param word
	 */
	public float cohesion(String word) {

		if (word.length() == 0) {
			return Integer.MIN_VALUE;
		}

		List elements = Config.wordAlert(word);

		for (int i = 0; i < elements.size(); i++) {
			computeTagScore(elements, i);
		}

		float value = elements.get(0).tagScore[1];

		int len = elements.size() - 1;

		for (int i = 1; i < len; i++) {
			value += elements.get(i).tagScore[2];
		}

		value += elements.get(len).tagScore[3];

		if (value < 0) {
			return 1;
		} else {
			value += 1;
		}

		return value;
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy