All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ansj.app.crf.Config Maven / Gradle / Ivy

There is a newer version: 5.1.6
Show newest version
package org.ansj.app.crf;

import java.util.ArrayList;
import java.util.List;

import org.ansj.app.crf.pojo.Element;
import org.nlpcn.commons.lang.util.WordAlert;

public class Config {

	public String splitStr = "\\s+";

	public Config(int[][] template) {
		this.template = template;
	}

	public static final int TAG_NUM = 4; // 标记类型写死了4个

	// 特殊字符的标注
	public static final char BEGIN = 128;

	public static final char END = 129;

	public static final char NUM_BEGIN = 130;

	public static final char EN_BEGIN = 140;

	public static final char FEATURE_BEGIN = 150;

	public static char getNum(String str) {
		if (str.length() > 9) {
			return NUM_BEGIN;
		} else {
			return (char) (NUM_BEGIN + str.length());
		}
	}

	public static char getEn(String str) {
		if (str.length() > 9) {
			return EN_BEGIN;
		} else {
			return (char) (EN_BEGIN + str.length());
		}

	}

	// 字标注类型
	public static int S = 0;
	public static int B = 1;
	public static int M = 2;
	public static int E = 3;

	private int[][] template = { { -2 }, { -1 }, { 0 }, { 1 }, { 2 }, { -2, -1 }, { -1, 0 }, { 0, 1 }, { 1, 2 }, { -1, 1 } };

	public int[][] getTemplate() {
		return template;
	}

	public void setTemplate(int[][] template) {
		this.template = template;
	}

	/**
	 * 词语标准化
	 * 
	 * @param word
	 * @return
	 */
	public static List wordAlert(String word) {

		char[] chars = WordAlert.alertStr(word);

		List list = new ArrayList();

		StringBuilder tempSb = new StringBuilder();

		int status = 0; // 1 num 2 english

		Element element = null;

		for (int i = 0; i < chars.length; i++) {

			if (chars[i] >= '0' && chars[i] <= '9') {
				if (status == 2) {
					element = new Element(Config.getNum(tempSb.toString()));
					element.len = tempSb.length();
					list.add(element);
					tempSb = new StringBuilder();
				}
				tempSb.append(chars[i]);
				status = 1;
			} else if (chars[i] >= 'A' && chars[i] <= 'z') {
				if (status == 1) {
					element = new Element(Config.getEn(tempSb.toString()));
					element.len = tempSb.length();
					list.add(element);
					tempSb = new StringBuilder();
				}
				tempSb.append(chars[i]);
				status = 2;
			} else {
				if (status == 1) {
					element = new Element(Config.getNum(tempSb.toString()));
					element.len = tempSb.length();
					list.add(element);
				} else if (status == 2) {
					element = new Element(Config.getEn(tempSb.toString()));
					element.len = tempSb.length();
					list.add(element);
				}
				tempSb = new StringBuilder();
				list.add(new Element(chars[i]));
				status = 0;
			}

		}

		if (tempSb.length() > 0) {
			if (status == 1) {
				element = new Element(Config.getNum(tempSb.toString()));
				element.len = tempSb.length();
				list.add(element);
			} else if (status == 2) {
				element = new Element(Config.getEn(tempSb.toString()));
				element.len = tempSb.length();
				list.add(element);
			} else {
				System.out.println("err!");
			}
		}

		return list;
	}

	/**
	 * @param temp
	 * @return
	 */
	public static List makeToElementList(String temp, String splitStr) {
		String[] split = temp.split(splitStr);
		List list = new ArrayList(temp.length());

		for (String word : split) {

			List wordAlert = wordAlert(word);

			int len = wordAlert.size();

			if (len == 1) {
				wordAlert.get(0).updateTag(Config.S);
			} else if (len == 2) {
				wordAlert.get(0).updateTag(Config.B);
				wordAlert.get(1).updateTag(Config.E);
			} else if (len > 2) {
				wordAlert.get(0).updateTag(Config.B);
				for (int i = 1; i < len - 1; i++) {
					wordAlert.get(i).updateTag(Config.M);
				}
				wordAlert.get(len - 1).updateTag(Config.E);
			}

			list.addAll(wordAlert);
		}
		return list;
	}

	public List makeToElementList(String temp) {
		return wordAlert(temp);
	}

	public char getNameIfOutArr(List list, int index) {
		if (index < 0) {
			return Config.BEGIN;
		} else if (index >= list.size()) {
			return Config.END;
		} else {
			return list.get(index).name;
		}
	}

	public char getTagIfOutArr(List list, int index) {
		if (index < 0 || index >= list.size()) {
			return 0;
		} else {
			return (char) list.get(index).getTag();
		}
	}

	/**
	 * 得到一个位置的所有特征
	 * 
	 * @param list
	 * @param index
	 * @return KeyValue(词语,featureLength*tagNum)
	 */
	public char[][] makeFeatureArr(List list, int index) {
		char[][] result = new char[template.length][];
		char[] chars = null;
		int len = 0;
		int i = 0;
		for (; i < template.length; i++) {
			if (template[i].length == 0) {
				continue;
			}
			chars = new char[template[i].length + 1];
			len = chars.length - 1;
			for (int j = 0; j < len; j++) {
				chars[j] = getNameIfOutArr(list, index + template[i][j]);
			}
			chars[len] = (char) (FEATURE_BEGIN + i);
			result[i] = chars;
		}

		return result;
	}

	public static char getTagName(int tag) {
		switch (tag) {
		case 0:
			return 'S';
		case 1:
			return 'B';
		case 2:
			return 'M';
		case 3:
			return 'E';
		default:
			return '?';
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy