All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.nlpcn.commons.lang.standardization.WordUtil Maven / Gradle / Ivy

package org.nlpcn.commons.lang.standardization;

import org.nlpcn.commons.lang.util.WordAlert;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
 * 句子标准化
 * 英文。数字全角转半角。。
 * 所有数字,EN合并
 *
 * @author ansj
 */

public class WordUtil {

	/**
	 * 用这个值替换所有数字,如果为null就不替换
	 */
	private Character num2Value;

	/*
	 * 用这个值替换所有英文,如果为null就不替换
	 */
	private Character en2Value;

	/**
	 * 如果不想被替换清保留null
	 *
	 * @param num2Value
	 * @param enValue
	 */
	public WordUtil(Character num2Value, Character enValue) {
		this.num2Value = num2Value;
		this.en2Value = enValue;
	}

	/**
	 * @param str
	 * @return
	 */
	public List str2Elements(String str) {

		if (str == null || str.trim().length() == 0) {
			return Collections.emptyList();
		}

		char[] chars = WordAlert.alertStr(str);
		int maxLen = chars.length - 1;
		List list = new ArrayList();
		Element element = null;
		out:
		for (int i = 0; i < chars.length; i++) {
			if (num2Value != null && chars[i] >= '0' && chars[i] <= '9') {
				element = new Element(num2Value);
				list.add(element);
				if (i == maxLen) {
					break out;
				}
				char c = chars[++i];
				while (c == '.' || c == '%' || (c >= '0' && c <= '9')) {
					if (i == maxLen) {
						break out;
					}
					c = chars[++i];
					element.len();
				}
				i--;
			} else if (en2Value != null && chars[i] >= 'a' && chars[i] <= 'z') {
				element = new Element(en2Value);
				list.add(element);
				if (i == maxLen) {
					break out;
				}
				char c = chars[++i];
				while (c >= 'a' && c <= 'z') {
					if (i == maxLen) {
						break out;
					}
					c = chars[++i];
					element.len();
				}
				i--;
			} else {
				list.add(new Element(chars[i]));
			}
		}
		return list;
	}


	/**
	 * @param str
	 * @return
	 */
	public String str2Str(String str) {
		return new String(str2Chars(str));

	}

	/**
	 * @param str
	 * @return
	 */
	public char[] str2Chars(String str) {
		List elements = str2Elements(str);
		char[] chars = new char[elements.size()];
		for (int i = 0; i < elements.size(); i++) {
			chars[i] = elements.get(i).name;
		}
		return chars;
	}

	public static void main(String[] args) {
		WordUtil wordUtil = new WordUtil('1', 'A');
		System.out.println(wordUtil.str2Elements("123中国CHINA456你好!"));
		System.out.println(Arrays.toString(wordUtil.str2Chars("123中国CHINA456你好!")));
		System.out.println(wordUtil.str2Str("123中国CHINA456你好!"));
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy