All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.until.keyword.criterion.CharStandardization Maven / Gradle / Ivy

There is a newer version: 2.0.3
Show newest version
package org.until.keyword.criterion;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

public class CharStandardization {

	public static final int SYMBOL_MASK = 0x1 << 16;
	public static final int TRAD_MASK = 0x2 << 16;
	public static final int DBC_MASK = 0x4 << 16;
	public static final int UPPER_MASK = 0x8 << 16;
	public static final int HAN_MASK = 0x10 << 16;
	public static final int SYNONYMY_MASK = 0x20 << 16;

	public static final String TRAD2SIMP_FILE = "Trad2Simp_CT.txt";
	public static final String UPPER_FILE = "Upper2Lower_CT.txt";
	public static final String SYMBOL_FILE = "Symbol_CT.txt";
	public static final String DBC_FILE = "DBC_CT.txt";
	public static final String SYNONYMY_FILE = "Synonymy_CT.txt";

	public static final char PARAGRAPH_BREAK = '\n';
	/**
	 * 默认控制字符。
	 */
	public static final char DEFAULT_BLANK_CHAR = (char) 0xffff;
	/**
	 * 中文代码页分隔符
	 */
	public static final char CJK_UNIFIED_IDEOGRAPHS_START = 0x4e00;
	public static final char CJK_UNIFIED_IDEOGRAPHS_END = 0xA000;

	/**
	 * 半角空格的值,在ASCII中为32(Decimal)
	 */
	public static final char DBC_SPACE = ' '; // 半角空格

	// 字符表编码
	private static int[] codeTable;

	static {
		loadCodeTable();
	}

	/**
	 * 
	 * 转换一个字符串,为了性能所有代码写在一个方法内,减少方法调用时间。 主要完成以下工作: <br>
	 * 1、繁体转简体(可选) <br>
	 * 2、全角转半角(可选)<br>
	 * 3、转小写(可选)<br>
	 * 4、过滤非汉字字符(可选)<br>
	 * 5、过滤空白字符(包括"\n""\r"" ""\t"等)(可选)<br>
	 * 6、连续空白字符是否保留一个(可选)<br>
	 * 
	 * 
* * @param src * 源字符串 * @param needT2S * 繁简体转换 * @param needDBC * 全半角转换 * @param ignoreCase * 大写转小写 * @param filterNoneHanLetter * 过滤非汉字字符 * @param convertSynonymy * 同义字符转换 * @param filterSymbol * 是否过滤symbol字符(包括"\n""\r"" ""\t"等,见Symbol_CT.txt), * @param keepLastSymbol * 连续symbol字符是否保留一个 * @return 转换后的字符串,长度可能比转换前的短 */ public static String compositeTextConvert(String src, boolean needT2S, boolean needDBC, boolean ignoreCase, boolean filterNoneHanLetter, boolean convertSynonymy, boolean filterSymbol, boolean keepLastSymbol) { if (src == null || src.length() == 0) { return src; } char[] chs = src.toCharArray(); StringBuilder buffer = new StringBuilder(chs.length); for (int i = 0; i < chs.length; i++) { char c = compositeCharConvert(chs[i], needT2S, needDBC, ignoreCase, filterNoneHanLetter, convertSynonymy, filterSymbol); if (keepLastSymbol) { if (c == DEFAULT_BLANK_CHAR && i < chs.length - 1) { char next = chs[i + 1]; if (isSeperatorSymbol(chs[i]) && isSeperatorSymbol(next)) { continue; } else { c = (char) codeTable[(char) chs[i]]; } } } if (c != DEFAULT_BLANK_CHAR) { buffer.append(c); } } return new String(buffer); } public static boolean isSeperatorSymbol(char c) { int i = codeTable[convertCharDBC(c)]; return (i & SYMBOL_MASK) != 0; } /** *
	 * 转换一个字符串,为了性能所有代码写在一个方法内,减少方法调用时间。 主要完成以下工作: <br>
	 * 1、繁体转简体(可选) <br>
	 * 2、全角转半角(可选)<br>
	 * 3、转小写(可选)<br>
	 * 4、过滤非汉字字符(可选)<br>
	 * 5、过滤symbol字符(包括"\n""\r"" ""\t"等)(可选)<br>
	 * 6、连续symbol字符是否保留一个(可选)<br>
	 * 
	 * 
* * @param c * 目标字符 * @param needT2S * 繁简体转换 * @param needDBC * 全半角转换 * @param ignoreCase * 大写转小写 * @param filterNoneHanLetter * 过滤非汉字字符 * @param filterSymbol * 是否过滤symbol字符(包括"\n""\r"" ""\t"等,见Symbol_CT.txt), * @param convertSynonymy * 同义字符转换 * @return 转换后的字符串,长度可能比转换前的短 */ public static final char compositeCharConvert(char c, boolean needT2S, boolean needDBC, boolean ignoreCase, boolean filterNoneHanLetter, boolean convertSynonymy, boolean filterSymbol) { if (needT2S) { c = convertCharT2S(c); } if (needDBC) { c = convertCharDBC(c); } if (ignoreCase) { c = convertChar2Lower(c); } if (filterNoneHanLetter) { c = filterNonHan(c); } if (convertSynonymy) { c = convertCharSynonymy(c); } if (filterSymbol) { c = filterSymbol(c); } return c; } public static char convertCharT2S(char ch) { int ret = codeTable[ch]; if ((ret & TRAD_MASK) != 0) { return (char) ret; } return ch; } public static char convertChar2Lower(char ch) { int ret = codeTable[ch]; if ((ret & UPPER_MASK) != 0) { return (char) ret; } return ch; } public static char convertCharDBC(char ch) { int ret = codeTable[ch]; if ((ret & DBC_MASK) != 0) { return (char) ret; } return ch; } public static char convertCharSynonymy(char ch) { int ret = codeTable[ch]; if ((ret & SYNONYMY_MASK) != 0) { return (char) ret; } return ch; } public static char filterNonHan(char ch) { int ret = codeTable[ch]; if ((ret & HAN_MASK) != 0) { return ch; } return DEFAULT_BLANK_CHAR; } public static char filterSymbol(char ch) { int ret = codeTable[ch]; if ((ret & SYMBOL_MASK) != 0) { return DEFAULT_BLANK_CHAR; } return ch; } public static char getCharFromTable(char c) { return (char) codeTable[c]; } /** * 装载编码表,顺序非常重要,过滤类往往只是打标记,应先处理。 */ private static final void loadCodeTable() { if (codeTable == null) { codeTable = new int[65536]; // 初始化数组 for (int i = 0; i < codeTable.length; i++) { codeTable[i] = i; } String ENCODING = "UTF-8"; // 处理汉字过滤 for (int i = CJK_UNIFIED_IDEOGRAPHS_START; i < CJK_UNIFIED_IDEOGRAPHS_END; i++) { codeTable[i] = HAN_MASK | codeTable[i]; } // 处理Symbol过滤 loadCodeTable(SYMBOL_FILE, ENCODING, codeTable, SYMBOL_MASK); // 处理繁体转简体 loadCodeTable(TRAD2SIMP_FILE, ENCODING, codeTable, TRAD_MASK); // 处理大写转小写 loadCodeTable(UPPER_FILE, ENCODING, codeTable, UPPER_MASK); // 处理全角转半角 loadCodeTable(DBC_FILE, ENCODING, codeTable, DBC_MASK); // 处理汉字过滤 for (int i = CJK_UNIFIED_IDEOGRAPHS_START; i < CJK_UNIFIED_IDEOGRAPHS_END; i++) { codeTable[i] = HAN_MASK | codeTable[i]; } // 处理Synonymy loadCodeTable(SYNONYMY_FILE, ENCODING, codeTable, SYNONYMY_MASK); } } private static final void loadCodeTable(String file, String encoding, int[] codeTbl, int mask) { InputStream istream = CharStandardization.class.getResourceAsStream("resources/" + file); if (istream == null) { throw new RuntimeException("Could not find code table: " + file); } BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(istream, encoding), 2048); String line = null; int i = 0; char c = 0; while ((line = reader.readLine()) != null) { if (line.startsWith("#")) continue; if (line.length() > 5) { i = Integer.parseInt(line.substring(1, 5), 16);// 将U0020类似的转换为整数字符值 if (i == ',' || i == '\n' || i == '\r' || i == '\t') {// ',','\n','\r'特殊处理 c = (char) i; } else { String[] tokens = line.split(","); if (tokens.length == 3) {// 非','情况 String last = tokens[2]; if (last.length() == 1) { c = last.charAt(0); } else { last = last.trim(); if (last.length() == 0) {// 如果是多个空格,trim可能使之为空,取空格 c = DBC_SPACE; } else { c = last.charAt(0); } } } else { c = ','; } } } int ret = codeTbl[i]; if (ret == 65291 || ret == 65292) line.split(","); if (ret == i) { codeTbl[i] = (mask | c); } else { ret = mask | ret; codeTbl[i] = (ret & 0xffff0000) | c; } } } catch (IOException e) { throw new RuntimeException("Could not read code table: " + file, e); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { } } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy