All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.paoding.analysis.knife.NumberKnife Maven / Gradle / Ivy

/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.paoding.analysis.knife;

import java.math.BigInteger;

import net.paoding.analysis.dictionary.Dictionary;
import net.paoding.analysis.dictionary.Hit;

/**
 * 
 * @author Zhiliang Wang [[email protected]]
 * 
 */
public class NumberKnife extends CombinatoricsKnife implements DictionariesWare {

	private Dictionary units;
	
	public NumberKnife() {
	}

	public NumberKnife(Dictionaries dictionaries) {
		setDictionaries(dictionaries);
	}

	@Override
	public void setDictionaries(Dictionaries dictionaries) {
		super.setDictionaries(dictionaries);
		units = dictionaries.getUnitsDictionary();
	}
	

	@Override
	public int assignable(Beef beef, int offset, int index) {
		char ch = beef.charAt(index);
		if (CharSet.isArabianNumber(ch))
			return ASSIGNED;
		if (index > offset) {
			if (CharSet.isLantingLetter(ch) || ch == '.' || ch == '-' || ch == '_') {
				if (ch == '-' || ch == '_' || CharSet.isLantingLetter(ch)
						|| !CharSet.isArabianNumber(beef.charAt(index + 1))) {
					//分词效果
					//123.456		->123.456/
					//123.abc.34	->123/123.abc.34/abc/34/	["abc"、"abc/34"系由LetterKnife分出,非NumberKnife]
					//没有或判断!CharSet.isArabianNumber(beef.charAt(index + 1)),则分出"123.",而非"123"
					//123.abc.34	->123./123.abc.34/abc/34/
					return POINT;
				}
				return ASSIGNED;
			}
		}
		return LIMIT;
	}
	
	@Override
	protected int collectLimit(Collector collector, Beef beef,
			int offset, int point, int limit, int dicWordVote) {
		// "123abc"的直接调用super的
		if (point != -1) {
			return super.collectLimit(collector, beef, offset, point, limit, dicWordVote);
		}
		// 
		// 2.2两
		//    ^=_point
		//     
		final int _point = limit;
		// 当前尝试判断的字符的位置
		int curTail = offset;
		/*
		 * Fix issue 56: 中文数字解析问题后续
		 */				
		BigInteger number1 = BigInteger.valueOf(-1);
		BigInteger number2 = BigInteger.valueOf(-1);
		int bitValue = 0;
		int maxUnit = 0;
		//TODO:这里又重复从curTail(其值为offset)判断,重新遍历判断是否为数字,算是一个重复计算
		//但考虑这个计算对中文分词性能影响微乎其微暂时先不优化
		for (; (bitValue = CharSet.toNumber(beef.charAt(curTail))) >= 0; curTail++) {
			// 
			if (bitValue == 2
					&& (beef.charAt(curTail) == '两' || beef.charAt(curTail) == '俩' || beef
							.charAt(curTail) == '倆')) {
				if (curTail != offset) {
					break;
				}
			}
			// 处理连续汉字个位值的数字:"三四五六"	->"3456"
			if (bitValue >= 0 && bitValue < 10) {
				if (number2.compareTo(BigInteger.ZERO) < 0)
					number2 = BigInteger.valueOf(bitValue);
				else {
					number2 = number2.multiply(BigInteger.valueOf(10));
					number2 = number2.add(BigInteger.valueOf(bitValue));
				}
			} else {
				if (number2.compareTo(BigInteger.ZERO) < 0) {
					if (number1.compareTo(BigInteger.ZERO) < 0) {
						number1 = BigInteger.ONE;
					}
					number1 = number1.multiply(BigInteger.valueOf(bitValue));
				} else {
					if (number1.compareTo(BigInteger.ZERO) < 0) {
						number1 = BigInteger.ZERO;
					}
					if (bitValue >= maxUnit) {
						number1 = number1.add(number2);
						number1 = number1.multiply(BigInteger.valueOf(bitValue));
						maxUnit = bitValue;
					} else {
						number1 = number1.add(number2.multiply(BigInteger.valueOf(bitValue)));
					}
				}
				number2 = BigInteger.valueOf(-1);
			}
		}
		if (number2.compareTo(BigInteger.ZERO) > 0) {
			if (number1.compareTo(BigInteger.ZERO) < 0) {
				number1 = number2;
			} else {
				number1 = number1.add(number2);
			}
		}
		if (number1.compareTo(BigInteger.ZERO) >= 0 && curTail > _point) {
			doCollect(collector, String.valueOf(number1), beef, offset, curTail);
		}
		else {
			super.collectLimit(collector, beef, offset, point, limit, dicWordVote);
		}
		
		curTail = curTail > limit ? curTail : limit;
		
		//
		// 后面可能跟了计量单位
		if (units != null && CharSet.isCjkUnifiedIdeographs(beef.charAt(curTail))) {
			Hit wd = null;
			Hit wd2 = null;
			int i = curTail + 1;
			
			/*
			 * Fix issue 48: 查找计量单位引起的高亮越界错误
			 */
			while (i <= limit && (wd = units.search(beef, curTail, i - curTail)).isHit()) {
				wd2 = wd;
				i++;
				if (!wd.isUnclosed()) {
					break;
				}
			}
			i --;
			if (wd2 != null) {
				collector.collect(wd2.getWord().getText(), curTail, i);
				return i;
			}
		}
		//
		
		return curTail > limit ? curTail : -1;
	}


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy