org.wltea.analyzer.core.Lexeme Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of ik-analyzers Show documentation
bestwu ik-analyzer
The newest version!
package org.wltea.analyzer.core;

/**
 * IK词元对象
 */
public class Lexeme implements Comparable {
	//lexemeType常量
	//未知
	public static final int TYPE_UNKNOWN = 0;
	//英文
	public static final int TYPE_ENGLISH = 1;
	//数字
	public static final int TYPE_ARABIC = 2;
	//英文数字混合
	public static final int TYPE_LETTER = 3;
	//中文词元
	public static final int TYPE_CNWORD = 4;
	//中文单字
	public static final int TYPE_CNCHAR = 64;
	//日韩文字
	public static final int TYPE_OTHER_CJK = 8;
	//中文数词
	public static final int TYPE_CNUM = 16;
	//中文量词
	public static final int TYPE_COUNT = 32;
	//中文数量词
	public static final int TYPE_CQUAN = 48;

	//词元的起始位移
	private int offset;
	//词元的相对起始位置
	private int begin;
	//词元的长度
	private int length;
	//词元文本
	private String lexemeText;
	//词元类型
	private int lexemeType;

	public Lexeme(int offset, int begin, int length, int lexemeType) {
		this.offset = offset;
		this.begin = begin;
		if (length < 0) {
			throw new IllegalArgumentException("length < 0");
		}
		this.length = length;
		this.lexemeType = lexemeType;
	}

	/*
	 * 判断词元相等算法
	 * 起始位置偏移、起始位置、终止位置相同
	 * @see java.lang.Object#equals(Object o)
	 */
	public boolean equals(Object o) {
		if (o == null) {
			return false;
		}

		if (this == o) {
			return true;
		}

		if (o instanceof Lexeme) {
			Lexeme other = (Lexeme) o;
			return this.offset == other.getOffset()
					&& this.begin == other.getBegin()
					&& this.length == other.getLength();
		} else {
			return false;
		}
	}

	/*
	 * 词元哈希编码算法
	 * @see java.lang.Object#hashCode()
	 */
	public int hashCode() {
		int absBegin = getBeginPosition();
		int absEnd = getEndPosition();
		return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
	}

	/*
	 * 词元在排序集合中的比较算法
	 * @see java.lang.Comparable#compareTo(java.lang.Object)
	 */
	public int compareTo(Lexeme other) {
		//起始位置优先
		if (this.begin < other.getBegin()) {
			return -1;
		} else if (this.begin == other.getBegin()) {
			//词元长度优先
			if (this.length > other.getLength()) {
				return -1;
			} else if (this.length == other.getLength()) {
				return 0;
			} else {//this.length < other.getLength()
				return 1;
			}

		} else {//this.begin > other.getBegin()
			return 1;
		}
	}

	public int getOffset() {
		return offset;
	}

	public void setOffset(int offset) {
		this.offset = offset;
	}

	public int getBegin() {
		return begin;
	}

	/**
	 * 获取词元在文本中的起始位置
	 *
	 * @return int
	 */
	public int getBeginPosition() {
		return offset + begin;
	}

	public void setBegin(int begin) {
		this.begin = begin;
	}

	/**
	 * 获取词元在文本中的结束位置
	 *
	 * @return int
	 */
	public int getEndPosition() {
		return offset + begin + length;
	}

	/**
	 * 获取词元的字符长度
	 *
	 * @return int
	 */
	public int getLength() {
		return this.length;
	}

	public void setLength(int length) {
		if (this.length < 0) {
			throw new IllegalArgumentException("length < 0");
		}
		this.length = length;
	}

	/**
	 * 获取词元的文本内容
	 *
	 * @return String
	 */
	public String getLexemeText() {
		if (lexemeText == null) {
			return "";
		}
		return lexemeText;
	}

	public void setLexemeText(String lexemeText) {
		if (lexemeText == null) {
			this.lexemeText = "";
			this.length = 0;
		} else {
			this.lexemeText = lexemeText;
			this.length = lexemeText.length();
		}
	}

	/**
	 * 获取词元类型
	 *
	 * @return int
	 */
	public int getLexemeType() {
		return lexemeType;
	}

	/**
	 * 获取词元类型标示字符串
	 *
	 * @return String
	 */
	public String getLexemeTypeString() {
		switch (lexemeType) {

		case TYPE_ENGLISH:
			return "ENGLISH";

		case TYPE_ARABIC:
			return "ARABIC";

		case TYPE_LETTER:
			return "LETTER";

		case TYPE_CNWORD:
			return "CN_WORD";

		case TYPE_CNCHAR:
			return "CN_CHAR";

		case TYPE_OTHER_CJK:
			return "OTHER_CJK";

		case TYPE_COUNT:
			return "COUNT";

		case TYPE_CNUM:
			return "TYPE_CNUM";

		case TYPE_CQUAN:
			return "TYPE_CQUAN";

		default:
			return "UNKONW";
		}
	}

	public void setLexemeType(int lexemeType) {
		this.lexemeType = lexemeType;
	}

	/**
	 * 合并两个相邻的词元
	 *
	 * @param l          l
	 * @param lexemeType lexemeType
	 * @return boolean 词元是否成功合并
	 */
	public boolean append(Lexeme l, int lexemeType) {
		if (l != null && this.getEndPosition() == l.getBeginPosition()) {
			this.length += l.getLength();
			this.lexemeType = lexemeType;
			return true;
		} else {
			return false;
		}
	}

	/**
	 *
	 */
	public String toString() {
		return String.valueOf(this.getBeginPosition()) + "-" + this.getEndPosition() +
				" : " + this.lexemeText + " : \t" +
				this.getLexemeTypeString();
	}

}