org.wltea.analyzer.core.LexemePath Maven / Gradle / Ivy
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益([email protected])提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.core;
/**
* Lexeme链(路径)
*/
class LexemePath extends QuickSortSet implements Comparable{
//起始位置
private int pathBegin;
//结束
private int pathEnd;
//词元链的有效字符长度
private int payloadLength;
LexemePath(){
this.pathBegin = -1;
this.pathEnd = -1;
this.payloadLength = 0;
}
/**
* 向LexemePath追加相交的Lexeme
* @param lexeme
* @return
*/
boolean addCrossLexeme(Lexeme lexeme){
if(this.isEmpty()){
this.addLexeme(lexeme);
this.pathBegin = lexeme.getBegin();
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
this.payloadLength += lexeme.getLength();
return true;
}else if(this.checkCross(lexeme)){
this.addLexeme(lexeme);
if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
}
this.payloadLength = this.pathEnd - this.pathBegin;
return true;
}else{
return false;
}
}
/**
* 向LexemePath追加不相交的Lexeme
* @param lexeme
* @return
*/
boolean addNotCrossLexeme(Lexeme lexeme){
if(this.isEmpty()){
this.addLexeme(lexeme);
this.pathBegin = lexeme.getBegin();
this.pathEnd = lexeme.getBegin() + lexeme.getLength();
this.payloadLength += lexeme.getLength();
return true;
}else if(this.checkCross(lexeme)){
return false;
}else{
this.addLexeme(lexeme);
this.payloadLength += lexeme.getLength();
Lexeme head = this.peekFirst();
this.pathBegin = head.getBegin();
Lexeme tail = this.peekLast();
this.pathEnd = tail.getBegin() + tail.getLength();
return true;
}
}
/**
* 移除尾部的Lexeme
* @return
*/
Lexeme removeTail(){
Lexeme tail = this.pollLast();
if(this.isEmpty()){
this.pathBegin = -1;
this.pathEnd = -1;
this.payloadLength = 0;
}else{
this.payloadLength -= tail.getLength();
Lexeme newTail = this.peekLast();
this.pathEnd = newTail.getBegin() + newTail.getLength();
}
return tail;
}
/**
* 检测词元位置交叉(有歧义的切分)
* @param lexeme
* @return
*/
boolean checkCross(Lexeme lexeme){
return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
|| (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength());
}
int getPathBegin() {
return pathBegin;
}
int getPathEnd() {
return pathEnd;
}
/**
* 获取Path的有效词长
* @return
*/
int getPayloadLength(){
return this.payloadLength;
}
/**
* 获取LexemePath的路径长度
* @return
*/
int getPathLength(){
return this.pathEnd - this.pathBegin;
}
/**
* X权重(词元长度积)
* @return
*/
int getXWeight(){
int product = 1;
Cell c = this.getHead();
while( c != null && c.getLexeme() != null){
product *= c.getLexeme().getLength();
c = c.getNext();
}
return product;
}
/**
* 词元位置权重
* @return
*/
int getPWeight(){
int pWeight = 0;
int p = 0;
Cell c = this.getHead();
while( c != null && c.getLexeme() != null){
p++;
pWeight += p * c.getLexeme().getLength() ;
c = c.getNext();
}
return pWeight;
}
LexemePath copy(){
LexemePath theCopy = new LexemePath();
theCopy.pathBegin = this.pathBegin;
theCopy.pathEnd = this.pathEnd;
theCopy.payloadLength = this.payloadLength;
Cell c = this.getHead();
while( c != null && c.getLexeme() != null){
theCopy.addLexeme(c.getLexeme());
c = c.getNext();
}
return theCopy;
}
public int compareTo(LexemePath o) {
//比较有效文本长度
if(this.payloadLength > o.payloadLength){
return -1;
}else if(this.payloadLength < o.payloadLength){
return 1;
}else{
//比较词元个数,越少越好
if(this.size() < o.size()){
return -1;
}else if (this.size() > o.size()){
return 1;
}else{
//路径跨度越大越好
if(this.getPathLength() > o.getPathLength()){
return -1;
}else if(this.getPathLength() < o.getPathLength()){
return 1;
}else {
//根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
if(this.pathEnd > o.pathEnd){
return -1;
}else if(pathEnd < o.pathEnd){
return 1;
}else{
//词长越平均越好
if(this.getXWeight() > o.getXWeight()){
return -1;
}else if(this.getXWeight() < o.getXWeight()){
return 1;
}else {
//词元位置权重比较
if(this.getPWeight() > o.getPWeight()){
return -1;
}else if(this.getPWeight() < o.getPWeight()){
return 1;
}
}
}
}
}
}
return 0;
}
public String toString(){
StringBuffer sb = new StringBuffer();
sb.append("pathBegin : ").append(pathBegin).append("\r\n");
sb.append("pathEnd : ").append(pathEnd).append("\r\n");
sb.append("payloadLength : ").append(payloadLength).append("\r\n");
Cell head = this.getHead();
while(head != null){
sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
head = head.getNext();
}
return sb.toString();
}
}