
com.github.houbb.sentence.segment.api.impl.SentenceSegment Maven / Gradle / Ivy
package com.github.houbb.sentence.segment.api.impl;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.constant.CharConst;
import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sentence.segment.api.ISentenceSegment;
import com.github.houbb.sentence.segment.api.ISentenceSegmentContext;
import com.github.houbb.sentence.segment.api.ISentenceSegmentResult;
import com.github.houbb.sentence.segment.support.data.ISentenceSegmentData;
import java.util.Collections;
import java.util.List;
/**
* @author binbin.hou
* @since 0.0.1
*/
@ThreadSafe
public class SentenceSegment implements ISentenceSegment {
@Override
public List segment(String text, ISentenceSegmentContext context) {
if (StringUtil.isEmptyTrim(text)) {
return Collections.emptyList();
}
final ISentenceSegmentData segmentData = context.data();
final boolean repeatable = context.repeatable();
List results = Guavas.newArrayList();
// 遍历处理
char[] chars = text.toCharArray();
StringBuilder stringBuilder = new StringBuilder();
int startIndex = 0;
int endIndex = 0;
for (int i = 0; i < chars.length; i++) {
char currentChar = chars[i];
stringBuilder.append(currentChar);
// 截断字符
if (segmentData.contains(currentChar)
&& isActuallySplitter(i, chars)) {
// 开始变为结束
startIndex = endIndex;
// 不考虑重复场景
if (!repeatable) {
// 结束变为当前节点
endIndex = i;
} else {
// 向后看多个字符。(等于当前字符 && 不超过数组边界)
char nextChar = currentChar;
do {
if (i < chars.length - 1) {
nextChar = chars[i + 1];
if(nextChar == currentChar) {
i++;
stringBuilder.append(nextChar);
// 更新结果
endIndex = i;
}
}
} while ((nextChar == currentChar) && (i < chars.length - 1));
}
//buffer 添加到列表中
bufferToList(stringBuilder, startIndex, endIndex, results);
}
}
// 最后不为空,则处理
bufferToList(stringBuilder, startIndex, endIndex, results);
return results;
}
/**
* 是否为 . 分隔符特判
* @param i 下标
* @param chars 数组
* @return 是否
* @since 0.0.2
*/
private boolean isActuallySplitter(final int i, final char[] chars) {
final char currentChar = chars[i];
if(CharConst.DOT != currentChar) {
return true;
}
if(i == chars.length-1) {
return true;
}
final char nextChar = chars[i+1];
// 如果是空格
// 否则就是 a.b 的形式。
return Character.isSpaceChar(nextChar);
}
/**
* buffer 传输到列表
* @param stringBuilder buffer 信息
* @param startIndex 开始下标
* @param endIndex 结束下标
* @param results 结果列表
* @since 0.0.1
*/
private void bufferToList(final StringBuilder stringBuilder,
final int startIndex,
final int endIndex,
final List results) {
if(stringBuilder.length() <= 0) {
return;
}
String sentence = stringBuilder.toString();
SentenceSegmentResult result = SentenceSegmentResult
.newInstance()
.startIndex(startIndex)
.endIndex(endIndex)
.sentence(sentence);
// 清空 buffer
stringBuilder.setLength(0);
results.add(result);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy