All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.houbb.sentence.segment.api.impl.SentenceSegment Maven / Gradle / Ivy

package com.github.houbb.sentence.segment.api.impl;

import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.constant.CharConst;
import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sentence.segment.api.ISentenceSegment;
import com.github.houbb.sentence.segment.api.ISentenceSegmentContext;
import com.github.houbb.sentence.segment.api.ISentenceSegmentResult;
import com.github.houbb.sentence.segment.support.data.ISentenceSegmentData;

import java.util.Collections;
import java.util.List;

/**
 * @author binbin.hou
 * @since 0.0.1
 */
@ThreadSafe
public class SentenceSegment implements ISentenceSegment {

    @Override
    public List segment(String text, ISentenceSegmentContext context) {
        if (StringUtil.isEmptyTrim(text)) {
            return Collections.emptyList();
        }

        final ISentenceSegmentData segmentData = context.data();
        final boolean repeatable = context.repeatable();
        List results = Guavas.newArrayList();

        // 遍历处理
        char[] chars = text.toCharArray();
        StringBuilder stringBuilder = new StringBuilder();
        int startIndex = 0;
        int endIndex = 0;

        for (int i = 0; i < chars.length; i++) {
            char currentChar = chars[i];
            stringBuilder.append(currentChar);

            // 截断字符
            if (segmentData.contains(currentChar)
                && isActuallySplitter(i, chars)) {
                // 开始变为结束
                startIndex = endIndex;

                // 不考虑重复场景
                if (!repeatable) {
                    // 结束变为当前节点
                    endIndex = i;
                } else {
                    // 向后看多个字符。(等于当前字符 && 不超过数组边界)
                    char nextChar = currentChar;
                    do {
                        if (i < chars.length - 1) {
                            nextChar = chars[i + 1];

                            if(nextChar == currentChar) {
                                i++;
                                stringBuilder.append(nextChar);

                                // 更新结果
                                endIndex = i;
                            }
                        }
                    } while ((nextChar == currentChar) && (i < chars.length - 1));
                }

                //buffer 添加到列表中
                bufferToList(stringBuilder, startIndex, endIndex, results);
            }
        }

        // 最后不为空,则处理
        bufferToList(stringBuilder, startIndex, endIndex, results);

        return results;
    }

    /**
     * 是否为 . 分隔符特判
     * @param i 下标
     * @param chars 数组
     * @return 是否
     * @since 0.0.2
     */
    private boolean isActuallySplitter(final int i, final char[] chars) {
        final char currentChar = chars[i];

        if(CharConst.DOT != currentChar) {
            return true;
        }
        if(i == chars.length-1) {
            return true;
        }

        final char nextChar = chars[i+1];

        // 如果是空格
        // 否则就是 a.b 的形式。
        return Character.isSpaceChar(nextChar);
    }

    /**
     * buffer 传输到列表
     * @param stringBuilder buffer 信息
     * @param startIndex 开始下标
     * @param endIndex 结束下标
     * @param results 结果列表
     * @since 0.0.1
     */
    private void bufferToList(final StringBuilder stringBuilder,
                              final int startIndex,
                              final int endIndex,
                              final List results) {
        if(stringBuilder.length() <= 0) {
            return;
        }

        String sentence = stringBuilder.toString();
        SentenceSegmentResult result = SentenceSegmentResult
                .newInstance()
                .startIndex(startIndex)
                .endIndex(endIndex)
                .sentence(sentence);
        // 清空 buffer
        stringBuilder.setLength(0);
        results.add(result);
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy