com.mayabot.nlp.segment.Lexer Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2018 mayabot.com authors. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.mayabot.nlp.segment;

import com.mayabot.nlp.common.Lists;
import com.mayabot.nlp.segment.pipeline.PipelineLexer;
import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder;

import java.util.List;
import java.util.function.Consumer;

/**
 * MynlpTokenizer分词器接口,只需要作用是把一句话或一小段有限长度的文本分词.
 * 
 * 切词器的实现类是个无状态，所以可以多线程安全，同一个配置只需要一个实例共享使用。
 * 所以一个固定的算法的切分,只需要一个实例。
 *
 * @author jimichan
 * @see PipelineLexer
 * @see LexerBuilder
 * @see PipelineLexerBuilder
 * @since 2.1.0
 */
public interface Lexer {

    /**
     * 对text进行分词
     *
     * @param text     分词的文本
     * @param consumer 对WordTerm对象消费者
     */
    void scan(char[] text, Consumer consumer);


    /**
     * 对text是String字符串的一个便捷包装.
     *
     * @param text     分词文本
     * @param consumer token消费者
     */
    default void scan(String text, Consumer consumer) {
        scan(text.toCharArray(), consumer);
    }

    /**
     * 便捷方法。不适用于超大文本
     *
     * @param text 需要分词的文本
     * @return Sentence 中文句子
     */
    default Sentence scan(String text) {

        if (text == null || text.isEmpty()) {
            return Sentence.of();
        }

        List target = Lists.newArrayListWithExpectedSize(Math.min(text.length() / 2, 4));
        scan(text.toCharArray(), target::add);

        return Sentence.of(target);
    }

    /**
     * 返回分析器对应的LexerReader对象
     *
     * @return LexerReader
     */
    default LexerReader reader() {
        return LexerReader.from(this);
    }

    /**
     * 返回分析器对应的LexerReader对象,这个LexerReader可以提供过滤标点符号和停用词的功能，
     * filter过自动处理posInc的值。
     *
     * @param punctuation
     * @param stopWord
     * @return LexerReader
     */
    default LexerReader filterReader(boolean punctuation, boolean stopWord) {
        return LexerReader.filter(this, punctuation, stopWord);
    }

}