All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mayabot.nlp.common.TokenizerSplitter Maven / Gradle / Ivy

/*
 * Copyright 2018 mayabot.com authors. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mayabot.nlp.common;

import com.mayabot.nlp.common.utils.Characters;

import java.util.ArrayList;
import java.util.List;

/**
 * 取得文本中的一个个文字段落的from,to
 * 

* 调用next之后=true,才可以方法 from() and to()方法 * 如果英文的话,返回的就是单词 * * @author jimichan */ public class TokenizerSplitter { //包含index private int fromIndex = -1; //(不包含) private int toIndex = -1; private CharSequence sequence; private int length; public static TokenizerSplitter create(CharSequence sequence) { return new TokenizerSplitter(sequence); } private TokenizerSplitter(CharSequence sequence) { rest(sequence); } public void rest(CharSequence sequence) { this.sequence = sequence; this.length = sequence.length(); } private int point = 0; /** * 移动到下一游标 * * @return boolean */ public boolean next() { if (point == length) { //最后一个也是标点 return false; } //找到第一个不是标点的字母 while (point < length && isSpliter(sequence.charAt(point))) { point++; } if (point == length) { //最后一个也是标点 return false; } //记录start位置 this.fromIndex = point; //找到下一个标点符号的位置 while (point < length && !isSpliter(sequence.charAt(point))) { point++; } this.toIndex = point; return true; } public CharSequence group() { return sequence.subSequence(fromIndex, toIndex); } public final int from() { return fromIndex; } public final int to() { return toIndex; } /** * 是否段落切分器 * * @param c * @return */ private boolean isSpliter(char c) { return Characters.isPunctuation(c); } public static List parts(String string) { TokenizerSplitter p = create(string); ArrayList list = new ArrayList<>(); while (p.next()) { list.add((String) p.group()); } return list; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy