org.jpmml.evaluator.TextSplitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pmml-evaluator Show documentation
Show all versions of pmml-evaluator Show documentation
JPMML class model evaluator
The newest version!
/*
* Copyright (c) 2021 Villu Ruusmann
*
* This file is part of JPMML-Evaluator
*
* JPMML-Evaluator is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* JPMML-Evaluator is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with JPMML-Evaluator. If not, see .
*/
package org.jpmml.evaluator;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.dmg.pmml.PMMLObject;
import org.dmg.pmml.TextIndex;
import org.jpmml.model.TermUtil;
/**
* @see TextIndex#getWordSeparatorCharacterRE()
*/
public class TextSplitter extends TextTokenizer {
public TextSplitter(String wordSeparatorCharacterRE, PMMLObject context){
this(RegExUtil.compile(wordSeparatorCharacterRE, context));
}
public TextSplitter(Pattern pattern){
super(pattern);
}
@Override
public TokenizedString tokenize(String string){
Pattern pattern = getPattern();
if(("").equals(string)){
return TokenizedString.EMPTY;
}
Matcher matcher = pattern.matcher(string);
if(!matcher.find()){
String token = TermUtil.trimPunctuation(string);
if(!token.isEmpty()){
return new TokenizedString(token);
}
return TokenizedString.EMPTY;
}
List tokens = new ArrayList<>(Math.max(string.length() / 4, 16));
int index = 0;
do {
int start = matcher.start();
int end = matcher.end();
String token = TermUtil.trimPunctuation(string.substring(index, start));
if(!token.isEmpty()){
tokens.add(token);
}
index = end;
} while(matcher.find());
String token = TermUtil.trimPunctuation(string.substring(index));
if(!token.isEmpty()){
tokens.add(token);
}
return new TokenizedString(tokens);
}
}