All Downloads are FREE. Search and download functionalities are using the official Maven repository.

justhalf.nlp.tokenizer.RegexTokenizer Maven / Gradle / Ivy

package justhalf.nlp.tokenizer;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreLabel;

/**
 * An implementation of {@link Tokenizer} using simple regular expression
* * The regular expression is inspired by the wordpunct_tokenize method in NLTK Python library */ public class RegexTokenizer implements Tokenizer { public static final String DEFAULT_REGEX = "[ \\t\\r\\n]+|" + "((?<=[\\w\\p{IsL}])(?=[^\\w\\p{IsL}]))|" // Previous char is letter, next is non-letter + "((?<=[^\\w\\p{IsL}])(?=[\\w\\p{IsL}]))"; // Previous char is non-letter, next is letter public Pattern pattern; public RegexTokenizer() { pattern = Pattern.compile(DEFAULT_REGEX); } public RegexTokenizer(String regex){ pattern = Pattern.compile(regex); } @Override public String[] tokenizeToString(String sentence) { List words = tokenize(sentence); String[] result = new String[words.size()]; for(int i=0; i tokenize(String sentence) { List result = new ArrayList(); Matcher matcher = pattern.matcher(sentence); int lastEndPos = 0; String lastBetweenText = ""; while(matcher.find()){ int start = matcher.start(); int end = matcher.end(); if(start == lastEndPos && end == lastEndPos){ continue; } String wordText = sentence.substring(lastEndPos, start); String betweenText = sentence.substring(start, end); CoreLabel word = new CoreLabel(); word.setBefore(lastBetweenText); word.setBeginPosition(lastEndPos); word.setEndPosition(start); word.setValue(wordText); word.setWord(wordText); word.setOriginalText(wordText); word.setAfter(betweenText); lastEndPos = end; lastBetweenText = betweenText; result.add(word); } if(lastEndPos != sentence.length()){ int start = sentence.length(); int end = sentence.length(); String wordText = sentence.substring(lastEndPos, start); String betweenText = sentence.substring(start, end); CoreLabel word = new CoreLabel(); word.setBefore(lastBetweenText); word.setBeginPosition(lastEndPos); word.setEndPosition(start); word.setValue(wordText); word.setWord(wordText); word.setOriginalText(wordText); word.setAfter(betweenText); lastEndPos = end; lastBetweenText = betweenText; result.add(word); } return result; } @Override public boolean isThreadSafe(){ return true; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy