hr.fer.zemris.takelab.splitter.TokenSplitter Maven / Gradle / Ivy
package hr.fer.zemris.takelab.splitter;
import java.util.ArrayList;
import java.util.List;
/**
* A very simple implementation of a token splitter. Splitting is modeled after the splitting done by the CSTLemma lemmatiser.
* @author Luka Skukan
*
*/
public class TokenSplitter {
/**
* Whitespace characters (vertical tab not included)
*/
private static String spaces = " \t\n\r";
/**
* Punctuation characters (according to CSTLemma)
*/
private static String punctuation = ".,;?!:()";
/**
* Takes a string and returns a List of all tokens contained within it.
* Any non-whitespace string of characters delimited by punctuation or whitespace is considered a token.
* Likewise, every instance of a punctuation character is also a token.
* @param sentence A string from which tokens are extracted
* @return List of tokens in given string
*/
public static List getTokens(String sentence) {
//Token container
List tokens = new ArrayList();
//Token building buffer
StringBuilder buff = new StringBuilder();
for(char c : sentence.toCharArray()) {
//Spaces delimit tokens (if non-empty token)
if(buff.length() > 0 && spaces.indexOf(c) != -1) {
tokens.add(buff.toString());
buff.setLength(0);
//Punctuation both delimits non-empty tokens and IS a token
} else if(punctuation.indexOf(c) != -1) {
if(buff.length() > 0) {
tokens.add(buff.toString());
buff.setLength(0);
}
tokens.add(String.valueOf(c));
//Non-whitespace is added to currently built token
} else if(spaces.indexOf(c) == -1){
buff.append(c);
}
}
//If we've a token left in the buffer, wrap it up
if(buff.length() > 0) {
tokens.add(buff.toString());
}
return tokens;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy