hr.fer.zemris.takelab.splitter.TokenSplitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
package hr.fer.zemris.takelab.splitter;
import java.util.ArrayList;
import java.util.List;
/**
* A very simple implementation of a token splitter. Splitting is modeled after the splitting done by the CSTLemma lemmatiser.
* @author Luka Skukan
*
*/
public class TokenSplitter {
/**
* Whitespace characters (vertical tab not included)
*/
private static String spaces = " \t\n\r";
/**
* Punctuation characters (according to CSTLemma)
*/
private static String punctuation = ".,;?!:()";
/**
* Takes a string and returns a List of all tokens contained within it.
* Any non-whitespace string of characters delimited by punctuation or whitespace is considered a token.
* Likewise, every instance of a punctuation character is also a token.
* @param sentence A string from which tokens are extracted
* @return List of tokens in given string
*/
public static List getTokens(String sentence) {
//Token container
List tokens = new ArrayList();
//Token building buffer
StringBuilder buff = new StringBuilder();
for(char c : sentence.toCharArray()) {
//Spaces delimit tokens (if non-empty token)
if(buff.length() > 0 && spaces.indexOf(c) != -1) {
tokens.add(buff.toString());
buff.setLength(0);
//Punctuation both delimits non-empty tokens and IS a token
} else if(punctuation.indexOf(c) != -1) {
if(buff.length() > 0) {
tokens.add(buff.toString());
buff.setLength(0);
}
tokens.add(String.valueOf(c));
//Non-whitespace is added to currently built token
} else if(spaces.indexOf(c) == -1){
buff.append(c);
}
}
//If we've a token left in the buffer, wrap it up
if(buff.length() > 0) {
tokens.add(buff.toString());
}
return tokens;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy