
eu.fbk.twm.utils.analysis.StandardTokenizer Maven / Gradle / Ivy
The newest version!
package eu.fbk.twm.utils.analysis;
import org.apache.log4j.Logger;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* User: giuliano
* Date: 1/15/13
* Time: 2:02 PM
* To change this templatePageCounter use File | Settings | File Templates.
*/
public class StandardTokenizer extends AbstractTokenizer implements Tokenizer {
/**
* Define a static logger variable so that it references the
* Logger instance named StandardTokenizer
.
*/
static Logger logger = Logger.getLogger(StandardTokenizer.class.getName());
private static StandardTokenizer ourInstance = null;
public static synchronized StandardTokenizer getInstance() {
if (ourInstance == null) {
ourInstance = new StandardTokenizer();
}
return ourInstance;
}
public String[] stringArray(String text) {
//logger.debug("tokenizing " + text.length() + "...");
List list = new ArrayList();
BreakIterator boundary = BreakIterator.getWordInstance();
boundary.setText(text);
int start = boundary.first();
String form = null;
//int j = -1;
for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
form = text.substring(start, end);
//logger.debug(start + "\t" + end + "\t" + form);
if (!isSeparatorChar(form.charAt(0))) {
list.add(form);
}
}
return list.toArray(new String[list.size()]);
}
public Token[] tokenArray(String text) //throws Exception
{
//logger.debug("tokenizing " + text.length() + "...");
List list = new ArrayList();
BreakIterator boundary = BreakIterator.getWordInstance();
boundary.setText(text);
int start = boundary.first();
String form = null;
//int a = -1;
for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
form = text.substring(start, end);
//logger.debug(start + "\t" + end + "\t" + form);
if (!isSeparatorChar(form.charAt(0))) {
list.add(new Token(start, end, form));
}
}
return list.toArray(new Token[list.size()]);
}
/*int indexOfApostrophe(String form)
{
char ch;
for (int i=0;i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy