
com.wcohen.ss.tokens.SerializableSimpleTokenizer Maven / Gradle / Ivy
package com.wcohen.ss.tokens;
import com.wcohen.ss.api.Token;
import com.wcohen.ss.api.Tokenizer;
import java.io.Serializable;
import java.util.*;
public class SerializableSimpleTokenizer implements Tokenizer, Serializable {
public static final SerializableSimpleTokenizer DEFAULT_TOKENIZER = new SerializableSimpleTokenizer(true,true);
private boolean ignorePunctuation = true;
private boolean ignoreCase = true;
public SerializableSimpleTokenizer(boolean ignorePunctuation,boolean ignoreCase) {
this.ignorePunctuation = ignorePunctuation;
this.ignoreCase = ignoreCase;
}
// parameter setting
public void setIgnorePunctuation(boolean flag) { ignorePunctuation = flag; }
public void setIgnoreCase(boolean flag) { ignoreCase = flag; }
public String toString() { return "[SimpleTokenizer "+ignorePunctuation+";"+ignoreCase+"]"; }
/** Return tokenized version of a string. Tokens are sequences
* of alphanumerics, or any single punctuation character. */
public Token[] tokenize(String input)
{
List tokens = new ArrayList();
int cursor = 0;
while (cursor
© 2015 - 2025 Weber Informatics LLC | Privacy Policy