
cc.mallet.pipe.FixedVocabTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.pipe;
import cc.mallet.types.*;
import java.io.*;
/**
* A simple unicode tokenizer that accepts sequences of letters
* as tokens.
*/
public class FixedVocabTokenizer extends Pipe implements Serializable {
public int minimumLength = 3;
int[] tokenBuffer;
int[] characterBuffer;
public FixedVocabTokenizer (Alphabet alphabet) {
super(alphabet, null);
tokenBuffer = new int[100000];
characterBuffer = new int[1000];
}
public Instance pipe(Instance instance) {
Alphabet alphabet = this.getAlphabet();
int underscoreCodePoint = Character.codePointAt("_", 0);
if (instance.getData() instanceof CharSequence) {
CharSequence characters = (CharSequence) instance.getData();
int length = -1;
int numTokens = 0;
// Using code points instead of chars allows us
// to support extended Unicode, and has no significant
// efficiency costs.
int totalCodePoints = Character.codePointCount(characters, 0, characters.length());
for (int i=0; i < totalCodePoints; i++) {
if (numTokens == tokenBuffer.length - 1) { System.err.println("Overflowed token buffer"); break; }
int codePoint = Character.codePointAt(characters, i);
int codePointType = Character.getType(codePoint);
if (codePointType == Character.LOWERCASE_LETTER ||
codePointType == Character.UPPERCASE_LETTER ||
codePoint == underscoreCodePoint) {
length++;
characterBuffer[length] = codePoint;
}
else if (codePointType == Character.DASH_PUNCTUATION ||
codePointType == Character.DECIMAL_DIGIT_NUMBER) {
// Add dashes and numbers EXCEPT at the beginning of tokens
if (length != -1) {
length++;
characterBuffer[length] = codePoint;
}
}
else if (codePointType == Character.SPACE_SEPARATOR ||
codePointType == Character.LINE_SEPARATOR ||
codePointType == Character.PARAGRAPH_SEPARATOR ||
codePointType == Character.END_PUNCTUATION ||
codePointType == Character.CONNECTOR_PUNCTUATION ||
codePointType == Character.START_PUNCTUATION ||
codePointType == Character.INITIAL_QUOTE_PUNCTUATION ||
codePointType == Character.FINAL_QUOTE_PUNCTUATION ||
codePointType == Character.OTHER_PUNCTUATION) {
// Things that delimit words
if (length != -1) {
String token = new String(characterBuffer, 0, length + 1);
if (alphabet.contains(token) && length >= minimumLength) {
tokenBuffer[numTokens] = alphabet.lookupIndex(token);
numTokens++;
}
length = -1;
}
}
else if (codePointType == Character.COMBINING_SPACING_MARK ||
codePointType == Character.ENCLOSING_MARK ||
codePointType == Character.NON_SPACING_MARK ||
codePointType == Character.TITLECASE_LETTER ||
codePointType == Character.MODIFIER_LETTER ||
codePointType == Character.OTHER_LETTER) {
// Obscure things that are technically part of words.
// Marks are especially useful for Indic scripts.
length++;
characterBuffer[length] = codePoint;
}
else {
// Character.CONTROL
// Character.MATH_SYMBOL
//System.out.println("type " + codePointType);
}
}
if (length != -1) {
String token = new String(characterBuffer, 0, length + 1);
if (alphabet.contains(token) && length >= minimumLength) {
tokenBuffer[numTokens] = alphabet.lookupIndex(token);
numTokens++;
}
}
int[] tokens = new int[numTokens];
System.arraycopy(tokenBuffer, 0, tokens, 0, numTokens);
instance.setData(new FeatureSequence(alphabet, tokens));
}
else {
throw new IllegalArgumentException("Looking for a CharSequence, found a " +
instance.getData().getClass());
}
return instance;
}
static final long serialVersionUID = 1;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy