
cc.mallet.extract.StringTokenization Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum [email protected]
*/
package cc.mallet.extract;
import java.io.ObjectOutputStream;
import java.io.ObjectInputStream;
import java.io.IOException;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.CharSequenceLexer;
public class StringTokenization extends TokenSequence implements Tokenization
{
private CharSequence document;
/** Create an empty StringTokenization */
public StringTokenization (CharSequence seq)
{
document = seq;
}
/**
* Creates a tokenization of the given string. Tokens are
* added from all the matches of the given lexer.
*/
public StringTokenization (CharSequence string, CharSequenceLexer lexer)
{
super();
this.document = string;
lexer.setCharSequence (string);
while (lexer.hasNext()) {
lexer.next ();
this.add (new StringSpan (string, lexer.getStartOffset(), lexer.getEndOffset()));
}
}
//xxx Refactor into AbstractTokenization
public Span subspan (int firstToken, int lastToken)
{
StringSpan firstSpan = (StringSpan) get(firstToken);
int startIdx = firstSpan.getStartIdx ();
int endIdx;
if (lastToken > size()) {
endIdx = document.length ();
} else {
StringSpan lastSpan = (StringSpan) get(lastToken - 1);
endIdx = lastSpan.getEndIdx ();
}
return new StringSpan (document, startIdx, endIdx);
}
public Span getSpan (int i) { return (Span) get(i); }
public Object getDocument ()
{
return document;
}
// Serialization garbage
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 1;
private void writeObject (ObjectOutputStream out) throws IOException
{
out.defaultWriteObject ();
out.writeInt (CURRENT_SERIAL_VERSION);
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException
{
in.defaultReadObject ();
int version = in.readInt ();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy