edu.stanford.nlp.process.TokenizerAdapter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.process;
import java.io.IOException;
import java.io.StreamTokenizer;
/**
* This class adapts between a java.io.StreamTokenizer
* and a edu.stanford.nlp.process.Tokenizer
.
*
* @author Christopher Manning
* @version 2004/04/01
*/
public class TokenizerAdapter extends AbstractTokenizer {
protected final StreamTokenizer st;
protected String eolString = "";
/**
* Create a new TokenizerAdaptor
. In general, it is
* recommended that the passed in StreamTokenizer
should
* have had resetSyntax()
done to it, so that numbers are
* returned as entered as tokens of type String
, though this
* code will cope as best it can.
*
* @param st The internal java.io.StreamTokenizer
*/
public TokenizerAdapter(StreamTokenizer st) {
this.st = st;
}
/**
* Internally fetches the next token.
*
* @return the next token in the token stream, or null if none exists.
*/
@Override
public String getNext() {
try {
int nextTok = st.nextToken();
switch (nextTok) {
case java.io.StreamTokenizer.TT_EOL:
return eolString;
case java.io.StreamTokenizer.TT_EOF:
return null;
case java.io.StreamTokenizer.TT_WORD:
return st.sval;
case java.io.StreamTokenizer.TT_NUMBER:
return Double.toString(st.nval);
default:
char[] t = { (char) nextTok }; // (array initialization)
return new String(t);
}
} catch (IOException ioe) {
// do nothing, return null
return null;
}
}
/**
* Set the String
returned when the inner tokenizer
* returns an end-of-line token. This will only happen if the
* inner tokenizer has been set to eolIsSignificant(true)
.
*
* @param eolString The String used to represent eol. It is not allowed
* to be null
(which would confuse line ends and file end)
*/
public void setEolString(String eolString) {
if (eolString == null) {
throw new IllegalArgumentException("eolString cannot be null");
}
this.eolString = eolString;
}
/**
* Say whether the String
is the end-of-line token for
* this tokenizer.
*
* @param str The String being tested
* @return Whether it is the end-of-line token
*/
public boolean isEol(String str) {
return eolString.equals(str);
}
}