edu.stanford.nlp.process.WhitespaceTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.process;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.util.Iterator;
import java.util.Properties;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
/**
* A WhitespaceTokenizer is a tokenizer that splits on and discards only
* whitespace characters.
* This implementation returns Word objects. It has a parameter for whether
* to make EOL a token or whether to treat EOL characters as whitespace.
* If an EOL is a token, the class returns it as a Word with String value "\n".
*
* Implementation note: This was rewritten in Apr 2006 to discard the
* old StreamTokenizer based implementation and to replace it with a
* Unicode compliant JFlex-based version.
*
* @author Joseph Smarr ([email protected])
* @author Teg Grenager ([email protected])
* @author Roger Levy
* @author Christopher Manning
*/
public class WhitespaceTokenizer extends AbstractTokenizer {
private WhitespaceLexer lexer;
private final boolean eolIsSignificant;
/**
* A factory which vends WhitespaceTokenizers.
*
* @author Christopher Manning
*/
public static class WhitespaceTokenizerFactory implements TokenizerFactory {
private boolean tokenizeNLs;
private LexedTokenFactory factory;
/**
* Constructs a new TokenizerFactory that returns Word objects and
* treats carriage returns as normal whitespace.
* THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP
* CODE TO LOAD A TOKENIZER FACTORY. IT SHOULD BE PRESENT IN A
* TokenizerFactory.
*
* @return A TokenizerFactory that returns Word objects
*/
public static TokenizerFactory newTokenizerFactory() {
return new WhitespaceTokenizerFactory(new WordTokenFactory(),
false);
}
public WhitespaceTokenizerFactory(LexedTokenFactory factory) {
this(factory, false);
}
public WhitespaceTokenizerFactory(LexedTokenFactory factory,
String options) {
this.factory = factory;
Properties prop = StringUtils.stringToProperties(options);
this.tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", false);
}
public WhitespaceTokenizerFactory(LexedTokenFactory factory,
boolean tokenizeNLs) {
this.factory = factory;
this.tokenizeNLs = tokenizeNLs;
}
public Iterator getIterator(Reader r) {
return getTokenizer(r);
}
public Tokenizer getTokenizer(Reader r) {
return new WhitespaceTokenizer(factory, r, tokenizeNLs);
}
public Tokenizer getTokenizer(Reader r, String extraOptions) {
Properties prop = StringUtils.stringToProperties(extraOptions);
boolean tokenizeNewlines =
PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs);
return new WhitespaceTokenizer(factory, r, tokenizeNewlines);
}
public void setOptions(String options) {
Properties prop = StringUtils.stringToProperties(options);
tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", tokenizeNLs);
}
} // end class WhitespaceTokenizerFactory
public static WhitespaceTokenizerFactory newCoreLabelTokenizerFactory(String options) {
return new WhitespaceTokenizerFactory(new CoreLabelTokenFactory(), options);
}
public static WhitespaceTokenizerFactory newCoreLabelTokenizerFactory() {
return new WhitespaceTokenizerFactory(new CoreLabelTokenFactory());
}
/**
* Internally fetches the next token.
*
* @return the next token in the token stream, or null if none exists.
*/
@SuppressWarnings("unchecked")
@Override
protected T getNext() {
T token = null;
if (lexer == null) {
return token;
}
try {
token = (T) lexer.next();
while (token != null && token.word().equals(WhitespaceLexer.NEWLINE)) {
if (eolIsSignificant) {
return token;
} else {
token = (T) lexer.next();
}
}
} catch (IOException e) {
// do nothing, return null
}
return token;
}
/**
* Constructs a new WhitespaceTokenizer
* @param r The Reader that is its source.
* @param eolIsSignificant Whether eol tokens should be returned.
*/
public WhitespaceTokenizer(LexedTokenFactory factory,
Reader r, boolean eolIsSignificant) {
this.eolIsSignificant = eolIsSignificant;
// The conditional below is perhaps currently needed in LexicalizedParser, since
// it passes in a null arg while doing type-checking for sentence escaping
// but StreamTokenizer barfs on that. But maybe shouldn't be here.
if (r != null) {
lexer = new WhitespaceLexer(r, factory);
}
}
public static WhitespaceTokenizer newCoreLabelWhitespaceTokenizer(Reader r) {
return new WhitespaceTokenizer(new CoreLabelTokenFactory(), r, false);
}
public static WhitespaceTokenizer newCoreLabelWhitespaceTokenizer(Reader r, boolean tokenizeNLs) {
return new WhitespaceTokenizer(new CoreLabelTokenFactory(), r, tokenizeNLs);
}
public static WhitespaceTokenizer
newWordWhitespaceTokenizer(Reader r)
{
return newWordWhitespaceTokenizer(r, false);
}
public static WhitespaceTokenizer
newWordWhitespaceTokenizer(Reader r, boolean eolIsSignificant)
{
return new WhitespaceTokenizer(new WordTokenFactory(), r,
eolIsSignificant);
}
/* ----
* Sets the source of this Tokenizer to be the Reader r.
private void setSource(Reader r) {
lexer = new WhitespaceLexer(r);
}
---- */
public static TokenizerFactory factory() {
return new WhitespaceTokenizerFactory(new WordTokenFactory(),
false);
}
public static TokenizerFactory factory(boolean eolIsSignificant) {
return new WhitespaceTokenizerFactory(new WordTokenFactory(),
eolIsSignificant);
}
/**
* Reads a file from the argument and prints its tokens one per line.
* This is mainly as a testing aid, but it can also be quite useful
* standalone to turn a corpus into a one token per line file of tokens.
*
* Usage: java edu.stanford.nlp.process.WhitespaceTokenizer filename
*
*
* @param args Command line arguments
* @throws IOException If can't open files, etc.
*/
public static void main(String[] args) throws IOException {
boolean eolIsSignificant = (args.length > 0 && args[0].equals("-cr"));
Reader reader = ((args.length > 0 &&
!args[args.length - 1].equals("-cr")) ?
new InputStreamReader(new FileInputStream
(args[args.length - 1]), "UTF-8") :
new InputStreamReader(System.in, "UTF-8"));
WhitespaceTokenizer tokenizer =
new WhitespaceTokenizer(new WordTokenFactory(), reader,
eolIsSignificant);
PrintWriter pw =
new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true);
while (tokenizer.hasNext()) {
Word w = tokenizer.next();
if (w.value().equals(WhitespaceLexer.NEWLINE)) {
pw.println("***CR***");
} else {
pw.println(w);
}
}
}
} // end class WhitespaceTokenizer