All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.process.WhitespaceTokenizer Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.process;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.util.Iterator;
import java.util.Properties;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;

/**
 * A WhitespaceTokenizer is a tokenizer that splits on and discards only
 * whitespace characters.
 * This implementation returns Word objects. It has a parameter for whether
 * to make EOL a token or whether to treat EOL characters as whitespace.
 * If an EOL is a token, the class returns it as a Word with String value "\n".
 * 

* Implementation note: This was rewritten in Apr 2006 to discard the * old StreamTokenizer based implementation and to replace it with a * Unicode compliant JFlex-based version. * * @author Joseph Smarr ([email protected]) * @author Teg Grenager ([email protected]) * @author Roger Levy * @author Christopher Manning */ public class WhitespaceTokenizer extends AbstractTokenizer { private WhitespaceLexer lexer; private final boolean eolIsSignificant; /** * A factory which vends WhitespaceTokenizers. * * @author Christopher Manning */ public static class WhitespaceTokenizerFactory implements TokenizerFactory { private boolean tokenizeNLs; private LexedTokenFactory factory; /** * Constructs a new TokenizerFactory that returns Word objects and * treats carriage returns as normal whitespace. * THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP * CODE TO LOAD A TOKENIZER FACTORY. IT SHOULD BE PRESENT IN A * TokenizerFactory. * * @return A TokenizerFactory that returns Word objects */ public static TokenizerFactory newTokenizerFactory() { return new WhitespaceTokenizerFactory(new WordTokenFactory(), false); } public WhitespaceTokenizerFactory(LexedTokenFactory factory) { this(factory, false); } public WhitespaceTokenizerFactory(LexedTokenFactory factory, String options) { this.factory = factory; Properties prop = StringUtils.stringToProperties(options); this.tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", false); } public WhitespaceTokenizerFactory(LexedTokenFactory factory, boolean tokenizeNLs) { this.factory = factory; this.tokenizeNLs = tokenizeNLs; } public Iterator getIterator(Reader r) { return getTokenizer(r); } public Tokenizer getTokenizer(Reader r) { return new WhitespaceTokenizer(factory, r, tokenizeNLs); } public Tokenizer getTokenizer(Reader r, String extraOptions) { Properties prop = StringUtils.stringToProperties(extraOptions); boolean tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs); return new WhitespaceTokenizer(factory, r, tokenizeNewlines); } public void setOptions(String options) { Properties prop = StringUtils.stringToProperties(options); tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", tokenizeNLs); } } // end class WhitespaceTokenizerFactory public static WhitespaceTokenizerFactory newCoreLabelTokenizerFactory(String options) { return new WhitespaceTokenizerFactory(new CoreLabelTokenFactory(), options); } public static WhitespaceTokenizerFactory newCoreLabelTokenizerFactory() { return new WhitespaceTokenizerFactory(new CoreLabelTokenFactory()); } /** * Internally fetches the next token. * * @return the next token in the token stream, or null if none exists. */ @SuppressWarnings("unchecked") @Override protected T getNext() { T token = null; if (lexer == null) { return token; } try { token = (T) lexer.next(); while (token != null && token.word().equals(WhitespaceLexer.NEWLINE)) { if (eolIsSignificant) { return token; } else { token = (T) lexer.next(); } } } catch (IOException e) { // do nothing, return null } return token; } /** * Constructs a new WhitespaceTokenizer * @param r The Reader that is its source. * @param eolIsSignificant Whether eol tokens should be returned. */ public WhitespaceTokenizer(LexedTokenFactory factory, Reader r, boolean eolIsSignificant) { this.eolIsSignificant = eolIsSignificant; // The conditional below is perhaps currently needed in LexicalizedParser, since // it passes in a null arg while doing type-checking for sentence escaping // but StreamTokenizer barfs on that. But maybe shouldn't be here. if (r != null) { lexer = new WhitespaceLexer(r, factory); } } public static WhitespaceTokenizer newCoreLabelWhitespaceTokenizer(Reader r) { return new WhitespaceTokenizer(new CoreLabelTokenFactory(), r, false); } public static WhitespaceTokenizer newCoreLabelWhitespaceTokenizer(Reader r, boolean tokenizeNLs) { return new WhitespaceTokenizer(new CoreLabelTokenFactory(), r, tokenizeNLs); } public static WhitespaceTokenizer newWordWhitespaceTokenizer(Reader r) { return newWordWhitespaceTokenizer(r, false); } public static WhitespaceTokenizer newWordWhitespaceTokenizer(Reader r, boolean eolIsSignificant) { return new WhitespaceTokenizer(new WordTokenFactory(), r, eolIsSignificant); } /* ---- * Sets the source of this Tokenizer to be the Reader r. private void setSource(Reader r) { lexer = new WhitespaceLexer(r); } ---- */ public static TokenizerFactory factory() { return new WhitespaceTokenizerFactory(new WordTokenFactory(), false); } public static TokenizerFactory factory(boolean eolIsSignificant) { return new WhitespaceTokenizerFactory(new WordTokenFactory(), eolIsSignificant); } /** * Reads a file from the argument and prints its tokens one per line. * This is mainly as a testing aid, but it can also be quite useful * standalone to turn a corpus into a one token per line file of tokens. *

* Usage: java edu.stanford.nlp.process.WhitespaceTokenizer filename * * * @param args Command line arguments * @throws IOException If can't open files, etc. */ public static void main(String[] args) throws IOException { boolean eolIsSignificant = (args.length > 0 && args[0].equals("-cr")); Reader reader = ((args.length > 0 && !args[args.length - 1].equals("-cr")) ? new InputStreamReader(new FileInputStream (args[args.length - 1]), "UTF-8") : new InputStreamReader(System.in, "UTF-8")); WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new WordTokenFactory(), reader, eolIsSignificant); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true); while (tokenizer.hasNext()) { Word w = tokenizer.next(); if (w.value().equals(WhitespaceLexer.NEWLINE)) { pw.println("***CR***"); } else { pw.println(w); } } } } // end class WhitespaceTokenizer





© 2015 - 2024 Weber Informatics LLC | Privacy Policy