![JAR search and dependency download from the Maven repository](/logo.png)
edu.berkeley.nlp.io.PTBTokenizer Maven / Gradle / Ivy
Show all versions of berkeleyparser Show documentation
/**
*
*/
package edu.berkeley.nlp.io;
import java.io.*;
import java.util.*;
import edu.berkeley.nlp.util.StringUtils;
/**
* Tokenizer implementation that conforms to the Penn Treebank tokenization
* conventions.
* This tokenizer is a Java implementation of Professor Chris Manning's Flex
* tokenizer, pgtt-treebank.l. It reads raw text and outputs
* tokens as edu.stanford.nlp.trees.Words in the Penn treebank format. It can
* optionally return carriage returns as tokens.
*
* @author Teg Grenager ([email protected])
*/
public class PTBTokenizer extends AbstractTokenizer {
// whether carriage returns should be returned as tokens
private boolean tokenizeCRs;
// the underlying lexer
PTBLexer lexer;
/**
* Constructs a new PTBTokenizer that treats carriage returns as normal whitespace.
* No source is specified, so hasNext() will return false.
*/
public PTBTokenizer() {
this(false);
}
/**
* Constructs a new PTBTokenizer that optionally returns carriage returns
* as their own token. CRs come back as Words whose text is
* the value of PTBLexer.cr
.
*/
public PTBTokenizer(boolean tokenizeCRs) {
this.tokenizeCRs = tokenizeCRs;
}
/**
* Constructs a new PTBTokenizer that treats carriage returns as normal whitespace.
*/
public PTBTokenizer(Reader r) {
this(r, false);
}
/**
* Constructs a new PTBTokenizer that optionally returns carriage returns
* as their own token. CRs come back as Words whose text is
* the value of PTBLexer.cr
.
*/
public PTBTokenizer(Reader r, boolean tokenizeCRs) {
this.tokenizeCRs = tokenizeCRs;
setSource(r);
}
/**
* Get the next valid Word from the lexer if possible.
*/
protected Object getNext() {
if (lexer == null) {
return null;
}
Object token = null;
try {
token = lexer.next();
// get rid of CRs if necessary
while (!tokenizeCRs && PTBLexer.cr.equals((String)token))
token = lexer.next();
} catch (Exception e) {
nextToken = null;
}
return token;
}
/** Reads a file from the argument and prints its tokens one per line.
* This is mainly as a testing aid, but it can also be quite useful
* standalone to turn a corpus into a one token per line file of tokens.
*
* Usage: java edu.stanford.nlp.process.PTBTokenizer filename
*
* @param args Command line arguments
*/
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("usage: java edu.berkeley.nlp.io." +
"PTBTokenizer [-cr] filename");
return;
}
PTBTokenizer tokenizer = new PTBTokenizer(new FileReader(args[args.length - 1]), "-cr".equals(args[0]));
List words = tokenizer.tokenize();
for (int i = 0; i < words.size(); i++)
System.out.println(words.get(i));
}
/**
* Sets the source of this Tokenizer to be the Reader r.
*/
public void setSource(Reader r) {
lexer = new PTBLexer(r);
}
/**
* Returns a presentable version of the given PTB-tokenized text.
* PTB tokenization splits up punctuation and does various other things
* that makes simply joining the tokens with spaces look bad. So join
* the tokens with space and run it through this method to produce nice
* looking text. It's not perfect, but it works pretty well.
*/
public static String ptb2Text(String ptbText) {
StringBuffer sb = new StringBuffer(ptbText.length()); // probably an overestimate
PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));
String token;
try {
while ((token = lexer.next()) != null) sb.append(token);
} catch (IOException e) {
e.printStackTrace();
}
return (sb.toString());
}
/**
* Returns a presentable version of the given PTB-tokenized words.
* Pass in a List of Words or Strings, or a Document and this method will
* join the words with spaces and call {@link #ptb2Text(String) } on the
* output. This method will check if the elements in the list are subtypes
* of Word, and if so, it will take the word() values to prevent additional
* text from creeping in (e.g., POS tags). Otherwise the toString value will
* be used.
*/
public static String ptb2Text(List ptbWords) {
for (int i = 0; i < ptbWords.size(); i++)
if (ptbWords.get(i) instanceof String)
ptbWords.set(i, ((String) ptbWords.get(i)));
return (ptb2Text(StringUtils.join(ptbWords)));
}
public static TokenizerFactory factory() {
return new PTBTokenizerFactory();
}
public static TokenizerFactory factory(boolean tokenizeCRs) {
return new PTBTokenizerFactory(tokenizeCRs);
}
static class PTBTokenizerFactory implements TokenizerFactory {
protected boolean tokenizeCRs;
/**
* Constructs a new PTBTokenizerFactory that treats carriage returns as
* normal whitespace.
*/
public PTBTokenizerFactory() {
this(false);
}
/**
* Constructs a new PTBTokenizer that optionally returns carriage returns
* as their own token. CRs come back as Words whose text is
* the value of PTBLexer.cr
.
*/
public PTBTokenizerFactory(boolean tokenizeCRs) {
this.tokenizeCRs = tokenizeCRs;
}
public Iterator getIterator(Reader r) {
return getTokenizer(r);
}
public Tokenizer getTokenizer(Reader r) {
return new PTBTokenizer(r, tokenizeCRs);
}
}
}