edu.stanford.nlp.ling.DocumentReader Maven / Gradle / Ivy
package edu.stanford.nlp.ling;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import java.io.*;
import java.net.URL;
/**
* Basic mechanism for reading in Documents from various input sources.
* This default implementation can read from strings, files, URLs, and
* InputStreams and can use a given Tokenizer to turn the text into words.
* When working with a new data format, make a new DocumentReader to parse it
* and then use it with the existing Document APIs (rather than having to make
* new Document classes). Use the protected class variables (in, tokenizer,
* keepOriginalText) to read text and create docs appropriately. Subclasses should
* ideally provide similar constructors to this class, though only the constructor
* that takes a Reader is required.
*
* @author Joseph Smarr ([email protected])
* @author Sarah Spikes ([email protected]) - templatized
*
* @param label type
*/
public class DocumentReader {
/**
* Reader used to read in document text. In default implementation, this is
* guaranteed to be a BufferedReader (so cast down) but it's typed as
* Reader in case subclasses don't want it buffered for some reason.
*/
protected BufferedReader in;
/**
* Tokenizer used to chop up document text into words.
*/
protected TokenizerFactory extends HasWord> tokenizerFactory;
/**
* Whether to keep source text in document along with tokenized words.
*/
protected boolean keepOriginalText;
/**
* Constructs a new DocumentReader without an initial input source.
* Must call {@link #setReader} before trying to read any documents.
* Uses a PTBTokenizer and keeps original text.
*/
public DocumentReader() {
this(null);
}
/**
* Constructs a new DocumentReader using a PTBTokenizerFactory and keeps the original text.
*
* @param in The Reader
*/
public DocumentReader(Reader in) {
this(in, PTBTokenizer.PTBTokenizerFactory.newTokenizerFactory(), true);
}
/**
* Constructs a new DocumentReader that will read text from the given
* Reader and tokenize it into words using the given Tokenizer. The default
* implementation will internally buffer the reader if it is not already
* buffered, so there is no need to pre-wrap the reader with a BufferedReader.
* This class provides many getReader methods for conviniently
* reading from many input sources.
*/
public DocumentReader(Reader in, TokenizerFactory extends HasWord> tokenizerFactory, boolean keepOriginalText) {
if (in != null) {
setReader(in);
}
setTokenizerFactory(tokenizerFactory);
this.keepOriginalText = keepOriginalText;
}
/**
* Returns the reader for the text input source of this DocumentReader.
*/
public Reader getReader() {
return in;
}
/**
* Sets the reader from which to read and create documents.
* Default implementation automatically buffers the Reader if it's not
* already buffered. Subclasses that don't want buffering may want to override
* this method to simply set the global in directly.
*/
public void setReader(Reader in) {
this.in = getBufferedReader(in);
}
/**
* Returns the tokenizer used to chop up text into words for the documents.
*/
public TokenizerFactory extends HasWord> getTokenizerFactory() {
return (tokenizerFactory);
}
/**
* Sets the tokenizer used to chop up text into words for the documents.
*/
public void setTokenizerFactory(TokenizerFactory extends HasWord> tokenizerFactory) {
this.tokenizerFactory = tokenizerFactory;
}
/**
* Returns whether created documents will store their source text along with tokenized words.
*/
public boolean getKeepOriginalText() {
return (keepOriginalText);
}
/**
* Sets whether created documents should store their source text along with tokenized words.
*/
public void setKeepOriginalText(boolean keepOriginalText) {
this.keepOriginalText = keepOriginalText;
}
/**
* Reads the next document's worth of text from the reader and turns it into
* a Document. Default implementation calls {@link #readNextDocumentText}
* and passes it to {@link #parseDocumentText} to create the document.
* Subclasses may wish to override either or both of those methods to handle
* custom formats of document collections and individual documents
* respectively. This method can also be overridden in its entirety to
* provide custom reading and construction of documents from input text.
*/
public BasicDocument readDocument() throws IOException {
String text = readNextDocumentText();
if (text == null) {
return (null);
}
return parseDocumentText(text);
}
/**
* Reads the next document's worth of text from the reader. Default
* implementation reads all the text. Subclasses wishing to read multiple
* documents from a single input source should read until the next document
* delimiter and return the text so far. Returns null if there is no more
* text to be read.
*/
protected String readNextDocumentText() throws IOException {
return readText(in);
}
/**
* Creates a new Document for the given text. Default implementation tokenizes
* the text using the tokenizer provided during construction and sticks the words
* in a new BasicDocument. The text is also stored as the original text in
* the BasicDocument if keepOriginalText was set in the constructor. Subclasses
* may wish to extract additional information from the text and/or return another
* document subclass with additional meta-data.
*/
protected BasicDocument parseDocumentText(String text) {
new BasicDocument();
return BasicDocument.init(text, keepOriginalText);
}
/**
* Wraps the given Reader in a BufferedReader or returns it directly if it
* is already a BufferedReader. Subclasses should use this method before
* reading from in for efficiency and/or to read entire lines at
* a time. Note that this should only be done once per reader because when
* you read from a buffered reader, it reads more than necessary and stores
* the rest, so if you then throw that buffered reader out and get a new one
* for the original reader, text will be missing. In the default DocumentReader
* text, the Reader passed in at construction is wrapped in a buffered reader
* so you can just cast in down to a BufferedReader without calling
* this method.
*/
public static BufferedReader getBufferedReader(Reader in) {
if (in == null) {
return (null);
}
if (!(in instanceof BufferedReader)) {
in = new BufferedReader(in);
}
return (BufferedReader) in;
}
/**
* Returns everything that can be read from the given Reader as a String.
* Returns null if the given Reader is null.
*/
public static String readText(Reader in) throws IOException {
// returns null if the reader is null
if (in == null) {
return (null);
}
// ensures the reader is buffered
BufferedReader br = getBufferedReader(in);
// reads all the chars into a buffer
StringBuilder sb = new StringBuilder(16000); // make biggish
int c;
while ((c = br.read()) >= 0) {
sb.append((char) c);
}
return sb.toString();
}
/**
* Returns a Reader that reads in the given text.
*/
public static Reader getReader(String text) {
return (new StringReader(text));
}
/**
* Returns a Reader that reads in the given file.
*/
public static Reader getReader(File file) throws FileNotFoundException {
return (new FileReader(file));
}
/**
* Returns a Reader that reads in the given URL.
*/
public static Reader getReader(URL url) throws IOException {
return (getReader(url.openStream()));
}
/**
* Returns a Reader that reads in the given InputStream.
*/
public static Reader getReader(InputStream in) {
return (new InputStreamReader(in));
}
}