All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.ling.BasicDocument Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.ling; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.util.ErasureUtils;

import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;


/**
 * Basic implementation of Document that should be suitable for most needs.
 * BasicDocument is an ArrayList for storing words and performs tokenization
 * during construction. Override {@link #parse(String)} to provide support
 * for custom
 * document formats or to do a custom job of tokenization. BasicDocument should
 * only be used for documents that are small enough to store in memory.
 *
 * The easiest way to use BasicDocuments is to construct them and call an init
 * method in the same line (we use init methods instead of constructors because
 * they're inherited and allow subclasses to have other more specific constructors).
 * For example, to read in a file file and tokenize it, you can call
 * 
Document doc=new BasicDocument().init(file);
. * * @author Joseph Smarr ([email protected]) * @author Sarah Spikes ([email protected]) (Templatization) * * @param The type of the labels */ public class BasicDocument extends ArrayList implements Document { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(BasicDocument.class); /** * title of this document (never null). */ protected String title = ""; /** * original text of this document (may be null). */ protected String originalText; /** * Label(s) for this document. */ protected final List labels = new ArrayList<>(); /** * TokenizerFactory used to convert the text into words inside * {@link #parse(String)}. */ protected TokenizerFactory tokenizerFactory; /** * Constructs a new (empty) BasicDocument using a {@link PTBTokenizer}. * Call one of the init * methods to populate the document * from a desired source. */ public BasicDocument() { this(PTBTokenizer.factory()); } /** * Constructs a new (empty) BasicDocument using the given tokenizer. * Call one of the init * methods to populate the document * from a desired source. */ public BasicDocument(TokenizerFactory tokenizerFactory) { setTokenizerFactory(tokenizerFactory); } public BasicDocument(Document d) { this((Collection) d); } public BasicDocument(Collection d) { this(); addAll(d); } /** * Inits a new BasicDocument with the given text contents and title. * The text is tokenized using {@link #parse(String)} to populate the list of words * ("" is used if text is null). If specified, a reference to the * original text is also maintained so that the text() method returns the * text given to this constructor. Returns a reference to this * BasicDocument * for convenience (so it's more like a constructor, but inherited). */ public static BasicDocument init(String text, String title, boolean keepOriginalText) { BasicDocument basicDocument = new BasicDocument<>(); // initializes the List of labels and sets the title basicDocument.setTitle(title); // stores the original text as specified if (keepOriginalText) { basicDocument.originalText = text; } else { basicDocument.originalText = null; } // populates the words by parsing the text basicDocument.parse(text == null ? "" : text); return basicDocument; } /** * Calls init(text,title,true) */ public static BasicDocument init(String text, String title) { return init(text, title, true); } /** * Calls init(text,null,keepOriginalText) */ public static BasicDocument init(String text, boolean keepOriginalText) { return init(text, null, keepOriginalText); } /** * Calls init(text,null,true) */ public static BasicDocument init(String text) { return init(text, null, true); } /** * Calls init((String)null,null,true) */ public static BasicDocument init() { return init((String) null, null, true); } /** * Inits a new BasicDocument by reading in the text from the given Reader. * * @see #init(String,String,boolean) */ public static BasicDocument init(Reader textReader, String title, boolean keepOriginalText) throws IOException { return init(DocumentReader.readText(textReader), title, keepOriginalText); } /** * Calls init(textReader,title,true) */ public BasicDocument init(Reader textReader, String title) throws IOException { return init(textReader, title, true); } /** * Calls init(textReader,null,keepOriginalText) */ public BasicDocument init(Reader textReader, boolean keepOriginalText) throws IOException { return init(textReader, null, keepOriginalText); } /** * Calls init(textReader,null,true) */ public BasicDocument init(Reader textReader) throws IOException { return init(textReader, null, true); } /** * Inits a new BasicDocument by reading in the text from the given File. * * @see #init(String,String,boolean) */ public BasicDocument init(File textFile, String title, boolean keepOriginalText) throws IOException { Reader in = DocumentReader.getReader(textFile); BasicDocument bd = init(in, title, keepOriginalText); in.close(); return bd; } /** * Calls init(textFile,title,true) */ public BasicDocument init(File textFile, String title) throws FileNotFoundException, IOException { return init(textFile, title, true); } /** * Calls init(textFile,textFile.getCanonicalPath(),keepOriginalText) */ public BasicDocument init(File textFile, boolean keepOriginalText) throws FileNotFoundException, IOException { return init(textFile, textFile.getCanonicalPath(), keepOriginalText); } /** * Calls init(textFile,textFile.getCanonicalPath(),true) */ public BasicDocument init(File textFile) throws FileNotFoundException, IOException { return init(textFile, textFile.getCanonicalPath(), true); } /** * Constructs a new BasicDocument by reading in the text from the given URL. * * @see #init(String,String,boolean) */ public BasicDocument init(URL textURL, String title, boolean keepOriginalText) throws IOException { return init(DocumentReader.getReader(textURL), title, keepOriginalText); } /** * Calls init(textURL,title,true) */ public BasicDocument init(URL textURL, String title) throws IOException { return init(textURL, title, true); } /** * Calls init(textURL,textFile.toExternalForm(),keepOriginalText) */ public BasicDocument init(URL textURL, boolean keepOriginalText) throws IOException { return init(textURL, textURL.toExternalForm(), keepOriginalText); } /** * Calls init(textURL,textURL.toExternalForm(),true) */ public BasicDocument init(URL textURL) throws IOException { return init(textURL, textURL.toExternalForm(), true); } /** * Initializes a new BasicDocument with the given list of words and title. */ public BasicDocument init(List words, String title) { // initializes the List of labels and sets the title setTitle(title); // no original text originalText = null; // adds all of the given words to the list maintained by this document addAll(words); return (this); } /** * Calls init(words,null) */ public BasicDocument init(List words) { return init(words, null); } /** * Tokenizes the given text to populate the list of words this Document * represents. The default implementation uses the current tokenizer and tokenizes * the entirety of the text into words. Subclasses should override this method * to parse documents in non-standard formats, and/or to pull the title of the * document from the text. The given text may be empty ("") but will never * be null. Subclasses may want to do additional processing and then just * call super.parse. * * @see #setTokenizerFactory */ protected void parse(String text) { Tokenizer toke = tokenizerFactory.getTokenizer(new StringReader(text)); addAll(toke.tokenize()); } /** * Returns this (the features are the list of words). */ @Override public Collection asFeatures() { return this; } /** * Returns the first label for this Document, or null if none have been * set. */ @Override public L label() { return (labels.size() > 0) ? labels.get(0) : null; } /** * Returns the complete List of labels for this Document. * This is an empty collection if none have been set. */ @Override public Collection labels() { return labels; } /** * Removes all currently assigned labels for this Document then adds * the given label. * Calling setLabel(null) effectively clears all labels. */ public void setLabel(L label) { labels.clear(); addLabel(label); } /** * Removes all currently assigned labels for this Document then adds all * of the given labels. */ public void setLabels(Collection labels) { this.labels.clear(); if (labels != null) { this.labels.addAll(labels); } } /** * Adds the given label to the List of labels for this Document if it is not null. */ public void addLabel(L label) { if (label != null) { labels.add(label); } } /** * Returns the title of this document. The title may be empty ("") but will * never be null. */ @Override public String title() { return (title); } /** * Sets the title of this Document to the given title. If the given title * is null, sets the title to "". */ public void setTitle(String title) { if (title == null) { this.title = ""; } else { this.title = title; } } /** * Returns the current TokenizerFactory used by {@link #parse(String)}. */ public TokenizerFactory tokenizerFactory() { return (tokenizerFactory); } /** * Sets the tokenizerFactory to be used by {@link #parse(String)}. * Set this tokenizer before calling one of the init methods * because * it will probably call parse. Note that the tokenizer can equivalently be * passed in to the constructor. * * @see #BasicDocument(TokenizerFactory) */ public void setTokenizerFactory(TokenizerFactory tokenizerFactory) { this.tokenizerFactory = tokenizerFactory; } /** * Returns a new empty BasicDocument with the same title, labels, and * tokenizer as this Document. This is useful when you want to make a * new Document that's like the old document but * can be filled with new text (e.g. if you're transforming * the contents non-destructively). * * Subclasses that want to preserve extra state should * override this method and add the extra state to the new document before * returning it. The new BasicDocument is created by calling * getClass().newInstance() so it should be of the correct subclass, * and thus you should be able to cast it down and add extra meta data directly. * Note however that in the event an Exception is thrown on instantiation * (e.g. if your subclass doesn't have a public empty constructor--it should btw!) * then a new BasicDocument is used instead. Thus if you want to be paranoid * (or some would say "correct") you should check that your instance is of * the correct sub-type as follows (this example assumes the subclass is called * NumberedDocument and it has the additional numberproperty): *
Document blankDocument=super.blankDocument();
   * if(blankDocument instanceof NumberedDocument) {
   *     ((NumberedDocument)blankDocument).setNumber(getNumber());
*/ @Override public Document blankDocument() { BasicDocument bd; // tries to instantiate by reflection, settles for direct instantiation try { bd = ErasureUtils.>uncheckedCast(getClass().newInstance()); } catch (Exception e) { bd = new BasicDocument<>(); } // copies over basic meta-data bd.setTitle(title()); bd.setLabels(labels()); bd.setTokenizerFactory(tokenizerFactory); // cast to the new output type return ErasureUtils.>uncheckedCast(bd); } /** * Returns the text originally used to construct this document, or null if * there was no original text. */ public String originalText() { return (originalText); } /** * Returns a "pretty" version of the words in this Document suitable for * display. The default implementation returns each of the words in * this Document separated * by spaces. Specifically, each element that implements {@link HasWord} * has its * {@link HasWord#word} printed, and other elements are skipped. * * Subclasses that maintain additional information may which to * override this method.

*/ public String presentableText() { StringBuilder sb = new StringBuilder(); for (Word cur : this) { if (sb.length() > 0) { sb.append(' '); } sb.append(cur.word()); } return (sb.toString()); } /** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument().init(f, "this is the title [File]", true)); printState(new BasicDocument().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } } /** * For internal debugging purposes only. * Prints the state of the given BasicDocument to stderr. */ public static void printState(BasicDocument bd) throws Exception { log.info("BasicDocument:"); log.info("\tTitle: " + bd.title()); log.info("\tLabels: " + bd.labels()); log.info("\tOriginalText: " + bd.originalText()); log.info("\tWords: " + bd); log.info(); } private static final long serialVersionUID = -24171720584352262L; }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy