edu.stanford.nlp.trees.package-info Maven / Gradle / Ivy
/**
*
* A package for (NLP) trees, sentences, and similar things.
* This package provides several key abstractions (via abstract classes)
* and a number of further classes for related objects.
* Most of these classes use a Factory pattern to instantiate objects.
*
* A Label
is something that can be the label of a Tree or a
* Constituent. The simplest label is a StringLabel
.
* A Word
or a TaggedWord
is a
* Label
. They can be constructed with a
* LabelFactory
. A Label
often implements
* various interfaces, such as HasWord
.
*
* A Constituent
object defines a generic edge in a graph. It
* has a start and end, and usually a Label
. A
* ConstituentFactory
builds a Constituent
.
*
* A Tree
object provides generic facilities for manipulating
* NLP trees. A TreeFactory
can build a Tree
.
* A Treebank
provides an interface to a
* collection of parsed sentences (normally found on disk as a corpus).
* A TreeReader
reads trees from an InputStream
.
* A TreeReaderFactory
builds a TreeReader
.
* A TreeNormalizer
canonicalizes a Tree
on
* input from a File
. A HeadFinder
finds the
* head daughter of a Tree
. The TreeProcessor
* interface is for general sequential processing of trees, and the
* TreeTransformer
interface is for changing them.
*
* A Sentence
is a subclass of an ArrayList
.
* A Sentencebank
provides an interface to a large number of
* sentences (normally found on disk as a corpus).
* A SentenceReader
reads sentences from an
* InputStream
. A SentenceReaderFactory
* builds a SentenceReader
. A SentenceNormalizer
* canonicalizes a Sentence
on input from a File
.
* The SentenceProcessor
interface is for general sequential
* processing of sentences.
*
* There are also various subclasses of StreamTokenizer
. The class
* PairFinder
should probably be removed to samples
.
*
*
* Design notes: This package is the result of several iterations of
* trying to come up with a reusable and extendable set of tree
* classes. It may still be nonoptimal, but some thought went into
* it! At any rate, there are several things that it is important to
* understand to use the class effectively. One is that a Label has
* a primary value() which is always a String, and this is the only
* thing that matters for fundamental Label operations, such as
* checking equality. While anything else (or nothing) can be stored
* in a Label, all other Label content is regarded as purely
* decorative. All Label implementations should implement a
* labelFactory() method that returns a LabelFactory for the appropriate
* kind of Label. Since this depends on the exact class, this method
* should always be overwritten when a Label class is extended. The
* existing Label classes also provide a static factory() method
* which returns the same thing.
*
*
* Illustrations of use of the trees
package
*
* Treebank and Tree
*
* Here is some fairly straightforward code for loading trees from a
* treebank and iterating over the trees contained therein. It builds
* a histogram of sentence lengths.
*
*
*
* import java.util.Iterator;
* import edu.stanford.nlp.trees.*;
* import edu.stanford.nlp.io.NumberRangesFileFilter;
* import edu.stanford.nlp.util.Timing;
*
* /** This class just prints out sentences and their lengths.
* * Use: java SentenceLengths /turing/corpora/Treebank2/combined/wsj/07
* * [fileRange]
* *\/
* public class SentenceLengths {
*
* private static final int maxleng = 100;
* private static int[] lengthCounts = new int[maxleng+1];
* private static int numSents = 0;
*
* public static void main(String[] args) {
* Timing.startTime();
* Treebank treebank = new DiskTreebank(
* new LabeledScoredTreeReaderFactory());
* if (args.length > 1) {
* treebank.loadPath(args[0], new NumberRangesFileFilter(args[1],
* true));
* } else {
* treebank.loadPath(args[0]);
* }
*
* for (Iterator it = treebank.iterator(); it.hasNext(); ) {
* Tree t = (Tree) it.next();
* numSents++;
* int len = t.yield().length();
* if (len <= maxleng) {
* lengthCounts[len]++;
* }
* }
* System.out.print("Files " + args[0] + " ");
* if (args.length > 1) {
* System.out.print(args[1] + " ");
* }
* System.out.println("consists of " + numSents + " sentences");
* for (int i = 0; i <= maxleng; i++) {
* System.out.println(" " + lengthCounts[i] + " of length " + i);
* }
* Timing.endTime("Read/count all trees");
* }
* }
*
*
*
* Treebank, custom TreeReaderFactory, Tree, and Constituent
*
*
* This example illustrates building a Treebank by hand, specifying a
* custom TreeReaderFactory
, and illustrates more of the
* Tree
package, and the notion of a
* Constituent
. A Constituent
has a
* start and end point and a Label
.
*
*
*
*
* import java.io.*;
* import java.util.*;
*
* import edu.stanford.nlp.trees.*;
* import edu.stanford.nlp.util.*;
*
* /** This class counts how often each constituent appears
* * Use: java ConstituentCounter /turing/corpora/Treebank2/combined/wsj/07
* *\
*
* public class ConstituentCounter {
*
* public static void main(String[] args) {
* Treebank treebank = new DiskTreebank(new TreeReaderFactory() {
* public TreeReader newTreeReader(Reader in) {
* return new TreeReader(in,
* new LabeledScoredTreeFactory(new StringLabelFactory()),
* new BobChrisTreeNormalizer());
* }
* });
*
* treebank.loadPath(args[0]);
* Counter cnt = new Counter();
*
* ConstituentFactory confac = LabeledConstituent.factory();
* for (Iterator it = treebank.iterator(); it.hasNext(); ) {
* Tree t = (Tree) it.next();
* Set constituents = t.constituents(confac);
* for (Iterator it2 = constituents.iterator(); it2.hasNext(); ) {
* Constituent c = (Constituent) it2.next();
* cnt.increment(c);
* }
* }
* SortedSet ss = new TreeSet(cnt.seenSet());
* for (Iterator it = ss.iterator(); it.hasNext(); ) {
* Constituent c = (Constituent) it.next();
* System.out.println(c + " " + cnt.countOf(c));
* }
* }
* }
*
*
*
*
* Tree and Label
*
*
* Dealing with the Tree
and Label
classes is a
* central part of using this package. This code works out the
* set of tags (preterminal labels) used in a Treebank. It
* illustrates writing ones own code to recurse through a Tree, and getting
* a String value for a Label.
*
*
*
*
* import java.util.*;
* import edu.stanford.nlp.trees.*;
* import edu.stanford.nlp.util.Counter;
*
* /** This class prints out trees from strings and counts their preterminals.
* * Use: java TreesFromStrings '(S (NP (DT This)) (VP (VBD was) (JJ good)))'
* *\/
* public class TreesFromStrings {
*
* private static void addTerminals(Tree t, Counter c) {
* if (t.isLeaf()) {
* // do nothing
* } else if (t.isPreTerminal()) {
* c.increment(t.label().value());
* } else {
* // phrasal node
* Tree[] kids = t.children();
* for (int i = 0; i < kids.length; i++) {
* addTerminals(kids[i], c);
* }
* }
* }
*
* public static void main(String[] args) {
* Treebank tb = new MemoryTreebank();
* for (int i = 0; i < args.length; i++) {
* try {
* Tree t = Tree.valueOf(args[i]);
* tb.add(t);
* } catch (Exception e) {
* e.printStackTrace();
* }
* }
* Counter c = new Counter();
* for (Iterator it = tb.iterator(); it.hasNext(); ) {
* Tree t = (Tree) it.next();
* addTerminals(t, c);
* }
* System.out.println(c);
* }
*
* }
*
*
*
* As well as the Treebank classes, there are corresponding Sentencebank
* classes (though they are not quite so extensively developed.
* This final example shows use of a Sentencebank. It also
* illustrates the Visitor pattern for examining sentences in a
* Sentencebank. This was actually the original visitation
* pattern for Treebank and Sentencebank, but these days, it's in
* general easier to use an Iterator. You can also get Sentences
* from a Treebank, by taking the yield() or taggedYield() of
* each Tree.
*
*
*
*
* import java.io.*;
*
* import edu.stanford.nlp.trees.*;
*
* public class SentencePrinter {
*
* /** Loads SentenceBank from first argument and prints it out.
* * Usage: java SentencePrinter sentencebankPath
* * @param args Array of command-line arguments
* *\/
* public static void main(String[] args) {
* SentenceReaderFactory srf = new SentenceReaderFactory() {
* public SentenceReader newSentenceReader(Reader in) {
* return new SentenceReader(in, new TaggedWordFactory(),
* new PennSentenceNormalizer(),
* new PennTagbankStreamTokenizer(in));
* }
* };
* Sentencebank sentencebank = new DiskSentencebank(srf);
* sentencebank.loadPath(args[0]);
*
* sentencebank.apply(new SentenceVisitor() {
* public void visitSentence(final Sentence s) {
* // also print tag as well as word
* System.out.println(s.toString(false));
* }
* });
* }
*
* }
*
*
*
* @since 1.2
* @author Christopher Manning
* @author Dan Klein
*/
package edu.stanford.nlp.trees;