edu.stanford.nlp.parser.lexparser.demo.ParserDemo2 Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.parser.lexparser.demo;

import java.io.IOException;
import java.io.StringReader;
import java.util.*;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;

class ParserDemo2 {

  /** This example shows a few more ways of providing input to a parser.
   *
   *  Usage: ParserDemo2 [grammar [textFile]]
   */
  public static void main(String[] args) throws IOException {
    String grammar = args.length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String[] options = { "-maxLength", "80", "-retainTmpSubcategories" };
    LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options);
    TreebankLanguagePack tlp = lp.getOp().langpack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

    Iterable> sentences;
    if (args.length > 1) {
      DocumentPreprocessor dp = new DocumentPreprocessor(args[1]);
      List> tmp =
              new ArrayList<>();
      for (List sentence : dp) {
        tmp.add(sentence);
      }
      sentences = tmp;
    } else {
      // Showing tokenization and parsing in code a couple of different ways.
      String[] sent = { "This", "is", "an", "easy", "sentence", "." };
      List sentence = new ArrayList<>();
      for (String word : sent) {
        sentence.add(new Word(word));
      }

      String sent2 = ("This is a slightly longer and more complex " +
                      "sentence requiring tokenization.");
      // Use the default tokenizer for this TreebankLanguagePack
      Tokenizer toke =
        tlp.getTokenizerFactory().getTokenizer(new StringReader(sent2));
      List sentence2 = toke.tokenize();

      String[] sent3 = { "It", "can", "can", "it", "." };
      String[] tag3 = { "PRP", "MD", "VB", "PRP", "." }; // Parser gets second "can" wrong without help
      List sentence3 = new ArrayList<>();
      for (int i = 0; i < sent3.length; i++) {
        sentence3.add(new TaggedWord(sent3[i], tag3[i]));
      }
      Tree parse = lp.parse(sentence3);
      parse.pennPrint();

      List> tmp =
              new ArrayList<>();
      tmp.add(sentence);
      tmp.add(sentence2);
      tmp.add(sentence3);
      sentences = tmp;
    }

    for (List sentence : sentences) {
      Tree parse = lp.parse(sentence);
      parse.pennPrint();
      System.out.println();
      GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
      List tdl = gs.typedDependenciesCCprocessed();
      System.out.println(tdl);
      System.out.println();

      System.out.println("The words of the sentence:");
      for (Label lab : parse.yield()) {
        if (lab instanceof CoreLabel) {
          System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP));
        } else {
          System.out.println(lab);
        }
      }
      System.out.println();
      System.out.println(parse.taggedYield());
      System.out.println();

    }

    // This method turns the String into a single sentence using the
    // default tokenizer for the TreebankLanguagePack.
    String sent3 = "This is one last test!";
    lp.parse(sent3).pennPrint();
  }

  private ParserDemo2() {} // static methods only

}