edu.stanford.nlp.parser.lexparser.LexicalizedParserQuery Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
// Stanford Parser -- a probabilistic lexicalized NL CFG parser
// Copyright (c) 2002 - 2011 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    [email protected]
//    http://nlp.stanford.edu/software/lex-parser.shtml

package edu.stanford.nlp.parser.lexparser;

import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.parser.KBestViterbiParser;
import edu.stanford.nlp.parser.common.NoSuchParseException;
import edu.stanford.nlp.parser.common.ParserConstraint;
import edu.stanford.nlp.parser.common.ParserQuery;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.trees.TreeTransformer;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.ScoredObject;
import edu.stanford.nlp.util.DeltaIndex;
import edu.stanford.nlp.util.RuntimeInterruptedException;


public class LexicalizedParserQuery implements ParserQuery {

  private final Options op;
  private final TreeTransformer debinarizer;
  private final TreeTransformer boundaryRemover;

  /** The PCFG parser. */
  private final ExhaustivePCFGParser pparser;
  /** The dependency parser. */
  private final ExhaustiveDependencyParser dparser;
  /** The factored parser that combines the dependency and PCFG parsers. */
  private final KBestViterbiParser bparser;

  private final boolean fallbackToPCFG = true;

  private final TreeTransformer subcategoryStripper;

  // Whether or not the most complicated model available successfully
  // parsed the input sentence.
  private boolean parseSucceeded = false;
  // parseSkipped means that not only did we not succeed at parsing,
  // but for some reason we didn't even try.  Most likely this happens
  // when the sentence is too long or is of length 0.
  private boolean parseSkipped = false;
  // In some sense we succeeded, but only because we used a fallback grammar
  private boolean parseFallback = false;
  // Not enough memory to parse
  private boolean parseNoMemory = false;
  // Horrible error
  private boolean parseUnparsable = false;
  // If something ran out of memory, where the error occurred
  private String whatFailed = null;

  public boolean parseSucceeded() { return parseSucceeded; }
  public boolean parseSkipped() { return parseSkipped; }
  public boolean parseFallback() { return parseFallback; }
  public boolean parseNoMemory() { return parseNoMemory; }
  public boolean parseUnparsable() { return parseUnparsable; }

  private List originalSentence;

  @Override
  public List originalSentence() { return originalSentence; }

  /** Keeps track of whether the sentence had punctuation added, which affects the expected length of the sentence */
  private boolean addedPunct = false;

  private boolean saidMemMessage = false;

  public boolean saidMemMessage() {
    return saidMemMessage;
  }


  LexicalizedParserQuery(LexicalizedParser parser) {
    this.op = parser.getOp();

    BinaryGrammar bg = parser.bg;
    UnaryGrammar ug = parser.ug;
    Lexicon lex = parser.lex;
    DependencyGrammar dg = parser.dg;

    Index stateIndex = parser.stateIndex;
    Index wordIndex = new DeltaIndex(parser.wordIndex);
    Index tagIndex = parser.tagIndex;

    this.debinarizer = new Debinarizer(op.forceCNF);
    this.boundaryRemover = new BoundaryRemover();

    if (op.doPCFG) {
      if (op.testOptions.iterativeCKY) {
        pparser = new IterativeCKYPCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex);
      } else {
        pparser = new ExhaustivePCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex);
      }
    } else {
      pparser = null;
    }

    if (op.doDep) {
      dg.setLexicon(lex);
      if (!op.testOptions.useFastFactored) {
        dparser = new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex);
      } else {
        dparser = null;
      }
    } else {
      dparser = null;
    }

    if (op.doDep && op.doPCFG) {
      if (op.testOptions.useFastFactored) {
        MLEDependencyGrammar mledg = (MLEDependencyGrammar) dg;
        int numToFind = 1;
        if (op.testOptions.printFactoredKGood > 0) {
          numToFind = op.testOptions.printFactoredKGood;
        }
        bparser = new FastFactoredParser(pparser, mledg, op, numToFind, wordIndex, tagIndex);
      } else {
        Scorer scorer = new TwinScorer(pparser, dparser);
        //Scorer scorer = parser;
        if (op.testOptions.useN5) {
          bparser = new BiLexPCFGParser.N5BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex);
        } else {
          bparser = new BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex);
        }
      }
    } else {
      bparser = null;
    }

    subcategoryStripper = op.tlpParams.subcategoryStripper();
  }

  @Override
  public void setConstraints(List constraints) {
    if (pparser != null) {
      pparser.setConstraints(constraints);
    }
  }

  /**
   * Parse a sentence represented as a List of tokens.
   * The text must already have been tokenized and
   * normalized into tokens that are appropriate to the treebank
   * which was used to train the parser.  The tokens can be of
   * multiple types, and the list items need not be homogeneous as to type
   * (in particular, only some words might be given tags):
   * 
   * If a token implements HasWord, then the word to be parsed is
   * given by its word() value.
   * If a token implements HasTag and the tag() value is not
   * null or the empty String, then the parser is strongly advised to assign
   * a part of speech tag that begins with this String.
   * 
   *
   * @param sentence The sentence to parse
   * @return true Iff the sentence was accepted by the grammar
   * @throws UnsupportedOperationException If the Sentence is too long or
   *                                       of zero length or the parse
   *                                       otherwise fails for resource reasons
   */
  private boolean parseInternal(List sentence) {
    parseSucceeded = false;
    parseNoMemory = false;
    parseUnparsable = false;
    parseSkipped = false;
    parseFallback = false;
    whatFailed = null;
    addedPunct = false;
    originalSentence = sentence;
    int length = sentence.size();
    if (length == 0) {
      parseSkipped = true;
      throw new UnsupportedOperationException("Can't parse a zero-length sentence!");
    }

    List sentenceB;
    if (op.wordFunction != null) {
      sentenceB = Generics.newArrayList();
      for (HasWord word : originalSentence) {
        if (word instanceof Label) {
          Label label = (Label) word;
          Label newLabel = label.labelFactory().newLabel(label);
          if (newLabel instanceof HasWord) {
            sentenceB.add((HasWord) newLabel);
          } else {
            throw new AssertionError("This should have been a HasWord");
          }
        } else if (word instanceof HasTag) {
          TaggedWord tw = new TaggedWord(word.word(), ((HasTag) word).tag());
          sentenceB.add(tw);
        } else {
          sentenceB.add(new Word(word.word()));
        }
      }
      for (HasWord word : sentenceB) {
        word.setWord(op.wordFunction.apply(word.word()));
      }
    } else {
      sentenceB = new ArrayList(sentence);
    }

    if (op.testOptions.addMissingFinalPunctuation) {
      addedPunct = addSentenceFinalPunctIfNeeded(sentenceB, length);
    }
    if (length > op.testOptions.maxLength) {
      parseSkipped = true;
      throw new UnsupportedOperationException("Sentence too long: length " + length);
    }
    TreePrint treePrint = getTreePrint();
    PrintWriter pwOut = op.tlpParams.pw();

    //Insert the boundary symbol
    if(sentence.get(0) instanceof CoreLabel) {
      CoreLabel boundary = new CoreLabel();
      boundary.setWord(Lexicon.BOUNDARY);
      boundary.setValue(Lexicon.BOUNDARY);
      boundary.setTag(Lexicon.BOUNDARY_TAG);
      boundary.setIndex(sentence.size()+1);//1-based indexing used in the parser
      sentenceB.add(boundary);
    } else {
      sentenceB.add(new TaggedWord(Lexicon.BOUNDARY, Lexicon.BOUNDARY_TAG));
    }

    if (Thread.interrupted()) {
      throw new RuntimeInterruptedException();
    }

    if (op.doPCFG) {
      if (!pparser.parse(sentenceB)) {
        return parseSucceeded;
      }
      if (op.testOptions.verbose) {
        pwOut.println("PParser output");
        // getBestPCFGParse(false).pennPrint(pwOut); // with scores on nodes
        treePrint.printTree(getBestPCFGParse(false), pwOut); // without scores on nodes
      }
    }
    if (Thread.interrupted()) {
      throw new RuntimeInterruptedException();
    }
    if (op.doDep && ! op.testOptions.useFastFactored) {
      if ( ! dparser.parse(sentenceB)) {
        return parseSucceeded;
      }
      // cdm nov 2006: should move these printing bits to the main printing section,
      // so don't calculate the best parse twice!
      if (op.testOptions.verbose) {
        pwOut.println("DParser output");
        treePrint.printTree(dparser.getBestParse(), pwOut);
      }
    }
    if (Thread.interrupted()) {
      throw new RuntimeInterruptedException();
    }
    if (op.doPCFG && op.doDep) {
      if ( ! bparser.parse(sentenceB)) {
        return parseSucceeded;
      } else {
        parseSucceeded = true;
      }
    }
    return true;
  }


  @Override
  public void restoreOriginalWords(Tree tree) {
    if (originalSentence == null || tree == null) {
      return;
    }
    List leaves = tree.getLeaves();
    int expectedSize = addedPunct ? originalSentence.size() + 1 : originalSentence.size();
    if (leaves.size() != expectedSize) {
      throw new IllegalStateException("originalWords and sentence of different sizes: " + expectedSize + " vs. " + leaves.size() +
                                      "\n Orig: " + Sentence.listToString(originalSentence) +
                                      "\n Pars: " + Sentence.listToString(leaves));
    }
    Iterator leafIterator = leaves.iterator();
    for (HasWord word : originalSentence) {
      Tree leaf = leafIterator.next();
      if (!(word instanceof Label)) {
        continue;
      }
      leaf.setLabel((Label) word);
    }
  }


  /**
   * Parse a (speech) lattice with the PCFG parser.
   *
   * @param lr a lattice to parse
   * @return Whether the lattice could be parsed by the grammar
   */
  boolean parse(HTKLatticeReader lr) {
    TreePrint treePrint = getTreePrint();
    PrintWriter pwOut = op.tlpParams.pw();
    parseSucceeded = false;
    parseNoMemory = false;
    parseUnparsable = false;
    parseSkipped = false;
    parseFallback = false;
    whatFailed = null;
    originalSentence = null;
    if (lr.getNumStates() > op.testOptions.maxLength + 1) {  // + 1 for boundary symbol
      parseSkipped = true;
      throw new UnsupportedOperationException("Lattice too big: " + lr.getNumStates());
    }
    if (op.doPCFG) {
      if (!pparser.parse(lr)) {
        return parseSucceeded;
      }
      if (op.testOptions.verbose) {
        pwOut.println("PParser output");
        treePrint.printTree(getBestPCFGParse(false), pwOut);
      }
    }
    parseSucceeded = true;
    return true;
  }

  /**
   * Return the best parse of the sentence most recently parsed.
   * This will be from the factored parser, if it was used and it succeeded
   * else from the PCFG if it was used and succeed, else from the dependency
   * parser.
   *
   * @return The best tree
   * @throws NoSuchParseException If no previously successfully parsed
   *                                sentence
   */
  public Tree getBestParse() {
    return getBestParse(true);
  }

  Tree getBestParse(boolean stripSubcat) {
    if (parseSkipped) {
      return null;
    }
    if (bparser != null && parseSucceeded) {
      Tree binaryTree = bparser.getBestParse();

      Tree tree = debinarizer.transformTree(binaryTree);
      if (op.nodePrune) {
        NodePruner np = new NodePruner(pparser, debinarizer);
        tree = np.prune(tree);
      }
      if (stripSubcat) {
        tree = subcategoryStripper.transformTree(tree);
      }
      restoreOriginalWords(tree);
      return tree;

    } else if (pparser != null && pparser.hasParse() && fallbackToPCFG) {
      return getBestPCFGParse();
    } else if (dparser != null && dparser.hasParse()) { // && fallbackToDG
      // Should we strip subcategories like this?  Traditionally haven't...
      // return subcategoryStripper.transformTree(getBestDependencyParse(true));
      return getBestDependencyParse(true);
    } else {
      throw new NoSuchParseException();
    }
  }

  public List> getBestPCFGParses() {
    return pparser.getBestParses();
  }

  public boolean hasFactoredParse() {
    if (bparser == null) {
      return false;
    }
    return !parseSkipped && parseSucceeded && bparser.hasParse();
  }

  public Tree getBestFactoredParse() {
    return bparser.getBestParse();
  }

  public List> getKGoodFactoredParses(int k) {
    if (bparser == null || parseSkipped) {
      return null;
    }
    List> binaryTrees = bparser.getKGoodParses(k);
    if (binaryTrees == null) {
      return null;
    }
    List> trees = new ArrayList>(k);
    for (ScoredObject tp : binaryTrees) {
      Tree t = debinarizer.transformTree(tp.object());
      t = subcategoryStripper.transformTree(t);
      restoreOriginalWords(t);
      trees.add(new ScoredObject(t, tp.score()));
    }
    return trees;
  }

  /**
   * Returns the trees (and scores) corresponding to the
   * k-best derivations of the sentence.  This cannot be
   * a Counter because frequently there will be multiple
   * derivations which lead to the same parse tree.
   *
   * @param k The number of best parses to return
   * @return The list of trees with their scores (log prob).
   */
  public List> getKBestPCFGParses(int k) {
    if (pparser == null) {
      return null;
    }
    List> binaryTrees = pparser.getKBestParses(k);
    if (binaryTrees == null) {
      return null;
    }
    List> trees = new ArrayList>(k);
    for (ScoredObject p : binaryTrees) {
      Tree t = debinarizer.transformTree(p.object());
      t = subcategoryStripper.transformTree(t);
      restoreOriginalWords(t);
      trees.add(new ScoredObject(t, p.score()));
    }
    return trees;
  }


  public Tree getBestPCFGParse() {
    return getBestPCFGParse(true);
  }

  public Tree getBestPCFGParse(boolean stripSubcategories) {
    if (pparser == null || parseSkipped || parseUnparsable) {
      return null;
    }
    Tree binaryTree = pparser.getBestParse();

    if (binaryTree == null) {
      return null;
    }
    Tree t = debinarizer.transformTree(binaryTree);
    if (stripSubcategories) {
      t = subcategoryStripper.transformTree(t);
    }
    restoreOriginalWords(t);
    return t;
  }

  @Override
  public double getPCFGScore() {
    return pparser.getBestScore();
  }

  double getPCFGScore(String goalStr) {
    return pparser.getBestScore(goalStr);
  }

  void parsePCFG(List sentence) {
    parseSucceeded = false;
    parseNoMemory = false;
    parseUnparsable = false;
    parseSkipped = false;
    parseFallback = false;
    whatFailed = null;
    originalSentence = sentence;
    pparser.parse(sentence);
  }

  public Tree getBestDependencyParse() {
    return getBestDependencyParse(false);
  }

  @Override
  public Tree getBestDependencyParse(boolean debinarize) {
    if (dparser == null || parseSkipped || parseUnparsable) {
      return null;
    }
    Tree t = dparser.getBestParse();
    if (t != null) {
      if (debinarize) {
        t = debinarizer.transformTree(t);
      }
      t = boundaryRemover.transformTree(t); // remove boundary .$$. which is otherwise still there from dparser.
      restoreOriginalWords(t);
    }
    return t;
  }

  /**
   * Parse a sentence represented as a List of tokens.
   * The text must already have been tokenized and
   * normalized into tokens that are appropriate to the treebank
   * which was used to train the parser.  The tokens can be of
   * multiple types, and the list items need not be homogeneous as to type
   * (in particular, only some words might be given tags):
   * 
   * If a token implements HasWord, then the word to be parsed is
   * given by its word() value.
   * If a token implements HasTag and the tag() value is not
   * null or the empty String, then the parser is strongly advised to assign
   * a part of speech tag that begins with this String.
   * 
   *
   * @param sentence The sentence to parse
   * @return true Iff the sentence was accepted by the grammar.  If
   *              the main grammar fails, but the PCFG succeeds, then
   *              this still returns true, but parseFallback() will
   *              also return true.  getBestParse() will have a valid
   *              result iff this returns true.
   */
  @Override
  public boolean parse(List sentence) {
    try {
      if (!parseInternal(sentence)) {
        if (pparser != null && pparser.hasParse() && fallbackToPCFG) {
          parseFallback = true;
          return true;
        } else {
          parseUnparsable = true;
          return false;
        }
      } else {
        return true;
      }
    } catch (OutOfMemoryError e) {
      if (op.testOptions.maxLength != -0xDEADBEEF) {
        // this means they explicitly asked for a length they cannot handle.
        // Throw exception.  Avoid string concatenation before throw it.
        System.err.print("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH ");
        System.err.println(op.testOptions.maxLength);
        throw e;
      }
      if (pparser.hasParse() && fallbackToPCFG) {
        try {
          whatFailed = "dependency";
          if (dparser.hasParse()) {
            whatFailed = "factored";
          }
          parseFallback = true;
          return true;
        } catch (OutOfMemoryError oome) {
          oome.printStackTrace();
          parseNoMemory = true;
          pparser.nudgeDownArraySize();
          return false;
        }
      } else {
        parseNoMemory = true;
        return false;
      }
    } catch (UnsupportedOperationException uoe) {
      parseSkipped = true;
      return false;
    }
  }

  /**
   * Implements the same parsing with fallback that parse() does, but
   * also outputs status messages for failed parses to pwErr.
   */
  @Override
  public boolean parseAndReport(List sentence, PrintWriter pwErr) {
    boolean result = parse(sentence);
    if (result) {
      if (whatFailed != null) {
        // Something failed, probably because of memory problems.
        // However, we still got a PCFG parse, at least.
        if ( ! saidMemMessage) {
          ParserUtils.printOutOfMemory(pwErr);
          saidMemMessage = true;
        }
        pwErr.println("Sentence too long for " + whatFailed + " parser.  Falling back to PCFG parse...");
      } else if (parseFallback) {
        // We had to fall back for some other reason.
        pwErr.println("Sentence couldn't be parsed by grammar.... falling back to PCFG parse.");
      }
    } else if (parseUnparsable) {
      // No parse at all, completely failed.
      pwErr.println("Sentence couldn't be parsed by grammar.");
    } else if (parseNoMemory) {
      // Ran out of memory, either with or without a possible PCFG parse.
      if (!saidMemMessage) {
        ParserUtils.printOutOfMemory(pwErr);
        saidMemMessage = true;
      }
      if (pparser.hasParse() && fallbackToPCFG) {
        pwErr.println("No memory to gather PCFG parse. Skipping...");
      } else {
        pwErr.println("Sentence has no parse using PCFG grammar (or no PCFG fallback).  Skipping...");
      }
    } else if (parseSkipped) {
      pwErr.println("Sentence too long (or zero words).");
    }
    return result;
  }


  /** Return a TreePrint for formatting parsed output trees.
   *  @return A TreePrint for formatting parsed output trees.
   */
  public TreePrint getTreePrint() {
    return op.testOptions.treePrint(op.tlpParams);
  }

  @Override
  public KBestViterbiParser getPCFGParser() {
    return pparser;
  }

  @Override
  public KBestViterbiParser getDependencyParser() {
    return dparser;
  }

  @Override
  public KBestViterbiParser getFactoredParser() {
    return bparser;
  }

  /** Adds a sentence final punctuation mark to sentences that lack one.
   *  This method adds a period (the first sentence final punctuation word
   *  in a parser language pack) to sentences that don't have one within
   *  the last 3 words (to allow for close parentheses, etc.).  It checks
   *  tags for punctuation, if available, otherwise words.
   *
   *  @param sentence The sentence to check
   *  @param length The length of the sentence (just to avoid recomputation)
   */
  private boolean addSentenceFinalPunctIfNeeded(List sentence, int length) {
    int start = length - 3;
    if (start < 0) start = 0;
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();
    for (int i = length - 1; i >= start; i--) {
      HasWord item = sentence.get(i);
      // An object (e.g., CoreLabel) can implement HasTag but not actually store
      // a tag so we need to check that there is something there for this case.
      // If there is, use only it, since word tokens can be ambiguous.
      String tag = null;
      if (item instanceof HasTag) {
        tag = ((HasTag) item).tag();
      }
      if (tag != null && ! tag.isEmpty()) {
        if (tlp.isSentenceFinalPunctuationTag(tag)) {
          return false;
        }
      } else {
        String str = item.word();
        if (tlp.isPunctuationWord(str)) {
          return false;
        }
      }
    }
    // none found so add one.
    if (op.testOptions.verbose) {
      System.err.println("Adding missing final punctuation to sentence.");
    }
    String[] sfpWords = tlp.sentenceFinalPunctuationWords();
    if (sfpWords.length > 0) {
      sentence.add(new Word(sfpWords[0]));
    }
    return true;
  }

}