edu.stanford.nlp.parser.lexparser.AbstractTreebankParserParams Maven / Gradle / Ivy

Go to download
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.ling.CategoryWordTag;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.parser.metrics.AbstractEval;
import edu.stanford.nlp.parser.tools.PunctEquivalenceClasser;
import edu.stanford.nlp.process.SerializableFunction;
import edu.stanford.nlp.stats.EquivalenceClasser;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Index;

import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;


/**
 * An abstract class providing a common method base from which to
 * complete a {@code TreebankLangParserParams} implementing class.
 * 
 * With some extending classes you'll want to have access to special
 * attributes of the corresponding TreebankLanguagePack while taking
 * advantage of this class's code for making the TreebankLanguagePack
 * accessible.  A good way to do this is to pass a new instance of the
 * appropriate TreebankLanguagePack into this class's constructor,
 * then get it back later on by casting a call to
 * treebankLanguagePack().  See ChineseTreebankParserParams for an
 * example.
 *
 * @author Roger Levy
 */
public abstract class AbstractTreebankParserParams implements TreebankLangParserParams {

  /**
   * If true, then evaluation is over grammatical functions as well as the labels
   * If false, then grammatical functions are stripped for evaluation.  This really
   * only makes sense if you've trained with grammatical functions but want to evaluate without them.
   */
  protected boolean evalGF = true;

  /** The job of this class is to remove subcategorizations from
   *  tag and category nodes, so as to put a tree in a suitable
   *  state for evaluation.  Providing the TreebankLanguagePack
   *  is defined correctly, this should work for any language.
   */
  protected class SubcategoryStripper implements TreeTransformer {

    protected TreeFactory tf = new LabeledScoredTreeFactory();

    @Override
    public Tree transformTree(Tree tree) {
      Label lab = tree.label();
      if (tree.isLeaf()) {
        Tree leaf = tf.newLeaf(lab);
        leaf.setScore(tree.score());
        return leaf;
      }
      String s = lab.value();
      s = treebankLanguagePack().basicCategory(s);
      int numKids = tree.numChildren();
      List children = new ArrayList<>(numKids);
      for (int cNum = 0; cNum < numKids; cNum++) {
        Tree child = tree.getChild(cNum);
        Tree newChild = transformTree(child);
        // cdm 2007: for just subcategory stripping, null shouldn't happen
        // if (newChild != null) {
        children.add(newChild);
        // }
      }
      // if (children.isEmpty()) {
      //   return null;
      // }
      CategoryWordTag newLabel = new CategoryWordTag(lab);
      newLabel.setCategory(s);
      if (lab instanceof HasTag) {
        String tag = ((HasTag) lab).tag();
        tag = treebankLanguagePack().basicCategory(tag);
        newLabel.setTag(tag);
      }
      Tree node = tf.newTreeNode(newLabel, children);
      node.setScore(tree.score());
      return node;
    }

  } // end class SubcategoryStripper

  /** The job of this class is to remove subcategorizations from
   *  tag and category nodes, so as to put a tree in a suitable
   *  state for evaluation.  Providing the TreebankLanguagePack
   *  is defined correctly, this should work for any language.
   *  Very simililar to subcategory stripper, but strips grammatical
   *  functions as well.
   */
  protected class RemoveGFSubcategoryStripper implements TreeTransformer {

    protected TreeFactory tf = new LabeledScoredTreeFactory();

    @Override
    public Tree transformTree(Tree tree) {
      Label lab = tree.label();
      if (tree.isLeaf()) {
        Tree leaf = tf.newLeaf(lab);
        leaf.setScore(tree.score());
        return leaf;
      }
      String s = lab.value();
      s = treebankLanguagePack().basicCategory(s);
      s = treebankLanguagePack().stripGF(s);
      int numKids = tree.numChildren();
      List children = new ArrayList<>(numKids);
      for (int cNum = 0; cNum < numKids; cNum++) {
        Tree child = tree.getChild(cNum);
        Tree newChild = transformTree(child);
        children.add(newChild);
      }
      CategoryWordTag newLabel = new CategoryWordTag(lab);
      newLabel.setCategory(s);
      if (lab instanceof HasTag) {
        String tag = ((HasTag) lab).tag();
        tag = treebankLanguagePack().basicCategory(tag);
        tag = treebankLanguagePack().stripGF(tag);

        newLabel.setTag(tag);
      }
      Tree node = tf.newTreeNode(newLabel, children);
      node.setScore(tree.score());
      return node;
    }
  } // end class RemoveGFSubcategoryStripper

  protected String inputEncoding;
  protected String outputEncoding;
  protected TreebankLanguagePack tlp;
  protected boolean generateOriginalDependencies;


  /**
   * Stores the passed-in TreebankLanguagePack and sets up charset encodings.
   *
   * @param tlp The treebank language pack to use
   */
  protected AbstractTreebankParserParams(TreebankLanguagePack tlp) {
    this.tlp = tlp;
    inputEncoding = tlp.getEncoding();
    outputEncoding = tlp.getEncoding();
    generateOriginalDependencies = false;
  }

  @Override
  public Label processHeadWord(Label headWord) {
    return headWord;
  }

  /**
   * Sets whether to consider grammatical functions in evaluation
   */
  @Override
  public void setEvaluateGrammaticalFunctions(boolean evalGFs) {
    this.evalGF = evalGFs;
  }

  /**
   * Sets the input encoding.
   */
  @Override
  public void setInputEncoding(String encoding) {
    inputEncoding = encoding;
  }

  /**
   * Sets the output encoding.
   */
  @Override
  public void setOutputEncoding(String encoding) {
    outputEncoding = encoding;
  }

  /**
   * Returns the output encoding being used.
   */
  @Override
  public String getOutputEncoding() {
    return outputEncoding;
  }

  /**
   * Returns the input encoding being used.
   */
  @Override
  public String getInputEncoding() {
    return inputEncoding;
  }


  /**
   * Returns a language specific object for evaluating PP attachment
   *
   * @return An object that implements {@link AbstractEval}
   */
  @Override
  public AbstractEval ppAttachmentEval() {
    return null;
  }

  /**
   * returns a MemoryTreebank appropriate to the treebank source
   */
  @Override
  public abstract MemoryTreebank memoryTreebank();

  /**
   * returns a DiskTreebank appropriate to the treebank source
   */
  @Override
  public abstract DiskTreebank diskTreebank();

  /**
   * You can often return the same thing for testMemoryTreebank as
   * for memoryTreebank
   */
  @Override
  public MemoryTreebank testMemoryTreebank() {
    return memoryTreebank();
  }

  /**
   * Implemented as required by TreebankFactory. Use diskTreebank() instead.
   */
  @Override
  public Treebank treebank() {
    return diskTreebank();
  }

  /**
   * The PrintWriter used to print output. It's the responsibility of
   * pw to deal properly with character encodings for the relevant
   * treebank.
   */
  @Override
  public PrintWriter pw() {
    return pw(System.out);
  }

  /**
   * The PrintWriter used to print output. It's the responsibility of
   * pw to deal properly with character encodings for the relevant
   * treebank.
   */
  @Override
  public PrintWriter pw(OutputStream o) {
    String encoding = outputEncoding;
    if (!java.nio.charset.Charset.isSupported(encoding)) {
      System.err.println("Warning: desired encoding " + encoding + " not accepted. ");
      System.err.println("Using UTF-8 to construct PrintWriter");
      encoding = "UTF-8";
    }

    //System.err.println("TreebankParserParams.pw(): encoding is " + encoding);
    try {
      return new PrintWriter(new OutputStreamWriter(o, encoding), true);
    } catch (UnsupportedEncodingException e) {
      System.err.println("Warning: desired encoding " + outputEncoding + " not accepted. " + e);
      try {
        return new PrintWriter(new OutputStreamWriter(o, "UTF-8"), true);
      } catch (UnsupportedEncodingException e1) {
        System.err.println("Something is really wrong.  Your system doesn't even support UTF-8!" + e1);
        return new PrintWriter(o, true);
      }
    }
  }


  /**
   * Returns an appropriate treebankLanguagePack
   */
  @Override
  public TreebankLanguagePack treebankLanguagePack() {
    return tlp;
  }

  /**
   * The HeadFinder to use for your treebank.
   */
  @Override
  public abstract HeadFinder headFinder();

  /**
   * The HeadFinder to use when extracting typed dependencies.
   */
  @Override
  public abstract HeadFinder typedDependencyHeadFinder();

  @Override
  public Lexicon lex(Options op, Index wordIndex, Index tagIndex) {
    return new BaseLexicon(op, wordIndex, tagIndex);
  }

  /**
   * Give the parameters for smoothing in the MLEDependencyGrammar.
   * Defaults are the ones previously hard coded into MLEDependencyGrammar.
   * @return an array of doubles with smooth_aT_hTWd, smooth_aTW_hTWd, smooth_stop, and interp
   */
  @Override
  public double[] MLEDependencyGrammarSmoothingParams() {
    return new double[] { 16.0, 16.0, 4.0, 0.6 };
  }


  /**
   * Takes a Tree and a collinizer and returns a Collection of labeled
   * {@link Constituent}s for PARSEVAL.
   * @param t The tree to extract constituents from
   * @param collinizer The TreeTransformer used to normalize the tree for
   *              evaluation
   * @return The bag of Constituents for PARSEVAL.
   */
  public static Collection parsevalObjectify(Tree t, TreeTransformer collinizer) {
    return parsevalObjectify(t,collinizer,true);
  }

  /**
   * Takes a Tree and a collinizer and returns a Collection of {@link Constituent}s for
   * PARSEVAL evaluation.  Some notes on this particular parseval:
   * 

   *  It is character-based, which allows it to be used on segmentation/parsing combination evaluation.
   * 
 whether it gives you labeled or unlabeled bracketings depends on the value of the {@code labelConstituents}
   * parameter
   * 
   *
   * (Note that I haven't checked this rigorously yet with the PARSEVAL definition
   * -- Roger.)
   */
  public static Collection parsevalObjectify(Tree t, TreeTransformer collinizer, boolean labelConstituents) {
    Collection spans = new ArrayList<>();
    Tree t1 = collinizer.transformTree(t);
    if (t1 == null) {
      return spans;
    }
    for (Tree node : t1) {
      if (node.isLeaf() || node.isPreTerminal() || (node != t1 && node.parent(t1) == null)) {
        continue;
      }
      int leftEdge = t1.leftCharEdge(node);
      int rightEdge = t1.rightCharEdge(node);
      if(labelConstituents)
        spans.add(new LabeledConstituent(leftEdge, rightEdge, node.label()));
      else
        spans.add(new SimpleConstituent(leftEdge, rightEdge));
    }
    return spans;
  }


  /**
   * Returns a collection of untyped word-word dependencies for the tree.
   */
  public static Collection> untypedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) {
    return dependencyObjectify(t, hf, collinizer, new UntypedDependencyTyper(hf));
  }

  /**
   * Returns a collection of unordered (but directed!) untyped word-word dependencies for the tree.
   */
  public static Collection> unorderedUntypedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) {
    return dependencyObjectify(t, hf, collinizer, new UnorderedUntypedDependencyTyper(hf));
  }

  /**
   * Returns a collection of word-word dependencies typed by mother, head, daughter node syntactic categories.
   */
  public static Collection> typedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) {
    return dependencyObjectify(t, hf, collinizer, new TypedDependencyTyper(hf));
  }

  /**
   * Returns a collection of unordered (but directed!) typed word-word dependencies for the tree.
   */
  public static Collection> unorderedTypedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) {
    return dependencyObjectify(t, hf, collinizer, new UnorderedTypedDependencyTyper(hf));
  }

  /**
   * Returns the set of dependencies in a tree, according to some {@link edu.stanford.nlp.trees.DependencyTyper}.
   */
  public static  Collection dependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer, DependencyTyper typer) {
    Collection deps = new ArrayList<>();
    Tree t1 = collinizer.transformTree(t);
    if(t1==null)
      return deps;
    dependencyObjectifyHelper(t1, t1, hf, deps, typer);
    return deps;
  }

  private static  void dependencyObjectifyHelper(Tree t, Tree root, HeadFinder hf, Collection c, DependencyTyper typer) {
    if (t.isLeaf() || t.isPreTerminal()) {
      return;
    }
    Tree headDtr = hf.determineHead(t);
    for (Tree child : t.children()) {
      dependencyObjectifyHelper(child, root, hf, c, typer);
      if (child != headDtr) {
        c.add(typer.makeDependency(headDtr, child, root));
      }
    }
  }


  private static class UntypedDependencyTyper implements DependencyTyper> {
    HeadFinder hf;

    public UntypedDependencyTyper(HeadFinder hf) {
      this.hf = hf;
    }

    @Override
    public List makeDependency(Tree head, Tree dep, Tree root) {
      List result = new ArrayList<>(3);
      Tree headTerm = head.headTerminal(hf);
      Tree depTerm = dep.headTerminal(hf);
      boolean headLeft = root.leftCharEdge(headTerm) < root.leftCharEdge(depTerm);
      result.add(headTerm.value());
      result.add(depTerm.value());
      if(headLeft)
        result.add(leftHeaded);
      else
        result.add(rightHeaded);
      return result;
    }

  }

  private static class UnorderedUntypedDependencyTyper implements DependencyTyper> {
    HeadFinder hf;

    public UnorderedUntypedDependencyTyper(HeadFinder hf) {
      this.hf = hf;
    }

    @Override
    public List makeDependency(Tree head, Tree dep, Tree root) {
      List result = new ArrayList<>(3);
      Tree headTerm = head.headTerminal(hf);
      Tree depTerm = dep.headTerminal(hf);
      result.add(headTerm.value());
      result.add(depTerm.value());
      return result;
    }

  }

  private static final String leftHeaded = "leftHeaded";
  private static final String rightHeaded = "rightHeaded";

  private static class TypedDependencyTyper implements DependencyTyper> {
    HeadFinder hf;

    public TypedDependencyTyper(HeadFinder hf) {
      this.hf = hf;
    }


    @Override
    public List makeDependency(Tree head, Tree dep, Tree root) {
      List result = new ArrayList<>(6);
      Tree headTerm = head.headTerminal(hf);
      Tree depTerm = dep.headTerminal(hf);
      boolean headLeft = root.leftCharEdge(headTerm) < root.leftCharEdge(depTerm);
      result.add(headTerm.value());
      result.add(depTerm.value());
      result.add(head.parent(root).value());
      result.add(head.value());
      result.add(dep.value());
      if(headLeft)
        result.add(leftHeaded);
      else
        result.add(rightHeaded);
      return result;
    }

  }

    private static class UnorderedTypedDependencyTyper implements DependencyTyper> {
    HeadFinder hf;

    public UnorderedTypedDependencyTyper(HeadFinder hf) {
      this.hf = hf;
    }

    @Override
    public List makeDependency(Tree head, Tree dep, Tree root) {
      List result = new ArrayList<>(6);
      Tree headTerm = head.headTerminal(hf);
      Tree depTerm = dep.headTerminal(hf);
      result.add(headTerm.value());
      result.add(depTerm.value());
      result.add(head.parent(root).value());
      result.add(head.value());
      result.add(dep.value());
      return result;
    }

  }

  /** Returns an EquivalenceClasser that classes typed dependencies
   *  by the syntactic categories of mother, head and daughter,
   *  plus direction.
   *
   *  @return An Equivalence class for typed dependencies
   */
  public static EquivalenceClasser, String> typedDependencyClasser() {
    return s -> {
      if(s.get(5).equals(leftHeaded))
        return s.get(2) + '(' + s.get(3) + "->" + s.get(4) + ')';
      return s.get(2) + '(' + s.get(4) + "<-" + s.get(3) + ')';
    };
  }


  /**
   * the tree transformer used to produce trees for evaluation.  Will
   * be applied both to the parse output tree and to the gold
   * tree. Should strip punctuation and maybe do some other things.
   */
  @Override
  public abstract TreeTransformer collinizer();

  /**
   * the tree transformer used to produce trees for evaluation.  Will
   * be applied both to the parse output tree and to the gold
   * tree. Should strip punctuation and maybe do some other
   * things. The evalb version should strip some more stuff
   * off. (finish this doc!)
   */
  @Override
  public abstract TreeTransformer collinizerEvalb();


  /**
   * Returns the splitting strings used for selective splits.
   *
   * @return An array containing ancestor-annotated Strings: categories
   *         should be split according to these ancestor annotations.
   */
  @Override
  public abstract String[] sisterSplitters();


  /**
   * Returns a TreeTransformer appropriate to the Treebank which
   * can be used to remove functional tags (such as "-TMP") from
   * categories. Removes GFs if evalGF = false; if GFs were not used
   * in training, results are equivalent.
   */
  @Override
  public TreeTransformer subcategoryStripper() {
    if(evalGF)
      return new SubcategoryStripper();
    return new RemoveGFSubcategoryStripper();
  }

  /**
   * This method does language-specific tree transformations such
   * as annotating particular nodes with language-relevant features.
   * Such parameterizations should be inside the specific
   * TreebankLangParserParams class.  This method is recursively
   * applied to each node in the tree (depth first, left-to-right),
   * so you shouldn't write this method to apply recursively to tree
   * members.  This method is allowed to (and in some cases does)
   * destructively change the input tree {@code t}. It changes both
   * labels and the tree shape.
   *
   * @param t The input tree (with non-language specific annotation already
   *           done, so you need to strip back to basic categories)
   * @param root The root of the current tree (can be null for words)
   * @return The fully annotated tree node (with daughters still as you
   *           want them in the final result)
   */
  @Override
  public abstract Tree transformTree(Tree t, Tree root);

  /**
   * Display (write to stderr) language-specific settings.
   */
  @Override
  public abstract void display();

  /**
   * Set language-specific options according to flags.
   * This routine should process the option starting in args[i] (which
   * might potentially be several arguments long if it takes arguments).
   * It should return the index after the last index it consumed in
   * processing.  In particular, if it cannot process the current option,
   * the return value should be i.
   * 
   * Generic options are processed separately by
   * {@link Options#setOption(String[],int)},
   * and implementations of this method do not have to worry about them.
   * The Options class handles routing options.
   * TreebankParserParams that extend this class should call super when
   * overriding this method.
   */
  @Override
  public int setOptionFlag(String[] args, int i) {
    return i;
  }

  private static final long serialVersionUID = 4299501909017975915L;

  @Override
  public TokenizerFactory treeTokenizerFactory() {
    return new TreeTokenizerFactory(treeReaderFactory());
  }

  @Override
  public Extractor dependencyGrammarExtractor(Options op, Index wordIndex, Index tagIndex) {
    return new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
  }

  public boolean isEvalGF() {
    return evalGF;
  }

  public void setEvalGF(boolean evalGF) {
    this.evalGF = evalGF;
  }

  /**
   * Annotation function for mapping punctuation to PTB-style equivalence classes.
   *
   * @author Spence Green
   *
   */
  protected static class AnnotatePunctuationFunction implements SerializableFunction {
    private final String key;
    private final String annotationMark;

    public AnnotatePunctuationFunction(String annotationMark, String key) {
      this.key = key;
      this.annotationMark = annotationMark;
    }

    @Override
    public String apply(TregexMatcher m) {
      String punc = m.getNode(key).value();
      String punctClass = PunctEquivalenceClasser.getPunctClass(punc);

      return punctClass.equals("") ? "" : annotationMark + punctClass;
    }

    @Override
    public String toString() { return "AnnotatePunctuationFunction"; }

    private static final long serialVersionUID = 1L;
  }

  @Override
  public List
    readGrammaticalStructureFromFile(String filename)
  {
    throw new UnsupportedOperationException("This language does not support GrammaticalStructures or dependencies");
  }

  @Override
  public GrammaticalStructure getGrammaticalStructure(Tree t,
                                                      Predicate filter,
                                                      HeadFinder hf) {
    throw new UnsupportedOperationException("This language does not support GrammaticalStructures or dependencies");
  }

  /**
   * By default, parsers are assumed to not support dependencies.
   * Only English and Chinese do at present.
   */
  @Override
  public boolean supportsBasicDependencies() {
    return false;
  }

  /**
   * For languages that have implementations of the
   * original Stanford dependencies and Universal
   * dependencies, this parameter is used to decide which
   * implementation should be used.
   */
  @Override
  public void setGenerateOriginalDependencies(boolean originalDependencies) {
    this.generateOriginalDependencies = originalDependencies;
    if (this.tlp != null) {
      this.tlp.setGenerateOriginalDependencies(originalDependencies);
    }
  }

  @Override
  public boolean generateOriginalDependencies() {
    return this.generateOriginalDependencies;
  }

  private static final String[] EMPTY_ARGS = new String[0];

  @Override
  public String[] defaultCoreNLPFlags() {
    return EMPTY_ARGS;
  }

}