edu.stanford.nlp.parser.lexparser.TueBaDZParserParams Maven / Gradle / Ivy

Go to download
package edu.stanford.nlp.parser.lexparser;

import java.util.ArrayList;
import java.util.List;

import edu.stanford.nlp.ling.CategoryWordTag;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.international.tuebadz.TueBaDZHeadFinder;
import edu.stanford.nlp.trees.international.tuebadz.TueBaDZLanguagePack;
import edu.stanford.nlp.trees.international.tuebadz.TueBaDZTreeReaderFactory;
import edu.stanford.nlp.util.Index;


/** TreebankLangParserParams for the German Tuebingen corpus.
 *
 *  The TueBaDZTreeReaderFactory has been changed in order to use a
 *  TueBaDZPennTreeNormalizer.
 *
 *  @author Roger Levy ([email protected])
 *  @author Wolfgang Maier ([email protected])
 */
public class TueBaDZParserParams extends AbstractTreebankParserParams {

  private HeadFinder hf = new TueBaDZHeadFinder();

  /** How to clean up node labels: 0 = do nothing, 1 = keep category and
   *  function, 2 = just category.
   */
  private int nodeCleanup = 0;
  private boolean markKonjParent = false;
  private boolean markContainsV = true;
  private boolean markZu = true;
  private boolean markColons = false;
  private boolean leftPhrasal = false;
  private boolean markHDParent = false;
  private boolean leaveGF = false;


  public TueBaDZParserParams() {
    super(new TueBaDZLanguagePack());
  }

  /** Returns the first sentence of TueBaDZ. */
  @Override
  public List defaultTestSentence() {
    return Sentence.toWordList("Veruntreute", "die", "AWO", "Spendengeld", "?");
  }

  @Override
  public String[] sisterSplitters() {
    return new String[0];
  }

  @Override
  public TreeTransformer collinizer() {
    return new TreeCollinizer(treebankLanguagePack());
  }

  @Override
  public TreeTransformer collinizerEvalb() {
    return new TreeCollinizer(treebankLanguagePack());
  }

  @Override
  public MemoryTreebank memoryTreebank() {
    return new MemoryTreebank(treeReaderFactory());
  }

  @Override
  public DiskTreebank diskTreebank() {
    return new DiskTreebank(treeReaderFactory());
  }

  @Override
  public TreeReaderFactory treeReaderFactory() {
    return new TueBaDZTreeReaderFactory(treebankLanguagePack(), nodeCleanup);
  }

  @Override
  public Lexicon lex(Options op, Index wordIndex, Index tagIndex) {
    if (op.lexOptions.uwModelTrainer == null) {
      op.lexOptions.uwModelTrainer = "edu.stanford.nlp.parser.lexparser.GermanUnknownWordModelTrainer";
    }
    return new BaseLexicon(op, wordIndex, tagIndex);
  }

  /**
   * Set language-specific options according to flags.
   * This routine should process the option starting in args[i] (which
   * might potentially be several arguments long if it takes arguments).
   * It should return the index after the last index it consumed in
   * processing.  In particular, if it cannot process the current option,
   * the return value should be i.
   * 
   * In the TueBaDZ ParserParams, all flags take 1 argument (and so can all
   * be turned on and off).
   */
  @Override
  public int setOptionFlag(String[] args, int i) {
    // [CDM 2008: there are no generic options!] first, see if it's a generic option
    // int j = super.setOptionFlag(args, i);
    // if(i != j) return j;

    //lang. specific options
    if (args[i].equalsIgnoreCase("-nodeCleanup")) {
      nodeCleanup = Integer.parseInt(args[i + 1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-markKonjParent")) {
      markKonjParent = Boolean.parseBoolean(args[i+1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-markContainsV")) {
      markContainsV = Boolean.parseBoolean(args[i+1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-markZu")) {
      markZu = Boolean.parseBoolean(args[i+1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-markColons")) {
      markColons = Boolean.parseBoolean(args[i+1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-leftPhrasal")) {
      leftPhrasal = Boolean.parseBoolean(args[i+1]);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-markHDParent")) {
      markHDParent = Boolean.parseBoolean(args[i+1]);
      i += 2;
    }  else if (args[i].equalsIgnoreCase("-leaveGF")) {
      leaveGF = Boolean.parseBoolean(args[i+1]);
      ((TueBaDZLanguagePack) treebankLanguagePack()).setLeaveGF(leaveGF);
      i += 2;
    } else if (args[i].equalsIgnoreCase("-evalGF")) {
      this.setEvalGF(Boolean.parseBoolean(args[i + 1]));
      i+=2;
    } else if (args[i].equalsIgnoreCase("-limitedGF")) {
      ((TueBaDZLanguagePack) treebankLanguagePack()).setLimitedGF(Boolean.parseBoolean(args[i + 1]));
      i+=2;
    } else if (args[i].equalsIgnoreCase("-gfCharacter")) {
      String gfChar = args[i + 1];
      if (gfChar.length() > 1) {
        System.out.println("Warning! gfCharacter argument ignored; must specify a character, not a String");
      }
      treebankLanguagePack().setGfCharacter(gfChar.charAt(0));
      i+=2;
    }

    return i;
  }

  @Override
  public void display() {
    System.err.println("TueBaDZParserParams nodeCleanup=" + nodeCleanup +
                       " mKonjParent=" + markKonjParent + " mContainsV=" + markContainsV +
                       " mZu=" + markZu + " mColons=" + markColons);
  }

  /** returns a {@link TueBaDZHeadFinder}. */
  @Override
  public HeadFinder headFinder() {
    return hf;
  }

  @Override
  public HeadFinder typedDependencyHeadFinder() {
    return headFinder();
  }


  /** Annotates a tree according to options. */
  @Override
  public Tree transformTree(Tree t, Tree root) {
    if (t == null || t.isLeaf()) {
      return t;
    }

    List annotations = new ArrayList<>();
    Label lab = t.label();
    String word = null;
    if (lab instanceof HasWord) {
      word = ((HasWord) lab).word();
    }
    String tag = null;
    if (lab instanceof HasTag) {
      tag = ((HasTag) lab).tag();
    }
    String cat = lab.value();
    // Tree parent = t.parent(root);

    if (t.isPhrasal()) {

      List childBasicCats = childBasicCats(t);

      // cdm 2008: have form for with and without functional tags since this is a hash
      if (markZu && cat.startsWith("V") && (childBasicCats.contains("PTKZU") || childBasicCats.contains("PTKZU-HD") || childBasicCats.contains("VVIZU") || childBasicCats.contains("VVIZU-HD"))) {
        annotations.add("%ZU");
      }
      if (markContainsV && containsV(t)) {
        annotations.add("%vp");
      }

      if (markKonjParent) {
        // this depends on functional tags being present
        for (String cCat : childBasicCats) {
          if (cCat.contains("-KONJ")) {
            annotations.add("%konjp");
            break;
          }
        }
      }

      if (markHDParent) {
        // this depends on functional tags being present
        for (String cCat : childBasicCats) {
          if (cCat.contains("-HD")) {
            annotations.add("%hdp");
            break;
          }
        }
      }
    } else {
      // t.isPreTerminal() case
//      if (word.equals("%")) {
//        annotations.add("-%");
//      }
//      if(parent != null) {
//        String parentVal = parent.label().value();
//        int cutOffPtD = parentVal.indexOf('-');
//        int cutOffPtC = parentVal.indexOf('^');
//        int curMin = parentVal.length();
//        if(cutOffPtD != -1) {
//          curMin = cutOffPtD;
//        }
//        if(cutOffPtC != -1) {
//          curMin = Math.min(curMin, cutOffPtC);
//        }
//        parentVal = parentVal.substring(0, curMin);
//        annotations.add("^" + parentVal);
//      }
      if (markColons && cat.equals("$.") && word != null && (word.equals(":") || word.equals(";"))) {
        annotations.add("-%colon");
      }

      if(leftPhrasal && leftPhrasal(t)) {
        annotations.add("%LP");
      }


    }
    // put on all the annotations
    StringBuilder catSB = new StringBuilder(cat);
    for (String annotation : annotations) {
      catSB.append(annotation);
    }

    t.setLabel(new CategoryWordTag(catSB.toString(), word, tag));
    return t;
  }

  private static boolean leftPhrasal(Tree t) {
    while (!t.isLeaf()) {
      t = t.lastChild();
      String str = t.label().value();
      if (str.startsWith("NP") || str.startsWith("PP") || str.startsWith("VP") || str.startsWith("S") || str.startsWith("Q") || str.startsWith("A")) {
        return true;
      }
    }
    return false;
  }

  private List childBasicCats(Tree t) {
    Tree[] kids = t.children();
    List l = new ArrayList<>();
    for (Tree kid : kids) {
      l.add(basicCat(kid.label().value()));
    }
    return l;
  }

  private String basicCat(String str) {
    return tlp.basicCategory(str);
  }

  private static boolean containsV(Tree t) {
    String cat = t.label().value();
    if (cat.startsWith("V")) {
      return true;
    } else {
      Tree[] kids = t.children();
      for (Tree kid : kids) {
        if (containsV(kid)) {
          return true;
        }
      }
      return false;
    }
  }


  private static final long serialVersionUID = 7303189408025355170L;

}