All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.parser.lexparser.TueBaDZParserParams Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

The newest version!
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.List;

import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.international.tuebadz.TueBaDZHeadFinder;
import edu.stanford.nlp.trees.international.tuebadz.TueBaDZLanguagePack;
import edu.stanford.nlp.trees.international.tuebadz.TueBaDZTreeReaderFactory;
import edu.stanford.nlp.util.Index;


/** TreebankLangParserParams for the German Tuebingen corpus.
 *
 *  The TueBaDZTreeReaderFactory has been changed in order to use a
 *  TueBaDZPennTreeNormalizer.
 *
 *  @author Roger Levy ([email protected])
 *  @author Wolfgang Maier ([email protected])
 */
public class TueBaDZParserParams extends AbstractTreebankParserParams  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(TueBaDZParserParams.class);

  private HeadFinder hf = new TueBaDZHeadFinder();

  /** How to clean up node labels: 0 = do nothing, 1 = keep category and
   *  function, 2 = just category.
   */
  private int nodeCleanup = 0;
  private boolean markKonjParent = false;
  private boolean markContainsV = true;
  private boolean markZu = true;
  private boolean markColons = false;
  private boolean leftPhrasal = false;
  private boolean markHDParent = false;
  private boolean leaveGF = false;


  public TueBaDZParserParams() {
    super(new TueBaDZLanguagePack());
  }

  /** Returns the first sentence of TueBaDZ. */
  @Override
  public List defaultTestSentence() {
    return SentenceUtils.toWordList("Veruntreute", "die", "AWO", "Spendengeld", "?");
  }

  @Override
  public String[] sisterSplitters() {
    return new String[0];
  }

  @Override
  public TreeTransformer collinizer() {
    return new TreeCollinizer(treebankLanguagePack());
  }

  @Override
  public TreeTransformer collinizerEvalb() {
    return new TreeCollinizer(treebankLanguagePack());
  }

  @Override
  public MemoryTreebank memoryTreebank() {
    return new MemoryTreebank(treeReaderFactory());
  }

  @Override
  public DiskTreebank diskTreebank() {
    return new DiskTreebank(treeReaderFactory());
  }

  @Override
  public TreeReaderFactory treeReaderFactory() {
    return new TueBaDZTreeReaderFactory(treebankLanguagePack(), nodeCleanup);
  }

  @Override
  public Lexicon lex(Options op, Index wordIndex, Index tagIndex) {
    if (op.lexOptions.uwModelTrainer == null) {
      op.lexOptions.uwModelTrainer = "edu.stanford.nlp.parser.lexparser.GermanUnknownWordModelTrainer";
    }
    return new BaseLexicon(op, wordIndex, tagIndex);
  }

  /**
   * Set language-specific options according to flags.
   * This routine should process the option starting in args[i] (which
   * might potentially be several arguments long if it takes arguments).
   * It should return the index after the last index it consumed in
   * processing.  In particular, if it cannot process the current option,
   * the return value should be i.
   * 

* In the TueBaDZ ParserParams, all flags take 1 argument (and so can all * be turned on and off). */ @Override public int setOptionFlag(String[] args, int i) { // [CDM 2008: there are no generic options!] first, see if it's a generic option // int j = super.setOptionFlag(args, i); // if(i != j) return j; //lang. specific options if (args[i].equalsIgnoreCase("-nodeCleanup")) { nodeCleanup = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-markKonjParent")) { markKonjParent = Boolean.parseBoolean(args[i+1]); i += 2; } else if (args[i].equalsIgnoreCase("-markContainsV")) { markContainsV = Boolean.parseBoolean(args[i+1]); i += 2; } else if (args[i].equalsIgnoreCase("-markZu")) { markZu = Boolean.parseBoolean(args[i+1]); i += 2; } else if (args[i].equalsIgnoreCase("-markColons")) { markColons = Boolean.parseBoolean(args[i+1]); i += 2; } else if (args[i].equalsIgnoreCase("-leftPhrasal")) { leftPhrasal = Boolean.parseBoolean(args[i+1]); i += 2; } else if (args[i].equalsIgnoreCase("-markHDParent")) { markHDParent = Boolean.parseBoolean(args[i+1]); i += 2; } else if (args[i].equalsIgnoreCase("-leaveGF")) { leaveGF = Boolean.parseBoolean(args[i+1]); ((TueBaDZLanguagePack) treebankLanguagePack()).setLeaveGF(leaveGF); i += 2; } else if (args[i].equalsIgnoreCase("-evalGF")) { this.setEvalGF(Boolean.parseBoolean(args[i + 1])); i+=2; } else if (args[i].equalsIgnoreCase("-limitedGF")) { ((TueBaDZLanguagePack) treebankLanguagePack()).setLimitedGF(Boolean.parseBoolean(args[i + 1])); i+=2; } else if (args[i].equalsIgnoreCase("-gfCharacter")) { String gfChar = args[i + 1]; if (gfChar.length() > 1) { System.out.println("Warning! gfCharacter argument ignored; must specify a character, not a String"); } treebankLanguagePack().setGfCharacter(gfChar.charAt(0)); i+=2; } return i; } @Override public void display() { log.info("TueBaDZParserParams nodeCleanup=" + nodeCleanup + " mKonjParent=" + markKonjParent + " mContainsV=" + markContainsV + " mZu=" + markZu + " mColons=" + markColons); } /** returns a {@link TueBaDZHeadFinder}. */ @Override public HeadFinder headFinder() { return hf; } @Override public HeadFinder typedDependencyHeadFinder() { return headFinder(); } /** Annotates a tree according to options. */ @Override public Tree transformTree(Tree t, Tree root) { if (t == null || t.isLeaf()) { return t; } List annotations = new ArrayList<>(); Label lab = t.label(); String word = null; if (lab instanceof HasWord) { word = ((HasWord) lab).word(); } String tag = null; if (lab instanceof HasTag) { tag = ((HasTag) lab).tag(); } String cat = lab.value(); // Tree parent = t.parent(root); if (t.isPhrasal()) { List childBasicCats = childBasicCats(t); // cdm 2008: have form for with and without functional tags since this is a hash if (markZu && cat.startsWith("V") && (childBasicCats.contains("PTKZU") || childBasicCats.contains("PTKZU-HD") || childBasicCats.contains("VVIZU") || childBasicCats.contains("VVIZU-HD"))) { annotations.add("%ZU"); } if (markContainsV && containsV(t)) { annotations.add("%vp"); } if (markKonjParent) { // this depends on functional tags being present for (String cCat : childBasicCats) { if (cCat.contains("-KONJ")) { annotations.add("%konjp"); break; } } } if (markHDParent) { // this depends on functional tags being present for (String cCat : childBasicCats) { if (cCat.contains("-HD")) { annotations.add("%hdp"); break; } } } } else { // t.isPreTerminal() case // if (word.equals("%")) { // annotations.add("-%"); // } // if(parent != null) { // String parentVal = parent.label().value(); // int cutOffPtD = parentVal.indexOf('-'); // int cutOffPtC = parentVal.indexOf('^'); // int curMin = parentVal.length(); // if(cutOffPtD != -1) { // curMin = cutOffPtD; // } // if(cutOffPtC != -1) { // curMin = Math.min(curMin, cutOffPtC); // } // parentVal = parentVal.substring(0, curMin); // annotations.add("^" + parentVal); // } if (markColons && cat.equals("$.") && word != null && (word.equals(":") || word.equals(";"))) { annotations.add("-%colon"); } if(leftPhrasal && leftPhrasal(t)) { annotations.add("%LP"); } } // put on all the annotations StringBuilder catSB = new StringBuilder(cat); for (String annotation : annotations) { catSB.append(annotation); } t.setLabel(new CategoryWordTag(catSB.toString(), word, tag)); return t; } private static boolean leftPhrasal(Tree t) { while (!t.isLeaf()) { t = t.lastChild(); String str = t.label().value(); if (str.startsWith("NP") || str.startsWith("PP") || str.startsWith("VP") || str.startsWith("S") || str.startsWith("Q") || str.startsWith("A")) { return true; } } return false; } private List childBasicCats(Tree t) { Tree[] kids = t.children(); List l = new ArrayList<>(); for (Tree kid : kids) { l.add(basicCat(kid.label().value())); } return l; } private String basicCat(String str) { return tlp.basicCategory(str); } private static boolean containsV(Tree t) { String cat = t.label().value(); if (cat.startsWith("V")) { return true; } else { Tree[] kids = t.children(); for (Tree kid : kids) { if (containsV(kid)) { return true; } } return false; } } private static final long serialVersionUID = 7303189408025355170L; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy