edu.stanford.nlp.parser.lexparser.TueBaDZParserParams Maven / Gradle / Ivy
package edu.stanford.nlp.parser.lexparser;
import java.util.ArrayList;
import java.util.List;
import edu.stanford.nlp.ling.CategoryWordTag;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.international.tuebadz.TueBaDZHeadFinder;
import edu.stanford.nlp.trees.international.tuebadz.TueBaDZLanguagePack;
import edu.stanford.nlp.trees.international.tuebadz.TueBaDZTreeReaderFactory;
import edu.stanford.nlp.util.Index;
/** TreebankLangParserParams for the German Tuebingen corpus.
*
* The TueBaDZTreeReaderFactory has been changed in order to use a
* TueBaDZPennTreeNormalizer.
*
* @author Roger Levy ([email protected])
* @author Wolfgang Maier ([email protected])
*/
public class TueBaDZParserParams extends AbstractTreebankParserParams {
private HeadFinder hf = new TueBaDZHeadFinder();
/** How to clean up node labels: 0 = do nothing, 1 = keep category and
* function, 2 = just category.
*/
private int nodeCleanup = 0;
private boolean markKonjParent = false;
private boolean markContainsV = true;
private boolean markZu = true;
private boolean markColons = false;
private boolean leftPhrasal = false;
private boolean markHDParent = false;
private boolean leaveGF = false;
public TueBaDZParserParams() {
super(new TueBaDZLanguagePack());
}
/** Returns the first sentence of TueBaDZ. */
@Override
public List extends HasWord> defaultTestSentence() {
return Sentence.toWordList("Veruntreute", "die", "AWO", "Spendengeld", "?");
}
@Override
public String[] sisterSplitters() {
return new String[0];
}
@Override
public TreeTransformer collinizer() {
return new TreeCollinizer(treebankLanguagePack());
}
@Override
public TreeTransformer collinizerEvalb() {
return new TreeCollinizer(treebankLanguagePack());
}
@Override
public MemoryTreebank memoryTreebank() {
return new MemoryTreebank(treeReaderFactory());
}
@Override
public DiskTreebank diskTreebank() {
return new DiskTreebank(treeReaderFactory());
}
@Override
public TreeReaderFactory treeReaderFactory() {
return new TueBaDZTreeReaderFactory(treebankLanguagePack(), nodeCleanup);
}
@Override
public Lexicon lex(Options op, Index wordIndex, Index tagIndex) {
if (op.lexOptions.uwModelTrainer == null) {
op.lexOptions.uwModelTrainer = "edu.stanford.nlp.parser.lexparser.GermanUnknownWordModelTrainer";
}
return new BaseLexicon(op, wordIndex, tagIndex);
}
/**
* Set language-specific options according to flags.
* This routine should process the option starting in args[i] (which
* might potentially be several arguments long if it takes arguments).
* It should return the index after the last index it consumed in
* processing. In particular, if it cannot process the current option,
* the return value should be i.
*
* In the TueBaDZ ParserParams, all flags take 1 argument (and so can all
* be turned on and off).
*/
@Override
public int setOptionFlag(String[] args, int i) {
// [CDM 2008: there are no generic options!] first, see if it's a generic option
// int j = super.setOptionFlag(args, i);
// if(i != j) return j;
//lang. specific options
if (args[i].equalsIgnoreCase("-nodeCleanup")) {
nodeCleanup = Integer.parseInt(args[i + 1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-markKonjParent")) {
markKonjParent = Boolean.parseBoolean(args[i+1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-markContainsV")) {
markContainsV = Boolean.parseBoolean(args[i+1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-markZu")) {
markZu = Boolean.parseBoolean(args[i+1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-markColons")) {
markColons = Boolean.parseBoolean(args[i+1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-leftPhrasal")) {
leftPhrasal = Boolean.parseBoolean(args[i+1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-markHDParent")) {
markHDParent = Boolean.parseBoolean(args[i+1]);
i += 2;
} else if (args[i].equalsIgnoreCase("-leaveGF")) {
leaveGF = Boolean.parseBoolean(args[i+1]);
((TueBaDZLanguagePack) treebankLanguagePack()).setLeaveGF(leaveGF);
i += 2;
} else if (args[i].equalsIgnoreCase("-evalGF")) {
this.setEvalGF(Boolean.parseBoolean(args[i + 1]));
i+=2;
} else if (args[i].equalsIgnoreCase("-limitedGF")) {
((TueBaDZLanguagePack) treebankLanguagePack()).setLimitedGF(Boolean.parseBoolean(args[i + 1]));
i+=2;
} else if (args[i].equalsIgnoreCase("-gfCharacter")) {
String gfChar = args[i + 1];
if (gfChar.length() > 1) {
System.out.println("Warning! gfCharacter argument ignored; must specify a character, not a String");
}
treebankLanguagePack().setGfCharacter(gfChar.charAt(0));
i+=2;
}
return i;
}
@Override
public void display() {
System.err.println("TueBaDZParserParams nodeCleanup=" + nodeCleanup +
" mKonjParent=" + markKonjParent + " mContainsV=" + markContainsV +
" mZu=" + markZu + " mColons=" + markColons);
}
/** returns a {@link TueBaDZHeadFinder}. */
@Override
public HeadFinder headFinder() {
return hf;
}
@Override
public HeadFinder typedDependencyHeadFinder() {
return headFinder();
}
/** Annotates a tree according to options. */
@Override
public Tree transformTree(Tree t, Tree root) {
if (t == null || t.isLeaf()) {
return t;
}
List annotations = new ArrayList<>();
Label lab = t.label();
String word = null;
if (lab instanceof HasWord) {
word = ((HasWord) lab).word();
}
String tag = null;
if (lab instanceof HasTag) {
tag = ((HasTag) lab).tag();
}
String cat = lab.value();
// Tree parent = t.parent(root);
if (t.isPhrasal()) {
List childBasicCats = childBasicCats(t);
// cdm 2008: have form for with and without functional tags since this is a hash
if (markZu && cat.startsWith("V") && (childBasicCats.contains("PTKZU") || childBasicCats.contains("PTKZU-HD") || childBasicCats.contains("VVIZU") || childBasicCats.contains("VVIZU-HD"))) {
annotations.add("%ZU");
}
if (markContainsV && containsV(t)) {
annotations.add("%vp");
}
if (markKonjParent) {
// this depends on functional tags being present
for (String cCat : childBasicCats) {
if (cCat.contains("-KONJ")) {
annotations.add("%konjp");
break;
}
}
}
if (markHDParent) {
// this depends on functional tags being present
for (String cCat : childBasicCats) {
if (cCat.contains("-HD")) {
annotations.add("%hdp");
break;
}
}
}
} else {
// t.isPreTerminal() case
// if (word.equals("%")) {
// annotations.add("-%");
// }
// if(parent != null) {
// String parentVal = parent.label().value();
// int cutOffPtD = parentVal.indexOf('-');
// int cutOffPtC = parentVal.indexOf('^');
// int curMin = parentVal.length();
// if(cutOffPtD != -1) {
// curMin = cutOffPtD;
// }
// if(cutOffPtC != -1) {
// curMin = Math.min(curMin, cutOffPtC);
// }
// parentVal = parentVal.substring(0, curMin);
// annotations.add("^" + parentVal);
// }
if (markColons && cat.equals("$.") && word != null && (word.equals(":") || word.equals(";"))) {
annotations.add("-%colon");
}
if(leftPhrasal && leftPhrasal(t)) {
annotations.add("%LP");
}
}
// put on all the annotations
StringBuilder catSB = new StringBuilder(cat);
for (String annotation : annotations) {
catSB.append(annotation);
}
t.setLabel(new CategoryWordTag(catSB.toString(), word, tag));
return t;
}
private static boolean leftPhrasal(Tree t) {
while (!t.isLeaf()) {
t = t.lastChild();
String str = t.label().value();
if (str.startsWith("NP") || str.startsWith("PP") || str.startsWith("VP") || str.startsWith("S") || str.startsWith("Q") || str.startsWith("A")) {
return true;
}
}
return false;
}
private List childBasicCats(Tree t) {
Tree[] kids = t.children();
List l = new ArrayList<>();
for (Tree kid : kids) {
l.add(basicCat(kid.label().value()));
}
return l;
}
private String basicCat(String str) {
return tlp.basicCategory(str);
}
private static boolean containsV(Tree t) {
String cat = t.label().value();
if (cat.startsWith("V")) {
return true;
} else {
Tree[] kids = t.children();
for (Tree kid : kids) {
if (containsV(kid)) {
return true;
}
}
return false;
}
}
private static final long serialVersionUID = 7303189408025355170L;
}