edu.stanford.nlp.parser.lexparser.ArabicTreebankParserParams Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.parser.lexparser;
import java.util.*;
import java.util.regex.*;
import edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification.MorphoFeatureType;
import edu.stanford.nlp.international.morph.MorphoFeatures;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.process.SerializableFunction;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.international.arabic.*;
import edu.stanford.nlp.trees.tregex.*;
import java.util.function.Function;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;
/**
* A {@link TreebankLangParserParams} implementing class for
* the Penn Arabic Treebank. The baseline feature set works with either
* UTF-8 or Buckwalter input, although the behavior of some unused features depends
* on the input encoding.
*
* @author Roger Levy
* @author Christopher Manning
* @author Spence Green
*/
public class ArabicTreebankParserParams extends AbstractTreebankParserParams {
private static final long serialVersionUID = 8853426784197984653L;
private final StringBuilder optionsString;
private boolean retainNPTmp = false;
private boolean retainNPSbj = false;
private boolean retainPRD = false;
private boolean retainPPClr = false;
private boolean changeNoLabels = false;
private boolean collinizerRetainsPunctuation = false;
private boolean discardX = false;
private HeadFinder headFinder;
private final Map>> annotationPatterns;
private final List>> activeAnnotations;
private static final String[] EMPTY_STRING_ARRAY = new String[0];
private MorphoFeatureSpecification morphoSpec = null;
public ArabicTreebankParserParams() {
super(new ArabicTreebankLanguagePack());
optionsString = new StringBuilder();
optionsString.append("ArabicTreebankParserParams\n");
annotationPatterns = Generics.newHashMap();
activeAnnotations = new ArrayList>>();
//Initialize the headFinder here
headFinder = headFinder();
initializeAnnotationPatterns();
}
/**
* Creates an {@link ArabicTreeReaderFactory} with parameters set
* via options passed in from the command line.
*
* @return An {@link ArabicTreeReaderFactory}
*/
public TreeReaderFactory treeReaderFactory() {
return new ArabicTreeReaderFactory(retainNPTmp, retainPRD,
changeNoLabels, discardX,
retainNPSbj, false, retainPPClr);
}
//NOTE (WSG): This method is called by main() to load the test treebank
@Override
public MemoryTreebank memoryTreebank() {
return new MemoryTreebank(treeReaderFactory(), inputEncoding);
}
//NOTE (WSG): This method is called to load the training treebank
@Override
public DiskTreebank diskTreebank() {
return new DiskTreebank(treeReaderFactory(), inputEncoding);
}
@Override
public HeadFinder headFinder() {
if(headFinder == null)
headFinder = new ArabicHeadFinder(treebankLanguagePack());
return headFinder;
}
@Override
public HeadFinder typedDependencyHeadFinder() {
return headFinder();
}
/**
* Returns a lexicon for Arabic. At the moment this is just a BaseLexicon.
*
* @param op Lexicon options
* @return A Lexicon
*/
@Override
public Lexicon lex(Options op, Index wordIndex, Index tagIndex) {
if(op.lexOptions.uwModelTrainer == null) {
op.lexOptions.uwModelTrainer = "edu.stanford.nlp.parser.lexparser.ArabicUnknownWordModelTrainer";
}
if(morphoSpec != null) {
return new FactoredLexicon(op, morphoSpec, wordIndex, tagIndex);
}
return new BaseLexicon(op, wordIndex, tagIndex);
}
/**
* Return a default sentence for the language (for testing).
* The example is in UTF-8.
*/
public List extends HasWord> defaultTestSentence() {
String[] sent = {"هو","استنكر","الحكومة","يوم","امس","."};
return Sentence.toWordList(sent);
}
protected class ArabicSubcategoryStripper implements TreeTransformer {
protected final TreeFactory tf = new LabeledScoredTreeFactory();
public Tree transformTree(Tree tree) {
Label lab = tree.label();
String s = lab.value();
if (tree.isLeaf()) {
Tree leaf = tf.newLeaf(lab);
leaf.setScore(tree.score());
return leaf;
} else if(tree.isPhrasal()) {
if(retainNPTmp && s.startsWith("NP-TMP")) {
s = "NP-TMP";
} else if(retainNPSbj && s.startsWith("NP-SBJ")) {
s = "NP-SBJ";
} else if(retainPRD && s.matches("VB[^P].*PRD.*")) {
s = tlp.basicCategory(s);
s += "-PRD";
} else {
s = tlp.basicCategory(s);
}
} else if(tree.isPreTerminal()) {
s = tlp.basicCategory(s);
} else {
System.err.printf("Encountered a non-leaf/phrasal/pre-terminal node %s\n",s);
//Normalize by default
s = tlp.basicCategory(s);
}
// Recursively process children depth-first
List children = new ArrayList(tree.numChildren());
for (Tree child : tree.getChildrenAsList()) {
Tree newChild = transformTree(child);
children.add(newChild);
}
// Make the new parent label
Tree node = tf.newTreeNode(lab, children);
node.setValue(s);
node.setScore(tree.score());
if(node.label() instanceof HasTag)
((HasTag) node.label()).setTag(s);
return node;
}
}
/**
* Returns a TreeTransformer that retains categories
* according to the following options supported by setOptionFlag:
*
* -retainNPTmp
Retain temporal NP marking on NPs.
* -retainNPSbj
Retain NP subject function tags
* -markPRDverbs
Retain PRD verbs.
*
*/
//NOTE (WSG): This is applied to both the best parse by getBestParse()
//and to the gold eval tree by testOnTreebank()
@Override
public TreeTransformer subcategoryStripper() {
return new ArabicSubcategoryStripper();
}
/**
* The collinizer eliminates punctuation
*/
@Override
public TreeTransformer collinizer() {
return new TreeCollinizer(tlp, !collinizerRetainsPunctuation, false);
}
/**
* Stand-in collinizer does nothing to the tree.
*/
@Override
public TreeTransformer collinizerEvalb() {
return collinizer();
}
@Override
public String[] sisterSplitters() {
return EMPTY_STRING_ARRAY;
}
// WSGDEBUG -- Annotate POS tags with nominal (grammatical) gender
private static final MorphoFeatureSpecification tagSpec = new ArabicMorphoFeatureSpecification();
static {
tagSpec.activate(MorphoFeatureType.NGEN);
}
@Override
public Tree transformTree(Tree t, Tree root) {
String baseCat = t.value();
StringBuilder newCategory = new StringBuilder();
//Add manual state splits
for (Pair> e : activeAnnotations) {
TregexMatcher m = e.first().matcher(root);
if (m.matchesAt(t))
newCategory.append(e.second().apply(m));
}
// WSGDEBUG
//Add morphosyntactic features if this is a POS tag
if(t.isPreTerminal() && tagSpec != null) {
if( !(t.firstChild().label() instanceof CoreLabel) || ((CoreLabel) t.firstChild().label()).originalText() == null )
throw new RuntimeException(String.format("%s: Term lacks morpho analysis: %s",this.getClass().getName(),t.toString()));
String morphoStr = ((CoreLabel) t.firstChild().label()).originalText();
MorphoFeatures feats = tagSpec.strToFeatures(morphoStr);
baseCat = feats.getTag(baseCat);
}
//Update the label(s)
String newCat = baseCat + newCategory.toString();
t.setValue(newCat);
if (t.isPreTerminal() && t.label() instanceof HasTag)
((HasTag) t.label()).setTag(newCat);
return t;
}
/**
* These are the annotations included when the user selects the -arabicFactored option.
*/
private final List baselineFeatures = new ArrayList();
{
baselineFeatures.add("-markNounNPargTakers");
baselineFeatures.add("-genitiveMark");
baselineFeatures.add("-splitPUNC");
baselineFeatures.add("-markContainsVerb");
baselineFeatures.add("-markStrictBaseNP");
baselineFeatures.add("-markOneLevelIdafa");
baselineFeatures.add("-splitIN");
baselineFeatures.add("-markMasdarVP");
baselineFeatures.add("-containsSVO");
baselineFeatures.add("-splitCC");
baselineFeatures.add("-markFem");
// Added for MWE experiments
baselineFeatures.add("-mwe");
baselineFeatures.add("-mweContainsVerb");
}
private final List additionalFeatures = new ArrayList();
private void initializeAnnotationPatterns() {
//This doesn't/can't really pick out genitives, but just any NP following an NN head.
//wsg2011: In particular, it doesn't select NP complements of PPs, which are also genitive.
final String genitiveNodeTregexString = "@NP > @NP $- /^N/";
TregexPatternCompiler tregexPatternCompiler =
new TregexPatternCompiler(headFinder());
try {
// ******************
// Baseline features
// ******************
annotationPatterns.put("-genitiveMark",new Pair>(TregexPattern.compile(genitiveNodeTregexString),new SimpleStringFunction("-genitive")));
annotationPatterns.put("-markStrictBaseNP",new Pair>(tregexPatternCompiler.compile("@NP !< (__ < (__ < __))"),new SimpleStringFunction("-base"))); // NP with no phrasal node in it
annotationPatterns.put("-markOneLevelIdafa",new Pair>(tregexPatternCompiler.compile("@NP < (@NP < (__ < __)) !< (/^[^N]/ < (__ < __)) !< (__ < (__ < (__ < __)))"),new SimpleStringFunction("-idafa1")));
annotationPatterns.put("-markNounNPargTakers",new Pair>(tregexPatternCompiler.compile("@NN|NNS|NNP|NNPS|DTNN|DTNNS|DTNNP|DTNNPS ># (@NP < @NP)"),new SimpleStringFunction("-NounNParg")));
annotationPatterns.put("-markContainsVerb",new Pair>(tregexPatternCompiler.compile("__ << (/^[CIP]?V/ < (__ !< __))"),new SimpleStringFunction("-withV")));
annotationPatterns.put("-splitIN",new Pair>(tregexPatternCompiler.compile("@IN < __=word"), new AddRelativeNodeFunction("-","word", false)));
annotationPatterns.put("-splitPUNC",new Pair>(tregexPatternCompiler.compile("@PUNC < __=" + AnnotatePunctuationFunction2.key),new AnnotatePunctuationFunction2()));
annotationPatterns.put("-markMasdarVP", new Pair>(tregexPatternCompiler.compile("@VP|MWVP < /VBG|VN/"), new SimpleStringFunction("-masdar")));
annotationPatterns.put("-containsSVO", new Pair>(tregexPatternCompiler.compile("__ << (@S < (@NP . @VP|MWVP))"), new SimpleStringFunction("-hasSVO")));
annotationPatterns.put("-splitCC",new Pair>(tregexPatternCompiler.compile("@CC|CONJ . __=term , __"),new AddEquivalencedConjNode("-","term")));
annotationPatterns.put("-markFem", new Pair>(tregexPatternCompiler.compile("__ < /ة$/"), new SimpleStringFunction("-fem")));
// Added for MWE experiments
annotationPatterns.put("-mwe",new Pair>(tregexPatternCompiler.compile("__ > /MW/=tag"), new AddRelativeNodeFunction("-","tag", true)));
annotationPatterns.put("-mweContainsVerb",new Pair>(tregexPatternCompiler.compile("__ << @MWVP"),new SimpleStringFunction("-withV")));
//This version, which uses the PTB equivalence classing, results in slightly lower labeled F1
//than the splitPUNC feature above, which was included in the COLING2010 evaluation
annotationPatterns.put("-splitPUNC2",new Pair>(tregexPatternCompiler.compile("@PUNC < __=punc"),new AnnotatePunctuationFunction("-","punc")));
// Label each POS with its parent
annotationPatterns.put("-tagPAar", new Pair>(tregexPatternCompiler.compile("!@PUNC < (__ !< __) > __=parent"),new AddRelativeNodeFunction("-","parent", true)));
//Didn't work
annotationPatterns.put("-splitCC1",new Pair>(tregexPatternCompiler.compile("@CC|CONJ < __=term"),new AddRelativeNodeRegexFunction("-","term", "-*([^-].*)")));
annotationPatterns.put("-splitCC2",new Pair>(tregexPatternCompiler.compile("@CC . __=term , __"),new AddRelativeNodeFunction("-","term", true)));
annotationPatterns.put("-idafaJJ1", new Pair>(tregexPatternCompiler.compile("@NP <, (@NN $+ @NP) <+(@NP) @ADJP"), new SimpleStringFunction("-idafaJJ")));
annotationPatterns.put("-idafaJJ2", new Pair>(tregexPatternCompiler.compile("@NP <, (@NN $+ @NP) <+(@NP) @ADJP !<< @SBAR"), new SimpleStringFunction("-idafaJJ")));
annotationPatterns.put("-properBaseNP", new Pair>(tregexPatternCompiler.compile("@NP !<< @NP < /NNP/ !< @PUNC|CD"), new SimpleStringFunction("-prop")));
annotationPatterns.put("-interrog", new Pair>(tregexPatternCompiler.compile("__ << هل|ماذا|لماذا|اين|متى"), new SimpleStringFunction("-inter")));
annotationPatterns.put("-splitPseudo", new Pair>(tregexPatternCompiler.compile("@NN < مع|بعد|بين"), new SimpleStringFunction("-pseudo")));
annotationPatterns.put("-nPseudo", new Pair>(tregexPatternCompiler.compile("@NP < (@NN < مع|بعد|بين)"), new SimpleStringFunction("-npseudo")));
annotationPatterns.put("-pseudoArg", new Pair>(tregexPatternCompiler.compile("@NP < @NP $, (@NN < مع|بعد|بين)"), new SimpleStringFunction("-pseudoArg")));
annotationPatterns.put("-eqL1", new Pair>(tregexPatternCompiler.compile("__ < (@S !< @VP|S)"), new SimpleStringFunction("-haseq")));
annotationPatterns.put("-eqL1L2", new Pair>(tregexPatternCompiler.compile("__ < (__ < (@S !< @VP|S)) | < (@S !< @VP|S)"), new SimpleStringFunction("-haseq")));
annotationPatterns.put("-fullQuote", new Pair>(tregexPatternCompiler.compile("__ < ((@PUNC < \") $ (@PUNC < \"))"), new SimpleStringFunction("-fq")));
annotationPatterns.put("-brokeQuote", new Pair>(tregexPatternCompiler.compile("__ < ((@PUNC < \") !$ (@PUNC < \"))"), new SimpleStringFunction("-bq")));
annotationPatterns.put("-splitVP", new Pair>(tregexPatternCompiler.compile("@VP <# __=term1"), new AddRelativeNodeFunction("-","term1",true)));
annotationPatterns.put("-markFemP", new Pair>(tregexPatternCompiler.compile("@NP|ADJP < (__ < /ة$/)"), new SimpleStringFunction("-femP")));
annotationPatterns.put("-embedSBAR", new Pair>(tregexPatternCompiler.compile("@NP|PP <+(@NP|PP) @SBAR"), new SimpleStringFunction("-embedSBAR")));
annotationPatterns.put("-complexVP", new Pair>(tregexPatternCompiler.compile("__ << (@VP < (@NP $ @NP)) > __"), new SimpleStringFunction("-complexVP")));
annotationPatterns.put("-containsJJ", new Pair>(tregexPatternCompiler.compile("@NP <+(@NP) /JJ/"), new SimpleStringFunction("-hasJJ")));
annotationPatterns.put("-markMasdarVP2", new Pair>(tregexPatternCompiler.compile("__ << @VN|VBG"), new SimpleStringFunction("-masdar")));
annotationPatterns.put("-coordNP", new Pair>(tregexPatternCompiler.compile("@NP|ADJP <+(@NP|ADJP) (@CC|PUNC $- __ $+ __)"), new SimpleStringFunction("-coordNP")));
annotationPatterns.put("-coordWa", new Pair>(tregexPatternCompiler.compile("__ << (@CC , __ < و-)"), new SimpleStringFunction("-coordWA")));
annotationPatterns.put("-NPhasADJP", new Pair>(tregexPatternCompiler.compile("@NP <+(@NP) @ADJP"), new SimpleStringFunction("-NPhasADJP")));
annotationPatterns.put("-NPADJP", new Pair>(tregexPatternCompiler.compile("@NP < @ADJP"), new SimpleStringFunction("-npadj")));
annotationPatterns.put("-NPJJ", new Pair>(tregexPatternCompiler.compile("@NP < /JJ/"), new SimpleStringFunction("-npjj")));
annotationPatterns.put("-NPCC", new Pair>(tregexPatternCompiler.compile("@NP <+(@NP) @CC"), new SimpleStringFunction("-npcc")));
annotationPatterns.put("-NPCD", new Pair>(tregexPatternCompiler.compile("@NP < @CD"), new SimpleStringFunction("-npcd")));
annotationPatterns.put("-NPNNP", new Pair>(tregexPatternCompiler.compile("@NP < /NNP/"), new SimpleStringFunction("-npnnp")));
annotationPatterns.put("-SVO", new Pair>(tregexPatternCompiler.compile("@S < (@NP . @VP)"), new SimpleStringFunction("-svo")));
annotationPatterns.put("-containsSBAR", new Pair>(tregexPatternCompiler.compile("__ << @SBAR"), new SimpleStringFunction("-hasSBAR")));
//WSGDEBUG - Template
//annotationPatterns.put("", new Pair>(tregexPatternCompiler.compile(""), new SimpleStringFunction("")));
// ************
// Old and unused features (in various states of repair)
// *************
annotationPatterns.put("-markGappedVP",new Pair>(TregexPattern.compile("@VP > @VP $- __ $ /^(?:CC|CONJ)/ !< /^V/"),new SimpleStringFunction("-gappedVP")));
annotationPatterns.put("-markGappedVPConjoiners",new Pair>(TregexPattern.compile("/^(?:CC|CONJ)/ $ (@VP > @VP $- __ !< /^V/)"),new SimpleStringFunction("-gappedVP")));
annotationPatterns.put("-markGenitiveParent",new Pair>(TregexPattern.compile("@NP < (" + genitiveNodeTregexString + ')'),new SimpleStringFunction("-genitiveParent")));
// maSdr: this pattern is just a heuristic classification, which matches on
// various common maSdr pattterns, but probably also matches on a lot of other
// stuff. It marks NPs with possible maSdr.
// Roger's old pattern:
annotationPatterns.put("-maSdrMark",new Pair>(tregexPatternCompiler.compile("/^N/ <<# (/^[t\\u062a].+[y\\u064a].$/ > @NN|NOUN|DTNN)"),new SimpleStringFunction("-maSdr")));
// chris' attempt
annotationPatterns.put("-maSdrMark2",new Pair>(tregexPatternCompiler.compile("/^N/ <<# (/^(?:[t\\u062a].+[y\\u064a].|<.{3,}|A.{3,})$/ > @NN|NOUN|DTNN)"),new SimpleStringFunction("-maSdr")));
annotationPatterns.put("-maSdrMark3",new Pair>(tregexPatternCompiler.compile("/^N/ <<# (/^(?:[t\\u062a @NN|NOUN|DTNN)"),new SimpleStringFunction("-maSdr")));
annotationPatterns.put("-maSdrMark4",new Pair>(tregexPatternCompiler.compile("/^N/ <<# (/^(?:[t\\u062a (@NN|NOUN|DTNN > (@NP < @NP)))"),new SimpleStringFunction("-maSdr")));
annotationPatterns.put("-maSdrMark5",new Pair>(tregexPatternCompiler.compile("/^N/ <<# (__ > (@NN|NOUN|DTNN > (@NP < @NP)))"),new SimpleStringFunction("-maSdr")));
annotationPatterns.put("-mjjMark",new Pair>(tregexPatternCompiler.compile("@JJ|DTJJ < /^m/ $+ @PP ># @ADJP "),new SimpleStringFunction("-mjj")));
//annotationPatterns.put(markPRDverbString,new Pair>(TregexPattern.compile("/^V[^P]/ > VP $ /-PRD$/"),new SimpleStringFunction("-PRDverb"))); // don't need this pattern anymore, the functionality has been moved to ArabicTreeNormalizer
// PUNC is PUNC in either raw or Bies POS encoding
annotationPatterns.put("-markNPwithSdescendant",new Pair>(tregexPatternCompiler.compile("__ !< @S << @S [ >> @NP | == @NP ]"),new SimpleStringFunction("-inNPdominatesS")));
annotationPatterns.put("-markRightRecursiveNP",new Pair>(tregexPatternCompiler.compile("__ <<- @NP [>>- @NP | == @NP]"),new SimpleStringFunction("-rrNP")));
annotationPatterns.put("-markBaseNP",new Pair>(tregexPatternCompiler.compile("@NP !< @NP !< @VP !< @SBAR !< @ADJP !< @ADVP !< @S !< @QP !< @UCP !< @PP"),new SimpleStringFunction("-base")));
// allow only a single level of idafa as Base NP; this version works!
annotationPatterns.put("-markBaseNPplusIdafa",new Pair>(tregexPatternCompiler.compile("@NP !< (/^[^N]/ < (__ < __)) !< (__ < (__ < (__ < __)))"),new SimpleStringFunction("-base")));
annotationPatterns.put("-markTwoLevelIdafa",new Pair>(tregexPatternCompiler.compile("@NP < (@NP < (@NP < (__ < __)) !< (/^[^N]/ < (__ < __))) !< (/^[^N]/ < (__ < __)) !< (__ < (__ < (__ < (__ < __))))"),new SimpleStringFunction("-idafa2")));
annotationPatterns.put("-markDefiniteIdafa",new Pair>(tregexPatternCompiler.compile("@NP < (/^(?:NN|NOUN)/ !$,, /^[^AP]/) <+(/^NP/) (@NP < /^DT/)"), new SimpleStringFunction("-defIdafa")));
annotationPatterns.put("-markDefiniteIdafa1",new Pair>(tregexPatternCompiler.compile("@NP < (/^(?:NN|NOUN)/ !$,, /^[^AP]/) < (@NP < /^DT/) !< (/^[^N]/ < (__ < __)) !< (__ < (__ < (__ < __)))"), new SimpleStringFunction("-defIdafa1")));
annotationPatterns.put("-markContainsSBAR",new Pair>(tregexPatternCompiler.compile("__ << @SBAR"),new SimpleStringFunction("-withSBAR")));
annotationPatterns.put("-markPhrasalNodesDominatedBySBAR",new Pair>(tregexPatternCompiler.compile("__ < (__ < __) >> @SBAR"),new SimpleStringFunction("-domBySBAR")));
annotationPatterns.put("-markCoordinateNPs",new Pair>(tregexPatternCompiler.compile("@NP < @CC|CONJ"),new SimpleStringFunction("-coord")));
//annotationPatterns.put("-markCopularVerbTags",new Pair>(tregexPatternCompiler.compile("/^V/ < " + copularVerbForms),new SimpleStringFunction("-copular")));
//annotationPatterns.put("-markSBARVerbTags",new Pair>(tregexPatternCompiler.compile("/^V/ < " + sbarVerbForms),new SimpleStringFunction("-SBARverb")));
annotationPatterns.put("-markNounAdjVPheads",new Pair>(tregexPatternCompiler.compile("@NN|NNS|NNP|NNPS|JJ|DTJJ|DTNN|DTNNS|DTNNP|DTNNPS ># @VP"),new SimpleStringFunction("-VHead")));
// a better version of the below might only mark clitic pronouns, but
// since most pronouns are clitics, let's try this first....
annotationPatterns.put("-markPronominalNP",new Pair>(tregexPatternCompiler.compile("@NP < @PRP"),new SimpleStringFunction("-PRP")));
// try doing coordination parallelism -- there's a lot of that in Arabic (usually the same, sometimes different CC)
annotationPatterns.put("-markMultiCC", new Pair>(tregexPatternCompiler.compile("__ < (@CC $.. @CC)"), new SimpleStringFunction("-multiCC"))); // this unfortunately didn't seem helpful for capturing CC parallelism; should try again
annotationPatterns.put("-markHasCCdaughter", new Pair>(tregexPatternCompiler.compile("__ < @CC"), new SimpleStringFunction("-CCdtr")));
annotationPatterns.put("-markAcronymNP",new Pair>(tregexPatternCompiler.compile("@NP !< (__ < (__ < __)) < (/^NN/ < /^.$/ $ (/^NN/ < /^.$/)) !< (__ < /../)"), new SimpleStringFunction("-acro")));
annotationPatterns.put("-markAcronymNN",new Pair>(tregexPatternCompiler.compile("/^NN/ < /^.$/ $ (/^NN/ < /^.$/) > (@NP !< (__ < (__ < __)) !< (__ < /../))"), new SimpleStringFunction("-acro")));
//PP Specific patterns
annotationPatterns.put("-markPPwithPPdescendant",new Pair>(tregexPatternCompiler.compile("__ !< @PP << @PP [ >> @PP | == @PP ]"),new SimpleStringFunction("-inPPdominatesPP")));
annotationPatterns.put("-gpAnnotatePrepositions",new Pair>(TregexPattern.compile("/^(?:IN|PREP)$/ > (__ > __=gp)"),new AddRelativeNodeFunction("^^","gp", false)));
annotationPatterns.put("-gpEquivalencePrepositions",new Pair>(TregexPattern.compile("/^(?:IN|PREP)$/ > (@PP >+(/^PP/) __=gp)"),new AddEquivalencedNodeFunction("^^","gp")));
annotationPatterns.put("-gpEquivalencePrepositionsVar",new Pair>(TregexPattern.compile("/^(?:IN|PREP)$/ > (@PP >+(/^PP/) __=gp)"),new AddEquivalencedNodeFunctionVar("^^","gp")));
annotationPatterns.put("-markPPParent", new Pair>(tregexPatternCompiler.compile("@PP=max !< @PP"),new AddRelativeNodeRegexFunction("^^","max","^(\\w)")));
annotationPatterns.put("-whPP", new Pair>(tregexPatternCompiler.compile("@PP <- (@SBAR <, /^WH/)"),new SimpleStringFunction("-whPP")));
// annotationPatterns.put("-markTmpPP", new Pair>(tregexPatternCompiler.compile("@PP !<+(__) @PP"),new LexicalCategoryFunction("-TMP",temporalNouns)));
annotationPatterns.put("-deflateMin", new Pair>(tregexPatternCompiler.compile("__ < (__ < من)"),new SimpleStringFunction("-min")));
annotationPatterns.put("-v2MarkovIN", new Pair>(tregexPatternCompiler.compile("@IN > (@__=p1 > @__=p2)"),new AddRelativeNodeFunction("^","p1","p2", false)));
annotationPatterns.put("-pleonasticMin", new Pair>(tregexPatternCompiler.compile("@PP <, (IN < من) > @S"),new SimpleStringFunction("-pleo")));
annotationPatterns.put("-v2MarkovPP", new Pair>(tregexPatternCompiler.compile("@PP > (@__=p1 > @__=p2)"), new AddRelativeNodeFunction("^","p1","p2", false)));
} catch (TregexParseException e) {
int nth = annotationPatterns.size() + 1;
String nthStr = (nth == 1) ? "1st": ((nth == 2) ? "2nd": nth + "th");
System.err.println("Parse exception on " + nthStr + " annotation pattern initialization:" + e);
throw e;
}
}
private static class SimpleStringFunction implements SerializableFunction {
public SimpleStringFunction(String result) {
this.result = result;
}
private String result;
public String apply(TregexMatcher tregexMatcher) {
return result;
}
@Override
public String toString() { return "SimpleStringFunction[" + result + ']'; }
private static final long serialVersionUID = 1L;
}
private static class AddRelativeNodeFunction implements SerializableFunction {
private String annotationMark;
private String key;
private String key2;
private boolean doBasicCat = false;
private static final TreebankLanguagePack tlp = new ArabicTreebankLanguagePack();
public AddRelativeNodeFunction(String annotationMark, String key, boolean basicCategory) {
this.annotationMark = annotationMark;
this.key = key;
this.key2 = null;
doBasicCat = basicCategory;
}
public AddRelativeNodeFunction(String annotationMark, String key1, String key2, boolean basicCategory) {
this(annotationMark,key1,basicCategory);
this.key2 = key2;
}
public String apply(TregexMatcher m) {
if(key2 == null)
return annotationMark + ((doBasicCat) ? tlp.basicCategory(m.getNode(key).label().value()) : m.getNode(key).label().value());
else {
String annot1 = (doBasicCat) ? tlp.basicCategory(m.getNode(key).label().value()) : m.getNode(key).label().value();
String annot2 = (doBasicCat) ? tlp.basicCategory(m.getNode(key2).label().value()) : m.getNode(key2).label().value();
return annotationMark + annot1 + annotationMark + annot2;
}
}
@Override
public String toString() {
if(key2 == null)
return "AddRelativeNodeFunction[" + annotationMark + ',' + key + ']';
else
return "AddRelativeNodeFunction[" + annotationMark + ',' + key + ',' + key2 + ']';
}
private static final long serialVersionUID = 1L;
}
private static class AddRelativeNodeRegexFunction implements SerializableFunction {
private String annotationMark;
private String key;
private Pattern pattern;
private String key2 = null;
private Pattern pattern2;
public AddRelativeNodeRegexFunction(String annotationMark, String key, String regex) {
this.annotationMark = annotationMark;
this.key = key;
try {
this.pattern = Pattern.compile(regex);
} catch (PatternSyntaxException pse) {
System.err.println("Bad pattern: " + regex);
pattern = null;
throw new IllegalArgumentException(pse);
}
}
public String apply(TregexMatcher m) {
String val = m.getNode(key).label().value();
if (pattern != null) {
Matcher mat = pattern.matcher(val);
if (mat.find()) {
val = mat.group(1);
}
}
if(key2 != null && pattern2 != null) {
String val2 = m.getNode(key2).label().value();
Matcher mat2 = pattern2.matcher(val2);
if(mat2.find()) {
val = val + annotationMark + mat2.group(1);
} else {
val = val + annotationMark + val2;
}
}
return annotationMark + val;
}
@Override
public String toString() { return "AddRelativeNodeRegexFunction[" + annotationMark + ',' + key + ',' + pattern + ']'; }
private static final long serialVersionUID = 1L;
}
/** This one only distinguishes VP, S and Other (mainly nominal) contexts.
* These seem the crucial distinctions for Arabic true prepositions,
* based on raw counts in data.
*/
private static class AddEquivalencedNodeFunction implements SerializableFunction {
private String annotationMark;
private String key;
public AddEquivalencedNodeFunction(String annotationMark, String key) {
this.annotationMark = annotationMark;
this.key = key;
}
public String apply(TregexMatcher m) {
String node = m.getNode(key).label().value();
if (node.startsWith("S")) {
return annotationMark + 'S';
} else if (node.startsWith("V")) {
return annotationMark + 'V';
} else {
return "";
}
}
@Override
public String toString() { return "AddEquivalencedNodeFunction[" + annotationMark + ',' + key + ']'; }
private static final long serialVersionUID = 1L;
}
/** This one only distinguishes VP, S*, A* versus other (mainly nominal) contexts. */
private static class AddEquivalencedNodeFunctionVar implements SerializableFunction {
private String annotationMark;
private String key;
public AddEquivalencedNodeFunctionVar(String annotationMark, String key) {
this.annotationMark = annotationMark;
this.key = key;
}
public String apply(TregexMatcher m) {
String node = m.getNode(key).label().value();
// We also tried if (node.startsWith("V")) [var2] and if (node.startsWith("V") || node.startsWith("S")) [var3]. Both seemed markedly worse than the basic function or this var form (which seems a bit better than the basic equiv option).
if (node.startsWith("S") || node.startsWith("V") || node.startsWith("A")) {
return annotationMark + "VSA";
} else {
return "";
}
}
@Override
public String toString() { return "AddEquivalencedNodeFunctionVar[" + annotationMark + ',' + key + ']'; }
private static final long serialVersionUID = 1L;
}
private static class AnnotatePunctuationFunction2 implements SerializableFunction {
static final String key = "term";
private static final Pattern quote = Pattern.compile("^\"$");
public String apply(TregexMatcher m) {
final String punc = m.getNode(key).value();
if (punc.equals("."))
return "-fs";
else if (punc.equals("?"))
return "-quest";
else if (punc.equals(","))
return "-comma";
else if (punc.equals(":") || punc.equals(";"))
return "-colon";
else if (punc.equals("-LRB-"))
return "-lrb";
else if (punc.equals("-RRB-"))
return "-rrb";
else if (punc.equals("-PLUS-"))
return "-plus";
else if (punc.equals("-"))
return "-dash";
else if (quote.matcher(punc).matches())
return "-quote";
// else if(punc.equals("/"))
// return "-slash";
// else if(punc.equals("%"))
// return "-perc";
// else if(punc.contains(".."))
// return "-ellipses";
return "";
}
@Override
public String toString() { return "AnnotatePunctuationFunction2"; }
private static final long serialVersionUID = 1L;
}
private static class AddEquivalencedConjNode implements SerializableFunction {
private String annotationMark;
private String key;
private static final String nnTags = "DTNN DTNNP DTNNPS DTNNS NN NNP NNS NNPS";
private static final Set nnTagClass = Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(nnTags.split("\\s+"))));
private static final String jjTags = "ADJ_NUM DTJJ DTJJR JJ JJR";
private static final Set jjTagClass = Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(jjTags.split("\\s+"))));
private static final String vbTags = "VBD VBP";
private static final Set vbTagClass = Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(vbTags.split("\\s+"))));
private static final TreebankLanguagePack tlp = new ArabicTreebankLanguagePack();
public AddEquivalencedConjNode(String annotationMark, String key) {
this.annotationMark = annotationMark;
this.key = key;
}
public String apply(TregexMatcher m) {
String node = m.getNode(key).value();
String eqClass = tlp.basicCategory(node);
if(nnTagClass.contains(eqClass))
eqClass = "noun";
else if(jjTagClass.contains(eqClass))
eqClass = "adj";
else if(vbTagClass.contains(eqClass))
eqClass = "vb";
return annotationMark + eqClass;
}
@Override
public String toString() { return "AddEquivalencedConjNode[" + annotationMark + ',' + key + ']'; }
private static final long serialVersionUID = 1L;
}
/**
* Reconfigures active features after a change in the default headfinder.
*
* @param hf
*/
private void setHeadFinder(HeadFinder hf) {
if(hf == null)
throw new IllegalArgumentException();
headFinder = hf;
// Need to re-initialize all patterns due to the new headFinder
initializeAnnotationPatterns();
activeAnnotations.clear();
for(String key : baselineFeatures) {
Pair> p = annotationPatterns.get(key);
activeAnnotations.add(p);
}
for(String key : additionalFeatures) {
Pair> p = annotationPatterns.get(key);
activeAnnotations.add(p);
}
}
/**
* Configures morpho-syntactic annotations for POS tags.
*
* @param activeFeats A comma-separated list of feature values with names according
* to MorphoFeatureType.
*
*/
private String setupMorphoFeatures(String activeFeats) {
String[] feats = activeFeats.split(",");
morphoSpec = tlp.morphFeatureSpec();
for(String feat : feats) {
MorphoFeatureType fType = MorphoFeatureType.valueOf(feat.trim());
morphoSpec.activate(fType);
}
return morphoSpec.toString();
}
private void removeBaselineFeature(String featName) {
if(baselineFeatures.contains(featName)) {
baselineFeatures.remove(featName);
Pair> p = annotationPatterns.get(featName);
activeAnnotations.remove(p);
}
}
@Override
public void display() {
System.err.println(optionsString.toString());
}
/** Some options for setOptionFlag:
*
*
* -retainNPTmp
Retain temporal NP marking on NPs.
* -retainNPSbj
Retain NP subject function tags
* -markGappedVP
marked gapped VPs.
* -collinizerRetainsPunctuation
does what it says.
*
*
* @param args flag arguments (usually from commmand line
* @param i index at which to begin argument processing
* @return Index in args array after the last processed index for option
*/
@Override
public int setOptionFlag(String[] args, int i) {
//System.err.println("Setting option flag: " + args[i]);
//lang. specific options
boolean didSomething = false;
if (annotationPatterns.keySet().contains(args[i])) {
if(!baselineFeatures.contains(args[i])) additionalFeatures.add(args[i]);
Pair> p = annotationPatterns.get(args[i]);
activeAnnotations.add(p);
optionsString.append("Option " + args[i] + " added annotation pattern " + p.first() + " with annotation " + p.second() + '\n');
didSomething = true;
} else if (args[i].equals("-retainNPTmp")) {
optionsString.append("Retaining NP-TMP marking.\n");
retainNPTmp = true;
didSomething = true;
} else if (args[i].equals("-retainNPSbj")) {
optionsString.append("Retaining NP-SBJ dash tag.\n");
retainNPSbj = true;
didSomething = true;
} else if (args[i].equals("-retainPPClr")) {
optionsString.append("Retaining PP-CLR dash tag.\n");
retainPPClr = true;
didSomething = true;
} else if (args[i].equals("-discardX")) {
optionsString.append("Discarding X trees.\n");
discardX = true;
didSomething = true;
} else if (args[i].equals("-changeNoLabels")) {
optionsString.append("Change no labels.\n");
changeNoLabels = true;
didSomething = true;
} else if (args[i].equals("-markPRDverbs")) {
optionsString.append("Mark PRD.\n");
retainPRD = true;
didSomething = true;
} else if (args[i].equals("-collinizerRetainsPunctuation")) {
optionsString.append("Collinizer retains punctuation.\n");
collinizerRetainsPunctuation = true;
didSomething = true;
} else if (args[i].equals("-arabicFactored")) {
for(String annotation : baselineFeatures) {
String[] a = {annotation};
setOptionFlag(a,0);
}
didSomething = true;
} else if (args[i].equalsIgnoreCase("-headFinder") && (i + 1 < args.length)) {
try {
HeadFinder hf = (HeadFinder) Class.forName(args[i + 1]).newInstance();
setHeadFinder(hf);
optionsString.append("HeadFinder: " + args[i + 1] + "\n");
} catch (Exception e) {
System.err.println(e);
System.err.println(this.getClass().getName() +
": Could not load head finder " + args[i + 1]);
}
i++;
didSomething = true;
} else if(args[i].equals("-factlex") && (i + 1 < args.length)) {
String activeFeats = setupMorphoFeatures(args[++i]);
optionsString.append("Factored Lexicon: active features: ").append(activeFeats);
//
// removeBaselineFeature("-markFem");
// optionsString.append(" (removed -markFem)\n");
didSomething = true;
} else if(args[i].equals("-noFeatures")) {
activeAnnotations.clear();
optionsString.append("Removed all manual features.\n");
didSomething = true;
}
//wsg2010: The segmenter does not work, but keep this to remember how it was instantiated.
// else if (args[i].equals("-arabicTokenizerModel")) {
// String modelFile = args[i+1];
// try {
// WordSegmenter aSeg = (WordSegmenter) Class.forName("edu.stanford.nlp.wordseg.ArabicSegmenter").newInstance();
// aSeg.loadSegmenter(modelFile);
// System.out.println("aSeg=" + aSeg);
// TokenizerFactory aTF = WordSegmentingTokenizer.factory(aSeg);
// ((ArabicTreebankLanguagePack) treebankLanguagePack()).setTokenizerFactory(aTF);
// } catch (RuntimeIOException ex) {
// System.err.println("Couldn't load ArabicSegmenter " + modelFile);
// ex.printStackTrace();
// } catch (Exception e) {
// System.err.println("Couldn't instantiate segmenter: edu.stanford.nlp.wordseg.ArabicSegmenter");
// e.printStackTrace();
// }
// i++; // 2 args
// didSomething = true;
// }
if (didSomething) i++;
return i;
}
/**
*
* @param args
*/
public static void main(String[] args) {
if(args.length != 1) {
System.exit(-1);
}
ArabicTreebankParserParams tlpp = new ArabicTreebankParserParams();
String[] options = {"-arabicFactored"};
tlpp.setOptionFlag(options, 0);
DiskTreebank tb = tlpp.diskTreebank();
tb.loadPath(args[0], "txt", false);
for(Tree t : tb) {
for(Tree subtree : t) {
tlpp.transformTree(subtree, t);
}
System.out.println(t.toString());
}
}
}