All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.international.arabic.ArabicHeadFinder Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees.international.arabic; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.regex.Pattern;

import edu.stanford.nlp.trees.AbstractCollinsHeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.util.Generics;

/**
 * Find the head of an Arabic tree, using the usual kind of heuristic
 * head finding rules.
 * 

* Implementation notes. * TO DO: make sure that -PRD marked elements are always chosen as heads. * (Has this now been successfully done or not??) *

* Mona: I added the 8 new Nonterm for the merged DT with its following * category as a rule the DT nonterm is right headed, the 8 new nonterm DTs * are: DTCD, DTRB, DTRP, DTJJ, DTNN, DTNNS, DTNNP, DTNNPS. * This was added Dec 7th, 2004. * * @author Roger Levy * @author Mona Diab * @author Christopher Manning (added new stuff for ATBp3v3 */ public class ArabicHeadFinder extends AbstractCollinsHeadFinder { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ArabicHeadFinder.class); private static final long serialVersionUID = 6203368998430280740L; protected TagSet tagSet; /* A work in progress. There may well be a better way to parameterize the HeadFinders via tagset. */ public enum TagSet { BIES_COLLAPSED { @Override String prep() { return "IN"; } @Override String noun() { return "NN"; } // really there should be several here. @Override String det() { return "DT"; } @Override String adj() { return "JJ"; } @Override String detPlusNoun() { return "DTNN"; } // really there should be several here; major point is that the det part is ignored completely @Override TreebankLanguagePack langPack() { return new ArabicTreebankLanguagePack(); } }, ORIGINAL { @Override String prep() { return "PREP"; } @Override String noun() { return "NOUN"; } @Override String det() { return "DET"; } @Override String adj() { return "ADJ"; } @Override String detPlusNoun() { return "DET+NN"; } @Override TreebankLanguagePack langPack() { return new ArabicTreebankLanguagePack(); } }; abstract String prep(); abstract String noun(); abstract String adj(); abstract String det(); abstract String detPlusNoun(); abstract TreebankLanguagePack langPack(); static TagSet tagSet(String str) { switch (str) { case "BIES_COLLAPSED": return BIES_COLLAPSED; case "ORIGINAL": return ORIGINAL; default: throw new IllegalArgumentException("Don't know anything about tagset " + str); } } } public ArabicHeadFinder() { this(new ArabicTreebankLanguagePack()); } /** * Construct an ArabicHeadFinder with a String parameter corresponding to the tagset in use. * @param tagSet Either "ORIGINAL" or "BIES_COLLAPSED" */ public ArabicHeadFinder(String tagSet) { this(TagSet.tagSet(tagSet)); } public ArabicHeadFinder(TagSet tagSet) { this(tagSet.langPack(), tagSet); //this(new ArabicTreebankLanguagePack(), tagSet); } public ArabicHeadFinder(TreebankLanguagePack tlp) { this(tlp,TagSet.BIES_COLLAPSED); } protected ArabicHeadFinder(TreebankLanguagePack tlp, TagSet tagSet) { super(tlp); this.tagSet = tagSet; //log.info("##testing: noun tag is " + tagSet.noun()); nonTerminalInfo = Generics.newHashMap(); nonTerminalInfo.put("NX", new String[][]{{"left", "DT","DTNN","DTNNS","DTNNP", "DTNNPS", "DTJJ", "DTNOUN_QUANT", "NOUN_QUANT", "MWNP"}}); nonTerminalInfo.put("ADJP", new String[][]{{"rightdis", tagSet.adj(), "DTJJ", "ADJ_NUM", "DTADJ_NUM", "JJR", "DTJJR", "MWADJP"}, {"right", "ADJP", "VN", tagSet.noun(), "MWNP", "NNP", "NNPS", "NNS", "DTNN", "DTNNS","DTNNP","DTNNPS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}, {"right", "RB", "MWADVP", "CD","DTRB","DTCD"}, {"right", "DT"}}); // sometimes right, sometimes left headed?? nonTerminalInfo.put("MWADJP", new String[][]{{"rightdis", tagSet.adj(), "DTJJ", "ADJ_NUM", "DTADJ_NUM", "JJR", "DTJJR"}, {"right", tagSet.noun(), "MWNP", "NNP", "NNPS", "NNS", "DTNN", "DTNNS","DTNNP","DTNNPS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}, {"right", "RB", "MWADVP", "CD","DTRB","DTCD"}, {"right", "DT"}}); // sometimes right, sometimes left headed?? nonTerminalInfo.put("ADVP", new String[][]{{"left", "WRB", "RB", "MWADVP", "ADVP", "WHADVP","DTRB"}, {"left", "CD", "RP", tagSet.noun(), "MWNP", "CC", "MWCONJP", tagSet.adj(), "MWADJP", "DTJJ", "ADJ_NUM", "DTADJ_NUM", "IN", "MWPP", "NP", "NNP", "NOFUNC","DTRP","DTNN","DTNNP","DTNNPS","DTNNS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); // NNP is a gerund that they called an unknown (=NNP, believe it or not...) nonTerminalInfo.put("MWADVP", new String[][]{{"left", "WRB", "RB", "ADVP", "WHADVP","DTRB"}, {"left", "CD", "RP", tagSet.noun(), "MWNP", "CC", "MWCONJP", tagSet.adj(), "MWADJP", "DTJJ", "ADJ_NUM", "DTADJ_NUM", "IN", "MWPP", "NP", "NNP", "NOFUNC","DTRP","DTNN","DTNNP","DTNNPS","DTNNS","DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); // NNP is a gerund that they called an unknown (=NNP, believe it or not...) nonTerminalInfo.put("CONJP", new String[][]{{"right", "IN", "RB", "MWADVP", tagSet.noun(), "MWNP", "NNS","NNP", "NNPS", "DTRB", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}}); nonTerminalInfo.put("MWCONJP", new String[][]{{"right", "IN", "RB", "MWADVP", tagSet.noun(), "MWNP", "NNS","NNP", "NNPS", "DTRB", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}}); nonTerminalInfo.put("FRAG", new String[][]{{"left", tagSet.noun(), "MWNP", "NNPS", "NNP","NNS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "VBP"}}); nonTerminalInfo.put("MWFRAG", new String[][]{{"left", tagSet.noun(), "MWNP", "NNPS", "NNP","NNS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "VBP"}}); nonTerminalInfo.put("INTJ", new String[][]{{"left", "RP", "UH", "DTRP"}}); nonTerminalInfo.put("LST", new String[][]{{"left"}}); nonTerminalInfo.put("NAC", new String[][]{{"left", "NP", "SBAR", "PP", "MWP","ADJP", "S", "PRT", "UCP"}, {"left", "ADVP"}}); // note: maybe CC, RB should be the heads? nonTerminalInfo.put("NP", new String[][]{{"left", tagSet.noun(), "MWNP", tagSet.detPlusNoun(), "NNS", "NNP", "NNPS", "NP", "PRP", "WHNP", "QP", "WP", "DTNNS", "DTNNPS", "DTNNP", "NOFUNC", "NO_FUNC", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", tagSet.adj(), "MWADJP", "DTJJ", "JJR", "DTJJR", "ADJ_NUM", "DTADJ_NUM"}, {"right", "CD", "DTCD"}, {"left", "PRP$"}, {"right", "DT"}}); // should the JJ rule be left or right? nonTerminalInfo.put("MWNP", new String[][]{{"left", tagSet.noun(), "MWNP", tagSet.detPlusNoun(), "NNS", "NNP", "NNPS", "PRP", "QP", "WP", "DTNNS", "DTNNPS", "DTNNP", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", tagSet.adj(), "MWADJP", "DTJJ", "JJR", "DTJJR", "ADJ_NUM", "DTADJ_NUM"}, {"right", "CD", "DTCD"}, {"left", "PRP$"}, {"right", "DT"}}); // should the JJ rule be left or right? nonTerminalInfo.put("PP", new String[][]{{"left", tagSet.prep(), "MWPP", "PP", "MWP","PRT", "X"}, {"left", "NNP", "RP", tagSet.noun(), "MWNP"}, {"left", "NP"}}); // NN is for a mistaken "fy", and many wsT nonTerminalInfo.put("MWPP", new String[][]{{"left", tagSet.prep(), "PP", "MWP","PRT", "X"}, {"left", "NNP", "RP", tagSet.noun(), "MWNP"}, {"left", "NP"}}); // NN is for a mistaken "fy", and many wsT nonTerminalInfo.put("PRN", new String[][]{{"left", "NP"}}); // don't get PUNC nonTerminalInfo.put("MWPRN", new String[][]{{"left", "IN"}}); // don't get PUNC nonTerminalInfo.put("PRT", new String[][]{{"left", "RP", "PRT", "IN", "DTRP"}}); nonTerminalInfo.put("QP", new String[][]{{"right", "CD", "DTCD", tagSet.noun(), "MWNP", tagSet.adj(), "MWADJP", "NNS", "NNP", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTJJ", "DTNOUN_QUANT", "NOUN_QUANT"}}); nonTerminalInfo.put("S", new String[][]{{"left", "VP", "MWVP", "S"}, {"right", "PP", "MWP","ADVP", "SBAR", "UCP", "ADJP"}}); // really important to put in -PRD sensitivity here! nonTerminalInfo.put("MWS", new String[][]{{"left", "VP", "MWVP", "S"}, {"right", "PP", "MWP","ADVP", "SBAR", "UCP", "ADJP"}}); // really important to put in -PRD sensitivity here! nonTerminalInfo.put("SQ", new String[][]{{"left", "VP", "MWVP", "PP", "MWP"}}); // to be principled, we need -PRD sensitivity here too. nonTerminalInfo.put("SBAR", new String[][]{{"left", "WHNP", "WHADVP", "WRB", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X", "DTRB", "DTRP"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}}); nonTerminalInfo.put("MWSBAR", new String[][]{{"left", "WHNP", "WHADVP", "WRB", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X", "DTRB", "DTRP"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS", "DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}}); nonTerminalInfo.put("SBARQ", new String[][]{{"left", "WHNP", "WHADVP", "RP", "IN", "SBAR", "CC", "MWCONJP", "WP", "WHPP", "ADVP", "PRT", "RB", "MWADVP", "X"}, {"left", tagSet.noun(), "MWNP", "NNP", "NNS", "NNPS","DTNN", "DTNNS", "DTNNP", "DTNNPS", "DTNOUN_QUANT", "NOUN_QUANT"}, {"left", "S"}}); // copied from SBAR rule -- look more closely when there's time nonTerminalInfo.put("UCP", new String[][]{{"left"}}); nonTerminalInfo.put("VP", new String[][]{{"left", "VBD", "VBN", "VBP", "VBG", "DTVBG", "VN", "DTVN", "VP", "RB", "MWADVP", "X","VB"}, {"left", "IN"}, {"left", "NNP", tagSet.noun(), "MWNP", "DTNN", "DTNNP", "DTNNPS", "DTNNS", "DTNOUN_QUANT", "NOUN_QUANT"}}); // exclude RP because we don't want negation markers as heads -- no useful information? nonTerminalInfo.put("MWVP", new String[][]{{"left", "VBD", "VBN", "VBP", "VBG", "DTVBG", "VN", "DTVN", "VP", "MWVP", "RB", "MWADVP", "X","VB"}, {"left", "IN"}, {"left", "NNP", tagSet.noun(), "MWNP", "DTNN", "DTNNP", "DTNNPS", "DTNNS", "DTNOUN_QUANT", "NOUN_QUANT"}}); // exclude RP because we don't want negation markers as heads -- no useful information? //also, RB is used as gerunds nonTerminalInfo.put("WHADVP", new String[][]{{"left", "WRB", "WP"}, {"right", "CC", "MWCONJP"}, {"left", "IN"}}); nonTerminalInfo.put("WHNP", new String[][]{{"right", "WP"}}); nonTerminalInfo.put("WHPP", new String[][]{{"left", "IN", "MWPP", "RB", "MWADVP"}}); nonTerminalInfo.put("X", new String[][]{{"left"}}); //Added by Mona 12/7/04 for the newly created DT nonterm cat nonTerminalInfo.put("DTNN", new String[][]{{"right"}}); nonTerminalInfo.put("DTNNS", new String[][]{{"right"}}); nonTerminalInfo.put("DTNNP", new String[][]{{"right"}}); nonTerminalInfo.put("DTNNPS", new String[][]{{"right"}}); nonTerminalInfo.put("DTJJ", new String[][]{{"right"}}); nonTerminalInfo.put("DTRP", new String[][]{{"right"}}); nonTerminalInfo.put("DTRB", new String[][]{{"right"}}); nonTerminalInfo.put("DTCD", new String[][]{{"right"}}); nonTerminalInfo.put("DTIN", new String[][]{{"right"}}); // stand-in dependency: nonTerminalInfo.put("EDITED", new String[][]{{"left"}}); nonTerminalInfo.put(tlp.startSymbol(), new String[][]{{"left"}}); // one stray SINV in the training set...garbage head rule here. nonTerminalInfo.put("SINV", new String[][]{{"left","ADJP","VP"}}); } private final Pattern predPattern = Pattern.compile(".*-PRD$"); /** * Predicatively marked elements in a sentence should be noted as heads */ @Override protected Tree findMarkedHead(Tree t) { String cat = t.value(); if (cat.equals("S")) { Tree[] kids = t.children(); for (Tree kid : kids) { if (predPattern.matcher(kid.value()).matches()) { return kid; } } } return null; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy