All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.EnglishGrammaticalStructure Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.trees;

import java.io.*;
import java.util.*;
import java.util.function.Predicate;

import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.util.*;
import static edu.stanford.nlp.trees.EnglishGrammaticalRelations.*;
import static edu.stanford.nlp.trees.GrammaticalRelation.*;

/**
 * A GrammaticalStructure for English. This is the class that produces Stanford Dependencies.
 * 

* For feeding Stanford parser trees into this class, the Stanford parser should be run with the * "-retainNPTmpSubcategories" option for best results! * * @author Bill MacCartney * @author Marie-Catherine de Marneffe * @author Christopher Manning * @author Daniel Cer (CoNLLX format and alternative user selected dependency * printer/reader interface) * @author John Bauer */ public class EnglishGrammaticalStructure extends GrammaticalStructure { private static final long serialVersionUID = -1866362375001969402L; private static final boolean DEBUG = System.getProperty("EnglishGrammaticalStructure", null) != null; /** * Construct a new {@code EnglishGrammaticalStructure} from an existing parse * tree. The new {@code GrammaticalStructure} has the same tree structure * and label values as the given tree (but no shared storage). As part of * construction, the parse tree is analyzed using definitions from * {@link GrammaticalRelation GrammaticalRelation} to populate * the new GrammaticalStructure with as many labeled grammatical * relations as it can. * * @param t Parse tree to make grammatical structure from */ public EnglishGrammaticalStructure(Tree t) { this(t, new PennTreebankLanguagePack().punctuationWordRejectFilter()); } /** * This gets used by GrammaticalStructureFactory (by reflection). DON'T DELETE. * * @param t Parse tree to make grammatical structure from * @param puncFilter Filter to remove punctuation dependencies */ public EnglishGrammaticalStructure(Tree t, Predicate puncFilter) { this(t, puncFilter, new SemanticHeadFinder(true), true); } /** * This gets used by GrammaticalStructureFactory (by reflection). DON'T DELETE. * * @param t Parse tree to make grammatical structure from * @param puncFilter Filter to remove punctuation dependencies * @param hf HeadFinder to use when building it */ public EnglishGrammaticalStructure(Tree t, Predicate puncFilter, HeadFinder hf) { this(t, puncFilter, hf, true); } /** * Construct a new GrammaticalStructure from an existing parse * tree. The new GrammaticalStructure has the same tree structure * and label values as the given tree (but no shared storage). As part of * construction, the parse tree is analyzed using definitions from * {@link GrammaticalRelation GrammaticalRelation} to populate * the new GrammaticalStructure with as many labeled grammatical * relations as it can. * * @param t Parse tree to make grammatical structure from * @param puncFilter Filter for punctuation words * @param hf HeadFinder to use when building it * @param threadSafe Whether or not to support simultaneous instances among multiple * threads */ public EnglishGrammaticalStructure(Tree t, Predicate puncFilter, HeadFinder hf, boolean threadSafe) { // the tree is normalized (for index and functional tag stripping) inside CoordinationTransformer super(t, EnglishGrammaticalRelations.values(threadSafe), threadSafe ? EnglishGrammaticalRelations.valuesLock() : null, new CoordinationTransformer(hf), hf, puncFilter, Filters.acceptFilter()); } /** Used for postprocessing CoNLL X dependencies */ public EnglishGrammaticalStructure(List projectiveDependencies, TreeGraphNode root) { super(projectiveDependencies, root); } /** * Returns a Filter which checks dependencies for usefulness as * extra tree-based dependencies. By default, everything is * accepted. One example of how this can be useful is in the * English dependencies, where the REL dependency is used as an * intermediate and we do not want this to be added when we make a * second pass over the trees for missing dependencies. */ @Override protected Predicate extraTreeDepFilter() { return extraTreeDepFilter; } private static class ExtraTreeDepFilter implements Predicate, Serializable { @Override public boolean test(TypedDependency d) { return d != null && d.reln() != RELATIVE; } private static final long serialVersionUID = 1L; } private static final Predicate extraTreeDepFilter = new ExtraTreeDepFilter(); @Override protected void correctDependencies(List list) { if (DEBUG) { printListSorted("At correctDependencies:", list); } correctSubjPass(list); if (DEBUG) { printListSorted("After correctSubjPass:", list); } removeExactDuplicates(list); if (DEBUG) { printListSorted("After removeExactDuplicates:", list); } } private static void printListSorted(String title, Collection list) { List lis = new ArrayList(list); Collections.sort(lis); if (title != null) { System.err.println(title); } System.err.println(lis); } @Override protected void postProcessDependencies(List list) { if (DEBUG) { printListSorted("At postProcessDependencies:", list); } SemanticGraph sg = new SemanticGraph(list); correctWHAttachment(sg); list.clear(); list.addAll(sg.typedDependencies()); if (DEBUG) { printListSorted("After correcting WH movement", list); } convertRel(list); if (DEBUG) { printListSorted("After converting rel:", list); } } @Override protected void getExtras(List list) { addRef(list); if (DEBUG) { printListSorted("After adding ref:", list); } addExtraNSubj(list); if (DEBUG) { printListSorted("After adding extra nsubj:", list); } addStrandedPobj(list); if (DEBUG) { printListSorted("After adding stranded pobj:", list); } } // Using this makes addStrandedPobj a lot cleaner looking, but it // makes the converter roughly 2% slower. Might not be worth it. // Similar changes could be made to many of the other complicated // collapsing methods. // static final SemgrexPattern strandedPobjSemgrex = SemgrexPattern.compile("{}=head >rcmod ({} [ == {}=prepgov | >xcomp {}=prepgov | >conj {}=prepgov ]) : {}=prepgov >prep ({}=prepdep !>pcomp {} !> pobj {})"); // // Deal with preposition stranding in relative clauses. // // For example, "the only thing I'm rooting for" // // This method will add pobj(for, thing) by connecting using the rcmod and prep // private static void addStrandedPobj(List list) { // SemanticGraph graph = new SemanticGraph(list); // SemgrexMatcher matcher = strandedPobjSemgrex.matcher(graph); // while (matcher.find()) { // IndexedWord gov = matcher.getNode("prepdep"); // IndexedWord dep = matcher.getNode("head"); // TypedDependency newDep = new TypedDependency(PREPOSITIONAL_OBJECT, gov, dep); // newDep.setExtra(); // list.add(newDep); // } // } // Deal with preposition stranding in relative clauses. // For example, "the only thing I'm rooting for" // This method will add pobj(for, thing) by connecting using the rcmod and prep private static void addStrandedPobj(List list) { List depNodes = null; List newDeps = null; for (TypedDependency rcmod : list) { if (rcmod.reln() != RELATIVE_CLAUSE_MODIFIER) { continue; } IndexedWord head = rcmod.gov(); if (depNodes == null) { depNodes = Generics.newArrayList(); } else { depNodes.clear(); } depNodes.add(rcmod.dep()); for (TypedDependency connected : list) { if (connected.gov().equals(rcmod.dep()) && (connected.reln() == XCLAUSAL_COMPLEMENT || connected.reln() == CONJUNCT)) { depNodes.add(connected.dep()); } } for (IndexedWord dep : depNodes) { for (TypedDependency prep : list) { if (!prep.gov().equals(dep) || prep.reln() != PREPOSITIONAL_MODIFIER) { continue; } boolean found = false; for (TypedDependency other : list) { if (other.gov().equals(prep.dep()) && (other.reln() == PREPOSITIONAL_COMPLEMENT || other.reln() == PREPOSITIONAL_OBJECT)) { found = true; break; } } if (!found) { if (newDeps == null) { newDeps = Generics.newArrayList(); } TypedDependency newDep = new TypedDependency(PREPOSITIONAL_OBJECT, prep.dep(), head); newDeps.add(newDep); } } } } if (newDeps != null) { list.addAll(newDeps); } } /* Used by correctWHAttachment */ private static SemgrexPattern XCOMP_PATTERN = SemgrexPattern.compile("{}=root >xcomp {}=embedded >/^(dep|dobj)$/ {}=wh ?>/([di]obj)/ {}=obj"); private static Morphology morphology = new Morphology(); /** * Tries to correct complicated cases of WH-movement in * sentences such as "What does Mary seem to have?" in * which "What" should attach to "have" instead of the * control verb. * * @param sg The Semantic graph to operate on. */ private static void correctWHAttachment(SemanticGraph sg) { /* Semgrexes require a graph with a root. */ if (sg.getRoots().isEmpty()) return; SemanticGraph sgCopy = sg.makeSoftCopy(); SemgrexMatcher matcher = XCOMP_PATTERN.matcher(sgCopy); while (matcher.findNextMatchingNode()) { IndexedWord root = matcher.getNode("root"); IndexedWord embeddedVerb = matcher.getNode("embedded"); IndexedWord wh = matcher.getNode("wh"); IndexedWord dobj = matcher.getNode("obj"); /* Check if the object is a WH-word. */ if (wh.tag().startsWith("W")) { boolean reattach = false; /* If the control verb already has an object, then we have to reattach th WH-word to the verb in the embedded clause. */ if (dobj != null) { reattach = true; } else { /* If the control verb can't have an object, we also have to reattach. */ String lemma = morphology.lemma(root.value(), root.tag()); if (lemma.matches(EnglishPatterns.NP_V_S_INF_VERBS_REGEX)) { reattach = true; } } if (reattach) { SemanticGraphEdge edge = sg.getEdge(root, wh); if (edge != null) { sg.removeEdge(edge); sg.addEdge(embeddedVerb, wh, DIRECT_OBJECT, Double.NEGATIVE_INFINITY, false); } } } } } /** * What we do in this method is look for temporary dependencies of * the type "rel". These occur in sentences such as "I saw the man * who you love". In that case, we should produce dobj(love, who). * On the other hand, in the sentence "... which Mr. Bush was * fighting for", we should have pobj(for, which). */ private static void convertRel(List list) { List newDeps = new ArrayList(); for (TypedDependency rel : list) { if (rel.reln() != RELATIVE) { continue; } boolean foundPrep = false; for (TypedDependency prep : list) { // todo: It would also be good to add a rule here to prefer ccomp nsubj over dobj if there is a ccomp with no subj // then we could get right: Which eco-friendly options do you think there will be on the new Lexus? if (prep.reln() != PREPOSITIONAL_MODIFIER) { continue; } if (!prep.gov().equals(rel.gov())) { continue; } // at this point, we have two dependencies as in the Mr. Bush // example. it should be rel(fighting, which) and // prep(fighting, for). We now look to see if there is a // corresponding pobj associated with the dependent of the // prep relation. If not, we will connect the dep of the prep // relation and the head of the rel relation. Otherwise, the // original rel relation will become a dobj. boolean foundPobj = false; for (TypedDependency pobj : list) { if (pobj.reln() != PREPOSITIONAL_OBJECT && pobj.reln() != PREPOSITIONAL_COMPLEMENT) { continue; } if (!pobj.gov().equals(prep.dep())) { continue; } // we did find a pobj/pcomp, so it is not necessary to // change this rel. foundPobj = true; break; } if (!foundPobj) { foundPrep = true; TypedDependency newDep = new TypedDependency(PREPOSITIONAL_OBJECT, prep.dep(), rel.dep()); newDeps.add(newDep); rel.setReln(KILL); // break; // only put it in one place (or do we want to allow across-the-board effects? } } if (!foundPrep) { rel.setReln(DIRECT_OBJECT); } } filterKill(list); for (TypedDependency dep : newDeps) { if (!list.contains(dep)) { list.add(dep); } } } /** * Alters a list in place by removing all the KILL relations */ private static void filterKill(Collection deps) { List filtered = Generics.newArrayList(); for (TypedDependency dep : deps) { if (dep.reln() != KILL) { filtered.add(dep); } } deps.clear(); deps.addAll(filtered); } /** * Destructively modifies this {@code Collection} * by collapsing several types of transitive pairs of dependencies. * If called with a tree of dependencies and both CCprocess and * includeExtras set to false, then the tree structure is preserved. *

*
prepositional object dependencies: pobj
*
* prep(cat, in) and pobj(in, hat) are collapsed to * prep_in(cat, hat)
*
prepositional complement dependencies: pcomp
*
* prep(heard, of) and pcomp(of, attacking) are * collapsed to prepc_of(heard, attacking)
*
conjunct dependencies
*
* cc(investors, and) and * conj(investors, regulators) are collapsed to * conj_and(investors,regulators)
*
possessive dependencies: possessive
*
* possessive(Montezuma, 's) will be erased. This is like a collapsing, but * due to the flatness of NPs, two dependencies are not actually composed.
*
For relative clauses, it will collapse referent
*
* ref(man, that) and dobj(love, that) are collapsed * to dobj(love, man)
*
*/ @Override protected void collapseDependencies(List list, boolean CCprocess, Extras includeExtras) { if (DEBUG) { printListSorted("collapseDependencies: CCproc: " + CCprocess + " includeExtras: " + includeExtras, list); } correctDependencies(list); if (DEBUG) { printListSorted("After correctDependencies:", list); } eraseMultiConj(list); if (DEBUG) { printListSorted("After collapse multi conj:", list); } collapse2WP(list); if (DEBUG) { printListSorted("After collapse2WP:", list); } collapseFlatMWP(list); if (DEBUG) { printListSorted("After collapseFlatMWP:", list); } collapse2WPbis(list); if (DEBUG) { printListSorted("After collapse2WPbis:", list); } collapse3WP(list); if (DEBUG) { printListSorted("After collapse3WP:", list); } collapsePrepAndPoss(list); if (DEBUG) { printListSorted("After PrepAndPoss:", list); } collapseConj(list); if (DEBUG) { printListSorted("After conj:", list); } if (includeExtras.doRef) { addRef(list); if (DEBUG) { printListSorted("After adding ref:", list); } if (includeExtras.collapseRef) { collapseReferent(list); if (DEBUG) { printListSorted("After collapse referent:", list); } } } if (CCprocess) { treatCC(list); if (DEBUG) { printListSorted("After treatCC:", list); } } if (includeExtras.doSubj) { addExtraNSubj(list); if (DEBUG) { printListSorted("After adding extra nsubj:", list); } correctSubjPass(list); if (DEBUG) { printListSorted("After correctSubjPass:", list); } } removeDep(list); if (DEBUG) { printListSorted("After remove dep:", list); } Collections.sort(list); if (DEBUG) { printListSorted("After all collapse:", list); } } @Override protected void collapseDependenciesTree(List list) { collapseDependencies(list, false, Extras.NONE); } /** * Does some hard coding to deal with relation in CONJP. For now we deal with: * but not, if not, instead of, rather than, but rather GO TO negcc
* as well as, not to mention, but also, & GO TO and. * * @param conj The head dependency of the conjunction marker * @return A GrammaticalRelation made from a normalized form of that * conjunction. */ private static GrammaticalRelation conjValue(String conj) { String newConj = conj.toLowerCase(); if (newConj.equals("not") || newConj.equals("instead") || newConj.equals("rather")) { newConj = "negcc"; } else if (newConj.equals("mention") || newConj.equals("to") || newConj.equals("also") || newConj.contains("well") || newConj.equals("&")) { newConj = "and"; } return EnglishGrammaticalRelations.getConj(newConj); } private static void treatCC(Collection list) { // Construct a map from tree nodes to the set of typed // dependencies in which the node appears as dependent. Map> map = Generics.newHashMap(); // Construct a map of tree nodes being governor of a subject grammatical // relation to that relation Map subjectMap = Generics.newHashMap(); // Construct a set of TreeGraphNodes with a passive auxiliary on them Set withPassiveAuxiliary = Generics.newHashSet(); // Construct a map of tree nodes being governor of an object grammatical // relation to that relation // Map objectMap = new // HashMap(); List rcmodHeads = Generics.newArrayList(); List prepcDep = Generics.newArrayList(); for (TypedDependency typedDep : list) { if (!map.containsKey(typedDep.dep())) { // NB: Here and in other places below, we use a TreeSet (which extends // SortedSet) to guarantee that results are deterministic) map.put(typedDep.dep(), new TreeSet()); } map.get(typedDep.dep()).add(typedDep); if (typedDep.reln().equals(AUX_PASSIVE_MODIFIER)) { withPassiveAuxiliary.add(typedDep.gov()); } // look for subjects if (typedDep.reln().getParent() == NOMINAL_SUBJECT || typedDep.reln().getParent() == SUBJECT || typedDep.reln().getParent() == CLAUSAL_SUBJECT) { if (!subjectMap.containsKey(typedDep.gov())) { subjectMap.put(typedDep.gov(), typedDep); } } // look for objects // this map was only required by the code commented out below, so comment // it out too // if (typedDep.reln() == DIRECT_OBJECT) { // if (!objectMap.containsKey(typedDep.gov())) { // objectMap.put(typedDep.gov(), typedDep); // } // } // look for rcmod relations if (typedDep.reln() == RELATIVE_CLAUSE_MODIFIER) { rcmodHeads.add(typedDep.gov()); } // look for prepc relations: put the dependent of such a relation in the // list // to avoid wrong propagation of dobj if (typedDep.reln().toString().startsWith("prepc")) { prepcDep.add(typedDep.dep()); } } // System.err.println(map); // if (DEBUG) System.err.println("Subject map: " + subjectMap); // if (DEBUG) System.err.println("Object map: " + objectMap); // System.err.println(rcmodHeads); // create a new list of typed dependencies Collection newTypedDeps = new ArrayList(list); // find typed deps of form conj(gov,dep) for (TypedDependency td : list) { if (EnglishGrammaticalRelations.getConjs().contains(td.reln())) { IndexedWord gov = td.gov(); IndexedWord dep = td.dep(); // look at the dep in the conjunct Set gov_relations = map.get(gov); // System.err.println("gov " + gov); if (gov_relations != null) { for (TypedDependency td1 : gov_relations) { // System.err.println("gov rel " + td1); IndexedWord newGov = td1.gov(); // in the case of errors in the basic dependencies, it // is possible to have overlapping newGov & dep if (newGov.equals(dep)) { continue; } GrammaticalRelation newRel = td1.reln(); if (newRel != ROOT) { if (rcmodHeads.contains(gov) && rcmodHeads.contains(dep)) { // to prevent wrong propagation in the case of long dependencies in relative clauses if (newRel != DIRECT_OBJECT && newRel != NOMINAL_SUBJECT) { if (DEBUG) { System.err.println("Adding new " + newRel + " dependency from " + newGov + " to " + dep + " (subj/obj case)"); } newTypedDeps.add(new TypedDependency(newRel, newGov, dep)); } } else { if (DEBUG) { System.err.println("Adding new " + newRel + " dependency from " + newGov + " to " + dep); } newTypedDeps.add(new TypedDependency(newRel, newGov, dep)); } } } } // propagate subjects // look at the gov in the conjunct: if it is has a subject relation, // the dep is a verb and the dep doesn't have a subject relation // then we want to add a subject relation for the dep. // (By testing for the dep to be a verb, we are going to miss subject of // copula verbs! but // is it safe to relax this assumption?? i.e., just test for the subject // part) // CDM 2008: I also added in JJ, since participial verbs are often // tagged JJ String tag = dep.tag(); if (subjectMap.containsKey(gov) && (tag.startsWith("VB") || tag.startsWith("JJ")) && ! subjectMap.containsKey(dep)) { TypedDependency tdsubj = subjectMap.get(gov); // check for wrong nsubjpass: if the new verb is VB or VBZ or VBP or JJ, then // add nsubj (if it is tagged correctly, should do this for VBD too, but we don't) GrammaticalRelation relation = tdsubj.reln(); if (relation == NOMINAL_PASSIVE_SUBJECT) { if (isDefinitelyActive(tag)) { relation = NOMINAL_SUBJECT; } } else if (relation == CLAUSAL_PASSIVE_SUBJECT) { if (isDefinitelyActive(tag)) { relation = CLAUSAL_SUBJECT; } } else if (relation == NOMINAL_SUBJECT) { if (withPassiveAuxiliary.contains(dep)) { relation = NOMINAL_PASSIVE_SUBJECT; } } else if (relation == CLAUSAL_SUBJECT) { if (withPassiveAuxiliary.contains(dep)) { relation = CLAUSAL_PASSIVE_SUBJECT; } } if (DEBUG) { System.err.println("Adding new " + relation + " dependency from " + dep + " to " + tdsubj.dep() + " (subj propagation case)"); } newTypedDeps.add(new TypedDependency(relation, dep, tdsubj.dep())); } // propagate objects // cdm july 2010: This bit of code would copy a dobj from the first // clause to a later conjoined clause if it didn't // contain its own dobj or prepc. But this is too aggressive and wrong // if the later clause is intransitive // (including passivized cases) and so I think we have to not have this // done always, and see no good "sometimes" heuristic. // IF WE WERE TO REINSTATE, SHOULD ALSO NOT ADD OBJ IF THERE IS A ccomp // (SBAR). // if (objectMap.containsKey(gov) && // dep.tag().startsWith("VB") && ! objectMap.containsKey(dep) // && ! prepcDep.contains(gov)) { // TypedDependency tdobj = objectMap.get(gov); // if (DEBUG) { // System.err.println("Adding new " + tdobj.reln() + " dependency from " // + dep + " to " + tdobj.dep() + " (obj propagation case)"); // } // newTypedDeps.add(new TypedDependency(tdobj.reln(), dep, // tdobj.dep())); // } } } list.clear(); list.addAll(newTypedDeps); } private static boolean isDefinitelyActive(String tag) { // we should include VBD, but don't as it is often a tagging mistake. return tag.equals("VB") || tag.equals("VBZ") || tag.equals("VBP") || tag.startsWith("JJ"); } /** * This rewrites the "conj" relation to "conj_word" and deletes cases of the * "cc" relation providing this rewrite has occurred (but not if there is only * something like a clause-initial and). For instance, cc(elected-5, and-9) * conj(elected-5, re-elected-11) becomes conj_and(elected-5, re-elected-11) * * @param list List of dependencies. */ private static void collapseConj(Collection list) { List govs = Generics.newArrayList(); // find typed deps of form cc(gov, dep) for (TypedDependency td : list) { if (td.reln() == COORDINATION) { // i.e. "cc" IndexedWord gov = td.gov(); GrammaticalRelation conj = conjValue(td.dep().value()); if (DEBUG) { System.err.println("Set conj to " + conj + " based on " + td); } // find other deps of that gov having reln "conj" boolean foundOne = false; for (TypedDependency td1 : list) { if (td1.gov().equals(gov)) { if (td1.reln() == CONJUNCT) { // i.e., "conj" // change "conj" to the actual (lexical) conjunction if (DEBUG) { System.err.println("Changing " + td1 + " to have relation " + conj); } td1.setReln(conj); foundOne = true; } else if (td1.reln() == COORDINATION) { conj = conjValue(td1.dep().value()); if (DEBUG) { System.err.println("Set conj to " + conj + " based on " + td1); } } } } // register to remove cc from this governor if (foundOne) { govs.add(gov); } } } // now remove typed dependencies with reln "cc" if we have successfully // collapsed for (Iterator iter = list.iterator(); iter.hasNext();) { TypedDependency td2 = iter.next(); if (td2.reln() == COORDINATION && govs.contains(td2.gov())) { iter.remove(); } } } /** * This method will collapse a referent relation such as follows. e.g.: * "The man that I love … " ref(man, that) dobj(love, that) -> * dobj(love, man) */ private static void collapseReferent(Collection list) { // find typed deps of form ref(gov, dep) // put them in a List for processing; remove them from the set of deps List refs = new ArrayList(); for (Iterator iter = list.iterator(); iter.hasNext();) { TypedDependency td = iter.next(); if (td.reln() == REFERENT) { refs.add(td); iter.remove(); } } // now substitute target of referent where possible for (TypedDependency ref : refs) { IndexedWord dep = ref.dep();// take the relative word IndexedWord ant = ref.gov();// take the antecedent for (TypedDependency td : list) { // the last condition below maybe shouldn't be necessary, but it has // helped stop things going haywire a couple of times (it stops the // creation of a unit cycle that probably leaves something else // disconnected) [cdm Jan 2010] if (td.dep().equals(dep) && td.reln() != REFERENT && !td.gov().equals(ant)) { if (DEBUG) { System.err.print("referent: changing " + td); } td.setDep(ant); td.setExtra(); if (DEBUG) { System.err.println(" to " + td); } } } } } /** * Look for ref rules for a given word. We look through the * children and grandchildren of the rcmod dependency, and if any * children or grandchildren depend on a that/what/which/etc word, * we take the leftmost that/what/which/etc word as the dependent * for the ref TypedDependency. */ private static void addRef(Collection list) { List newDeps = new ArrayList(); for (TypedDependency rcmod : list) { if (rcmod.reln() != RELATIVE_CLAUSE_MODIFIER) { // we only add ref dependencies across relative clauses continue; } IndexedWord head = rcmod.gov(); IndexedWord modifier = rcmod.dep(); TypedDependency leftChild = null; for (TypedDependency child : list) { if (child.gov().equals(modifier) && EnglishPatterns.RELATIVIZING_WORD_PATTERN.matcher(child.dep().value()).matches() && (leftChild == null || child.dep().index() < leftChild.dep().index())) { leftChild = child; } } // TODO: could be made more efficient TypedDependency leftGrandchild = null; for (TypedDependency child : list) { if (!child.gov().equals(modifier)) { continue; } for (TypedDependency grandchild : list) { if (grandchild.gov().equals(child.dep()) && EnglishPatterns.RELATIVIZING_WORD_PATTERN.matcher(grandchild.dep().value()).matches() && (leftGrandchild == null || grandchild.dep().index() < leftGrandchild.dep().index())) { leftGrandchild = grandchild; } } } TypedDependency newDep = null; if (leftGrandchild != null && (leftChild == null || leftGrandchild.dep().index() < leftChild.dep().index())) { newDep = new TypedDependency(REFERENT, head, leftGrandchild.dep()); } else if (leftChild != null) { newDep = new TypedDependency(REFERENT, head, leftChild.dep()); } if (newDep != null) { newDeps.add(newDep); } } for (TypedDependency newDep : newDeps) { if (!list.contains(newDep)) { newDep.setExtra(); list.add(newDep); } } } /** * Add extra nsubj dependencies when collapsing basic dependencies. *
* In the general case, we look for an aux modifier under an xcomp * modifier, and assuming there aren't already associated nsubj * dependencies as daughters of the original xcomp dependency, we * add nsubj dependencies for each nsubj daughter of the aux. *
* There is also a special case for "to" words, in which case we add * a dependency if and only if there is no nsubj associated with the * xcomp and there is no other aux dependency. This accounts for * sentences such as "he decided not to" with no following verb. */ private static void addExtraNSubj(Collection list) { List newDeps = new ArrayList(); for (TypedDependency xcomp : list) { if (xcomp.reln() != XCLAUSAL_COMPLEMENT) { // we only add extra nsubj dependencies to some xcomp dependencies continue; } IndexedWord modifier = xcomp.dep(); IndexedWord head = xcomp.gov(); boolean hasSubjectDaughter = false; boolean hasAux = false; List subjects = Generics.newArrayList(); List objects = Generics.newArrayList(); for (TypedDependency dep : list) { // already have a subject dependency if ((dep.reln() == NOMINAL_SUBJECT || dep.reln() == NOMINAL_PASSIVE_SUBJECT) && dep.gov().equals(modifier)) { hasSubjectDaughter = true; break; } if (dep.reln() == AUX_MODIFIER && dep.gov().equals(modifier)) { hasAux = true; } if ((dep.reln() == NOMINAL_SUBJECT || dep.reln() == NOMINAL_PASSIVE_SUBJECT) && dep.gov().equals(head)) { subjects.add(dep.dep()); } if (dep.reln() == DIRECT_OBJECT && dep.gov().equals(head)) { objects.add(dep.dep()); } } // if we already have an nsubj dependency, no need to add an extra nsubj if (hasSubjectDaughter) { continue; } if ((modifier.value().equalsIgnoreCase("to") && hasAux) || (!modifier.value().equalsIgnoreCase("to") && !hasAux)) { continue; } // In general, we find that the objects of the verb are better // for extra nsubj than the original nsubj of the verb. For example, // "Many investors wrote asking the SEC to require ..." // There is no nsubj of asking, but the dobj, SEC, is the extra nsubj of require. // Similarly, "The law tells them when to do so" // Instead of nsubj(do, law) we want nsubj(do, them) if (objects.size() > 0) { for (IndexedWord object : objects) { TypedDependency newDep = new TypedDependency(NOMINAL_SUBJECT, modifier, object); newDeps.add(newDep); } } else { for (IndexedWord subject : subjects) { TypedDependency newDep = new TypedDependency(NOMINAL_SUBJECT, modifier, subject); newDeps.add(newDep); } } } for (TypedDependency newDep : newDeps) { if (!list.contains(newDep)) { newDep.setExtra(); list.add(newDep); } } } /** * This method corrects subjects of verbs for which we identified an auxpass, * but didn't identify the subject as passive. * * @param list List of typedDependencies to work on */ private static void correctSubjPass(Collection list) { // put in a list verbs having an auxpass List list_auxpass = new ArrayList(); for (TypedDependency td : list) { if (td.reln() == AUX_PASSIVE_MODIFIER) { list_auxpass.add(td.gov()); } } for (TypedDependency td : list) { // correct nsubj if (td.reln() == NOMINAL_SUBJECT && list_auxpass.contains(td.gov())) { // System.err.println("%%% Changing subj to passive: " + td); td.setReln(NOMINAL_PASSIVE_SUBJECT); } if (td.reln() == CLAUSAL_SUBJECT && list_auxpass.contains(td.gov())) { // System.err.println("%%% Changing subj to passive: " + td); td.setReln(CLAUSAL_PASSIVE_SUBJECT); } // correct unretrieved poss: dep relation in which the dependent is a // PRP$ or WP$ // cdm: Now done in basic rules. The only cases that this still matches // are (1) tagging mistakes where PRP in dobj position is mistagged PRP$ // or a couple of parsing errors where the dependency is wrong anyway, so // it's probably okay to keep it a dep. So I'm disabling this. // String tag = td.dep().tag(); // if (td.reln() == DEPENDENT && (tag.equals("PRP$") || tag.equals("WP$"))) { // System.err.println("%%% Unrecognized basic possessive pronoun: " + td); // td.setReln(POSSESSION_MODIFIER); // } } } private static boolean inConjDeps(TypedDependency td, List> conjs) { for (Triple trip : conjs) { if (td.equals(trip.first())) { return true; } } return false; } private static void collapsePrepAndPoss(Collection list) { // Man oh man, how gnarly is the logic of this method.... Collection newTypedDeps = new ArrayList(); // Construct a map from tree nodes to the set of typed // dependencies in which the node appears as governor. // cdm: could use CollectionValuedMap here! Map> map = Generics.newHashMap(); List vmod = Generics.newArrayList(); for (TypedDependency typedDep : list) { if (!map.containsKey(typedDep.gov())) { map.put(typedDep.gov(), new TreeSet()); } map.get(typedDep.gov()).add(typedDep); if (typedDep.reln() == VERBAL_MODIFIER) { // look for aux deps which indicate this was a to-be verb boolean foundAux = false; for (TypedDependency auxDep : list) { if (auxDep.reln() != AUX_MODIFIER) { continue; } if (!auxDep.gov().equals(typedDep.dep()) || !auxDep.dep().value().equalsIgnoreCase("to")) { continue; } foundAux = true; break; } if (!foundAux) { vmod.add(typedDep.dep()); } } } // System.err.println("here's the vmod list: " + vmod); // Do preposition conjunction interaction for // governor p NP and p NP case ... a lot of special code cdm jan 2006 for (TypedDependency td1 : list) { if (td1.reln() != PREPOSITIONAL_MODIFIER) { continue; } IndexedWord td1Dep = td1.dep(); SortedSet possibles = map.get(td1Dep); if (possibles == null) { continue; } // look for the "second half" // unique: the head prep and whether it should be pobj Pair prepDep = null; TypedDependency ccDep = null; // treat as unique // list of dep and prepOtherDep and pobj (or pcomp) List> conjs = new ArrayList>(); Set otherDtrs = new TreeSet(); // first look for a conj(prep, prep) (there might be several conj relations!!!) boolean samePrepositionInEachConjunct = true; int conjIndex = -1; for (TypedDependency td2 : possibles) { if (td2.reln() == CONJUNCT) { IndexedWord td2Dep = td2.dep(); String td2DepPOS = td2Dep.tag(); if (td2DepPOS.equals("IN") || td2DepPOS.equals("TO")) { samePrepositionInEachConjunct = samePrepositionInEachConjunct && td2Dep.value().equals(td1Dep.value()); Set possibles2 = map.get(td2Dep); boolean pobj = true;// default of collapsing preposition is prep_ TypedDependency prepOtherDep = null; if (possibles2 != null) { for (TypedDependency td3 : possibles2) { IndexedWord td3Dep = td3.dep(); String td3DepPOS = td3Dep.tag(); // CDM Mar 2006: I put in disjunction here when I added in // PREPOSITIONAL_OBJECT. If it catches all cases, we should // be able to delete the DEPENDENT disjunct // maybe better to delete the DEPENDENT disjunct - it creates // problem with multiple prep (mcdm) if ((td3.reln() == PREPOSITIONAL_OBJECT || td3.reln() == PREPOSITIONAL_COMPLEMENT) && (!(td3DepPOS.equals("IN") || td3DepPOS.equals("TO"))) && prepOtherDep == null) { prepOtherDep = td3; if (td3.reln() == PREPOSITIONAL_COMPLEMENT) { pobj = false; } } else { otherDtrs.add(td3); } } } if (conjIndex < td2Dep.index()) { conjIndex = td2Dep.index(); } conjs.add(new Triple(td2, prepOtherDep, pobj)); } } } // end td2:possibles if (conjs.isEmpty()) { continue; } // if we have a conj under a preposition dependency, we look for the other // parts String td1DepPOS = td1Dep.tag(); for (TypedDependency td2 : possibles) { // we look for the cc linked to this conjDep // the cc dep must have an index smaller than the dep of conjDep if (td2.reln() == COORDINATION && td2.dep().index() < conjIndex) { ccDep = td2; } else { IndexedWord td2Dep = td2.dep(); String td2DepPOS = td2Dep.tag(); // System.err.println("prepDep find: td1.reln: " + td1.reln() + // "; td2.reln: " + td2.reln() + "; td1DepPos: " + td1DepPOS + // "; td2DepPos: " + td2DepPOS + "; index " + index + // "; td2.dep().index(): " + td2.dep().index()); if ((td2.reln() == DEPENDENT || td2.reln() == PREPOSITIONAL_OBJECT || td2.reln() == PREPOSITIONAL_COMPLEMENT) && (td1DepPOS.equals("IN") || td1DepPOS.equals("TO") || td1DepPOS.equals("VBG")) && prepDep == null && (!(td2DepPOS.equals("RB") || td2DepPOS.equals("IN") || td2DepPOS.equals("TO")))) { // same index trick, in case we have multiple deps // I deleted this to see if it helped [cdm Jan 2010] && // td2.dep().index() < index) prepDep = new Pair(td2, td2.reln() != PREPOSITIONAL_COMPLEMENT); } else if (!inConjDeps(td2, conjs)) {// don't want to add the conjDep // again! otherDtrs.add(td2); } } } if (prepDep == null || ccDep == null) { continue; // we can't deal with it in the hairy prep/conj interaction case! } if (DEBUG) { // ccDep must be non-null given test above System.err.println("!! Conj and prep case:"); System.err.println(" td1 (prep): " + td1); System.err.println(" Kids of td1 are: " + possibles); System.err.println(" prepDep: " + prepDep); System.err.println(" ccDep: " + ccDep); System.err.println(" conjs: " + conjs); System.err.println(" samePrepositionInEachConjunct: " + samePrepositionInEachConjunct); System.err.println(" otherDtrs: " + otherDtrs); } // check if we have the same prepositions in the conjunction if (samePrepositionInEachConjunct) { // conjDep != null && prepOtherDep != // null && // OK, we have a conjunction over parallel PPs: Fred flew to Greece and // to Serbia. GrammaticalRelation reln = determinePrepRelation(map, vmod, td1, td1, prepDep.second()); TypedDependency tdNew = new TypedDependency(reln, td1.gov(), prepDep.first().dep()); newTypedDeps.add(tdNew); if (DEBUG) { System.err.println("PrepPoss Conj branch (two parallel PPs) adding: " + tdNew); System.err.println(" removing: " + td1 + " " + prepDep + " " + ccDep); } td1.setReln(KILL);// remember these are "used up" prepDep.first().setReln(KILL); ccDep.setReln(KILL); for (Triple trip : conjs) { TypedDependency conjDep = trip.first(); TypedDependency prepOtherDep = trip.second(); if (prepOtherDep == null) { // CDM July 2010: I think this should only ever happen if there is a // misparse, but it has happened in such circumstances. You have // something like (PP in or in (NP Serbia)), with the two // prepositions the same. We just clean up the mess. if (DEBUG) { System.err.println(" apparent misparse: same P twice with only one NP object (prepOtherDep is null)"); System.err.println(" removing: " + conjDep); } ccDep.setReln(KILL); } else { TypedDependency tdNew2 = new TypedDependency(conjValue(ccDep.dep().value()), prepDep.first().dep(), prepOtherDep.dep()); newTypedDeps.add(tdNew2); if (DEBUG) { System.err.println(" adding: " + tdNew2); System.err.println(" removing: " + conjDep + " " + prepOtherDep); } prepOtherDep.setReln(KILL); } conjDep.setReln(KILL); } // promote dtrs that would be orphaned for (TypedDependency otd : otherDtrs) { if (DEBUG) { System.err.print("Changed " + otd); } otd.setGov(td1.gov()); if (DEBUG) { System.err.println(" to " + otd); } } // Now we need to see if there are any TDs that will be "orphaned" // by this collapse. Example: if we have: // dep(drew, on) // dep(on, book) // dep(on, right) // the first two will be collapsed to on(drew, book), but then // the third one will be orphaned, since its governor no // longer appears. So, change its governor to 'drew'. // CDM Feb 2010: This used to not move COORDINATION OR CONJUNCT, but now // it does, since they're not automatically deleted // Some things in possibles may have already been changed, so check gov if (DEBUG) { System.err.println("td1: " + td1 + "; possibles: " + possibles); } for (TypedDependency td2 : possibles) { // if (DEBUG) { // System.err.println("[a] td2.reln " + td2.reln() + " td2.gov " + // td2.gov() + " td1.dep " + td1.dep()); // } if (td2.reln() != KILL && td2.gov().equals(td1.dep())) { // && td2.reln() // != COORDINATION // && td2.reln() // != CONJUNCT if (DEBUG) { System.err.println("Changing " + td2 + " to have governor of " + td1 + " [a]"); } td2.setGov(td1.gov()); } } continue; // This one has been dealt with successfully } // end same prepositions // case of "Lufthansa flies to and from Serbia". Make it look like next // case :-) // that is, the prepOtherDep should be the same as prepDep ! for (Triple trip : conjs) { if (trip.first() != null && trip.second() == null) { trip.setSecond(new TypedDependency(prepDep.first().reln(), trip.first().dep(), prepDep.first().dep())); trip.setThird(prepDep.second()); } } // we have two different prepositions in the conjunction // in this case we need to add a node // "Bill jumped over the fence and through the hoop" // prep_over(jumped, fence) // conj_and(jumped, jumped) // prep_through(jumped, hoop) // Extra complication: // If "jumped" is already part of a conjunction, we should add the new one off that rather than chaining IndexedWord conjHead = td1.gov(); for (TypedDependency td3 : list) { if (td3.dep().equals(td1.gov()) && td3.reln().equals(CONJUNCT)) { conjHead = td3.gov(); } } GrammaticalRelation reln = determinePrepRelation(map, vmod, td1, td1, prepDep.second()); TypedDependency tdNew = new TypedDependency(reln, td1.gov(), prepDep.first().dep()); newTypedDeps.add(tdNew); if (DEBUG) { System.err.println("ConjPP (different preps) adding: " + tdNew); System.err.println(" deleting: " + td1 + " " + prepDep.first() + " " + ccDep); } td1.setReln(KILL);// remember these are "used up" prepDep.first().setReln(KILL); ccDep.setReln(KILL); // so far we added the first prep grammatical relation int copyNumber = 1; for (Triple trip : conjs) { TypedDependency conjDep = trip.first(); TypedDependency prepOtherDep = trip.second(); boolean pobj = trip.third(); // OK, we have a conjunction over different PPs // we create a new node; // in order to make a distinction between the original node and its copy // we set the "copyCount" variable in the IndexedWord // existence of copyCount > 0 is checked at printing (toString method of // TypedDependency) IndexedWord label = td1.gov().makeSoftCopy(copyNumber); copyNumber++; // now we add the conjunction relation between conjHead (either td1.gov // or what it is itself conjoined with) and the copy // the copy has the same label as td1.gov() but is another TreeGraphNode // todo: Or that's the plan; there are a couple of knock on changes to fix before we can do this! // TypedDependency tdNew2 = new TypedDependency(conjValue(ccDep.dep().value()), conjHead, label); TypedDependency tdNew2 = new TypedDependency(conjValue(ccDep.dep().value()), td1.gov(), label); newTypedDeps.add(tdNew2); // now we still need to add the second prep grammatical relation // between the copy and the dependent of the prepOtherDep node TypedDependency tdNew3; GrammaticalRelation reln2 = determinePrepRelation(map, vmod, conjDep, td1, pobj); tdNew3 = new TypedDependency(reln2, label, prepOtherDep.dep()); newTypedDeps.add(tdNew3); if (DEBUG) { System.err.println(" adding: " + tdNew2 + " " + tdNew3); System.err.println(" deleting: " + conjDep + " " + prepOtherDep); } conjDep.setReln(KILL); prepOtherDep.setReln(KILL); // promote dtrs that would be orphaned for (TypedDependency otd : otherDtrs) { // special treatment for prepositions: the original relation is // likely to be a "dep" and we want this to be a "prep" if (otd.dep().tag().equals("IN")) { otd.setReln(PREPOSITIONAL_MODIFIER); } otd.setGov(td1.gov()); } } // Now we need to see if there are any TDs that will be "orphaned" off // the first preposition // by this collapse. Example: if we have: // dep(drew, on) // dep(on, book) // dep(on, right) // the first two will be collapsed to on(drew, book), but then // the third one will be orphaned, since its governor no // longer appears. So, change its governor to 'drew'. // CDM Feb 2010: This used to not move COORDINATION OR CONJUNCT, but now // it does, since they're not automatically deleted for (TypedDependency td2 : possibles) { if (td2.reln() != KILL) { // && td2.reln() != COORDINATION && // td2.reln() != CONJUNCT) { if (DEBUG) { System.err.println("Changing " + td2 + " to have governor of " + td1 + " [b]"); } td2.setGov(td1.gov()); } } // end for different prepositions } // for TypedDependency td1 : list // below here is the single preposition/possessor basic case!! for (TypedDependency td1 : list) { if (td1.reln() == KILL) { continue; } IndexedWord td1Dep = td1.dep(); String td1DepPOS = td1Dep.tag(); // find all other typedDeps having our dep as gov Set possibles = map.get(td1Dep); if (possibles != null && (td1.reln() == PREPOSITIONAL_MODIFIER || td1.reln() == POSSESSION_MODIFIER || td1.reln() == CONJUNCT)) { // look for the "second half" boolean pobj = true;// default for prep relation is prep_ for (TypedDependency td2 : possibles) { if (td2.reln() != COORDINATION && td2.reln() != CONJUNCT) { IndexedWord td2Dep = td2.dep(); String td2DepPOS = td2Dep.tag(); if ((td1.reln() == POSSESSION_MODIFIER || td1.reln() == CONJUNCT)) { if (td2.reln() == POSSESSIVE_MODIFIER) { if ( ! map.containsKey(td2Dep)) { // if 's has no kids of its own (it shouldn't!) td2.setReln(KILL); } } } else if ((td2.reln() == PREPOSITIONAL_OBJECT || td2.reln() == PREPOSITIONAL_COMPLEMENT) && (td1DepPOS.equals("IN") || td1DepPOS.equals("TO") || td1DepPOS.equals("VBG")) && (!(td2DepPOS.equals("RB") || td2DepPOS.equals("IN") || td2DepPOS.equals("TO"))) && !isConjWithNoPrep(td2.gov(), possibles)) { // we don't collapse preposition conjoined with a non-preposition // to avoid disconnected constituents // OK, we have a pair td1, td2 to collapse to td3 if (DEBUG) { System.err.println("(Single prep/poss base case collapsing " + td1 + " and " + td2); } // check whether we are in a pcomp case: if (td2.reln() == PREPOSITIONAL_COMPLEMENT) { pobj = false; } GrammaticalRelation reln = determinePrepRelation(map, vmod, td1, td1, pobj); TypedDependency td3 = new TypedDependency(reln, td1.gov(), td2.dep()); if (DEBUG) { System.err.println("PP adding: " + td3 + " deleting: " + td1 + ' ' + td2); } // add it to map to deal with recursive cases like "achieved this (PP (PP in part) with talent)" map.get(td3.gov()).add(td3); newTypedDeps.add(td3); td1.setReln(KILL);// remember these are "used up" td2.setReln(KILL);// remember these are "used up" } } } // for TypedDependency td2 } // Now we need to see if there are any TDs that will be "orphaned" // by this collapse. Example: if we have: // dep(drew, on) // dep(on, book) // dep(on, right) // the first two will be collapsed to on(drew, book), but then // the third one will be orphaned, since its governor no // longer appears. So, change its governor to 'drew'. // CDM Feb 2010: This used to not move COORDINATION OR CONJUNCT, but now // it does, since they're not automatically deleted if (possibles != null && td1.reln() == KILL) { for (TypedDependency td2 : possibles) { if (td2.reln() != KILL) { // && td2.reln() != COORDINATION && // td2.reln() != CONJUNCT) { if (DEBUG) { System.err.println("Changing " + td2 + " to have governor of " + td1 + " [c]"); } td2.setGov(td1.gov()); } } } } // for TypedDependency td1 // now remove typed dependencies with reln "kill" and add new ones. for (Iterator iter = list.iterator(); iter.hasNext();) { TypedDependency td = iter.next(); if (td.reln() == KILL) { if (DEBUG) { System.err.println("Removing dep killed in poss/prep (conj) collapse: " + td); } iter.remove(); } } list.addAll(newTypedDeps); } // end collapsePrepAndPoss() /** Work out prep relation name. pc is the dependency whose dep() is the * preposition to do a name for. topPrep may be the same or different. * Among the daughters of its gov is where to look for an auxpass. */ private static GrammaticalRelation determinePrepRelation(Map> map, List vmod, TypedDependency pc, TypedDependency topPrep, boolean pobj) { // handling the case of an "agent": // the governor of a "by" preposition must have an "auxpass" dependency // or be the dependent of a "vmod" relation // if it is the case, the "agent" variable becomes true boolean agent = false; String preposition = pc.dep().value().toLowerCase(); if (preposition.equals("by")) { // look if we have an auxpass Set aux_pass_poss = map.get(topPrep.gov()); if (aux_pass_poss != null) { for (TypedDependency td_pass : aux_pass_poss) { if (td_pass.reln() == AUX_PASSIVE_MODIFIER) { agent = true; } } } // look if we have a vmod if (!vmod.isEmpty() && vmod.contains(topPrep.gov())) { agent = true; } } GrammaticalRelation reln; if (agent) { reln = AGENT; } else { // for prepositions, use the preposition // for pobj: we collapse into "prep"; for pcomp: we collapse into "prepc" if (pobj) { reln = EnglishGrammaticalRelations.getPrep(preposition); } else { reln = EnglishGrammaticalRelations.getPrepC(preposition); } } return reln; } // used by collapse2WP(), collapseFlatMWP(), collapse2WPbis() KEPT IN // ALPHABETICAL ORDER private static final String[][] MULTIWORD_PREPS = { { "according", "to" }, { "across", "from" }, { "ahead", "of" }, { "along", "with" }, { "alongside", "of" }, { "apart", "from" }, { "as", "for" }, { "as", "from" }, { "as", "of" }, { "as", "per" }, { "as", "to" }, { "aside", "from" }, { "away", "from" }, { "based", "on" }, { "because", "of" }, { "close", "by" }, { "close", "to" }, { "contrary", "to" }, { "compared", "to" }, { "compared", "with" }, { "due", "to" }, { "depending", "on" }, { "except", "for" }, { "exclusive", "of" }, { "far", "from" }, { "followed", "by" }, { "inside", "of" }, { "instead", "of" }, { "irrespective", "of" }, { "next", "to" }, { "near", "to" }, { "off", "of" }, { "out", "of" }, { "outside", "of" }, { "owing", "to" }, { "preliminary", "to" }, { "preparatory", "to" }, { "previous", "to" }, { "prior", "to" }, { "pursuant", "to" }, { "regardless", "of" }, { "subsequent", "to" }, { "such", "as" }, { "thanks", "to" }, { "together", "with" } }; // used by collapse3WP() KEPT IN ALPHABETICAL ORDER private static final String[][] THREEWORD_PREPS = { { "by", "means", "of" }, { "in", "accordance", "with" }, { "in", "addition", "to" }, { "in", "case", "of" }, { "in", "front", "of" }, { "in", "lieu", "of" }, { "in", "place", "of" }, { "in", "spite", "of" }, { "on", "account", "of" }, { "on", "behalf", "of" }, { "on", "top", "of" }, { "with", "regard", "to" }, { "with", "respect", "to" } }; /** * Given a list of typedDependencies, returns true if the node "node" is the * governor of a conj relation with a dependent which is not a preposition * * @param node * A node in this GrammaticalStructure * @param list * A list of typedDependencies * @return true If node is the governor of a conj relation in the list with * the dep not being a preposition */ private static boolean isConjWithNoPrep(IndexedWord node, Collection list) { for (TypedDependency td : list) { if (td.gov().equals(node) && td.reln() == CONJUNCT) { // we have a conjunct // check the POS of the dependent String tdDepPOS = td.dep().tag(); if (!(tdDepPOS.equals("IN") || tdDepPOS.equals("TO"))) { return true; } } } return false; } /** * Collapse multiword preposition of the following format: * prep|advmod|dep|amod(gov, mwp[0])
* dep(mpw[0],mwp[1])
* pobj|pcomp(mwp[1], compl) or pobj|pcomp(mwp[0], compl)
* -> prep_mwp[0]_mwp[1](gov, compl)
* * prep|advmod|dep|amod(gov, mwp[1])
* dep(mpw[1],mwp[0])
* pobj|pcomp(mwp[1], compl) or pobj|pcomp(mwp[0], compl)
* -> prep_mwp[0]_mwp[1](gov, compl) *

* * The collapsing has to be done at once in order to know exactly which node * is the gov and the dep of the multiword preposition. Otherwise this can * lead to problems: removing a non-multiword "to" preposition for example!!! * This method replaces the old "collapsedMultiWordPreps" * * @param list * list of typedDependencies to work on */ private static void collapse2WP(Collection list) { Collection newTypedDeps = new ArrayList(); for (String[] mwp : MULTIWORD_PREPS) { // first look for patterns such as: // X(gov, mwp[0]) // Y(mpw[0],mwp[1]) // Z(mwp[1], compl) or Z(mwp[0], compl) // -> prep_mwp[0]_mwp[1](gov, compl) collapseMultiWordPrep(list, newTypedDeps, mwp[0], mwp[1], mwp[0], mwp[1]); // now look for patterns such as: // X(gov, mwp[1]) // Y(mpw[1],mwp[0]) // Z(mwp[1], compl) or Z(mwp[0], compl) // -> prep_mwp[0]_mwp[1](gov, compl) collapseMultiWordPrep(list, newTypedDeps, mwp[0], mwp[1], mwp[1], mwp[0]); } } /** * Collapse multiword preposition of the following format: * prep|advmod|dep|amod(gov, mwp0) dep(mpw0,mwp1) pobj|pcomp(mwp1, compl) or * pobj|pcomp(mwp0, compl) -> prep_mwp0_mwp1(gov, compl) *

* * @param list List of typedDependencies to work on, * @param newTypedDeps List of typedDependencies that we construct * @param str_mwp0 First part of the multiword preposition to construct the collapsed * preposition * @param str_mwp1 Second part of the multiword preposition to construct the * collapsed preposition * @param w_mwp0 First part of the multiword preposition that we look for * @param w_mwp1 Second part of the multiword preposition that we look for */ private static void collapseMultiWordPrep(Collection list, Collection newTypedDeps, String str_mwp0, String str_mwp1, String w_mwp0, String w_mwp1) { // first find the multiword_preposition: dep(mpw[0], mwp[1]) // the two words should be next to another in the sentence (difference of // indexes = 1) IndexedWord mwp0 = null; IndexedWord mwp1 = null; TypedDependency dep = null; for (TypedDependency td : list) { if (td.gov().value().equalsIgnoreCase(w_mwp0) && td.dep().value().equalsIgnoreCase(w_mwp1) && Math.abs(td.gov().index() - td.dep().index()) == 1) { mwp0 = td.gov(); mwp1 = td.dep(); dep = td; } } if (mwp0 == null) { return; } // now search for prep|advmod|dep|amod(gov, mwp0) IndexedWord governor = null; TypedDependency prep = null; for (TypedDependency td1 : list) { if ((td1.reln() == PREPOSITIONAL_MODIFIER || td1.reln() == ADVERBIAL_MODIFIER || td1.reln() == ADJECTIVAL_MODIFIER || td1.reln() == DEPENDENT || td1.reln() == MULTI_WORD_EXPRESSION) && td1.dep().equals(mwp0)) { // we found prep|advmod|dep|amod(gov, mwp0) prep = td1; governor = prep.gov(); } } if (prep == null) { return; } // search for the complement: pobj|pcomp(mwp1,X) // or for pobj|pcomp(mwp0,X) // There may be more than one in weird constructions; if there are several, // take the one with the LOWEST index! TypedDependency pobj = null; TypedDependency newtd = null; for (TypedDependency td2 : list) { if ((td2.reln() == PREPOSITIONAL_OBJECT || td2.reln() == PREPOSITIONAL_COMPLEMENT) && (td2.gov().equals(mwp1) || td2.gov().equals(mwp0))) { if (pobj == null || pobj.dep().index() > td2.dep().index()) { pobj = td2; // create the new gr relation GrammaticalRelation gr; if (td2.reln() == PREPOSITIONAL_COMPLEMENT) { gr = EnglishGrammaticalRelations.getPrepC(str_mwp0 + '_' + str_mwp1); } else { gr = EnglishGrammaticalRelations.getPrep(str_mwp0 + '_' + str_mwp1); } if (governor != null) { newtd = new TypedDependency(gr, governor, pobj.dep()); } } } } if (pobj == null || newtd == null) { return; } // only if we found the three parts, set to KILL and remove // and add the new one // Necessarily from the above: prep != null, dep != null, pobj != null, newtd != null if (DEBUG) { System.err.println("Removing " + prep + ", " + dep + ", and " + pobj); System.err.println(" and adding " + newtd); } prep.setReln(KILL); dep.setReln(KILL); pobj.setReln(KILL); newTypedDeps.add(newtd); // now remove typed dependencies with reln "kill" // and promote possible orphans for (TypedDependency td1 : list) { if (td1.reln() != KILL) { if (td1.gov().equals(mwp0) || td1.gov().equals(mwp1)) { // CDM: Thought of adding this in Jan 2010, but it causes // conflicting relations tmod vs. pobj. Needs more thought // maybe restrict pobj to first NP in PP, and allow tmod for a later // one? if (td1.reln() == TEMPORAL_MODIFIER) { // special case when an extra NP-TMP is buried in a PP for // "during the same period last year" td1.setGov(pobj.dep()); } else { td1.setGov(governor); } } if (!newTypedDeps.contains(td1)) { newTypedDeps.add(td1); } } } list.clear(); list.addAll(newTypedDeps); } /** * Collapse multi-words preposition of the following format: advmod|prt(gov, * mwp[0]) prep(gov,mwp[1]) pobj|pcomp(mwp[1], compl) -> * prep_mwp[0]_mwp[1](gov, compl) *

* * @param list * List of typedDependencies to work on */ private static void collapse2WPbis(Collection list) { Collection newTypedDeps = new ArrayList(); for (String[] mwp : MULTIWORD_PREPS) { newTypedDeps.clear(); IndexedWord mwp0 = null; IndexedWord mwp1 = null; IndexedWord governor = null; TypedDependency prep = null; TypedDependency dep = null; TypedDependency pobj = null; TypedDependency newtd = null; // first find the first part of the multi_preposition: advmod|prt(gov, mwp[0]) for (TypedDependency td : list) { if (td.dep().value().equalsIgnoreCase(mwp[0]) && (td.reln() == PHRASAL_VERB_PARTICLE || td.reln() == ADVERBIAL_MODIFIER || td.reln() == DEPENDENT || td.reln() == MULTI_WORD_EXPRESSION)) { // we found advmod(gov, mwp0) or prt(gov, mwp0) governor = td.gov(); mwp0 = td.dep(); dep = td; } } // now search for the second part: prep(gov, mwp1) // the two words in the mwp should be next to another in the sentence // (difference of indexes = 1) if (mwp0 == null || governor == null) { continue; } for (TypedDependency td1 : list) { if (td1.reln() == PREPOSITIONAL_MODIFIER && td1.dep().value().equalsIgnoreCase(mwp[1]) && Math.abs(td1.dep().index() - mwp0.index()) == 1 && td1.gov().equals(governor)) {// we // found // prep(gov, // mwp1) mwp1 = td1.dep(); prep = td1; } } if (mwp1 == null) { continue; } // search for the complement: pobj|pcomp(mwp1,X) for (TypedDependency td2 : list) { if (td2.reln() == PREPOSITIONAL_OBJECT && td2.gov().equals(mwp1)) { pobj = td2; // create the new gr relation GrammaticalRelation gr = EnglishGrammaticalRelations.getPrep(mwp[0] + '_' + mwp[1]); newtd = new TypedDependency(gr, governor, pobj.dep()); } if (td2.reln() == PREPOSITIONAL_COMPLEMENT && td2.gov().equals(mwp1)) { pobj = td2; // create the new gr relation GrammaticalRelation gr = EnglishGrammaticalRelations.getPrepC(mwp[0] + '_' + mwp[1]); newtd = new TypedDependency(gr, governor, pobj.dep()); } } if (pobj == null) { return; } // only if we found the three parts, set to KILL and remove // and add the new one // now prep != null, pobj != null and newtd != null prep.setReln(KILL); dep.setReln(KILL); pobj.setReln(KILL); newTypedDeps.add(newtd); // now remove typed dependencies with reln "kill" // and promote possible orphans for (TypedDependency td1 : list) { if (td1.reln() != KILL) { if (td1.gov().equals(mwp0) || td1.gov().equals(mwp1)) { td1.setGov(governor); } if (!newTypedDeps.contains(td1)) { newTypedDeps.add(td1); } } } list.clear(); list.addAll(newTypedDeps); } } /** * Collapse 3-word preposition of the following format:
* This will be the case when the preposition is analyzed as a NP
* prep(gov, mwp0)
* X(mwp0,mwp1)
* X(mwp1,mwp2)
* pobj|pcomp(mwp2, compl)
* -> prep_mwp[0]_mwp[1]_mwp[2](gov, compl) *

* * It also takes flat annotation into account:
* prep(gov,mwp0)
* X(mwp0,mwp1)
* X(mwp0,mwp2)
* pobj|pcomp(mwp0, compl)
* -> prep_mwp[0]_mwp[1]_mwp[2](gov, compl) *

* * * @param list * List of typedDependencies to work on */ private static void collapse3WP(Collection list) { Collection newTypedDeps = new ArrayList(); // first, loop over the prepositions for NP annotation for (String[] mwp : THREEWORD_PREPS) { newTypedDeps.clear(); IndexedWord mwp0 = null; IndexedWord mwp1 = null; IndexedWord mwp2 = null; TypedDependency dep1 = null; TypedDependency dep2 = null; // first find the first part of the 3word preposition: dep(mpw[0], mwp[1]) // the two words should be next to another in the sentence (difference of // indexes = 1) for (TypedDependency td : list) { if (td.gov().value().equalsIgnoreCase(mwp[0]) && td.dep().value().equalsIgnoreCase(mwp[1]) && Math.abs(td.gov().index() - td.dep().index()) == 1) { mwp0 = td.gov(); mwp1 = td.dep(); dep1 = td; } } // find the second part of the 3word preposition: dep(mpw[1], mwp[2]) // the two words should be next to another in the sentence (difference of // indexes = 1) for (TypedDependency td : list) { if (td.gov().equals(mwp1) && td.dep().value().equalsIgnoreCase(mwp[2]) && Math.abs(td.gov().index() - td.dep().index()) == 1) { mwp2 = td.dep(); dep2 = td; } } if (dep1 != null && dep2 != null) { // now search for prep(gov, mwp0) IndexedWord governor = null; TypedDependency prep = null; for (TypedDependency td1 : list) { if (td1.reln() == PREPOSITIONAL_MODIFIER && td1.dep().equals(mwp0)) {// we // found // prep(gov, // mwp0) prep = td1; governor = prep.gov(); } } // search for the complement: pobj|pcomp(mwp2,X) TypedDependency pobj = null; TypedDependency newtd = null; for (TypedDependency td2 : list) { if (td2.reln() == PREPOSITIONAL_OBJECT && td2.gov().equals(mwp2)) { pobj = td2; // create the new gr relation GrammaticalRelation gr = EnglishGrammaticalRelations.getPrep(mwp[0] + '_' + mwp[1] + '_' + mwp[2]); if (governor != null) { newtd = new TypedDependency(gr, governor, pobj.dep()); } } if (td2.reln() == PREPOSITIONAL_COMPLEMENT && td2.gov().equals(mwp2)) { pobj = td2; // create the new gr relation GrammaticalRelation gr = EnglishGrammaticalRelations.getPrepC(mwp[0] + '_' + mwp[1] + '_' + mwp[2]); if (governor != null) { newtd = new TypedDependency(gr, governor, pobj.dep()); } } } // only if we found the governor and complement parts, set to KILL and // remove // and add the new one if (prep != null && pobj != null && newtd != null) { prep.setReln(KILL); dep1.setReln(KILL); dep2.setReln(KILL); pobj.setReln(KILL); newTypedDeps.add(newtd); // now remove typed dependencies with reln "kill" // and promote possible orphans for (TypedDependency td1 : list) { if (td1.reln() != KILL) { if (td1.gov().equals(mwp0) || td1.gov().equals(mwp1) || td1.gov().equals(mwp2)) { td1.setGov(governor); } if (!newTypedDeps.contains(td1)) { newTypedDeps.add(td1); } } } list.clear(); list.addAll(newTypedDeps); } } } // second, loop again looking at flat annotation for (String[] mwp : THREEWORD_PREPS) { newTypedDeps.clear(); IndexedWord mwp0 = null; IndexedWord mwp1 = null; IndexedWord mwp2 = null; TypedDependency dep1 = null; TypedDependency dep2 = null; // first find the first part of the 3word preposition: dep(mpw[0], mwp[1]) // the two words should be next to another in the sentence (difference of // indexes = 1) for (TypedDependency td : list) { if (td.gov().value().equalsIgnoreCase(mwp[0]) && td.dep().value().equalsIgnoreCase(mwp[1]) && Math.abs(td.gov().index() - td.dep().index()) == 1) { mwp0 = td.gov(); mwp1 = td.dep(); dep1 = td; } } // find the second part of the 3word preposition: dep(mpw[0], mwp[2]) // the two words should be one word apart in the sentence (difference of // indexes = 2) for (TypedDependency td : list) { if (td.gov().equals(mwp0) && td.dep().value().equalsIgnoreCase(mwp[2]) && Math.abs(td.gov().index() - td.dep().index()) == 2) { mwp2 = td.dep(); dep2 = td; } } if (dep1 != null && dep2 != null) { // now search for prep(gov, mwp0) IndexedWord governor = null; TypedDependency prep = null; for (TypedDependency td1 : list) { if (td1.dep().equals(mwp0) && td1.reln() == PREPOSITIONAL_MODIFIER) {// we // found // prep(gov, // mwp0) prep = td1; governor = prep.gov(); } } // search for the complement: pobj|pcomp(mwp0,X) TypedDependency pobj = null; TypedDependency newtd = null; for (TypedDependency td2 : list) { if (td2.gov().equals(mwp0) && td2.reln() == PREPOSITIONAL_OBJECT) { pobj = td2; // create the new gr relation GrammaticalRelation gr = EnglishGrammaticalRelations.getPrep(mwp[0] + '_' + mwp[1] + '_' + mwp[2]); if (governor != null) { newtd = new TypedDependency(gr, governor, pobj.dep()); } } if (td2.gov().equals(mwp0) && td2.reln() == PREPOSITIONAL_COMPLEMENT) { pobj = td2; // create the new gr relation GrammaticalRelation gr = EnglishGrammaticalRelations.getPrepC(mwp[0] + '_' + mwp[1] + '_' + mwp[2]); if (governor != null) { newtd = new TypedDependency(gr, governor, pobj.dep()); } } } // only if we found the governor and complement parts, set to KILL and // remove // and add the new one if (prep != null && pobj != null && newtd != null) { prep.setReln(KILL); dep1.setReln(KILL); dep2.setReln(KILL); pobj.setReln(KILL); newTypedDeps.add(newtd); // now remove typed dependencies with reln "kill" // and promote possible orphans for (TypedDependency td1 : list) { if (td1.reln() != KILL) { if (td1.gov().equals(mwp0) || td1.gov().equals(mwp1) || td1.gov().equals(mwp2)) { td1.setGov(governor); } if (!newTypedDeps.contains(td1)) { newTypedDeps.add(td1); } } } list.clear(); list.addAll(newTypedDeps); } } } } /* * * While upgrading, here are some lists of common multiword prepositions which * we might try to cover better. (Also do corpus counts for same?) * * (Prague Dependency Treebank) as per CRIT except for RESTR but for RESTR * apart from RESTR away from RESTR aside from RESTR as from TSIN ahead of * TWHEN back of LOC, DIR3 exclusive of* RESTR instead of SUBS outside of LOC, * DIR3 off of DIR1 upwards of LOC, DIR3 as of TSIN because of CAUS inside of * LOC, DIR3 irrespective of REG out of LOC, DIR1 regardless of REG according * to CRIT due to CAUS next to LOC, RESTR owing to* CAUS preparatory to* TWHEN * prior to* TWHEN subsequent to* TWHEN as to/for REG contrary to* CPR close * to* LOC, EXT (except the case named in the next table) near to LOC, DIR3 * nearer to LOC, DIR3 preliminary to* TWHEN previous to* TWHEN pursuant to* * CRIT thanks to CAUS along with ACMP together with ACMP devoid of* ACMP void * of* ACMP * * http://www.keepandshare.com/doc/view.php?u=13166 * * according to ahead of as far as as well as by means of due to far from in * addition to in case of in front of in place of in spite of inside of * instead of in to (into) near to next to on account of on behalf of on top * of on to (onto) out of outside of owing to prior to with regards to * * www.eslmonkeys.com/book/learner/prepositions.pdf According to Ahead of * Along with Apart from As for As to Aside from Because of But for Contrary * to Except for Instead of Next to Out of Prior to Thanks to */ /** * Collapse multi-words preposition of the following format, which comes from * flat annotation. This handles e.g., "because of" (PP (IN because) (IN of) * ...), "such as" (PP (JJ such) (IN as) ...) *

* prep(gov, mwp[1]) dep(mpw[1], mwp[0]) pobj(mwp[1], compl) -> * prep_mwp[0]_mwp[1](gov, compl) * * @param list List of typedDependencies to work on */ private static void collapseFlatMWP(Collection list) { Collection newTypedDeps = new ArrayList(); for (String[] mwp : MULTIWORD_PREPS) { newTypedDeps.clear(); IndexedWord mwp1 = null; IndexedWord governor = null; TypedDependency prep = null; TypedDependency dep = null; TypedDependency pobj = null; // first find the multi_preposition: dep(mpw[1], mwp[0]) for (TypedDependency td : list) { if (Math.abs(td.gov().index() - td.dep().index()) == 1 && td.gov().value().equalsIgnoreCase(mwp[1]) && td.dep().value().equalsIgnoreCase(mwp[0])) { mwp1 = td.gov(); dep = td; } } if (mwp1 == null) { continue; } // now search for prep(gov, mwp1) for (TypedDependency td1 : list) { if (td1.dep().equals(mwp1) && td1.reln() == PREPOSITIONAL_MODIFIER) { // we found prep(gov, mwp1) prep = td1; governor = prep.gov(); } } if (prep == null) { continue; } // search for the complement: pobj|pcomp(mwp1,X) for (TypedDependency td2 : list) { if (td2.gov().equals(mwp1) && td2.reln() == PREPOSITIONAL_OBJECT) { pobj = td2; // create the new gr relation GrammaticalRelation gr = EnglishGrammaticalRelations.getPrep(mwp[0] + '_' + mwp[1]); newTypedDeps.add(new TypedDependency(gr, governor, pobj.dep())); } if (td2.gov().equals(mwp1) && td2.reln() == PREPOSITIONAL_COMPLEMENT) { pobj = td2; // create the new gr relation GrammaticalRelation gr = EnglishGrammaticalRelations.getPrepC(mwp[0] + '_' + mwp[1]); newTypedDeps.add(new TypedDependency(gr, governor, pobj.dep())); } } if (pobj == null) { return; } // only if we found the three parts, set to KILL and remove // we know prep != null && dep != null && dep != null prep.setReln(KILL); dep.setReln(KILL); pobj.setReln(KILL); // now remove typed dependencies with reln "kill" // and promote possible orphans for (TypedDependency td1 : list) { if (td1.reln() != KILL) { if (td1.gov().equals(mwp1)) { td1.setGov(governor); } if (!newTypedDeps.contains(td1)) { newTypedDeps.add(td1); } } } list.clear(); list.addAll(newTypedDeps); } } /** * This method gets rid of multiwords in conjunctions to avoid having them * creating disconnected constituents e.g., * "bread-1 as-2 well-3 as-4 cheese-5" will be turned into conj_and(bread, * cheese) and then dep(well-3, as-2) and dep(well-3, as-4) cannot be attached * to the graph, these dependencies are erased * * @param list List of words to get rid of multiword conjunctions from */ private static void eraseMultiConj(Collection list) { // find typed deps of form cc(gov, x) for (TypedDependency td1 : list) { if (td1.reln() == COORDINATION) { IndexedWord x = td1.dep(); // find typed deps of form dep(x,y) and kill them for (TypedDependency td2 : list) { if (td2.gov().equals(x) && (td2.reln() == DEPENDENT || td2.reln() == MULTI_WORD_EXPRESSION || td2.reln() == COORDINATION || td2.reln() == ADVERBIAL_MODIFIER || td2.reln() == NEGATION_MODIFIER || td2.reln() == AUX_MODIFIER)) { td2.setReln(KILL); } } } } filterKill(list); } /** * Remove duplicate relations: it can happen when collapsing stranded * prepositions. E.g., "What does CPR stand for?" we get dep(stand, what), and * after collapsing we also get prep_for(stand, what). * * @param list A list of typed dependencies to check through */ private static void removeDep(Collection list) { Set prepRels = Generics.newHashSet(EnglishGrammaticalRelations.getPreps()); prepRels.addAll(EnglishGrammaticalRelations.getPrepsC()); for (TypedDependency td1 : list) { if (prepRels.contains(td1.reln())) { // if we have a prep_ relation IndexedWord gov = td1.gov(); IndexedWord dep = td1.dep(); for (TypedDependency td2 : list) { if (td2.reln() == DEPENDENT && td2.gov().equals(gov) && td2.dep().equals(dep)) { td2.setReln(KILL); } } } } // now remove typed dependencies with reln "kill" for (Iterator iter = list.iterator(); iter.hasNext();) { TypedDependency td = iter.next(); if (td.reln() == KILL) { if (DEBUG) { System.err.println("Removing duplicate relation: " + td); } iter.remove(); } } } /** * Find and remove any exact duplicates from a dependency list. * For example, the method that "corrects" nsubj dependencies can * turn them into nsubjpass dependencies. If there is some other * source of nsubjpass dependencies, there may now be multiple * copies of the nsubjpass dependency. If the containing data type * is a List, they may both now be in the List. */ private static void removeExactDuplicates(Collection list) { Set set = new TreeSet(list); list.clear(); list.addAll(set); } public static List readCoNLLXGrammaticalStructureCollection(String fileName) throws IOException { return readCoNLLXGrammaticalStructureCollection(fileName, EnglishGrammaticalRelations.shortNameToGRel, new FromDependenciesFactory()); } public static EnglishGrammaticalStructure buildCoNLLXGrammaticalStructure(List> tokenFields) { return (EnglishGrammaticalStructure) buildCoNLLXGrammaticalStructure(tokenFields, EnglishGrammaticalRelations.shortNameToGRel, new FromDependenciesFactory()); } public static class FromDependenciesFactory implements GrammaticalStructureFromDependenciesFactory { @Override public EnglishGrammaticalStructure build(List tdeps, TreeGraphNode root) { return new EnglishGrammaticalStructure(tdeps, root); } } } // end class EnglishGrammaticalStructure





© 2015 - 2024 Weber Informatics LLC | Privacy Policy