All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.trees.NPTmpRetainingTreeNormalizer Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.trees; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.LabelFactory;
import java.util.function.Predicate;

import java.io.Reader;
import java.util.regex.Pattern;
import java.util.*;


/**
 * Same TreeNormalizer as BobChrisTreeNormalizer, but optionally provides
 * four extras.  I.e., the class name is now a misnomer.
* 1) retains -TMP labels on NP with the new identification NP-TMP, * and provides various options to percolate that option downwards * to the head noun, and perhaps also to inherit this from a PP-TMP.
* 2) Annotates S nodes which contain a gapped subject: i.e., * S < (/^NP-SBJ/ < -NONE-) --> S-G
* 3) Leave all functional tags on nodes.
* 4) Keeps -ADV labels on NP and marks head tag with &`^ADV *

* Performance note: At one point in time, PCFG labeled F1 results * for the various TEMPORAL options in lexparser were: * 0=86.7, 1=87.49, 2=86.87, 3=87.49, 4=87.48, 5=87.5, 6=87.07. * So, mainly avoid values of 0, 2, and 6. *

* At another point they were: * 0=86.53, 1=87.1, 2=87.14, 3=87.22, 4=87.1, 5=87.13, 6=86.95, 7=87.16 * * @author Christopher Manning * @author Dan Klein */ public class NPTmpRetainingTreeNormalizer extends BobChrisTreeNormalizer { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(NPTmpRetainingTreeNormalizer.class); private static final long serialVersionUID = 7548777133196579107L; public static final int TEMPORAL_NONE = 0; public static final int TEMPORAL_ACL03PCFG = 1; public static final int TEMPORAL_ANY_TMP_PERCOLATED = 2; public static final int TEMPORAL_ALL_TERMINALS = 3; public static final int TEMPORAL_ALL_NP = 4; public static final int TEMPORAL_ALL_NP_AND_PP = 5; public static final int TEMPORAL_NP_AND_PP_WITH_NP_HEAD = 6; public static final int TEMPORAL_ALL_NP_EVEN_UNDER_PP = 7; public static final int TEMPORAL_ALL_NP_PP_ADVP = 8; public static final int TEMPORAL_9 = 9; private static final boolean onlyTagAnnotateNstar = true; private static final Pattern NPTmpPattern = Pattern.compile("NP.*-TMP.*"); private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*"); private static final Pattern ADVPTmpPattern = Pattern.compile("ADVP.*-TMP.*"); private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*"); private static final Pattern NPSbjPattern = Pattern.compile("NP.*-SBJ.*"); private static final Pattern NPAdvPattern = Pattern.compile("NP.*-ADV.*"); private final int temporalAnnotation; private final boolean doSGappedStuff; private final int leaveItAll; private final boolean doAdverbialNP; private final HeadFinder headFinder; public NPTmpRetainingTreeNormalizer() { this(TEMPORAL_ACL03PCFG, false); } public NPTmpRetainingTreeNormalizer(int temporalAnnotation, boolean doSGappedStuff) { this(temporalAnnotation, doSGappedStuff, 0, false); } public NPTmpRetainingTreeNormalizer(int temporalAnnotation, boolean doSGappedStuff, int leaveItAll, boolean doAdverbialNP) { this(temporalAnnotation, doSGappedStuff, leaveItAll, doAdverbialNP, new ModCollinsHeadFinder()); } /** * Create a TreeNormalizer that maintains some functional annotations, * particularly those involving temporal annotation. * * @param temporalAnnotation One of the constants: * TEMPORAL_NONE (no temporal annotation kept on trees), * TEMPORAL_ACL03PCFG (temporal annotation on NPs, and percolated down * to head of constituent until and including POS tag), * TEMPORAL_ANY_TMP_PERCOLATED (temporal annotation on any phrase is * kept and percolated via head chain to and including POS tag), * TEMPORAL_ALL_TERMINALS (temporal annotation is kept on NPs, and * is placed on all POS tag daughters of that NP (but is not * percolated down a head chain through phrasal categories), * TEMPORAL_ALL_NP (temporal annotation on NPs, and it is percolated * down via the head chain, but only through NPs: annotation stops * at either a POS tag (which is annotated) or a non-NP head * (which isn't annotated)), * TEMPORAL_ALL_NP_AND_PP (keeps temporal annotation on NPs and PPs, * and it is percolated down via the head chain, but only through * NPs: annotation stops at either a POS tag (which is annotated) * or a non-NP head (which isn't annotated)). * TEMPORAL_NP_AND_PP_WITH_NP_HEAD (like TEMPORAL_ALL_NP_AND_PP * except an NP is regarded as the head of a PP) * TEMPORAL_ALL_NP_EVEN_UNDER_PP (like TEMPORAL_ALL_NP, but a PP-TMP * annotation above an NP is 'passed down' to annotate that NP * as temporal (but the PP itself isn't marked)) * TEMPORAL_ALL_NP_PP_ADVP (keeps temporal annotation on NPs, PPs, and * ADVPs * and it is percolated down via the head chain, but only through * those categories: annotation stops at either a POS tag * (which is annotated) * or a non-NP/PP/ADVP head (which isn't annotated)), * TEMPORAL_9 (annotates like the previous one but * does all NP inside node, and their children if * pre-pre-terminal rather than only if head). * @param doSGappedStuff Leave -SBJ marking on subject NP and then mark * S-G sentences with a gapped subject. * @param leaveItAll 0 means the usual stripping of functional tags and indices; * 1 leaves all functional tags but still strips indices; * 2 leaves everything * @param doAdverbialNP Leave -ADV functional tag on adverbial NPs and * maybe add it to their head * @param headFinder A head finder that is used with some of the * options for temporalAnnotation */ public NPTmpRetainingTreeNormalizer(int temporalAnnotation, boolean doSGappedStuff, int leaveItAll, boolean doAdverbialNP, HeadFinder headFinder) { this.temporalAnnotation = temporalAnnotation; this.doSGappedStuff = doSGappedStuff; this.leaveItAll = leaveItAll; this.doAdverbialNP = doAdverbialNP; this.headFinder = headFinder; } /** * Remove things like hyphened functional tags and equals from the * end of a node label. */ @Override protected String cleanUpLabel(String label) { if (label == null) { return "ROOT"; // String constants are always interned } else if (leaveItAll == 1) { return tlp.categoryAndFunction(label); } else if (leaveItAll == 2) { return label; } else { boolean nptemp = NPTmpPattern.matcher(label).matches(); boolean pptemp = PPTmpPattern.matcher(label).matches(); boolean advptemp = ADVPTmpPattern.matcher(label).matches(); boolean anytemp = TmpPattern.matcher(label).matches(); boolean subj = NPSbjPattern.matcher(label).matches(); boolean npadv = NPAdvPattern.matcher(label).matches(); label = tlp.basicCategory(label); if (anytemp && temporalAnnotation == TEMPORAL_ANY_TMP_PERCOLATED) { label += "-TMP"; } else if (pptemp && (temporalAnnotation == TEMPORAL_ALL_NP_AND_PP || temporalAnnotation == TEMPORAL_NP_AND_PP_WITH_NP_HEAD || temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP || temporalAnnotation == TEMPORAL_ALL_NP_PP_ADVP || temporalAnnotation == TEMPORAL_9)) { label = label + "-TMP"; } else if (advptemp && (temporalAnnotation == TEMPORAL_ALL_NP_PP_ADVP || temporalAnnotation == TEMPORAL_9)) { label = label + "-TMP"; } else if (temporalAnnotation > 0 && nptemp) { label = label + "-TMP"; } if (doAdverbialNP && npadv) { label = label + "-ADV"; } if (doSGappedStuff && subj) { label = label + "-SBJ"; } return label; } } private static boolean includesEmptyNPSubj(Tree t) { if (t == null) { return false; } Tree[] kids = t.children(); if (kids == null) { return false; } boolean foundNullSubj = false; for (Tree kid : kids) { Tree[] kidkids = kid.children(); if (NPSbjPattern.matcher(kid.value()).matches()) { kid.setValue("NP"); if (kidkids != null && kidkids.length == 1 && kidkids[0].value().equals("-NONE-")) { // only set flag, since there are 2 a couple of times (errors) foundNullSubj = true; } } } return foundNullSubj; } /** * Normalize a whole tree -- one can assume that this is the root. * This implementation deletes empty elements (ones with nonterminal * tag label '-NONE-') from the tree. */ @Override public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { TreeTransformer transformer1 = t -> { if (doSGappedStuff) { String lab = t.label().value(); if (lab.equals("S") && includesEmptyNPSubj(t)) { LabelFactory lf = t.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! t.setLabel(lf.newLabel(t.label().value() + "-G")); } } return t; }; Predicate subtreeFilter = new Predicate() { private static final long serialVersionUID = -7250433816896327901L; @Override public boolean test(Tree t) { Tree[] kids = t.children(); Label l = t.label(); // The special Switchboard non-terminals clause. // Note that it deletes IP which other Treebanks might use! if ("RS".equals(t.label().value()) || "RM".equals(t.label().value()) || "IP".equals(t.label().value()) || "CODE".equals(t.label().value())) { return t.isLeaf(); //Prevents deletion of the word "IP" } if ((l != null) && l.value() != null && (l.value().equals("-NONE-")) && !t.isLeaf() && kids.length == 1 && kids[0].isLeaf()) { // Delete empty/trace nodes (ones marked '-NONE-') return false; } return true; } }; Predicate nodeFilter = new Predicate() { private static final long serialVersionUID = 9000955019205336311L; @Override public boolean test(Tree t) { if (t.isLeaf() || t.isPreTerminal()) { return true; } // The special switchboard non-terminals clause. Try keeping EDITED for now.... // if ("EDITED".equals(t.label().value())) { // return false; // } if (t.numChildren() != 1) { return true; } if (t.label() != null && t.label().value() != null && t.label().value().equals(t.children()[0].label().value())) { return false; } return true; } }; TreeTransformer transformer2 = t -> { if (temporalAnnotation == TEMPORAL_ANY_TMP_PERCOLATED) { String lab = t.label().value(); if (TmpPattern.matcher(lab).matches()) { Tree oldT = t; Tree ht; do { ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.label().value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } LabelFactory lf = ht.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); oldT = ht; } while (!ht.isPreTerminal()); if (lab.startsWith("PP")) { ht = headFinder.determineHead(t); // look to right int j = t.objectIndexOf(ht); int sz = t.children().length; if (j + 1 < sz) { ht = t.getChild(j + 1); } if (ht.label().value().startsWith("NP")) { while (!ht.isLeaf()) { LabelFactory lf = ht.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); ht = headFinder.determineHead(ht); } } } } } else if (temporalAnnotation == TEMPORAL_ALL_TERMINALS) { String lab = t.label().value(); if (NPTmpPattern.matcher(lab).matches()) { Tree ht; ht = headFinder.determineHead(t); if (ht.isPreTerminal()) { // change all tags to -TMP LabelFactory lf = ht.label().labelFactory(); Tree[] kids = t.children(); for (Tree kid : kids) { if (kid.isPreTerminal()) { // Note: this changes the tree label, rather // than creating a new tree node. Beware! kid.setLabel(lf.newLabel(kid.value() + "-TMP")); } } } else { Tree oldT = t; do { ht = headFinder.determineHead(oldT); oldT = ht; } while (!ht.isPreTerminal()); LabelFactory lf = ht.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); } } } else if (temporalAnnotation == TEMPORAL_ALL_NP) { String lab = t.label().value(); if (NPTmpPattern.matcher(lab).matches()) { Tree oldT = t; Tree ht; do { ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.label().value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } if (ht.isPreTerminal() || ht.value().startsWith("NP")) { LabelFactory lf = ht.labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); oldT = ht; } } while (ht.value().startsWith("NP")); } } else if (temporalAnnotation == TEMPORAL_ALL_NP_AND_PP || temporalAnnotation == TEMPORAL_NP_AND_PP_WITH_NP_HEAD || temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP) { // also allow chain to start with PP String lab = t.value(); if (NPTmpPattern.matcher(lab).matches() || PPTmpPattern.matcher(lab).matches()) { Tree oldT = t; do { Tree ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } else if ((temporalAnnotation == TEMPORAL_NP_AND_PP_WITH_NP_HEAD || temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP) && (ht.value().equals("IN") || ht.value().equals("TO"))) { // change the head to be NP if possible Tree[] kidlets = oldT.children(); for (int k = kidlets.length - 1; k > 0; k--) { if (kidlets[k].value().startsWith("NP")) { ht = kidlets[k]; } } } LabelFactory lf = ht.labelFactory(); // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! if (ht.isPreTerminal() || ht.value().startsWith("NP")) { ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } if (temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP && oldT.value().startsWith("PP")) { oldT.setLabel(lf.newLabel(tlp.basicCategory(oldT.value()))); } oldT = ht; } while (oldT.value().startsWith("NP") || oldT.value().startsWith("PP")); } } else if (temporalAnnotation == TEMPORAL_ALL_NP_PP_ADVP) { // also allow chain to start with PP or ADVP String lab = t.value(); if (NPTmpPattern.matcher(lab).matches() || PPTmpPattern.matcher(lab).matches() || ADVPTmpPattern.matcher(lab).matches()) { Tree oldT = t; do { Tree ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! if (ht.isPreTerminal() || ht.value().startsWith("NP")) { LabelFactory lf = ht.labelFactory(); ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } oldT = ht; } while (oldT.value().startsWith("NP")); } } else if (temporalAnnotation == TEMPORAL_9) { // also allow chain to start with PP or ADVP String lab = t.value(); if (NPTmpPattern.matcher(lab).matches() || PPTmpPattern.matcher(lab).matches() || ADVPTmpPattern.matcher(lab).matches()) { // log.info("TMP: Annotating " + t); addTMP9(t); } } else if (temporalAnnotation == TEMPORAL_ACL03PCFG) { String lab = t.label().value(); if (lab != null && NPTmpPattern.matcher(lab).matches()) { Tree oldT = t; Tree ht; do { ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.label().value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } oldT = ht; } while (!ht.isPreTerminal()); if ( ! onlyTagAnnotateNstar || ht.label().value().startsWith("N")) { LabelFactory lf = ht.label().labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-TMP")); } } } if (doAdverbialNP) { String lab = t.value(); if (NPAdvPattern.matcher(lab).matches()) { Tree oldT = t; Tree ht; do { ht = headFinder.determineHead(oldT); // special fix for possessives! -- make noun before head if (ht.label().value().equals("POS")) { int j = oldT.objectIndexOf(ht); if (j > 0) { ht = oldT.getChild(j - 1); } } if (ht.isPreTerminal() || ht.value().startsWith("NP")) { LabelFactory lf = ht.labelFactory(); // Note: this changes the tree label, rather than // creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.label().value() + "-ADV")); oldT = ht; } } while (ht.value().startsWith("NP")); } } return t; }; // if there wasn't an empty nonterminal at the top, but an S, wrap it. if (tree.label().value().equals("S")) { tree = tf.newTreeNode("ROOT", Collections.singletonList(tree)); } // repair for the phrasal VB in Switchboard (PTB version 3) that should be a VP for (Tree subtree : tree) { if (subtree.isPhrasal() && "VB".equals(subtree.label().value())) { subtree.setValue("VP"); } } tree = tree.transform(transformer1); if (tree == null) { return null; } tree = tree.prune(subtreeFilter, tf); if (tree == null) { return null; } tree = tree.spliceOut(nodeFilter, tf); if (tree == null) { return null; } return tree.transform(transformer2, tf); } /** * Add -TMP when not present within an NP * @param tree The tree to add temporal info to. */ private void addTMP9(final Tree tree) { // do the head chain under it Tree ht = headFinder.determineHead(tree); // special fix for possessives! -- make noun before head if (ht.value().equals("POS")) { int j = tree.objectIndexOf(ht); if (j > 0) { ht = tree.getChild(j - 1); } } // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! if (ht.isPreTerminal() || ht.value().startsWith("NP") || ht.value().startsWith("PP") || ht.value().startsWith("ADVP")) { if (!TmpPattern.matcher(ht.value()).matches()) { LabelFactory lf = ht.labelFactory(); // log.info("TMP: Changing " + ht.value() + " to " + // ht.value() + "-TMP"); ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } if (ht.value().startsWith("NP") || ht.value().startsWith("PP") || ht.value().startsWith("ADVP")) { addTMP9(ht); } } // do the NPs under it (which may or may not be the head chain Tree[] kidlets = tree.children(); for (Tree kidlet : kidlets) { ht = kidlet; LabelFactory lf; if (tree.isPrePreTerminal() && !TmpPattern.matcher(ht.value()).matches()) { // log.info("TMP: Changing " + ht.value() + " to " + // ht.value() + "-TMP"); lf = ht.labelFactory(); // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } else if (ht.value().startsWith("NP")) { // don't add -TMP twice! if (!TmpPattern.matcher(ht.value()).matches()) { lf = ht.labelFactory(); // log.info("TMP: Changing " + ht.value() + " to " + // ht.value() + "-TMP"); // Note: this next bit changes the tree label, rather // than creating a new tree node. Beware! ht.setLabel(lf.newLabel(ht.value() + "-TMP")); } addTMP9(ht); } } } /** Implementation of TreeReaderFactory, mainly for convenience of * constructing by reflection. */ public static class NPTmpRetainingTreeReaderFactory implements TreeReaderFactory { @Override public TreeReader newTreeReader(Reader in) { return new PennTreeReader(in, new LabeledScoredTreeFactory(), new NPTmpRetainingTreeNormalizer()); } } /** Implementation of TreeReaderFactory, mainly for convenience of * constructing by reflection. This one corresponds to what's currently * used in englishPCFG accurate unlexicalized parser. */ public static class NPTmpAdvRetainingTreeReaderFactory implements TreeReaderFactory { @Override public TreeReader newTreeReader(Reader in) { return new PennTreeReader(in, new LabeledScoredTreeFactory(), new NPTmpRetainingTreeNormalizer(NPTmpRetainingTreeNormalizer.TEMPORAL_ACL03PCFG, false, 0, true)); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy