All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.parser.lexparser.AbstractTreebankParserParams Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.CategoryWordTag;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.parser.metrics.AbstractEval;
import edu.stanford.nlp.parser.tools.PunctEquivalenceClasser;
import edu.stanford.nlp.process.SerializableFunction;
import edu.stanford.nlp.stats.EquivalenceClasser;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Index;

import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;


/**
 * An abstract class providing a common method base from which to
 * complete a {@code TreebankLangParserParams} implementing class.
 * 

* With some extending classes you'll want to have access to special * attributes of the corresponding TreebankLanguagePack while taking * advantage of this class's code for making the TreebankLanguagePack * accessible. A good way to do this is to pass a new instance of the * appropriate TreebankLanguagePack into this class's constructor, * then get it back later on by casting a call to * treebankLanguagePack(). See ChineseTreebankParserParams for an * example. * * @author Roger Levy */ public abstract class AbstractTreebankParserParams implements TreebankLangParserParams { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(AbstractTreebankParserParams.class); /** * If true, then evaluation is over grammatical functions as well as the labels * If false, then grammatical functions are stripped for evaluation. This really * only makes sense if you've trained with grammatical functions but want to evaluate without them. */ protected boolean evalGF = true; /** The job of this class is to remove subcategorizations from * tag and category nodes, so as to put a tree in a suitable * state for evaluation. Providing the TreebankLanguagePack * is defined correctly, this should work for any language. */ protected class SubcategoryStripper implements TreeTransformer { protected TreeFactory tf = new LabeledScoredTreeFactory(); @Override public Tree transformTree(Tree tree) { Label lab = tree.label(); if (tree.isLeaf()) { Tree leaf = tf.newLeaf(lab); leaf.setScore(tree.score()); return leaf; } String s = lab.value(); s = treebankLanguagePack().basicCategory(s); int numKids = tree.numChildren(); List children = new ArrayList<>(numKids); for (int cNum = 0; cNum < numKids; cNum++) { Tree child = tree.getChild(cNum); Tree newChild = transformTree(child); // cdm 2007: for just subcategory stripping, null shouldn't happen // if (newChild != null) { children.add(newChild); // } } // if (children.isEmpty()) { // return null; // } CategoryWordTag newLabel = new CategoryWordTag(lab); newLabel.setCategory(s); if (lab instanceof HasTag) { String tag = ((HasTag) lab).tag(); tag = treebankLanguagePack().basicCategory(tag); newLabel.setTag(tag); } Tree node = tf.newTreeNode(newLabel, children); node.setScore(tree.score()); return node; } } // end class SubcategoryStripper /** The job of this class is to remove subcategorizations from * tag and category nodes, so as to put a tree in a suitable * state for evaluation. Providing the TreebankLanguagePack * is defined correctly, this should work for any language. * Very simililar to subcategory stripper, but strips grammatical * functions as well. */ protected class RemoveGFSubcategoryStripper implements TreeTransformer { protected TreeFactory tf = new LabeledScoredTreeFactory(); @Override public Tree transformTree(Tree tree) { Label lab = tree.label(); if (tree.isLeaf()) { Tree leaf = tf.newLeaf(lab); leaf.setScore(tree.score()); return leaf; } String s = lab.value(); s = treebankLanguagePack().basicCategory(s); s = treebankLanguagePack().stripGF(s); int numKids = tree.numChildren(); List children = new ArrayList<>(numKids); for (int cNum = 0; cNum < numKids; cNum++) { Tree child = tree.getChild(cNum); Tree newChild = transformTree(child); children.add(newChild); } CategoryWordTag newLabel = new CategoryWordTag(lab); newLabel.setCategory(s); if (lab instanceof HasTag) { String tag = ((HasTag) lab).tag(); tag = treebankLanguagePack().basicCategory(tag); tag = treebankLanguagePack().stripGF(tag); newLabel.setTag(tag); } Tree node = tf.newTreeNode(newLabel, children); node.setScore(tree.score()); return node; } } // end class RemoveGFSubcategoryStripper protected String inputEncoding; protected String outputEncoding; protected TreebankLanguagePack tlp; protected boolean generateOriginalDependencies; /** * Stores the passed-in TreebankLanguagePack and sets up charset encodings. * * @param tlp The treebank language pack to use */ protected AbstractTreebankParserParams(TreebankLanguagePack tlp) { this.tlp = tlp; inputEncoding = tlp.getEncoding(); outputEncoding = tlp.getEncoding(); generateOriginalDependencies = false; } @Override public Label processHeadWord(Label headWord) { return headWord; } /** * Sets whether to consider grammatical functions in evaluation */ @Override public void setEvaluateGrammaticalFunctions(boolean evalGFs) { this.evalGF = evalGFs; } /** * Sets the input encoding. */ @Override public void setInputEncoding(String encoding) { inputEncoding = encoding; } /** * Sets the output encoding. */ @Override public void setOutputEncoding(String encoding) { outputEncoding = encoding; } /** * Returns the output encoding being used. */ @Override public String getOutputEncoding() { return outputEncoding; } /** * Returns the input encoding being used. */ @Override public String getInputEncoding() { return inputEncoding; } /** * Returns a language specific object for evaluating PP attachment * * @return An object that implements {@link AbstractEval} */ @Override public AbstractEval ppAttachmentEval() { return null; } /** * returns a MemoryTreebank appropriate to the treebank source */ @Override public abstract MemoryTreebank memoryTreebank(); /** * returns a DiskTreebank appropriate to the treebank source */ @Override public abstract DiskTreebank diskTreebank(); /** * You can often return the same thing for testMemoryTreebank as * for memoryTreebank */ @Override public MemoryTreebank testMemoryTreebank() { return memoryTreebank(); } /** * Implemented as required by TreebankFactory. Use diskTreebank() instead. */ @Override public Treebank treebank() { return diskTreebank(); } /** * The PrintWriter used to print output. It's the responsibility of * pw to deal properly with character encodings for the relevant * treebank. */ @Override public PrintWriter pw() { return pw(System.out); } /** * The PrintWriter used to print output. It's the responsibility of * pw to deal properly with character encodings for the relevant * treebank. */ @Override public PrintWriter pw(OutputStream o) { String encoding = outputEncoding; if (!java.nio.charset.Charset.isSupported(encoding)) { log.info("Warning: desired encoding " + encoding + " not accepted. "); log.info("Using UTF-8 to construct PrintWriter"); encoding = "UTF-8"; } //log.info("TreebankParserParams.pw(): encoding is " + encoding); try { return new PrintWriter(new OutputStreamWriter(o, encoding), true); } catch (UnsupportedEncodingException e) { log.info("Warning: desired encoding " + outputEncoding + " not accepted. " + e); try { return new PrintWriter(new OutputStreamWriter(o, "UTF-8"), true); } catch (UnsupportedEncodingException e1) { log.info("Something is really wrong. Your system doesn't even support UTF-8!" + e1); return new PrintWriter(o, true); } } } /** * Returns an appropriate treebankLanguagePack */ @Override public TreebankLanguagePack treebankLanguagePack() { return tlp; } /** * The HeadFinder to use for your treebank. */ @Override public abstract HeadFinder headFinder(); /** * The HeadFinder to use when extracting typed dependencies. */ @Override public abstract HeadFinder typedDependencyHeadFinder(); @Override public Lexicon lex(Options op, Index wordIndex, Index tagIndex) { return new BaseLexicon(op, wordIndex, tagIndex); } /** * Give the parameters for smoothing in the MLEDependencyGrammar. * Defaults are the ones previously hard coded into MLEDependencyGrammar. * @return an array of doubles with smooth_aT_hTWd, smooth_aTW_hTWd, smooth_stop, and interp */ @Override public double[] MLEDependencyGrammarSmoothingParams() { return new double[] { 16.0, 16.0, 4.0, 0.6 }; } /** * Takes a Tree and a collinizer and returns a Collection of labeled * {@link Constituent}s for PARSEVAL. * @param t The tree to extract constituents from * @param collinizer The TreeTransformer used to normalize the tree for * evaluation * @return The bag of Constituents for PARSEVAL. */ public static Collection parsevalObjectify(Tree t, TreeTransformer collinizer) { return parsevalObjectify(t,collinizer,true); } /** * Takes a Tree and a collinizer and returns a Collection of {@link Constituent}s for * PARSEVAL evaluation. Some notes on this particular parseval: *

    *
  • It is character-based, which allows it to be used on segmentation/parsing combination evaluation. *
  • whether it gives you labeled or unlabeled bracketings depends on the value of the {@code labelConstituents} * parameter *
* * (Note that I haven't checked this rigorously yet with the PARSEVAL definition * -- Roger.) */ public static Collection parsevalObjectify(Tree t, TreeTransformer collinizer, boolean labelConstituents) { Collection spans = new ArrayList<>(); Tree t1 = collinizer.transformTree(t); if (t1 == null) { return spans; } for (Tree node : t1) { if (node.isLeaf() || node.isPreTerminal() || (node != t1 && node.parent(t1) == null)) { continue; } int leftEdge = t1.leftCharEdge(node); int rightEdge = t1.rightCharEdge(node); if(labelConstituents) spans.add(new LabeledConstituent(leftEdge, rightEdge, node.label())); else spans.add(new SimpleConstituent(leftEdge, rightEdge)); } return spans; } /** * Returns a collection of untyped word-word dependencies for the tree. */ public static Collection> untypedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) { return dependencyObjectify(t, hf, collinizer, new UntypedDependencyTyper(hf)); } /** * Returns a collection of unordered (but directed!) untyped word-word dependencies for the tree. */ public static Collection> unorderedUntypedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) { return dependencyObjectify(t, hf, collinizer, new UnorderedUntypedDependencyTyper(hf)); } /** * Returns a collection of word-word dependencies typed by mother, head, daughter node syntactic categories. */ public static Collection> typedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) { return dependencyObjectify(t, hf, collinizer, new TypedDependencyTyper(hf)); } /** * Returns a collection of unordered (but directed!) typed word-word dependencies for the tree. */ public static Collection> unorderedTypedDependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer) { return dependencyObjectify(t, hf, collinizer, new UnorderedTypedDependencyTyper(hf)); } /** * Returns the set of dependencies in a tree, according to some {@link edu.stanford.nlp.trees.DependencyTyper}. */ public static Collection dependencyObjectify(Tree t, HeadFinder hf, TreeTransformer collinizer, DependencyTyper typer) { Collection deps = new ArrayList<>(); Tree t1 = collinizer.transformTree(t); if(t1==null) return deps; dependencyObjectifyHelper(t1, t1, hf, deps, typer); return deps; } private static void dependencyObjectifyHelper(Tree t, Tree root, HeadFinder hf, Collection c, DependencyTyper typer) { if (t.isLeaf() || t.isPreTerminal()) { return; } Tree headDtr = hf.determineHead(t); for (Tree child : t.children()) { dependencyObjectifyHelper(child, root, hf, c, typer); if (child != headDtr) { c.add(typer.makeDependency(headDtr, child, root)); } } } private static class UntypedDependencyTyper implements DependencyTyper> { HeadFinder hf; public UntypedDependencyTyper(HeadFinder hf) { this.hf = hf; } @Override public List makeDependency(Tree head, Tree dep, Tree root) { List result = new ArrayList<>(3); Tree headTerm = head.headTerminal(hf); Tree depTerm = dep.headTerminal(hf); boolean headLeft = root.leftCharEdge(headTerm) < root.leftCharEdge(depTerm); result.add(headTerm.value()); result.add(depTerm.value()); if(headLeft) result.add(leftHeaded); else result.add(rightHeaded); return result; } } private static class UnorderedUntypedDependencyTyper implements DependencyTyper> { HeadFinder hf; public UnorderedUntypedDependencyTyper(HeadFinder hf) { this.hf = hf; } @Override public List makeDependency(Tree head, Tree dep, Tree root) { List result = new ArrayList<>(3); Tree headTerm = head.headTerminal(hf); Tree depTerm = dep.headTerminal(hf); result.add(headTerm.value()); result.add(depTerm.value()); return result; } } private static final String leftHeaded = "leftHeaded"; private static final String rightHeaded = "rightHeaded"; private static class TypedDependencyTyper implements DependencyTyper> { HeadFinder hf; public TypedDependencyTyper(HeadFinder hf) { this.hf = hf; } @Override public List makeDependency(Tree head, Tree dep, Tree root) { List result = new ArrayList<>(6); Tree headTerm = head.headTerminal(hf); Tree depTerm = dep.headTerminal(hf); boolean headLeft = root.leftCharEdge(headTerm) < root.leftCharEdge(depTerm); result.add(headTerm.value()); result.add(depTerm.value()); result.add(head.parent(root).value()); result.add(head.value()); result.add(dep.value()); if(headLeft) result.add(leftHeaded); else result.add(rightHeaded); return result; } } private static class UnorderedTypedDependencyTyper implements DependencyTyper> { HeadFinder hf; public UnorderedTypedDependencyTyper(HeadFinder hf) { this.hf = hf; } @Override public List makeDependency(Tree head, Tree dep, Tree root) { List result = new ArrayList<>(6); Tree headTerm = head.headTerminal(hf); Tree depTerm = dep.headTerminal(hf); result.add(headTerm.value()); result.add(depTerm.value()); result.add(head.parent(root).value()); result.add(head.value()); result.add(dep.value()); return result; } } /** Returns an EquivalenceClasser that classes typed dependencies * by the syntactic categories of mother, head and daughter, * plus direction. * * @return An Equivalence class for typed dependencies */ public static EquivalenceClasser, String> typedDependencyClasser() { return s -> { if(s.get(5).equals(leftHeaded)) return s.get(2) + '(' + s.get(3) + "->" + s.get(4) + ')'; return s.get(2) + '(' + s.get(4) + "<-" + s.get(3) + ')'; }; } /** * the tree transformer used to produce trees for evaluation. Will * be applied both to the parse output tree and to the gold * tree. Should strip punctuation and maybe do some other things. */ @Override public abstract TreeTransformer collinizer(); /** * the tree transformer used to produce trees for evaluation. Will * be applied both to the parse output tree and to the gold * tree. Should strip punctuation and maybe do some other * things. The evalb version should strip some more stuff * off. (finish this doc!) */ @Override public abstract TreeTransformer collinizerEvalb(); /** * Returns the splitting strings used for selective splits. * * @return An array containing ancestor-annotated Strings: categories * should be split according to these ancestor annotations. */ @Override public abstract String[] sisterSplitters(); /** * Returns a TreeTransformer appropriate to the Treebank which * can be used to remove functional tags (such as "-TMP") from * categories. Removes GFs if evalGF = false; if GFs were not used * in training, results are equivalent. */ @Override public TreeTransformer subcategoryStripper() { if(evalGF) return new SubcategoryStripper(); return new RemoveGFSubcategoryStripper(); } /** * This method does language-specific tree transformations such * as annotating particular nodes with language-relevant features. * Such parameterizations should be inside the specific * TreebankLangParserParams class. This method is recursively * applied to each node in the tree (depth first, left-to-right), * so you shouldn't write this method to apply recursively to tree * members. This method is allowed to (and in some cases does) * destructively change the input tree {@code t}. It changes both * labels and the tree shape. * * @param t The input tree (with non-language specific annotation already * done, so you need to strip back to basic categories) * @param root The root of the current tree (can be null for words) * @return The fully annotated tree node (with daughters still as you * want them in the final result) */ @Override public abstract Tree transformTree(Tree t, Tree root); /** * Display (write to stderr) language-specific settings. */ @Override public abstract void display(); /** * Set language-specific options according to flags. * This routine should process the option starting in args[i] (which * might potentially be several arguments long if it takes arguments). * It should return the index after the last index it consumed in * processing. In particular, if it cannot process the current option, * the return value should be i. *

* Generic options are processed separately by * {@link Options#setOption(String[],int)}, * and implementations of this method do not have to worry about them. * The Options class handles routing options. * TreebankParserParams that extend this class should call super when * overriding this method. */ @Override public int setOptionFlag(String[] args, int i) { return i; } private static final long serialVersionUID = 4299501909017975915L; @Override public TokenizerFactory treeTokenizerFactory() { return new TreeTokenizerFactory(treeReaderFactory()); } @Override public Extractor dependencyGrammarExtractor(Options op, Index wordIndex, Index tagIndex) { return new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex); } public boolean isEvalGF() { return evalGF; } public void setEvalGF(boolean evalGF) { this.evalGF = evalGF; } /** * Annotation function for mapping punctuation to PTB-style equivalence classes. * * @author Spence Green * */ protected static class AnnotatePunctuationFunction implements SerializableFunction { private final String key; private final String annotationMark; public AnnotatePunctuationFunction(String annotationMark, String key) { this.key = key; this.annotationMark = annotationMark; } @Override public String apply(TregexMatcher m) { String punc = m.getNode(key).value(); String punctClass = PunctEquivalenceClasser.getPunctClass(punc); return punctClass.equals("") ? "" : annotationMark + punctClass; } @Override public String toString() { return "AnnotatePunctuationFunction"; } private static final long serialVersionUID = 1L; } @Override public List readGrammaticalStructureFromFile(String filename) { throw new UnsupportedOperationException("This language does not support GrammaticalStructures or dependencies"); } @Override public GrammaticalStructure getGrammaticalStructure(Tree t, Predicate filter, HeadFinder hf) { throw new UnsupportedOperationException("This language does not support GrammaticalStructures or dependencies"); } /** * By default, parsers are assumed to not support dependencies. * Only English and Chinese do at present. */ @Override public boolean supportsBasicDependencies() { return false; } /** * For languages that have implementations of the * original Stanford dependencies and Universal * dependencies, this parameter is used to decide which * implementation should be used. */ @Override public void setGenerateOriginalDependencies(boolean originalDependencies) { this.generateOriginalDependencies = originalDependencies; if (this.tlp != null) { this.tlp.setGenerateOriginalDependencies(originalDependencies); } } @Override public boolean generateOriginalDependencies() { return this.generateOriginalDependencies; } private static final String[] EMPTY_ARGS = new String[0]; @Override public String[] defaultCoreNLPFlags() { return EMPTY_ARGS; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy