edu.stanford.nlp.trees.QPTreeTransformer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.trees;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
import edu.stanford.nlp.util.StringUtils;
import java.util.Properties;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
/**
* Transforms an English structure parse tree in order to get the dependencies right:
* Adds an extra structure in QP phrases:
*
* (QP (RB well) (IN over) (CD 9)) becomes
*
* (QP (XS (RB well) (IN over)) (CD 9))
*
* (QP (...) (CC ...) (...)) becomes
*
* (QP (NP ...) (CC ...) (NP ...))
*
*
* @author mcdm
*/
public class QPTreeTransformer implements TreeTransformer {
private boolean universalDependencies = false;
public QPTreeTransformer() {
this(false);
}
public QPTreeTransformer(boolean universalDependencies) {
this.universalDependencies = universalDependencies;
}
/**
* Right now (Jan 2013) we only deal with the following QP structures:
*
* - NP (QP ...) (QP (CC and/or) ...)
*
- QP (RB IN CD|DT ...) well over, more than
*
- QP (JJR IN CD|DT ...) fewer than
*
- QP (IN JJS CD|DT ...) at least
*
- QP (... CC ...) between 5 and 10
*
*
* @param t tree to be transformed
* @return The tree t with an extra layer if there was a QP structure matching the ones mentioned above
*/
@Override
public Tree transformTree(Tree t) {
return QPtransform(t);
}
private static TregexPattern flattenNPoverQPTregex =
TregexPattern.compile("NP < (QP=left $+ (QP=right < CC))");
private static TsurgeonPattern flattenNPoverQPTsurgeon =
Tsurgeon.parseOperation("[createSubtree QP left right] [excise left left] [excise right right]");
private static TregexPattern multiwordXSTregex =
// TODO: should add NN and $ to the numeric expressions captured
// NN is for words such as "half" which are probably misparsed
// TODO: <3 (IN < as|than) is to avoid one weird case in PTB,
// "more than about". Perhaps there is some way to generalize this
// TODO: "all but X"
// TODO: "all but about X"
TregexPattern.compile("QP <1 /^RB|JJ|IN/=left [ ( <2 /^JJ|IN/=right <3 /^CD|DT/ ) | ( <2 /^JJ|IN/ <3 ( IN=right < /^(?i:as|than)$/ ) <4 /^CD|DT/ ) ] ");
private static TsurgeonPattern multiwordXSTsurgeon =
Tsurgeon.parseOperation("createSubtree XS left right");
// the old style split any flat QP with a CC in the middle
// TOD: there should be some allowances for phrases such as "or more", "or so", etc
private static TregexPattern splitCCTregex =
TregexPattern.compile("QP < (CC $- __=r1 $+ __=l2 ?$-- /^[$]|CC$/=lnum ?$++ /^[$]|CC$/=rnum) <1 __=l1 <- __=r2 !< (__ < (__ < __))");
private static TsurgeonPattern splitCCTsurgeon =
Tsurgeon.parseOperation("[if exists lnum createSubtree QP l1 r1] [if not exists lnum createSubtree NP l1 r1] " +
"[if exists rnum createSubtree QP l2 r2] [if not exists rnum createSubtree NP l2 r2]");
private static TregexPattern splitMoneyTregex =
TregexPattern.compile("QP < (/^[$]$/ !$++ /^(?!([$]|CD)).*$/ !$++ (__ < (__ < __)) $+ __=left) <- __=right");
private static TsurgeonPattern splitMoneyTsurgeon =
Tsurgeon.parseOperation("createSubtree QP left right");
/**
* Transforms t if it contains one of the following QP structure:
*
* - NP (QP ...) (QP (CC and/or) ...)
*
- QP (RB IN CD|DT ...) well over, more than
*
- QP (JJR IN CD|DT ...) fewer than
*
- QP (IN JJS CD|DT ...) at least
*
- QP (... CC ...) between 5 and 10
*
*
* @param t a tree to be transformed
* @return t transformed
*/
public Tree QPtransform(Tree t) {
t = Tsurgeon.processPattern(flattenNPoverQPTregex, flattenNPoverQPTsurgeon, t);
if ( ! universalDependencies)
t = Tsurgeon.processPattern(multiwordXSTregex, multiwordXSTsurgeon, t);
t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t);
t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t);
return t;
}
public static void main(String[] args) {
QPTreeTransformer transformer = new QPTreeTransformer();
Treebank tb = new MemoryTreebank();
Properties props = StringUtils.argsToProperties(args);
String treeFileName = props.getProperty("treeFile");
if (treeFileName != null) {
try {
TreeReader tr = new PennTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFileName))), new LabeledScoredTreeFactory());
Tree t;
while ((t = tr.readTree()) != null) {
tb.add(t);
}
} catch (IOException e) {
throw new RuntimeException("File problem: " + e);
}
}
for (Tree t : tb) {
System.out.println("Original tree");
t.pennPrint();
System.out.println();
System.out.println("Tree transformed");
Tree tree = transformer.transformTree(t);
tree.pennPrint();
System.out.println();
System.out.println("----------------------------");
}
}
}