All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.nyu.jet.format.PTBReader Maven / Gradle / Ivy

Go to download

Information extraction is the process of identifying specified classes of entities, relations, and events in natural language text – creating structured data from unstructured input. JET, the Java Extraction Toolkit, developed at New York University over the past fifteen years, provides a rich set of tools for research and education in information extraction from English text. These include standard language processing tools such as a tokenizer, sentence segmenter, part-of-speech tagger, name tagger, regular-expression pattern matcher, and dependency parser. Also provided are relation and event extractors based on the specifications of the U.S. Government's ACE [Automatic Content Extraction] program. The program is provided under an Apache 2.0 license.

The newest version!
// -*- tab-width: 4 -*-
package edu.nyu.jet.format;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Vector;

import edu.nyu.jet.lisp.FeatureSet;
import edu.nyu.jet.parser.ParseTreeNode;
import edu.nyu.jet.tipster.*;
import edu.nyu.jet.util.IOUtils;
import edu.nyu.jet.zoner.SentenceSplitter;
import edu.nyu.jet.zoner.SpecialZoner;
import edu.nyu.jet.parser.StatParser;
import edu.nyu.jet.parser.HeadRule;
import edu.nyu.jet.parser.ParseTreeNode;

/**
 * A reader for the output of a Penn Treebank Parser. The methods read a 
 * Penn Treebank corpus and either annotate an existing Document
 * (addAnnotations methods) with constit annotations representing
 * the trees or build a new Jet.Tipster.Document from the parse trees.
 * 
 * @author Akira ODA
 */
public class PTBReader {
	static Pattern tagNamePattern = Pattern.compile(
			"([^-=]+) (?: - ([\\-a-zA-Z]+)*)? (?: [-=] ([\\-\\d]+))?", Pattern.COMMENTS);

	static Pattern specialTagNamePattern = Pattern.compile("-.*-");

	private static final Map TRANSFORM_TABLE;

	private static final Set PUNCTUATIONS;

	private static final Set NO_FOLLOWING_SPACE;

	private static final Set DELETE_PREVIOUS_SPACE;

	/**
	 * If true, backslashes are treated as escape character.
	 */
	private boolean backslashAsEscapeChar = true;

	/**
	 * If true, add tokens when read corpus.
	 */
	private boolean isAddingTokens = false;
	
	HeadRule hr = null;

	static {
		TRANSFORM_TABLE = new HashMap();
		TRANSFORM_TABLE.put("-LRB-", "(");
		TRANSFORM_TABLE.put("-LCB-", "{");
		TRANSFORM_TABLE.put("-LSB-", "[");

		TRANSFORM_TABLE.put("-RRB-", ")");
		TRANSFORM_TABLE.put("-RCB-", "}");
		TRANSFORM_TABLE.put("-RSB-", "]");

		PUNCTUATIONS = new HashSet();
		PUNCTUATIONS.add(".");
		PUNCTUATIONS.add(",");
		PUNCTUATIONS.add("?");
		PUNCTUATIONS.add("!");

		NO_FOLLOWING_SPACE = new HashSet();
		NO_FOLLOWING_SPACE.add("(");
		NO_FOLLOWING_SPACE.add("{");
		NO_FOLLOWING_SPACE.add("[");

		DELETE_PREVIOUS_SPACE = new HashSet();
		DELETE_PREVIOUS_SPACE.add(")");
		DELETE_PREVIOUS_SPACE.add("}");
		DELETE_PREVIOUS_SPACE.add("]");
		DELETE_PREVIOUS_SPACE.add(".");
		DELETE_PREVIOUS_SPACE.add(",");
	}
	
	/**
	 *  a list of strings which are deleted in preparing text for the Charniak
	 *  parser, and so should be skipped when matching the text and parser output.
	 */
	 
	private static final String[] skip = 
	  new String[] {"....", "...", "uh,", "Uh,", "um,", "Um,", 
	                "<", "<", ">", ">", "_"};

	/**
	 *  when matching an existing document text against a parse tree,
	 *  each pair of elements represents an allowable match (due to 
	 *  Adam's text-regularization script)
	 */
	 
	private static final String[] match =
	  new String[] {"\"", "``",
	                "\"", "''",
		              """, "``",
	                """, "''",
		              """, "``",
	                """, "''",   
	                "&", "&",
	                "&", "&",
	                "wo", "will",
	                "Wo", "Will",
	                "((", "(",
	                "))", ")"};

	/**
	 * Adds constit annotations to an existing Document doc to
	 * represent the parse tree structure tree.
	 * 
	 * @param tree          the parse tree (for a portion of Document doc)
	 * @param doc           the document
	 * @param span          the portion of doc covered by the parse tree
	 * @param jetCategories if true, use Jet categories as terminal categories
	 *                      (if false, use categories read from parse trees)
	 */
	 
	public void addAnnotations(ParseTreeNode tree, Document doc, Span span,
	                           boolean jetCategories) {
		List terminalNodes = getTerminalNodes(tree);
		String text = doc.text();
		int offset = span.start();

		for (ParseTreeNode terminal : terminalNodes) {
			while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
				offset++;
			}
			for (String skipString : skip) {
				if (text.startsWith(skipString, offset)) {
					offset += skipString.length();
					while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
						offset++;
					}
					break;
				}
			}
			// match next terminal node against next word in text
			int matchLength = matchTextToTree (text, offset, terminal.word);
			if (matchLength > 0) {
				int endOffset = offset + matchLength;
				while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) {
					endOffset++;
				}
				terminal.start = offset;
				terminal.end = endOffset;
				offset = endOffset;
			} else {
				System.err.println ("PTBReader.addAnnotations:  " +
				                    "Cannot determine parse tree offset for word " +
				                    terminal.word);
				System.err.println ("  at document offset " + offset + " in sentence");
				System.err.println ("  " + doc.text(span));
				return;
			}
		}

		if (jetCategories) {
			setJetAnnotations (tree, span, doc);
			StatParser.deleteUnusedConstits (doc, span, tree.ann); //<<<
		} else {
			determineNonTerminalSpans(tree, span.start());
			setAnnotations (tree, doc);
		}
	}
	
	/**
	 *  determines whether string text, beginning at 
	 *  offset, matches the string word in a
	 *  PennTreeBank tree.  This may be an exact match, or may
	 *  reflect some regularization of the word for the PTB parser.
	 *
	 *  @return  if a successful match, the number of characters in text
	 *           which were matched;  else -1
	 */
	
	private static int matchTextToTree (String text, int offset, String word) {
		if (word.equals("can") && text.startsWith("can't", offset))
			return 2;
		if (word.equals("Can") && text.startsWith("Can't", offset))
			return 2;
		for (int i=0; i < match.length; i+=2) {
			String textPattern = match[i];
			String treePattern = match[i+1];
			if (text.startsWith(textPattern, offset) && word.equals(treePattern))
				return textPattern.length();
		}
		if (text.startsWith(word, offset))
			return word.length();
		// because Adam sometimes deletes '.'s for Charniak
		if (text.startsWith("." + word, offset))
			return word.length() + 1;
		return -1;
	}			

	/**
 	 * Adds constit annotations to an existing Document doc to
	 * represent the parse tree structure of a set of trees trees.
	 * 
	 * @param trees
	 *            list of parse trees
	 * @param doc
	 *            document to which annotations should be added
	 * @param targetAnnotation
	 *            name of annotation to determine spans to add parse tree
	 *            annotations.
	 * @param span
	 *            target span.
	 * @param jetCategories
	 *            if false, use lexical categories from Penn Tree Bank;  if
	 *            true, use categories from Jet
	 */
	 
	public void addAnnotations(List trees, Document doc, String targetAnnotation,
			Span span, boolean jetCategories) {
		List targetList = (List) doc.annotationsOfType(targetAnnotation,
				span);
		Comparator cmp = new Comparator() {
			public int compare(Annotation a, Annotation b) {
				return a.span().compareTo(b.span());
			}
		};

		Collections.sort(targetList, cmp);
		if (trees.size() != targetList.size()) {
			System.err.println ("PTBReader.addAnnotations:  mismatch between number of " +
			                    targetAnnotation + " (" + targetList.size() +
			                    ") and number of trees (" + trees.size() + ")");
		}
		int n = Math.min(trees.size(), targetList.size());
		for (int i = 0; i < n; i++) {
			ParseTreeNode tree = trees.get(i);
			addAnnotations(tree, doc, targetList.get(i).span(), jetCategories);
			targetList.get(i).put("parse", tree.ann);
		}
	}

	/**
 	 * Adds constit annotations to an existing Document doc to
	 * represent the parse tree structure of a set of trees trees.
	 * This version is provided for parse tree files which include sentence
	 * offsets.
	 * 
	 * @param trees
	 *            list of parse trees
	 * @param offsets
	 *            list of the starting position (in doc) of the text
	 *            corresponding to each parse tree
	 * @param doc
	 *            document to which annotations should be added
	 * @param targetAnnotation
	 *            name of annotation to get 'parse' feature pointing
	 *            to parse tree
	 * @param span
	 *            target span.
	 * @param jetCategories
	 *            if false, use lexical categories from Penn Tree Bank;  if
	 *            true, use categories from Jet
	 */
	
	public void addAnnotations (List trees, List offsets,
		  Document doc, String targetAnnotation, Span span, boolean jetCategories) {
		if (trees.size() != offsets.size()) {
			System.err.println ("PTBReader.addAnnotations:  mismatch between number of " +
			                    "trees (" + trees.size() + ") and number of offsets (" + 
			                    offsets.size() + ")");
			return;
		}
		for (int i = 0; i < trees.size(); i++) {
			ParseTreeNode tree = trees.get(i);
			int start = offsets.get(i);
			if (start < 0) {
				System.err.println ("PTBReader.addAnnotations:  offset missing for " +
				                    " parse tree " + i);
				continue;
			}
			int end = (i+1 == offsets.size()) ? span.end() : offsets.get(i+1);
			Span sentenceSpan = new Span(start, end);
			addAnnotations(tree, doc, sentenceSpan, jetCategories);
			Vector anns = doc.annotationsAt (start, targetAnnotation);
			if (anns != null && anns.size() > 0) {
				Annotation ann = anns.get(0);
				ann.put("parse", tree.ann);
			}
		}
	}
	
	List offsets;

	/**
	 * Loads parse tree corpus from Penn Treebank corpus.
	 * 

* This method loads the parse trees, but not determine annotation span and not * set annotation. *

* Also sets offsets to a list of the sentence offsets, * if they are encoded as comments preceding each tree. * * @param in the Reader from which the Penn Trees are read * @return a List of parse trees * @throws IOException * @throws InvalidFormatException */ public List loadParseTrees(Reader in) throws IOException, InvalidFormatException { List list = new ArrayList(); offsets = new ArrayList(); PushbackReader input = new PushbackReader(in); while (true) { skipWhitespaceAndComment(input); if (lookAhead(input) == -1) { break; } offsets.add(offset); ParseTreeNode node = readNode(input); list.add(node); } return list; } public List loadParseTrees(File file) throws IOException, InvalidFormatException { Reader in = null; try { in = new BufferedReader(new FileReader(file)); return loadParseTrees(in); } finally { IOUtils.closeQuietly(in); } } public List getOffsets () { return offsets; } /** * Builds Jet.Tipster.Document object from Penn treebank corpus. * * @param in * @return * @throws IOException * @throws InvalidFormatException */ public Treebank load(Reader in) throws IOException, InvalidFormatException { List trees = new ArrayList(); PushbackReader input = new PushbackReader(in); int start = 0; while (true) { skipWhitespace(input); if (lookAhead(input) == -1) { break; } ParseTreeNode tree = readNode(input); trees.add(tree); determineSpans(tree, start); setAnnotations(tree, null); start = tree.end; } String text = buildDocumentString(trees); Document doc = new Document(text); for (ParseTreeNode tree : trees) { doc.annotate("sentence", new Span(tree.start, tree.end), new FeatureSet()); annotate(doc, tree); } return new Treebank(doc, trees); } /** * Builds Document object from Penn treebank corpus. * * @param file * @return * @throws IOException * @throws InvalidFormatException */ public Treebank load(File file) throws IOException, InvalidFormatException { Reader in = null; try { in = new BufferedReader(new FileReader(file)); return load(in); } finally { IOUtils.closeQuietly(in); } } /** * Builds Document object from Penn treebank corpus. * * @param file * @param encoding * @return * @throws IOException * @throws InvalidFormatException */ public Treebank load(File file, String encoding) throws IOException, InvalidFormatException { InputStream fin = null; Reader in = null; try { fin = new FileInputStream(file); in = new InputStreamReader(fin, encoding); in = new BufferedReader(in); return load(in); } finally { IOUtils.closeQuietly(in); IOUtils.closeQuietly(fin); } } /** * Sets a backslash is treated as escape character or not. * * @param b */ public void setBackslashAsEscapeCharacter(boolean b) { this.backslashAsEscapeChar = b; } /** * Sets a adding tokens automatically or not. * * @param b */ public void setAddingToken(boolean b) { this.isAddingTokens = b; } /** * Returns if node is null element. */ private static boolean isNullNode(ParseTreeNode node) { return node.category.equals("-none-"); } /** * Remove last whitespace character and modify annotation span. * * @param annotations * @param buffer */ private void modifyAnnotationEnd(List annotations, StringBuilder buffer) { ListIterator it = annotations.listIterator(annotations.size()); if (buffer.length() == 0) { return; } if (!Character.isWhitespace(buffer.charAt(buffer.length() - 1))) { return; } while (it.hasPrevious()) { Annotation a = it.previous(); if (a.end() != buffer.length()) { break; } Span span = new Span(a.start(), a.end() - 1); Annotation replacement = new Annotation(a.type(), span, a.attributes()); it.set(replacement); } buffer.deleteCharAt(buffer.length() - 1); } /** * Reads one node from a stream. * * @param in * @return readed node * @throws IOException * @throws InvalidFormatException */ private ParseTreeNode readNode(PushbackReader in) throws IOException, InvalidFormatException { int c = in.read(); if (c != '(') { throw new InvalidFormatException(); } if ((c = lookAhead(in)) == -1) { throw new InvalidFormatException(); } if (Character.isWhitespace(c) || c == '(') { skipWhitespace(in); ParseTreeNode node = readNode(in); skipWhitespace(in); c = (char) in.read(); if (c != ')') { throw new InvalidFormatException(); } return node; } String tag = readTagName(in); String function = null; Matcher m = tagNamePattern.matcher(tag); if (m.matches()) { tag = m.group(1); function = m.group(2); } else if (!specialTagNamePattern.matcher(tag).matches()) { throw new InvalidFormatException(tag + " is invalid format."); } if (skipWhitespace(in) == 0) { return null; } ParseTreeNode node; if (lookAhead(in) == '(') { // has any child node (not terminal node) List children = new ArrayList(); do { ParseTreeNode child = readNode(in); if (!isNullNode(child)) { children.add(child); } skipWhitespace(in); } while (lookAhead(in) != ')'); node = new ParseTreeNode(tag, children.toArray(new ParseTreeNode[0]), 0, 0, 0, function); } else { // terminal node String word = readWord(in); node = new ParseTreeNode(tag, null, 0, 0, null, word, function); } skipWhitespace(in); if (in.read() != ')') { throw new InvalidFormatException(); } return node; } /** * skip whitespace characters * * @param in * @return count of skipped characters. * @throws IOException */ private int skipWhitespace(PushbackReader in) throws IOException { int count = 0; int c; do { c = in.read(); count++; } while (Character.isWhitespace(c) && c != -1); if (c != -1) { in.unread(c); } return count - 1; } private StringBuffer comment = new StringBuffer(); private int offset = -1; /** * skip whitespace characters and comments (characters following a "#" * on a line). Also, if a skipped comment consists of a single integer, * sets offset to that integer. * * @param in * @return count of skipped characters. * @throws IOException */ private int skipWhitespaceAndComment(PushbackReader in) throws IOException { int count = 0; boolean inComment = false; offset = -1; int c; do { c = in.read(); count++; if (c == '#' && !inComment) { inComment = true; comment.setLength(0); } else if (c == '\n' && inComment) { try { offset = Integer.parseInt(comment.toString().trim()); } catch (NumberFormatException e) { } inComment = false; } else if (inComment) { comment.append((char) c); } } while ((Character.isWhitespace(c) || inComment) && c != -1); if (c != -1) { in.unread(c); } return count - 1; } /** * Reads a tag name which is after opened parenthesis. * * @param in * @return readed token string * @throws IOException * @throws InvalidFormatException */ private String readTagName(PushbackReader in) throws IOException, InvalidFormatException { StringBuilder buffer = new StringBuilder(); int c; while (true) { c = in.read(); if (c == -1) { throw new InvalidFormatException(); } else if (Character.isWhitespace(c)) { break; } buffer.append((char) c); } in.unread(c); if (buffer.length() == 0) { throw new InvalidFormatException(); } return buffer.toString().toLowerCase().intern(); } /** * Reads annotated token. * * @param in * @return readed token. * @throws IOException * @throws InvalidFormatException */ private String readWord(PushbackReader in) throws IOException, InvalidFormatException { int c; StringBuilder buffer = new StringBuilder(); while (true) { c = in.read(); if (c != -1 && backslashAsEscapeChar && c == '\\') { c = in.read(); } if (c == ')') { break; } else if (c == -1) { throw new InvalidFormatException(); } buffer.append((char) c); } in.unread(c); String word = buffer.toString(); if (TRANSFORM_TABLE.containsKey(word)) { word = TRANSFORM_TABLE.get(word); } return word; } /** * Look ahead next character. * * @param in * @return readed character * @throws IOException */ private int lookAhead(PushbackReader in) throws IOException { int c = in.read(); if (c != -1) { in.unread(c); } return c; } /** * converts a set of Penn TreeBank files into text documents. * Invoked by: PTBReader inputDir outputDir. Converts all files with * extension .mrg in inputDir to text documents, and writes them into * outputDir. */ public static void main(String[] args) throws Exception { if (args.length != 2) { System.out.println("usage: java " + PTBReader.class.getName() + " "); System.exit(1); } File inputDir = new File(args[0]); File outputDir = new File(args[1]); PTBReader parser = new PTBReader(); for (File file : getFiles(new File(args[0]), ".mrg")) { String outFilename = removeSuffix(getRelativePath(inputDir, file)); File outFile = new File(outputDir, outFilename); outFile.getParentFile().mkdirs(); Writer out = new FileWriter(outFile); Document doc = parser.load(file).getDocument(); out.write(doc.text()); out.close(); } } /* -- alternative main methods for debugging static final String home = "../"; public static void main(String[] args) throws Exception { String sgmFileName = home + "Ace 05/V4/bc/CNN_CF_20030303.1900.00.sgm"; String PTBFileName = "PTB.txt"; ExternalDocument doc = new ExternalDocument("sgml", sgmFileName); doc.setAllTags (true); doc.open(); // mark sentences List textSegments = (List) doc.annotationsOfType ("TEXT"); if (textSegments == null) { System.out.println ("No in " + doc.fileName() + ", skipped."); } Annotation ann = textSegments.get(0); Span textSpan = ann.span (); SentenceSplitter.split (doc, textSpan); File f = new File(PTBFileName); PTBReader reader = new PTBReader(); List trees = reader.loadParseTrees (f); reader.addAnnotations (trees, doc, "sentence", new Span(0, doc.text().length()), false); new View (doc, 1); } public static void main(String[] args) throws Exception { String sgmFileName = "article.sgm"; String PTBFileName = "article.chout"; Jet.Lex.EnglishLex.readLexicon("data/Jet4.dict"); ExternalDocument doc = new ExternalDocument("sgml", sgmFileName); doc.setAllTags (true); doc.open(); // mark sentences SpecialZoner.findSpecialZones (doc); List textSegments = (List) doc.annotationsOfType ("TEXT"); if (textSegments == null) { System.out.println ("No in " + doc.fileName() + ", skipped."); } Annotation ann = textSegments.get(0); Span textSpan = ann.span (); SentenceSplitter.split (doc, textSpan); Vector sentences = doc.annotationsOfType("sentence"); if (sentences != null) { for (Annotation sentence : sentences) { Jet.Lex.Tokenizer.tokenize(doc, sentence.span()); Jet.Lex.Lexicon.annotateWithDefinitions(doc, sentence.span().start(), sentence.span().end()); } } File f = new File(PTBFileName); PTBReader reader = new PTBReader(); List trees = reader.loadParseTrees (f); reader.addAnnotations (trees, doc, "sentence", new Span(0, doc.text().length()), true); new View (doc, 1); } */ private static List getFiles(File dir, String suffix) throws IOException { List list = new ArrayList(); for (File file : dir.listFiles()) { if (file.isFile() && file.getName().endsWith(suffix)) { list.add(file); } else if (file.isDirectory()) { list.addAll(getFiles(file, suffix)); } } return list; } private static String getRelativePath(File base, File file) { return file.getAbsolutePath().substring(base.getAbsolutePath().length()); } private static String removeSuffix(String filename) { int index = filename.lastIndexOf('.'); if (index >= 0) { return filename.substring(0, index); } else { return filename; } } private String buildDocumentString(List trees) { StringBuilder buffer = new StringBuilder(); for (ParseTreeNode tree : trees) { List terminals = getTerminalNodes(tree); for (ParseTreeNode terminal : terminals) { if (terminal.word != null) { buffer.append(terminal.word); while (buffer.length() < terminal.end) { buffer.append(' '); } } } // set last character to newline if (buffer.charAt(buffer.length() - 1) == ' ') { buffer.setCharAt(buffer.length() - 1, '\n'); } } return buffer.toString(); } private void determineSpans(ParseTreeNode tree, int offset) { List terminals = getTerminalNodes(tree); determineTerminalSpans(terminals, offset); determineNonTerminalSpans(tree, offset); } private void determineTerminalSpans(List terminals, int offset) { int start = offset; int n = terminals.size(); for (int i = 0; i < n; i++) { ParseTreeNode current = terminals.get(i); ParseTreeNode prev = i > 0 ? terminals.get(i - 1) : null; String word = current.word; int end = start + (word != null ? word.length() + 1 : 0); if (!hasAfterSpace(word)) { end--; } if (hasBeforeSpace(word) && prev != null) { if (hasAfterSpace(prev.word)) { prev.end--; start--; end--; } } current.start = start; current.end = end; start = end; } } private int determineNonTerminalSpans(ParseTreeNode tree, int offset) { if (isTerminalNode(tree)) { return tree.end; } else { ParseTreeNode[] children = tree.children; if (children.length > 0) { for (ParseTreeNode child : children) { offset = determineNonTerminalSpans(child, offset); } tree.start = children[0].start; tree.end = children[children.length - 1].end; } else { tree.start = offset; tree.end = offset; } return tree.end; } } private boolean hasAfterSpace(String word) { if (NO_FOLLOWING_SPACE.contains(word)) { return false; } else { return true; } } private boolean hasBeforeSpace(String word) { if (DELETE_PREVIOUS_SPACE.contains(word)) { return true; } else if (isPartOfShortenedForm(word)) { return true; } else { return false; } } private boolean isPartOfShortenedForm(String word) { if (word != null) { return word.startsWith("'") || word.equals("n't"); } else { return false; } } private void annotate(Document doc, ParseTreeNode node) { doc.addAnnotation(node.ann); if (node.children != null) { Annotation[] children = new Annotation[node.children.length]; for (int i = 0; i < node.children.length; i++) { children[i] = node.children[i].ann; } node.ann.put("children", children); for (ParseTreeNode child : node.children) { annotate(doc, child); } } if (node.children == null && isAddingTokens) { // TODO: adds `case' property doc.annotate("token", node.ann.span(), new FeatureSet()); } } /** * Returns termninal node list in the parse tree. * * @param tree * @return */ private List getTerminalNodes(ParseTreeNode tree) { if (tree.children == null || tree.children.length == 0) { // terminal node if (tree.word != null) { return Collections.singletonList(tree); } return Collections.emptyList(); } else { List list = new ArrayList(); // non terminal node for (ParseTreeNode child : tree.children) { list.addAll(getTerminalNodes(child)); } return list; } } /** * Returns if node is terminal node. * * @param node * @return */ private boolean isTerminalNode(ParseTreeNode node) { return node.children == null; } /** * Creates annotations for each node in parse tree node. * These annotations are added to the parse tree; in addition, if * Document doc is non-empty, they are added to the document. *

* Note that this method does not set the "children" attribute. * * @param node * @param doc */ private void setAnnotations(ParseTreeNode node, Document doc) { Span span = new Span(node.start, node.end); FeatureSet attrs = new FeatureSet(); attrs.put("cat", node.category); if (node.head != 0) { attrs.put("head", node.head); } if (node.function != null) { attrs.put("func", node.function); } node.ann = new Annotation("constit", span, attrs); if (doc != null) { doc.addAnnotation(node.ann); } if (node.children != null) { for (ParseTreeNode child : node.children) { setAnnotations(child, doc); } } } /** * Creates annotations for each node in parse tree node. * These annotations are added to the parse tree and to the document * doc. In constrast to setAnnotations, * the categories used for terminal nodes are Jet categories obtained by * Jet tokenization and lexical look-up. This means that hyphenated * items are split, and multi-word names are reduced to a single node. * * @param node the root of the parse tree * @param treeSpan the span of the document matching the parse tree * @param doc the document to which annotations will be added */ private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) { StatParser.buildParserInput (doc, treeSpan.start(), treeSpan.end(), false); StatParser.fixHyphenatedItems (doc); int nameConstitEnd = -1; List terminals = getTerminalNodes(node); for (ParseTreeNode terminal : terminals) { int terminalEnd = terminal.end; // is there a 'name' constituent or 'hyphword' constituent here? Vector constits = doc.annotationsAt(terminal.start, "constit"); Annotation constit = null; Annotation nameConstit = null; Annotation hyphword = null; if (constits != null) { for (Annotation c : constits) { if (c.get("cat") == "name") { nameConstit = c; } else if (c.get("cat") == "hyphword") { hyphword = c; } if (constit == null) constit = c; } } if (hyphword != null) { nameConstit = null; constit = hyphword; } // if there is a name which is not part of a hyphword, associate the // name with this (first) terminal node, and mark any remaining terminal // nodes which match tokens in the name as empty if (nameConstit != null) { terminal.end = nameConstit.end(); terminal.ann = nameConstit; nameConstitEnd = nameConstit.end(); } else if (nameConstitEnd >= 0) { terminal.word = null; } else { Span span = new Span(terminal.start, terminal.end); String pennPOS = ((String) terminal.category).toUpperCase().intern(); String word = terminal.word; terminal.ann = StatParser.buildWordDefn (doc, word, span, constit, pennPOS); } if (nameConstitEnd == terminalEnd) nameConstitEnd = -1; } // prune parse tree: remove a node if it has no word or children pruneTree (node); determineNonTerminalSpans(node, treeSpan.start()); // add head links if (hr == null) hr = HeadRule.createDefaultRule(); hr.apply (node); // add annotations for non-terminals: ParseTreeNode.makeParseAnnotations(doc, node); } /** * recursively traverse parse tree node, removing terminal nodes * which are not associated with any word, and any non-terminal nodes all * of whose children have been removed. *

* This method is used by setJetAnnotations to prune a parse * tree after multiple NNP nodes for a multi-word name have been replaced by * a single NAME node. */ private ParseTreeNode pruneTree(ParseTreeNode node) { ParseTreeNode[] children = node.children; if (children != null) { ArrayList newChildren = new ArrayList(); for (ParseTreeNode child : children) { ParseTreeNode c = pruneTree(child); if (c != null) newChildren.add(c); } if (newChildren.isEmpty()) { children = null; } else { children = newChildren.toArray(new ParseTreeNode[0]); } node.children = children; } if (node.word == null && children == null) return null; else return node; } }