edu.nyu.jet.format.PTBReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jet Show documentation
Information extraction is the process of identifying specified classes of entities, relations, and events in natural language text – creating structured data from unstructured input. JET, the Java Extraction Toolkit, developed at New York University over the past fifteen years, provides a rich set of tools for research and education in information extraction from English text. These include standard language processing tools such as a tokenizer, sentence segmenter, part-of-speech tagger, name tagger, regular-expression pattern matcher, and dependency parser. Also provided are relation and event extractors based on the specifications of the U.S. Government's ACE [Automatic Content Extraction] program. The program is provided under an Apache 2.0 license.
The newest version!
// -*- tab-width: 4 -*-
package edu.nyu.jet.format;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Vector;

import edu.nyu.jet.lisp.FeatureSet;
import edu.nyu.jet.parser.ParseTreeNode;
import edu.nyu.jet.tipster.*;
import edu.nyu.jet.util.IOUtils;
import edu.nyu.jet.zoner.SentenceSplitter;
import edu.nyu.jet.zoner.SpecialZoner;
import edu.nyu.jet.parser.StatParser;
import edu.nyu.jet.parser.HeadRule;
import edu.nyu.jet.parser.ParseTreeNode;

/**
 * A reader for the output of a Penn Treebank Parser. The methods read a 
 * Penn Treebank corpus and either annotate an existing Document
 * (addAnnotations methods) with constit annotations representing
 * the trees or build a new Jet.Tipster.Document from the parse trees.
 * 
 * @author Akira ODA
 */
public class PTBReader {
	static Pattern tagNamePattern = Pattern.compile(
			"([^-=]+) (?: - ([\\-a-zA-Z]+)*)? (?: [-=] ([\\-\\d]+))?", Pattern.COMMENTS);

	static Pattern specialTagNamePattern = Pattern.compile("-.*-");

	private static final Map TRANSFORM_TABLE;

	private static final Set PUNCTUATIONS;

	private static final Set NO_FOLLOWING_SPACE;

	private static final Set DELETE_PREVIOUS_SPACE;

	/**
	 * If true, backslashes are treated as escape character.
	 */
	private boolean backslashAsEscapeChar = true;

	/**
	 * If true, add tokens when read corpus.
	 */
	private boolean isAddingTokens = false;
	
	HeadRule hr = null;

	static {
		TRANSFORM_TABLE = new HashMap();
		TRANSFORM_TABLE.put("-LRB-", "(");
		TRANSFORM_TABLE.put("-LCB-", "{");
		TRANSFORM_TABLE.put("-LSB-", "[");

		TRANSFORM_TABLE.put("-RRB-", ")");
		TRANSFORM_TABLE.put("-RCB-", "}");
		TRANSFORM_TABLE.put("-RSB-", "]");

		PUNCTUATIONS = new HashSet();
		PUNCTUATIONS.add(".");
		PUNCTUATIONS.add(",");
		PUNCTUATIONS.add("?");
		PUNCTUATIONS.add("!");

		NO_FOLLOWING_SPACE = new HashSet();
		NO_FOLLOWING_SPACE.add("(");
		NO_FOLLOWING_SPACE.add("{");
		NO_FOLLOWING_SPACE.add("[");

		DELETE_PREVIOUS_SPACE = new HashSet();
		DELETE_PREVIOUS_SPACE.add(")");
		DELETE_PREVIOUS_SPACE.add("}");
		DELETE_PREVIOUS_SPACE.add("]");
		DELETE_PREVIOUS_SPACE.add(".");
		DELETE_PREVIOUS_SPACE.add(",");
	}
	
	/**
	 *  a list of strings which are deleted in preparing text for the Charniak
	 *  parser, and so should be skipped when matching the text and parser output.
	 */
	 
	private static final String[] skip = 
	  new String[] {"....", "...", "uh,", "Uh,", "um,", "Um,", 
	                "<", "<", ">", ">", "_"};

	/**
	 *  when matching an existing document text against a parse tree,
	 *  each pair of elements represents an allowable match (due to 
	 *  Adam's text-regularization script)
	 */
	 
	private static final String[] match =
	  new String[] {"\"", "``",
	                "\"", "''",
		              """, "``",
	                """, "''",
		              """, "``",
	                """, "''",   
	                "&", "&",
	                "&", "&",
	                "wo", "will",
	                "Wo", "Will",
	                "((", "(",
	                "))", ")"};

	/**
	 * Adds constit annotations to an existing Document doc to
	 * represent the parse tree structure tree.
	 * 
	 * @param tree          the parse tree (for a portion of Document doc)
	 * @param doc           the document
	 * @param span          the portion of doc covered by the parse tree
	 * @param jetCategories if true, use Jet categories as terminal categories
	 *                      (if false, use categories read from parse trees)
	 */
	 
	public void addAnnotations(ParseTreeNode tree, Document doc, Span span,
	                           boolean jetCategories) {
		List terminalNodes = getTerminalNodes(tree);
		String text = doc.text();
		int offset = span.start();

		for (ParseTreeNode terminal : terminalNodes) {
			while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
				offset++;
			}
			for (String skipString : skip) {
				if (text.startsWith(skipString, offset)) {
					offset += skipString.length();
					while (offset < span.end() && Character.isWhitespace(text.charAt(offset))) {
						offset++;
					}
					break;
				}
			}
			// match next terminal node against next word in text
			int matchLength = matchTextToTree (text, offset, terminal.word);
			if (matchLength > 0) {
				int endOffset = offset + matchLength;
				while (endOffset < span.end() && Character.isWhitespace(text.charAt(endOffset))) {
					endOffset++;
				}
				terminal.start = offset;
				terminal.end = endOffset;
				offset = endOffset;
			} else {
				System.err.println ("PTBReader.addAnnotations:  " +
				                    "Cannot determine parse tree offset for word " +
				                    terminal.word);
				System.err.println ("  at document offset " + offset + " in sentence");
				System.err.println ("  " + doc.text(span));
				return;
			}
		}

		if (jetCategories) {
			setJetAnnotations (tree, span, doc);
			StatParser.deleteUnusedConstits (doc, span, tree.ann); //<<<
		} else {
			determineNonTerminalSpans(tree, span.start());
			setAnnotations (tree, doc);
		}
	}
	
	/**
	 *  determines whether string text, beginning at 
	 *  offset, matches the string word in a
	 *  PennTreeBank tree.  This may be an exact match, or may
	 *  reflect some regularization of the word for the PTB parser.
	 *
	 *  @return  if a successful match, the number of characters in text
	 *           which were matched;  else -1
	 */
	
	private static int matchTextToTree (String text, int offset, String word) {
		if (word.equals("can") && text.startsWith("can't", offset))
			return 2;
		if (word.equals("Can") && text.startsWith("Can't", offset))
			return 2;
		for (int i=0; i < match.length; i+=2) {
			String textPattern = match[i];
			String treePattern = match[i+1];
			if (text.startsWith(textPattern, offset) && word.equals(treePattern))
				return textPattern.length();
		}
		if (text.startsWith(word, offset))
			return word.length();
		// because Adam sometimes deletes '.'s for Charniak
		if (text.startsWith("." + word, offset))
			return word.length() + 1;
		return -1;
	}			

	/**
 	 * Adds constit annotations to an existing Document doc to
	 * represent the parse tree structure of a set of trees trees.
	 * 
	 * @param trees
	 *            list of parse trees
	 * @param doc
	 *            document to which annotations should be added
	 * @param targetAnnotation
	 *            name of annotation to determine spans to add parse tree
	 *            annotations.
	 * @param span
	 *            target span.
	 * @param jetCategories
	 *            if false, use lexical categories from Penn Tree Bank;  if
	 *            true, use categories from Jet
	 */
	 
	public void addAnnotations(List trees, Document doc, String targetAnnotation,
			Span span, boolean jetCategories) {
		List targetList = (List) doc.annotationsOfType(targetAnnotation,
				span);
		Comparator cmp = new Comparator() {
			public int compare(Annotation a, Annotation b) {
				return a.span().compareTo(b.span());
			}
		};

		Collections.sort(targetList, cmp);
		if (trees.size() != targetList.size()) {
			System.err.println ("PTBReader.addAnnotations:  mismatch between number of " +
			                    targetAnnotation + " (" + targetList.size() +
			                    ") and number of trees (" + trees.size() + ")");
		}
		int n = Math.min(trees.size(), targetList.size());
		for (int i = 0; i < n; i++) {
			ParseTreeNode tree = trees.get(i);
			addAnnotations(tree, doc, targetList.get(i).span(), jetCategories);
			targetList.get(i).put("parse", tree.ann);
		}
	}

	/**
 	 * Adds constit annotations to an existing Document doc to
	 * represent the parse tree structure of a set of trees trees.
	 * This version is provided for parse tree files which include sentence
	 * offsets.
	 * 
	 * @param trees
	 *            list of parse trees
	 * @param offsets
	 *            list of the starting position (in doc) of the text
	 *            corresponding to each parse tree
	 * @param doc
	 *            document to which annotations should be added
	 * @param targetAnnotation
	 *            name of annotation to get 'parse' feature pointing
	 *            to parse tree
	 * @param span
	 *            target span.
	 * @param jetCategories
	 *            if false, use lexical categories from Penn Tree Bank;  if
	 *            true, use categories from Jet
	 */
	
	public void addAnnotations (List trees, List offsets,
		  Document doc, String targetAnnotation, Span span, boolean jetCategories) {
		if (trees.size() != offsets.size()) {
			System.err.println ("PTBReader.addAnnotations:  mismatch between number of " +
			                    "trees (" + trees.size() + ") and number of offsets (" + 
			                    offsets.size() + ")");
			return;
		}
		for (int i = 0; i < trees.size(); i++) {
			ParseTreeNode tree = trees.get(i);
			int start = offsets.get(i);
			if (start < 0) {
				System.err.println ("PTBReader.addAnnotations:  offset missing for " +
				                    " parse tree " + i);
				continue;
			}
			int end = (i+1 == offsets.size()) ? span.end() : offsets.get(i+1);
			Span sentenceSpan = new Span(start, end);
			addAnnotations(tree, doc, sentenceSpan, jetCategories);
			Vector anns = doc.annotationsAt (start, targetAnnotation);
			if (anns != null && anns.size() > 0) {
				Annotation ann = anns.get(0);
				ann.put("parse", tree.ann);
			}
		}
	}
	
	List offsets;

	/**
	 * Loads parse tree corpus from Penn Treebank corpus.
	 * 
	 * This method loads the parse trees, but not determine annotation span and not
	 * set annotation.
	 * 

	 * Also sets offsets to a list of the sentence offsets,
	 * if they are encoded as comments preceding each tree.
	 * 
	 * @param in  the Reader from which the Penn Trees are read
	 * @return a List of parse trees
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	 
	public List loadParseTrees(Reader in) throws IOException, InvalidFormatException {
		List list = new ArrayList();
		offsets = new ArrayList();
		PushbackReader input = new PushbackReader(in);

		while (true) {
			skipWhitespaceAndComment(input);
			if (lookAhead(input) == -1) {
				break;
			}
			offsets.add(offset);

			ParseTreeNode node = readNode(input);
			list.add(node);
		}

		return list;
	}
	
	public List loadParseTrees(File file) throws IOException, InvalidFormatException {
		Reader in = null;
		try {
			in = new BufferedReader(new FileReader(file));
			return loadParseTrees(in);
		} finally {
			IOUtils.closeQuietly(in);
		}
	}
	
	public List getOffsets () {
		return offsets;
	}

	/**
	 * Builds Jet.Tipster.Document object from Penn treebank corpus.
	 * 
	 * @param in
	 * @return
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	public Treebank load(Reader in) throws IOException, InvalidFormatException {

		List trees = new ArrayList();
		PushbackReader input = new PushbackReader(in);

		int start = 0;
		while (true) {
			skipWhitespace(input);
			if (lookAhead(input) == -1) {
				break;
			}

			ParseTreeNode tree = readNode(input);
			trees.add(tree);
			determineSpans(tree, start);
			setAnnotations(tree, null);
			start = tree.end;
		}

		String text = buildDocumentString(trees);
		Document doc = new Document(text);
		for (ParseTreeNode tree : trees) {
			doc.annotate("sentence", new Span(tree.start, tree.end), new FeatureSet());
			annotate(doc, tree);
		}

		return new Treebank(doc, trees);
	}

	/**
	 * Builds Document object from Penn treebank corpus.
	 * 
	 * @param file
	 * @return
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	public Treebank load(File file) throws IOException, InvalidFormatException {
		Reader in = null;
		try {
			in = new BufferedReader(new FileReader(file));
			return load(in);
		} finally {
			IOUtils.closeQuietly(in);
		}
	}

	/**
	 * Builds Document object from Penn treebank corpus.
	 * 
	 * @param file
	 * @param encoding
	 * @return
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	public Treebank load(File file, String encoding) throws IOException, InvalidFormatException {
		InputStream fin = null;
		Reader in = null;

		try {
			fin = new FileInputStream(file);
			in = new InputStreamReader(fin, encoding);
			in = new BufferedReader(in);

			return load(in);
		} finally {
			IOUtils.closeQuietly(in);
			IOUtils.closeQuietly(fin);
		}
	}

	/**
	 * Sets a backslash is treated as escape character or not.
	 * 
	 * @param b
	 */
	public void setBackslashAsEscapeCharacter(boolean b) {
		this.backslashAsEscapeChar = b;
	}

	/**
	 * Sets a adding tokens automatically or not.
	 * 
	 * @param b
	 */
	public void setAddingToken(boolean b) {
		this.isAddingTokens = b;
	}

	/**
	 * Returns if node is null element.
	 */
	private static boolean isNullNode(ParseTreeNode node) {
		return node.category.equals("-none-");
	}

	/**
	 * Remove last whitespace character and modify annotation span.
	 * 
	 * @param annotations
	 * @param buffer
	 */
	private void modifyAnnotationEnd(List annotations, StringBuilder buffer) {
		ListIterator it = annotations.listIterator(annotations.size());

		if (buffer.length() == 0) {
			return;
		}

		if (!Character.isWhitespace(buffer.charAt(buffer.length() - 1))) {
			return;
		}

		while (it.hasPrevious()) {
			Annotation a = it.previous();
			if (a.end() != buffer.length()) {
				break;
			}

			Span span = new Span(a.start(), a.end() - 1);
			Annotation replacement = new Annotation(a.type(), span, a.attributes());
			it.set(replacement);
		}

		buffer.deleteCharAt(buffer.length() - 1);
	}

	/**
	 * Reads one node from a stream.
	 * 
	 * @param in
	 * @return readed node
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	private ParseTreeNode readNode(PushbackReader in) throws IOException, InvalidFormatException {
		int c = in.read();

		if (c != '(') {
			throw new InvalidFormatException();
		}

		if ((c = lookAhead(in)) == -1) {
			throw new InvalidFormatException();
		}

		if (Character.isWhitespace(c) || c == '(') {
			skipWhitespace(in);
			ParseTreeNode node = readNode(in);
			skipWhitespace(in);
			c = (char) in.read();
			if (c != ')') {
				throw new InvalidFormatException();
			}
			return node;
		}

		String tag = readTagName(in);
		String function = null;
		Matcher m = tagNamePattern.matcher(tag);
		if (m.matches()) {
			tag = m.group(1);
			function = m.group(2);
		} else if (!specialTagNamePattern.matcher(tag).matches()) {
			throw new InvalidFormatException(tag + " is invalid format.");
		}

		if (skipWhitespace(in) == 0) {
			return null;
		}

		ParseTreeNode node;

		if (lookAhead(in) == '(') {
			// has any child node (not terminal node)
			List children = new ArrayList();
			do {
				ParseTreeNode child = readNode(in);
				if (!isNullNode(child)) {
					children.add(child);
				}
				skipWhitespace(in);
			} while (lookAhead(in) != ')');

			node = new ParseTreeNode(tag, children.toArray(new ParseTreeNode[0]), 0, 0, 0, function);
		} else {
			// terminal node
			String word = readWord(in);
			node = new ParseTreeNode(tag, null, 0, 0, null, word, function);
		}

		skipWhitespace(in);
		if (in.read() != ')') {
			throw new InvalidFormatException();
		}

		return node;
	}

	/**
	 * skip whitespace characters
	 * 
	 * @param in
	 * @return count of skipped characters.
	 * @throws IOException
	 */
	private int skipWhitespace(PushbackReader in) throws IOException {
		int count = 0;
		int c;
		do {
			c = in.read();
			count++;
		} while (Character.isWhitespace(c) && c != -1);

		if (c != -1) {
			in.unread(c);
		}

		return count - 1;
	}
	
	private StringBuffer comment = new StringBuffer();
	private int offset = -1;

	/**
	 *  skip whitespace characters and comments (characters following a "#"
	 *  on a line).  Also, if a skipped comment consists of a single integer,
	 *  sets offset to that integer.
	 * 
	 *  @param  in
	 *  @return count of skipped characters.
	 *  @throws IOException
	 */
	private int skipWhitespaceAndComment(PushbackReader in) throws IOException {
		int count = 0;
		boolean inComment = false;
		offset = -1;
		int c;
		do {
			c = in.read();
			count++;
			if (c == '#' && !inComment) {
				inComment = true;
				comment.setLength(0);
			} else if (c == '\n' && inComment) {
				try {
					offset = Integer.parseInt(comment.toString().trim());
				} catch (NumberFormatException e) {
				}
				inComment = false;
			} else if (inComment) {
				comment.append((char) c);
			}
		} while ((Character.isWhitespace(c) || inComment) && c != -1);

		if (c != -1) {
			in.unread(c);
		}

		return count - 1;
	}
	
	/**
	 * Reads a tag name which is after opened parenthesis.
	 * 
	 * @param in
	 * @return readed token string
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	private String readTagName(PushbackReader in) throws IOException, InvalidFormatException {
		StringBuilder buffer = new StringBuilder();
		int c;

		while (true) {
			c = in.read();
			if (c == -1) {
				throw new InvalidFormatException();
			} else if (Character.isWhitespace(c)) {
				break;
			}

			buffer.append((char) c);
		}

		in.unread(c);

		if (buffer.length() == 0) {
			throw new InvalidFormatException();
		}

		return buffer.toString().toLowerCase().intern();
	}

	/**
	 * Reads annotated token.
	 * 
	 * @param in
	 * @return readed token.
	 * @throws IOException
	 * @throws InvalidFormatException
	 */
	private String readWord(PushbackReader in) throws IOException, InvalidFormatException {
		int c;
		StringBuilder buffer = new StringBuilder();
		while (true) {
			c = in.read();

			if (c != -1 && backslashAsEscapeChar && c == '\\') {
				c = in.read();
			}

			if (c == ')') {
				break;
			} else if (c == -1) {
				throw new InvalidFormatException();
			}

			buffer.append((char) c);
		}

		in.unread(c);

		String word = buffer.toString();
		if (TRANSFORM_TABLE.containsKey(word)) {
			word = TRANSFORM_TABLE.get(word);
		}
		return word;
	}

	/**
	 * Look ahead next character.
	 * 
	 * @param in
	 * @return readed character
	 * @throws IOException
	 */
	private int lookAhead(PushbackReader in) throws IOException {
		int c = in.read();
		if (c != -1) {
			in.unread(c);
		}
		return c;
	}

	/**
	 *  converts a set of Penn TreeBank files into text documents.
	 *  Invoked by:  PTBReader inputDir outputDir.  Converts all files with
	 *  extension .mrg in inputDir to text documents, and writes them into
	 *  outputDir.
	 */
	 
	public static void main(String[] args) throws Exception {
		if (args.length != 2) {
			System.out.println("usage: java " + PTBReader.class.getName() + " ");
			System.exit(1);
		}

		File inputDir = new File(args[0]);
		File outputDir = new File(args[1]);
		PTBReader parser = new PTBReader();
		for (File file : getFiles(new File(args[0]), ".mrg")) {
			String outFilename = removeSuffix(getRelativePath(inputDir, file));
			File outFile = new File(outputDir, outFilename);
			outFile.getParentFile().mkdirs();

			Writer out = new FileWriter(outFile);
			Document doc = parser.load(file).getDocument();
			out.write(doc.text());
			out.close();
		}
	}
	
/* -- alternative main methods for debugging

	static final String home = "../";
	
	public static void main(String[] args) throws Exception {
		String sgmFileName = home + "Ace 05/V4/bc/CNN_CF_20030303.1900.00.sgm";
		String PTBFileName = "PTB.txt";
		ExternalDocument doc = new ExternalDocument("sgml", sgmFileName);
		doc.setAllTags (true);
		doc.open();
		// mark sentences
		List textSegments = (List) doc.annotationsOfType ("TEXT");
		if (textSegments == null) {
			System.out.println ("No  in " + doc.fileName() + ", skipped.");
		}
		Annotation ann = textSegments.get(0);
		Span textSpan = ann.span ();
		SentenceSplitter.split (doc, textSpan);
		File f = new File(PTBFileName);
		PTBReader reader = new PTBReader();
		List trees = reader.loadParseTrees (f);
		reader.addAnnotations (trees, doc, "sentence",  new Span(0, doc.text().length()), false);
		new View (doc, 1);
	}

	public static void main(String[] args) throws Exception {
		String sgmFileName = "article.sgm";
		String PTBFileName = "article.chout";
		Jet.Lex.EnglishLex.readLexicon("data/Jet4.dict");
		ExternalDocument doc = new ExternalDocument("sgml", sgmFileName);
		doc.setAllTags (true);
		doc.open();
		// mark sentences
		SpecialZoner.findSpecialZones (doc);
		List textSegments = (List) doc.annotationsOfType ("TEXT");
		if (textSegments == null) {
			System.out.println ("No  in " + doc.fileName() + ", skipped.");
		}
		Annotation ann = textSegments.get(0);
		Span textSpan = ann.span ();
		SentenceSplitter.split (doc, textSpan);
		Vector sentences = doc.annotationsOfType("sentence");
		if (sentences != null) {
			for (Annotation sentence : sentences) {
				Jet.Lex.Tokenizer.tokenize(doc, sentence.span());
				Jet.Lex.Lexicon.annotateWithDefinitions(doc, sentence.span().start(), sentence.span().end());
			}
		}
		File f = new File(PTBFileName);
		PTBReader reader = new PTBReader();
		List trees = reader.loadParseTrees (f);
		reader.addAnnotations (trees, doc, "sentence",  new Span(0, doc.text().length()), true);
		new View (doc, 1);
	}
*/
	
	private static List getFiles(File dir, String suffix) throws IOException {
		List list = new ArrayList();

		for (File file : dir.listFiles()) {
			if (file.isFile() && file.getName().endsWith(suffix)) {
				list.add(file);
			} else if (file.isDirectory()) {
				list.addAll(getFiles(file, suffix));
			}
		}

		return list;
	}

	private static String getRelativePath(File base, File file) {
		return file.getAbsolutePath().substring(base.getAbsolutePath().length());
	}

	private static String removeSuffix(String filename) {
		int index = filename.lastIndexOf('.');
		if (index >= 0) {
			return filename.substring(0, index);
		} else {
			return filename;
		}
	}

	private String buildDocumentString(List trees) {
		StringBuilder buffer = new StringBuilder();

		for (ParseTreeNode tree : trees) {
			List terminals = getTerminalNodes(tree);
			for (ParseTreeNode terminal : terminals) {
				if (terminal.word != null) {
					buffer.append(terminal.word);
					while (buffer.length() < terminal.end) {
						buffer.append(' ');
					}
				}
			}

			// set last character to newline
			if (buffer.charAt(buffer.length() - 1) == ' ') {
				buffer.setCharAt(buffer.length() - 1, '\n');
			}
		}

		return buffer.toString();
	}

	private void determineSpans(ParseTreeNode tree, int offset) {
		List terminals = getTerminalNodes(tree);
		determineTerminalSpans(terminals, offset);
		determineNonTerminalSpans(tree, offset);
	}

	private void determineTerminalSpans(List terminals, int offset) {
		int start = offset;
		int n = terminals.size();

		for (int i = 0; i < n; i++) {
			ParseTreeNode current = terminals.get(i);
			ParseTreeNode prev = i > 0 ? terminals.get(i - 1) : null;

			String word = current.word;
			int end = start + (word != null ? word.length() + 1 : 0);
			if (!hasAfterSpace(word)) {
				end--;
			}
			if (hasBeforeSpace(word) && prev != null) {
				if (hasAfterSpace(prev.word)) {
					prev.end--;
					start--;
					end--;
				}
			}

			current.start = start;
			current.end = end;
			start = end;
		}
	}

	private int determineNonTerminalSpans(ParseTreeNode tree, int offset) {
		if (isTerminalNode(tree)) {
			return tree.end;
		} else {

			ParseTreeNode[] children = tree.children;
			if (children.length > 0) {
				for (ParseTreeNode child : children) {
					offset = determineNonTerminalSpans(child, offset);
				}

				tree.start = children[0].start;
				tree.end = children[children.length - 1].end;
			} else {
				tree.start = offset;
				tree.end = offset;
			}

			return tree.end;
		}
	}

	private boolean hasAfterSpace(String word) {
		if (NO_FOLLOWING_SPACE.contains(word)) {
			return false;
		} else {
			return true;
		}
	}

	private boolean hasBeforeSpace(String word) {
		if (DELETE_PREVIOUS_SPACE.contains(word)) {
			return true;
		} else if (isPartOfShortenedForm(word)) {
			return true;
		} else {
			return false;
		}
	}

	private boolean isPartOfShortenedForm(String word) {
		if (word != null) {
			return word.startsWith("'") || word.equals("n't");
		} else {
			return false;
		}
	}

	private void annotate(Document doc, ParseTreeNode node) {
		doc.addAnnotation(node.ann);
		if (node.children != null) {
			Annotation[] children = new Annotation[node.children.length];
			for (int i = 0; i < node.children.length; i++) {
				children[i] = node.children[i].ann;
			}
			node.ann.put("children", children);
			
			for (ParseTreeNode child : node.children) {
				annotate(doc, child);
			}
		}
		
		if (node.children == null && isAddingTokens) {
			// TODO: adds `case' property
			doc.annotate("token", node.ann.span(), new FeatureSet());
		}
	}

	/**
	 * Returns termninal node list in the parse tree.
	 * 
	 * @param tree
	 * @return
	 */
	private List getTerminalNodes(ParseTreeNode tree) {
		if (tree.children == null || tree.children.length == 0) {
			// terminal node
			if (tree.word != null) {
				return Collections.singletonList(tree);
			}
			return Collections.emptyList();
		} else {
			List list = new ArrayList();
			// non terminal node
			for (ParseTreeNode child : tree.children) {
				list.addAll(getTerminalNodes(child));
			}
			return list;
		}
	}

	/**
	 * Returns if node is terminal node.
	 * 
	 * @param node
	 * @return
	 */
	private boolean isTerminalNode(ParseTreeNode node) {
		return node.children == null;
	}

	/**
	 * Creates annotations for each node in parse tree node.
	 * These annotations are added to the parse tree;  in addition, if
	 * Document doc is non-empty, they are added to the document.
	 * 

	 * Note that this method does not set the "children" attribute.
	 * 
	 * @param node
	 * @param doc
	 */
	private void setAnnotations(ParseTreeNode node, Document doc) {
		Span span = new Span(node.start, node.end);
		FeatureSet attrs = new FeatureSet();
		attrs.put("cat", node.category);
		if (node.head != 0) {
			attrs.put("head", node.head);
		}
		if (node.function != null) {
			attrs.put("func", node.function);
		}

		node.ann = new Annotation("constit", span, attrs);
		if (doc != null) {
			doc.addAnnotation(node.ann);
		}

		if (node.children != null) {
			for (ParseTreeNode child : node.children) {
				setAnnotations(child, doc);
			}
		}
	}
	
	/**
	 * Creates annotations for each node in parse tree node.
	 * These annotations are added to the parse tree and to the document
	 * doc.  In constrast to setAnnotations,
	 * the categories used for terminal nodes are Jet categories obtained by
	 * Jet tokenization and lexical look-up.  This means that hyphenated
	 * items are split, and multi-word names are reduced to a single node.
	 * 
	 * @param node      the root of the parse tree
	 * @param treeSpan  the span of the document matching the parse tree
	 * @param doc       the document to which annotations will be added
	 */

	private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) {
		StatParser.buildParserInput (doc, treeSpan.start(), treeSpan.end(), false);
		StatParser.fixHyphenatedItems (doc);
		int nameConstitEnd = -1;
		List terminals = getTerminalNodes(node);
		for (ParseTreeNode terminal : terminals) {
			int terminalEnd = terminal.end;
			// is there a 'name' constituent or 'hyphword' constituent here?
			Vector constits = doc.annotationsAt(terminal.start, "constit");
			Annotation constit = null;
			Annotation nameConstit = null;
			Annotation hyphword = null;
			if (constits != null) {
				for (Annotation c : constits) {
					if (c.get("cat") == "name") {
						nameConstit = c;
					} else if (c.get("cat") == "hyphword") {
						hyphword = c;
					}
					if (constit == null)
						constit = c;
				}
			}
			if (hyphword != null) {
				nameConstit = null;
				constit = hyphword;
			}
			// if there is a name which is not part of a hyphword, associate the
			// name with this (first) terminal node, and mark any remaining terminal
			// nodes which match tokens in the name as empty
			if (nameConstit != null) {
				terminal.end = nameConstit.end();
				terminal.ann = nameConstit;
				nameConstitEnd = nameConstit.end();
			} else if (nameConstitEnd >= 0) {
				terminal.word = null;
			} else {
				Span span = new Span(terminal.start, terminal.end);
				String pennPOS = ((String) terminal.category).toUpperCase().intern();
				String word = terminal.word;
				terminal.ann = StatParser.buildWordDefn (doc, word, span, constit, pennPOS);
			}
			if (nameConstitEnd == terminalEnd)
				nameConstitEnd = -1;
		}
		// prune parse tree:  remove a node if it has no word or children
		pruneTree (node);
		determineNonTerminalSpans(node, treeSpan.start());
		// add head links
		if (hr == null)
			hr = HeadRule.createDefaultRule();
		hr.apply (node);
		// add annotations for non-terminals:
		ParseTreeNode.makeParseAnnotations(doc, node);
	}
	
	/**
	 *  recursively traverse parse tree node, removing terminal nodes
	 *  which are not associated with any word, and any non-terminal nodes all
	 *  of whose children have been removed.
	 *  

	 *  This method is used by setJetAnnotations to prune a parse
	 *  tree after multiple NNP nodes for a multi-word name have been replaced by
	 *  a single NAME node.
	 */
	
	private ParseTreeNode pruneTree(ParseTreeNode node) {
		ParseTreeNode[] children = node.children;
		if (children != null) {
			ArrayList newChildren = new ArrayList();
			for (ParseTreeNode child : children) {
				ParseTreeNode c = pruneTree(child);
				if (c != null) newChildren.add(c);
			}
			if (newChildren.isEmpty()) {
				children = null;
			} else {
				children = newChildren.toArray(new ParseTreeNode[0]);
			}
			node.children = children;
		}
		if (node.word == null && children == null)
			return null;
		else
			return node;
	}  
	
}