edu.nyu.jet.ne.NameAnnotator Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of jet Show documentation

Information extraction is the process of identifying specified classes of entities, relations, and events in natural language text – creating structured data from unstructured input. JET, the Java Extraction Toolkit, developed at New York University over the past fifteen years, provides a rich set of tools for research and education in information extraction from English text. These include standard language processing tools such as a tokenizer, sentence segmenter, part-of-speech tagger, name tagger, regular-expression pattern matcher, and dependency parser. Also provided are relation and event extractors based on the specifications of the U.S. Government's ACE [Automatic Content Extraction] program. The program is provided under an Apache 2.0 license.

The newest version!

// -*- tab-width: 4 -*-
package edu.nyu.jet.ne;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
import java.util.Map;

import edu.nyu.jet.tipster.Annotation;
import edu.nyu.jet.tipster.Document;
import edu.nyu.jet.tipster.Span;

/**
 * NameAnnotator provides methods for (extended) named entity annotation
 * based on a dictionary and rules.
 *
 * @author Akira Oda
 */

public class NameAnnotator {
	private static final String systemName = "ENE";
	
	private DictionaryTagger dictTagger;
	private TransformRules rules;
	private ClassAnnotator classAnnotator;
	private ClassHierarchyResolver hierarchyResolver;
	private Map aliasTable;

	/**
	 *  create a new NameAnnotator.
	 */

	public NameAnnotator() {
		dictTagger = new DictionaryTagger();
	}

	/**
	 *  specify the dictionary to be used by the ENE tagger.
	 */

	public void setDictionary(Dictionary dict) {
		dictTagger.setDictionary(dict);
	}
	
	/**
	 * load the the dictionary to be used by word class tagger.
	 */
	
	public void loadClassDictionary(Reader in) throws IOException {
		classAnnotator = new ClassAnnotator(in);
	}
	
	/**
	 * load the the dictionary to be used by word class tagger.
	 */

	public void loadClassDictionary(File file) throws IOException {
		classAnnotator = new ClassAnnotator(file);
	}

	/**
	 *  load the rules to be used by the ENE tagger using Reader 'in'.
	 */

	public void loadRules(Reader in) throws IOException, RuleFormatException {
		rules = TransformRules.load(in);
		if (hierarchyResolver != null) {
			rules.setClassHierarchyResolver(hierarchyResolver);
		}
	}

	/**
	 *  load the rules to be used by the ENE tagger using Reader 'in'.
	 */

	public void loadRules(File file) throws IOException, RuleFormatException {
		Reader in = null;
		try {
			in = new FileReader(file);
			loadRules(in);
		} finally {
			if (in != null) {
				try {
					in.close();
				} catch (IOException ex) {
				}
			}
		}
	}

	/**
	 *  annotate the text in span with named entity (ENAMEX)
	 *  annotations using the dictionary and rules of the ENE tagger.  If
	 *  an alias map has been specified by 'setAliasMap', the tags are
	 *  translated using this map.
	 */

	public void annotate(Document doc, Span span) {
		NamedEntityUtil.splitToNamedEntity(doc, span);
		dictTagger.annotate(doc, span);
		if (classAnnotator != null) {
			classAnnotator.annotate(doc, span);
		}
		rules.apply(doc, span);
		NamedEntityUtil.packNamedEntity(doc, span, systemName);

		if (aliasTable != null) {
			List neList = doc.annotationsOfType("ENAMEX", span);
			for (Annotation name : neList) {
				String type = (String) name.get("TYPE");
				String alias = aliasTable.get(type);
				if (alias != null) {
					name.put("TYPE", alias);
				} else {
					doc.removeAnnotation(name);
				}
			}
		}
	}

	/**
	 *  annotate document doc with named entity (ENAMEX)
	 *  annotations using the dictionary and rules of the ENE tagger.  If
	 *  an alias map has been specified by 'setAliasMap', the tags are
	 *  translated using this map.
	 */

	public void annotate(Document doc) {
		annotate(doc, doc.fullSpan());
	}

	/**
	 *  specify a mapping to be used to translate the TYPE feature of ENAMEX
	 *  annotations as a final step in tagging.
	 */

	public void setAliasMap(Map map) {
		this.aliasTable = map;
	}

	public void loadClassHierarchy(File file) throws IOException {
		hierarchyResolver = SimpleClassHierarchyResolver.getInstance(file);
		if (this.rules != null) {
			rules.setClassHierarchyResolver(hierarchyResolver);
		}
	}
}