All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.nyu.jet.ne.NameAnnotator Maven / Gradle / Ivy

Go to download

Information extraction is the process of identifying specified classes of entities, relations, and events in natural language text – creating structured data from unstructured input. JET, the Java Extraction Toolkit, developed at New York University over the past fifteen years, provides a rich set of tools for research and education in information extraction from English text. These include standard language processing tools such as a tokenizer, sentence segmenter, part-of-speech tagger, name tagger, regular-expression pattern matcher, and dependency parser. Also provided are relation and event extractors based on the specifications of the U.S. Government's ACE [Automatic Content Extraction] program. The program is provided under an Apache 2.0 license.

The newest version!
// -*- tab-width: 4 -*-
package edu.nyu.jet.ne;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
import java.util.Map;

import edu.nyu.jet.tipster.Annotation;
import edu.nyu.jet.tipster.Document;
import edu.nyu.jet.tipster.Span;

/**
 * NameAnnotator provides methods for (extended) named entity annotation
 * based on a dictionary and rules.
 *
 * @author Akira Oda
 */

public class NameAnnotator {
	private static final String systemName = "ENE";
	
	private DictionaryTagger dictTagger;
	private TransformRules rules;
	private ClassAnnotator classAnnotator;
	private ClassHierarchyResolver hierarchyResolver;
	private Map aliasTable;

	/**
	 *  create a new NameAnnotator.
	 */

	public NameAnnotator() {
		dictTagger = new DictionaryTagger();
	}

	/**
	 *  specify the dictionary to be used by the ENE tagger.
	 */

	public void setDictionary(Dictionary dict) {
		dictTagger.setDictionary(dict);
	}
	
	/**
	 * load the the dictionary to be used by word class tagger.
	 */
	
	public void loadClassDictionary(Reader in) throws IOException {
		classAnnotator = new ClassAnnotator(in);
	}
	
	/**
	 * load the the dictionary to be used by word class tagger.
	 */

	public void loadClassDictionary(File file) throws IOException {
		classAnnotator = new ClassAnnotator(file);
	}

	/**
	 *  load the rules to be used by the ENE tagger using Reader 'in'.
	 */

	public void loadRules(Reader in) throws IOException, RuleFormatException {
		rules = TransformRules.load(in);
		if (hierarchyResolver != null) {
			rules.setClassHierarchyResolver(hierarchyResolver);
		}
	}

	/**
	 *  load the rules to be used by the ENE tagger using Reader 'in'.
	 */

	public void loadRules(File file) throws IOException, RuleFormatException {
		Reader in = null;
		try {
			in = new FileReader(file);
			loadRules(in);
		} finally {
			if (in != null) {
				try {
					in.close();
				} catch (IOException ex) {
				}
			}
		}
	}

	/**
	 *  annotate the text in span with named entity (ENAMEX)
	 *  annotations using the dictionary and rules of the ENE tagger.  If
	 *  an alias map has been specified by 'setAliasMap', the tags are
	 *  translated using this map.
	 */

	public void annotate(Document doc, Span span) {
		NamedEntityUtil.splitToNamedEntity(doc, span);
		dictTagger.annotate(doc, span);
		if (classAnnotator != null) {
			classAnnotator.annotate(doc, span);
		}
		rules.apply(doc, span);
		NamedEntityUtil.packNamedEntity(doc, span, systemName);

		if (aliasTable != null) {
			List neList = doc.annotationsOfType("ENAMEX", span);
			for (Annotation name : neList) {
				String type = (String) name.get("TYPE");
				String alias = aliasTable.get(type);
				if (alias != null) {
					name.put("TYPE", alias);
				} else {
					doc.removeAnnotation(name);
				}
			}
		}
	}

	/**
	 *  annotate document doc with named entity (ENAMEX)
	 *  annotations using the dictionary and rules of the ENE tagger.  If
	 *  an alias map has been specified by 'setAliasMap', the tags are
	 *  translated using this map.
	 */

	public void annotate(Document doc) {
		annotate(doc, doc.fullSpan());
	}

	/**
	 *  specify a mapping to be used to translate the TYPE feature of ENAMEX
	 *  annotations as a final step in tagging.
	 */

	public void setAliasMap(Map map) {
		this.aliasTable = map;
	}

	public void loadClassHierarchy(File file) throws IOException {
		hierarchyResolver = SimpleClassHierarchyResolver.getInstance(file);
		if (this.rules != null) {
			rules.setClassHierarchyResolver(hierarchyResolver);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy