edu.nyu.jet.lex.Lexicon Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of jet Show documentation

Information extraction is the process of identifying specified classes of entities, relations, and events in natural language text – creating structured data from unstructured input. JET, the Java Extraction Toolkit, developed at New York University over the past fifteen years, provides a rich set of tools for research and education in information extraction from English text. These include standard language processing tools such as a tokenizer, sentence segmenter, part-of-speech tagger, name tagger, regular-expression pattern matcher, and dependency parser. Also provided are relation and event extractors based on the specifications of the U.S. Government's ACE [Automatic Content Extraction] program. The program is provided under an Apache 2.0 license.

The newest version!

// -*- tab-width: 4 -*-
//Title:        JET
//Version:      1.00
//Copyright:    Copyright (c) 2000
//Author:       Ralph Grishman
//Description:  A Java-based Information Extraction Tool

package edu.nyu.jet.lex;

import java.util.*;
import edu.nyu.jet.lisp.*;
import edu.nyu.jet.tipster.*;

/**
 *  provides (static) data structures for storing and looking up
 *  word definitions.
 */

public class Lexicon {

  /* lexicon organization:
      lexicon consists of a set of lexiconEntries;
      lexiconIndex is a mapping from strings (the lower case of the first
        word of an entry) to a vector of lexicalEntries;
  */

  static Hashtable lexiconIndex = new Hashtable();

  /**
   *  clears the entire lexicon (remove all entries).
   */

  public static void clear () {
    lexiconIndex.clear ();
  }

  /**
    *  removes the definition (if any) for lexical item words.
    *  @return true if an entry is found for the item.
    */

  public static boolean clearEntry (String words[]) {
    String key = words[0].toLowerCase();
    if (lexiconIndex.containsKey(key)) {
      Vector entries = (Vector) lexiconIndex.get(key);
      for (int i = 0; i < entries.size(); i++) {
        LexicalEntry entry = (LexicalEntry) entries.get(i);
        if (entry.matches(words)) {
          entries.remove(i);
          return true;
        }
      }
    }
    return false;
  }

  /**
   *  adds fs to the lexicon as a definition of words
   */

	public static void addEntry (String words[], FeatureSet fs) {
		addEntry (words, fs, "constit");
	}

  public static void addEntry (String words[], FeatureSet fs, String type) {
    String key = words[0].toLowerCase();
    if (lexiconIndex.containsKey(key)) {
      Vector entries = (Vector) lexiconIndex.get(key);
      for (int i = 0; i < entries.size(); i++) {
        LexicalEntry entry = (LexicalEntry) entries.get(i);
        if (entry.matches(words)) {
          entry.addDefinition (fs);
          return;
        }
      }
      entries.addElement(new LexicalEntry(words, fs, type));
    } else {
      Vector entries = new Vector();
      entries.addElement(new LexicalEntry(words, fs, type));
      lexiconIndex.put(key,entries);
    }
  }

  /**
    * return an array of the definitions (FeatureSets) associated
    * with the lexical item words, or null if there are no
    * definitions associated with this lexical item.
    */

  public static FeatureSet[] lookUp (String words[]) {
    String key = words[0].toLowerCase();
    if (lexiconIndex.containsKey(key)) {
      Vector entries = (Vector) lexiconIndex.get(key);
      for (int i = 0; i < entries.size(); i++) {
        LexicalEntry entry = (LexicalEntry) entries.get(i);
        if (entry.matches(words))
          return entry.getDefinition();
      }
    }
    return null;
  }

  /** annotateWithDefinitions looks for the longest defined lexical item
    * consisting of the tokens starting at position posn;  if such
    * an item is found, then for each definition of this item, an
    * annotation of type constit is added to the item, with the
    * item's definition as its attributes.
    * @return the end position of this lexical item
    */

  public static int annotateWithDefinitions (Document doc, int posn) {
    int furthest = 0;
    FeatureSet[] definition = null;
    String type = null;
    Annotation ann = doc.tokenAt(posn);
    if (ann == null) return 0;
    String key = doc.text(ann).trim().toLowerCase();
    if (lexiconIndex.containsKey(key)) {
      Vector entries = (Vector) lexiconIndex.get(key);
      for (int i = 0; i < entries.size(); i++) {
        LexicalEntry entry = (LexicalEntry) entries.get(i);
        int newposn = entry.matches(doc,posn);
        if (newposn > 0) {
          if (newposn > furthest) {
            furthest = newposn;
            definition = entry.getDefinition();
            type = entry.type;
          }
        }
      }
    }
    if (definition != null) {
      for (int i = 0; i < definition.length; i++) {
        doc.annotate(type, new Span (posn,furthest),
                                new FeatureSet (definition[i]));
      }
    }
    return furthest;
  }

  public static void annotateWithDefinitions (Document doc, int start, int end) {
    int posn = start;
    int newposn;
    //  advance 'position' to start of first token
    while (doc.tokenAt(posn) == null) {
      posn++;
      if (posn >= end) return;
    }
    while (posn < end) {
      newposn = annotateWithDefinitions (doc, posn);
      if (newposn == 0) {
        Annotation ann = doc.tokenAt(posn);
        if (ann == null) return;
        posn = ann.span().end();
      } else {
        posn = newposn;
      }
      // while ((posn < doc.length()) && Character.isWhitespace(doc.charAt(posn))) posn++;
    }
  }
}