All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.nyu.jet.lex.Lexicon Maven / Gradle / Ivy

Go to download

Information extraction is the process of identifying specified classes of entities, relations, and events in natural language text – creating structured data from unstructured input. JET, the Java Extraction Toolkit, developed at New York University over the past fifteen years, provides a rich set of tools for research and education in information extraction from English text. These include standard language processing tools such as a tokenizer, sentence segmenter, part-of-speech tagger, name tagger, regular-expression pattern matcher, and dependency parser. Also provided are relation and event extractors based on the specifications of the U.S. Government's ACE [Automatic Content Extraction] program. The program is provided under an Apache 2.0 license.

The newest version!
// -*- tab-width: 4 -*-
//Title:        JET
//Version:      1.00
//Copyright:    Copyright (c) 2000
//Author:       Ralph Grishman
//Description:  A Java-based Information Extraction Tool

package edu.nyu.jet.lex;

import java.util.*;
import edu.nyu.jet.lisp.*;
import edu.nyu.jet.tipster.*;

/**
 *  provides (static) data structures for storing and looking up
 *  word definitions.
 */

public class Lexicon {

  /* lexicon organization:
      lexicon consists of a set of lexiconEntries;
      lexiconIndex is a mapping from strings (the lower case of the first
        word of an entry) to a vector of lexicalEntries;
  */

  static Hashtable lexiconIndex = new Hashtable();

  /**
   *  clears the entire lexicon (remove all entries).
   */

  public static void clear () {
    lexiconIndex.clear ();
  }

  /**
    *  removes the definition (if any) for lexical item words.
    *  @return true if an entry is found for the item.
    */

  public static boolean clearEntry (String words[]) {
    String key = words[0].toLowerCase();
    if (lexiconIndex.containsKey(key)) {
      Vector entries = (Vector) lexiconIndex.get(key);
      for (int i = 0; i < entries.size(); i++) {
        LexicalEntry entry = (LexicalEntry) entries.get(i);
        if (entry.matches(words)) {
          entries.remove(i);
          return true;
        }
      }
    }
    return false;
  }

  /**
   *  adds fs to the lexicon as a definition of words
   */

	public static void addEntry (String words[], FeatureSet fs) {
		addEntry (words, fs, "constit");
	}

  public static void addEntry (String words[], FeatureSet fs, String type) {
    String key = words[0].toLowerCase();
    if (lexiconIndex.containsKey(key)) {
      Vector entries = (Vector) lexiconIndex.get(key);
      for (int i = 0; i < entries.size(); i++) {
        LexicalEntry entry = (LexicalEntry) entries.get(i);
        if (entry.matches(words)) {
          entry.addDefinition (fs);
          return;
        }
      }
      entries.addElement(new LexicalEntry(words, fs, type));
    } else {
      Vector entries = new Vector();
      entries.addElement(new LexicalEntry(words, fs, type));
      lexiconIndex.put(key,entries);
    }
  }

  /**
    * return an array of the definitions (FeatureSets) associated
    * with the lexical item words, or null if there are no
    * definitions associated with this lexical item.
    */

  public static FeatureSet[] lookUp (String words[]) {
    String key = words[0].toLowerCase();
    if (lexiconIndex.containsKey(key)) {
      Vector entries = (Vector) lexiconIndex.get(key);
      for (int i = 0; i < entries.size(); i++) {
        LexicalEntry entry = (LexicalEntry) entries.get(i);
        if (entry.matches(words))
          return entry.getDefinition();
      }
    }
    return null;
  }

  /** annotateWithDefinitions looks for the longest defined lexical item
    * consisting of the tokens starting at position posn;  if such
    * an item is found, then for each definition of this item, an
    * annotation of type constit is added to the item, with the
    * item's definition as its attributes.
    * @return the end position of this lexical item
    */

  public static int annotateWithDefinitions (Document doc, int posn) {
    int furthest = 0;
    FeatureSet[] definition = null;
    String type = null;
    Annotation ann = doc.tokenAt(posn);
    if (ann == null) return 0;
    String key = doc.text(ann).trim().toLowerCase();
    if (lexiconIndex.containsKey(key)) {
      Vector entries = (Vector) lexiconIndex.get(key);
      for (int i = 0; i < entries.size(); i++) {
        LexicalEntry entry = (LexicalEntry) entries.get(i);
        int newposn = entry.matches(doc,posn);
        if (newposn > 0) {
          if (newposn > furthest) {
            furthest = newposn;
            definition = entry.getDefinition();
            type = entry.type;
          }
        }
      }
    }
    if (definition != null) {
      for (int i = 0; i < definition.length; i++) {
        doc.annotate(type, new Span (posn,furthest),
                                new FeatureSet (definition[i]));
      }
    }
    return furthest;
  }

  public static void annotateWithDefinitions (Document doc, int start, int end) {
    int posn = start;
    int newposn;
    //  advance 'position' to start of first token
    while (doc.tokenAt(posn) == null) {
      posn++;
      if (posn >= end) return;
    }
    while (posn < end) {
      newposn = annotateWithDefinitions (doc, posn);
      if (newposn == 0) {
        Annotation ann = doc.tokenAt(posn);
        if (ann == null) return;
        posn = ann.span().end();
      } else {
        posn = newposn;
      }
      // while ((posn < doc.length()) && Character.isWhitespace(doc.charAt(posn))) posn++;
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy