edu.stanford.nlp.tagger.maxent.Dictionary Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
/**
 * Title:        StanfordMaxEnt
 * Description:  A Maximum Entropy Toolkit

 * Copyright:    Copyright (c) Kristina Toutanova

 * Company:      Stanford University
 */
package edu.stanford.nlp.tagger.maxent; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.stats.IntCounter;
import edu.stanford.nlp.util.Generics;

import java.io.IOException;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.util.Map;


/** Maintains a map from words to tags and their counts.
 *
 *  @author Kristina Toutanova
 *  @version 1.0
 */
public class Dictionary  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(Dictionary.class);

  private final Map dict = Generics.newHashMap();
  private final Map partTakingVerbs = Generics.newHashMap();
  private static final String naWord = "NA";
  private static final boolean VERBOSE = false;

  public Dictionary() {
  }

  void fillWordTagCounts(Map> wordTagCounts) {
    for (String word : wordTagCounts.keySet()) {
      TagCount count = new TagCount(wordTagCounts.get(word));
      dict.put(word, count);
    }
  }

  /*
  public void release() {
    dict.clear();
  }

  public void addVPTaking(String verb, String tag, String partWord) {
    int h = verb.hashCode();
    Integer i = Integer.valueOf(h);
    if (tag.startsWith("RP")) {
      if (this.partTakingVerbs.containsKey(i)) {
        this.partTakingVerbs.get(i).incPart(partWord);
      } else {
        this.partTakingVerbs.put(i, new CountWrapper(verb, 0, 0, 0, 0));
        this.partTakingVerbs.get(i).incPart(partWord);
      }
    } else if (tag.startsWith("RB")) {
      if (this.partTakingVerbs.containsKey(i)) {
        this.partTakingVerbs.get(i).incRB(partWord);
      } else {
        this.partTakingVerbs.put(i, new CountWrapper(verb, 0, 0, 0, 0));
        this.partTakingVerbs.get(i).incRB(partWord);
      }
    } else if (tag.startsWith("IN")) {
      if (this.partTakingVerbs.containsKey(i)) {
        this.partTakingVerbs.get(i).incIn(partWord);
      } else {
        this.partTakingVerbs.put(i, new CountWrapper(verb, 0, 0, 0, 0));
        this.partTakingVerbs.get(i).incIn(partWord);
      }
    }
  }
  */

  protected void addVThatTaking(String verb) {
    int i = verb.hashCode();
    if (this.partTakingVerbs.containsKey(i)) {
      this.partTakingVerbs.get(i).incThat();
    } else {
      this.partTakingVerbs.put(i, new CountWrapper(verb, 0, 1, 0, 0));
    }
  }

  protected int getCountPart(String verb) {
    int i = verb.hashCode();
    if (this.partTakingVerbs.containsKey(i)) {
      return this.partTakingVerbs.get(i).getCountPart();
    }
    return 0;
  }


  protected int getCountThat(String verb) {
    int i = verb.hashCode();
    if (this.partTakingVerbs.containsKey(i)) {
      return this.partTakingVerbs.get(i).getCountThat();
    }
    return 0;
  }


  protected int getCountIn(String verb) {
    int i = verb.hashCode();
    if (this.partTakingVerbs.containsKey(i)) {
      return this.partTakingVerbs.get(i).getCountIn();
    }
    return 0;
  }


  protected int getCountRB(String verb) {
    int i = verb.hashCode();
    if (this.partTakingVerbs.containsKey(i)) {
      return this.partTakingVerbs.get(i).getCountRB();
    }
    return 0;
  }


  protected int getCount(String word, String tag) {
    TagCount count = dict.get(word);
    if (count == null) {
      return 0;
    } else {
      return count.get(tag);
    }
  }


  protected String[] getTags(String word) {
    TagCount count = get(word);
    if (count == null) {
      return null;
    }
    return count.getTags();
  }


  protected TagCount get(String word) {
    return dict.get(word);
  }


  String getFirstTag(String word) {
    TagCount count = dict.get(word);
    if (count != null) {
      return count.getFirstTag();
    }
    return null;
  }


  protected int sum(String word) {
    TagCount count = dict.get(word);
    if (count != null) {
      return count.sum();
    }
    return 0;
  }

  boolean isUnknown(String word) {
    return ! dict.containsKey(word);
  }


  /*
  public void save(String filename) {
    try {
      DataOutputStream rf = IOUtils.getDataOutputStream(filename);
      save(rf);
      rf.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  */

  void save(DataOutputStream file) {
    String[] arr = dict.keySet().toArray(new String[dict.keySet().size()]);
    try {
      file.writeInt(arr.length);
      log.info("Saving dictionary of " + arr.length + " words ...");
      for (String word : arr) {
        TagCount count = get(word);
        file.writeUTF(word);
        count.save(file);
      }
      Integer[] arrverbs = this.partTakingVerbs.keySet().toArray(new Integer[partTakingVerbs.keySet().size()]);
      file.writeInt(arrverbs.length);
      for (Integer iO : arrverbs) {
        CountWrapper tC = this.partTakingVerbs.get(iO);
        file.writeInt(iO.intValue());
        tC.save(file);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  private void read(DataInputStream rf, String filename) throws IOException {
    // Object[] arr=dict.keySet().toArray();

    int maxNumTags = 0;
    int len = rf.readInt();
    if (VERBOSE) {
      log.info("Reading Dictionary of " + len + " words from " + filename + '.');
    }

    for (int i = 0; i < len; i++) {
      String word = rf.readUTF();
      TagCount count = TagCount.readTagCount(rf);
      int numTags = count.numTags();
      if (numTags > maxNumTags) {
        maxNumTags = numTags;
      }
      this.dict.put(word, count);
      if (VERBOSE) {
        log.info("  " + word + " [idx=" + i + "]: " + count);
      }
    }
    if (VERBOSE) {
      log.info("Read dictionary of " + len + " words; max tags for word was " + maxNumTags + '.');
    }
  }

  private void readTags(DataInputStream rf) throws IOException {
    // Object[] arr=dict.keySet().toArray();

    int maxNumTags = 0;
    int len = rf.readInt();
    if (VERBOSE) {
      log.info("Reading Dictionary of " + len + " words.");
    }

    for (int i = 0; i < len; i++) {
      String word = rf.readUTF();
      TagCount count = TagCount.readTagCount(rf);
      int numTags = count.numTags();
      if (numTags > maxNumTags) {
        maxNumTags = numTags;
      }
      this.dict.put(word, count);
      if (VERBOSE) {
        log.info("  " + word + " [idx=" + i + "]: " + count);
      }
    }
    if (VERBOSE) {
      log.info("Read dictionary of " + len + " words; max tags for word was " + maxNumTags + '.');
    }
  }

  protected void read(String filename) {
    try {
      DataInputStream rf = IOUtils.getDataInputStream(filename);
      read(rf, filename);

      int len1 = rf.readInt();
      for (int i = 0; i < len1; i++) {
        int iO = rf.readInt();
        CountWrapper tC = new CountWrapper();
        tC.read(rf);

        this.partTakingVerbs.put(iO, tC);
      }
      rf.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  protected void read(DataInputStream file) {
    try {
      readTags(file);

      int len1 = file.readInt();
      for (int i = 0; i < len1; i++) {
        int iO = file.readInt();
        CountWrapper tC = new CountWrapper();
        tC.read(file);

        this.partTakingVerbs.put(iO, tC);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  /*
  public void printAmbiguous() {
    String[] arr = dict.keySet().toArray(new String[dict.keySet().size()]);
    try {
      int countAmbiguous = 0;
      int countUnAmbiguous = 0;
      int countAmbDisamb = 0;
      for (String word : arr) {
        if (word.indexOf('|') == -1) {
          continue;
        }
        TagCount count = get(word);
        if (count.numTags() > 1) {
          System.out.print(word);
          countAmbiguous++;
          tC.print();
          System.out.println();
        } else {
          String wordA = word.substring(0, word.indexOf('|'));
          if (get(wordA).numTags() > 1) {
            System.out.print(word);
            countAmbDisamb++;
            countUnAmbiguous++;
            tC.print();
            System.out.println();
          } else {
            countUnAmbiguous++;
          }
        }// else
      }
      System.out.println(" ambg " + countAmbiguous + " unambg " + countUnAmbiguous + " disamb " + countAmbDisamb);
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  */

  /**
   * This makes ambiguity classes from all words in the dictionary and remembers
   * their classes in the TagCounts
   */
  protected void setAmbClasses(AmbiguityClasses ambClasses, int veryCommonWordThresh, TTags ttags) {
    for (Map.Entry entry : dict.entrySet()) {
      String w = entry.getKey();
      TagCount count = entry.getValue();
      int ambClassId = ambClasses.getClass(w, this, veryCommonWordThresh, ttags);
      count.setAmbClassId(ambClassId);
    }
  }

  protected int getAmbClass(String word) {
    if (word.equals(naWord)) {
      return -2;
    }
    if (get(word) == null) {
      return -1;
    }
    return get(word).getAmbClassId();
  }

  public static void main(String[] args) {
    String s = "word";
    String tag = "tag";
    Dictionary d = new Dictionary();

    System.out.println(d.getCount(s, tag));
    System.out.println(d.getFirstTag(s));
  }

}