edu.stanford.nlp.parser.lexparser.UnknownGTTrainer Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2

Show newest version

package edu.stanford.nlp.parser.lexparser;

import java.util.Collection;
import java.util.Map;
import java.util.Set;

import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;

/**
 * This class trains a Good-Turing model for unknown words from a
 * collection of trees.  It builds up a map of statistics which can be
 * used by any UnknownWordModel which wants to use the GT model.
 *
 * Authors:
 *
 * @author Roger Levy
 * @author Greg Donaker (corrections and modeling improvements)
 * @author Christopher Manning (generalized and improved what Greg did)
 * @author Anna Rafferty
 * @author John Bauer (refactored into a separate training class)
 */
public class UnknownGTTrainer {
  ClassicCounter> wtCount = new ClassicCounter>();
  ClassicCounter tagCount = new ClassicCounter();
  ClassicCounter r1 = new ClassicCounter(); // for each tag, # of words seen once
  ClassicCounter r0 = new ClassicCounter(); // for each tag, # of words not seen
  Set seenWords = Generics.newHashSet();

  double tokens = 0;

  Map unknownGT = Generics.newHashMap();

  public void train(Collection trees) {
    train(trees, 1.0);
  }

  public void train(Collection trees, double weight) {
    for (Tree t : trees) {
      train(t, weight);
    }
  }


  public void train(Tree tree, double weight) {
    /* get TaggedWord and total tag counts, and get set of all
     * words attested in training
     */
    for (TaggedWord word : tree.taggedYield()) {
      train(word, weight);
    }
  }

  public void train(TaggedWord tw, double weight) {
    tokens = tokens + weight;
    String word = tw.word();
    String tag = tw.tag();

    // TaggedWord has crummy equality conditions
    Pair wt = new Pair(word, tag);
    wtCount.incrementCount(wt, weight);

    tagCount.incrementCount(tag, weight);
    seenWords.add(word);
  }

  public void finishTraining() {
    // testing: get some stats here
    System.err.println("Total tokens: " + tokens);
    System.err.println("Total WordTag types: " + wtCount.keySet().size());
    System.err.println("Total tag types: " + tagCount.keySet().size());
    System.err.println("Total word types: " + seenWords.size());

    /* find # of once-seen words for each tag */
    for (Pair wt : wtCount.keySet()) {
      if (wtCount.getCount(wt) == 1) {
        r1.incrementCount(wt.second());
      }
    }

    /* find # of unseen words for each tag */
    for (String tag : tagCount.keySet()) {
      for (String word : seenWords) {
        Pair wt = new Pair(word, tag);
        if (!(wtCount.keySet().contains(wt))) {
          r0.incrementCount(tag);
        }
      }
    }

    /* set unseen word probability for each tag */
    for (String tag : tagCount.keySet()) {
      float logprob = (float) Math.log(r1.getCount(tag) / (tagCount.getCount(tag) * r0.getCount(tag)));
      unknownGT.put(tag, Float.valueOf(logprob));
    }

  }

}