edu.stanford.nlp.stats.SimpleGoodTuring Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.stats;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

/**
 * Simple Good-Turing smoothing, based on code from Sampson, available at:
 * ftp://ftp.informatics.susx.ac.uk/pub/users/grs2/SGT.c 
 *
 * See also http://www.grsampson.net/RGoodTur.html
 * 
 * @author Bill MacCartney ([email protected])
 */
public class SimpleGoodTuring {

  private static final int MIN_INPUT = 5;
  private static final double CONFID_FACTOR = 1.96;
  private static final double TOLERANCE = 1e-12;

  private int[] r;               // for each bucket, a frequency
  private int[] n;               // for each bucket, number of items w that frequency
  private int rows;              // number of frequency buckets

  private int bigN = 0;          // total count of all items
  private double pZero;          // probability of unseen items
  private double bigNPrime;
  private double slope;
  private double intercept;
  private double[] z;
  private double[] logR;
  private double[] logZ;
  private double[] rStar;
  private double[] p;

  /**
   * Each instance of this class encapsulates the computation of the smoothing
   * for one probability distribution.  The constructor takes two arguments
   * which are two parallel arrays.  The first is an array of counts, which must
   * be positive and in ascending order.  The second is an array of
   * corresponding counts of counts; that is, for each i, n[i] represents the
   * number of types which occurred with count r[i] in the underlying
   * collection.  See the documentation for main() for a concrete example.
   */
  public SimpleGoodTuring(int[] r, int[] n) {
    if (r == null) throw new IllegalArgumentException("r must not be null!");
    if (n == null) throw new IllegalArgumentException("n must not be null!");
    if (r.length != n.length) throw new IllegalArgumentException("r and n must have same size!");
    if (r.length < MIN_INPUT) throw new IllegalArgumentException("r must have size >= " + MIN_INPUT + "!");
    this.r = new int[r.length];
    this.n = new int[n.length];
    System.arraycopy(r, 0, this.r, 0, r.length); // defensive copy
    System.arraycopy(n, 0, this.n, 0, n.length); // defensive copy
    this.rows = r.length;
    compute();
    validate(TOLERANCE);
  }

  /**
   * Returns the probability allocated to types not seen in the underlying
   * collection.
   */
  public double getProbabilityForUnseen() {
    return pZero;
  }

  /**
   * Returns the probabilities allocated to each type, according to their count
   * in the underlying collection.  The returned array parallels the arrays
   * passed in to the constructor.  If the returned array is designated p, then
   * for all i, p[i] represents the smoothed probability assigned to types which
   * occurred r[i] times in the underlying collection (where r is the first
   * argument to the constructor).
   */
  public double[] getProbabilities() {
    return p;
  }

  private void compute() {
    int i, j, next_n;
    double k, x, y;
    boolean indiffValsSeen = false;

    z = new double[rows];
    logR = new double[rows];
    logZ = new double[rows];
    rStar = new double[rows];
    p = new double[rows];
        
    for (j = 0; j < rows; ++j) bigN += r[j] * n[j]; // count all items
    next_n = row(1);
    pZero = (next_n < 0) ? 0 : n[next_n] / (double) bigN;
    for (j = 0; j < rows; ++j) {
      i = (j == 0 ? 0 : r[j - 1]);
      if (j == rows - 1)
        k = (double) (2 * r[j] - i);
      else
        k = (double) r[j + 1];
      z[j] = 2 * n[j] / (k - i);
      logR[j] = Math.log(r[j]);
      logZ[j] = Math.log(z[j]);
    }
    findBestFit();
    for (j = 0; j < rows; ++j) {
      y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);
      if (row(r[j] + 1) < 0)
        indiffValsSeen = true;
      if (!indiffValsSeen) {
        x = (r[j] + 1) * (next_n = n[row(r[j] + 1)]) / (double) n[j];
        if (Math.abs(x - y) <= CONFID_FACTOR * Math.sqrt(sq(r[j] + 1.0)
                                                         * next_n / (sq((double) n[j]))
                                                         * (1 + next_n / (double) n[j])))
          indiffValsSeen = true;
        else
          rStar[j] = x;
      }
      if (indiffValsSeen)
        rStar[j] = y;
    }
    bigNPrime = 0.0;
    for (j = 0; j < rows; ++j)
      bigNPrime += n[j] * rStar[j];
    for (j = 0; j < rows; ++j)
      p[j] = (1 - pZero) * rStar[j] / bigNPrime;
  }

  /**
   * Returns the index of the bucket having the given frequency, or else -1 if no
   * bucket has the given frequency.
   */
  private int row(int freq) {
    int i = 0;
    while (i < rows && r[i] < freq) i++;
    return ((i < rows && r[i] == freq) ? i : -1);
  }

  private void findBestFit() {
    double XYs, Xsquares, meanX, meanY;
    int i;
    XYs = Xsquares = meanX = meanY = 0.0;
    for (i = 0; i < rows; ++i) {
      meanX += logR[i];
      meanY += logZ[i];
    }
    meanX /= rows;
    meanY /= rows;
    for (i = 0; i < rows; ++i) {
      XYs += (logR[i] - meanX) * (logZ[i] - meanY);
      Xsquares += sq(logR[i] - meanX);
    }
    slope = XYs / Xsquares;
    intercept = meanY - slope * meanX;
  }

  private double smoothed(int i) {
    return (Math.exp(intercept + slope * Math.log(i)));
  }

  private static double sq(double x) {
    return (x * x);
  }

  private void print() {
    int i;
    System.out.printf("%6s %6s %8s %8s%n", "r", "n", "p", "p*");
    System.out.printf("%6s %6s %8s %8s%n", "----", "----", "----", "----");
    System.out.printf("%6d %6d %8.4g %8.4g%n", 0, 0, 0.0, pZero);
    for (i = 0; i < rows; ++i)
      System.out.printf("%6d %6d %8.4g %8.4g%n", r[i], n[i], 1.0 * r[i] / bigN, p[i]);
  }

  /**
   * Ensures that we have a proper probability distribution.
   */
  private void validate(double tolerance) {
    double sum = pZero;
    for (int i = 0; i < n.length; i++) {
      sum += (n[i] * p[i]);
    }
    double err = 1.0 - sum;
    if (Math.abs(err) > tolerance) {
      throw new IllegalStateException("ERROR: the probability distribution sums to " + sum);
    }
  }


  // static methods -------------------------------------------------------------

  /**
   * Reads from STDIN a sequence of lines, each containing two integers,
   * separated by whitespace.  Returns a pair of int arrays containing the
   * values read.
   */
  private static int[][] readInput() throws Exception {
    List rVals = new ArrayList();
    List nVals = new ArrayList();
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    String line;
    while ((line = in.readLine()) != null) {
      String[] tokens = line.trim().split("\\s+");
      if (tokens.length != 2)
        throw new Exception("Line doesn't contain two tokens: " + line);
      Integer r = Integer.valueOf(tokens[0]);
      Integer n = Integer.valueOf(tokens[1]);
      rVals.add(r);
      nVals.add(n);
    }
    in.close();
    int[][] result = new int[2][];
    result[0] = integerList2IntArray(rVals);
    result[1] = integerList2IntArray(nVals);
    return result;
  }

  /**
   * Helper to readInput().
   */
  private static int[] integerList2IntArray(List integers) {
    int[] ints = new int[integers.size()];
    int i = 0;
    for (Integer integer : integers) {
      ints[i++] = integer;
    }
    return ints;
  }


  // main =======================================================================

  /**
   * Like Sampson's SGT program, reads data from STDIN and writes results to
   * STDOUT.  The input should contain two integers on each line, separated by
   * whitespace.  The first integer is a count; the second is a count for that
   * count.  The input must be sorted in ascending order, and should not contain
   * 0s.  For example, valid input is: 

   *
   * 
   *   1 10
   *   2 6
   *   3 4
   *   5 2
   *   8 1
   * 
   *
   * This represents a collection in which 10 types occur once each, 6 types
   * occur twice each, 4 types occur 3 times each, 2 types occur 5 times each,
   * and one type occurs 10 times, for a total count of 52.  This input will
   * produce the following output: 
   *
   *    *     r      n        p       p*
   *  ----   ----     ----     ----
   *     0      0    0.000   0.1923
   *     1     10  0.01923  0.01203
   *     2      6  0.03846  0.02951
   *     3      4  0.05769  0.04814
   *     5      2  0.09615  0.08647
   *     8      1   0.1538   0.1448
   * 
   *
   * The last column represents the smoothed probabilities, and the first item
   * in this column represents the probability assigned to unseen items.
   */
  public static void main(String[] args) throws Exception {
    int[][] input = readInput();
    SimpleGoodTuring sgt = new SimpleGoodTuring(input[0], input[1]);
    sgt.print();
  }

}