edu.stanford.nlp.stats.SimpleGoodTuring Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.stats;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
/**
* Simple Good-Turing smoothing, based on code from Sampson, available at:
* ftp://ftp.informatics.susx.ac.uk/pub/users/grs2/SGT.c
*
* See also http://www.grsampson.net/RGoodTur.html
*
* @author Bill MacCartney ([email protected])
*/
public class SimpleGoodTuring {
private static final int MIN_INPUT = 5;
private static final double CONFID_FACTOR = 1.96;
private static final double TOLERANCE = 1e-12;
private int[] r; // for each bucket, a frequency
private int[] n; // for each bucket, number of items w that frequency
private int rows; // number of frequency buckets
private int bigN = 0; // total count of all items
private double pZero; // probability of unseen items
private double bigNPrime;
private double slope;
private double intercept;
private double[] z;
private double[] logR;
private double[] logZ;
private double[] rStar;
private double[] p;
/**
* Each instance of this class encapsulates the computation of the smoothing
* for one probability distribution. The constructor takes two arguments
* which are two parallel arrays. The first is an array of counts, which must
* be positive and in ascending order. The second is an array of
* corresponding counts of counts; that is, for each i, n[i] represents the
* number of types which occurred with count r[i] in the underlying
* collection. See the documentation for main() for a concrete example.
*/
public SimpleGoodTuring(int[] r, int[] n) {
if (r == null) throw new IllegalArgumentException("r must not be null!");
if (n == null) throw new IllegalArgumentException("n must not be null!");
if (r.length != n.length) throw new IllegalArgumentException("r and n must have same size!");
if (r.length < MIN_INPUT) throw new IllegalArgumentException("r must have size >= " + MIN_INPUT + "!");
this.r = new int[r.length];
this.n = new int[n.length];
System.arraycopy(r, 0, this.r, 0, r.length); // defensive copy
System.arraycopy(n, 0, this.n, 0, n.length); // defensive copy
this.rows = r.length;
compute();
validate(TOLERANCE);
}
/**
* Returns the probability allocated to types not seen in the underlying
* collection.
*/
public double getProbabilityForUnseen() {
return pZero;
}
/**
* Returns the probabilities allocated to each type, according to their count
* in the underlying collection. The returned array parallels the arrays
* passed in to the constructor. If the returned array is designated p, then
* for all i, p[i] represents the smoothed probability assigned to types which
* occurred r[i] times in the underlying collection (where r is the first
* argument to the constructor).
*/
public double[] getProbabilities() {
return p;
}
private void compute() {
int i, j, next_n;
double k, x, y;
boolean indiffValsSeen = false;
z = new double[rows];
logR = new double[rows];
logZ = new double[rows];
rStar = new double[rows];
p = new double[rows];
for (j = 0; j < rows; ++j) bigN += r[j] * n[j]; // count all items
next_n = row(1);
pZero = (next_n < 0) ? 0 : n[next_n] / (double) bigN;
for (j = 0; j < rows; ++j) {
i = (j == 0 ? 0 : r[j - 1]);
if (j == rows - 1)
k = (double) (2 * r[j] - i);
else
k = (double) r[j + 1];
z[j] = 2 * n[j] / (k - i);
logR[j] = Math.log(r[j]);
logZ[j] = Math.log(z[j]);
}
findBestFit();
for (j = 0; j < rows; ++j) {
y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);
if (row(r[j] + 1) < 0)
indiffValsSeen = true;
if (!indiffValsSeen) {
x = (r[j] + 1) * (next_n = n[row(r[j] + 1)]) / (double) n[j];
if (Math.abs(x - y) <= CONFID_FACTOR * Math.sqrt(sq(r[j] + 1.0)
* next_n / (sq((double) n[j]))
* (1 + next_n / (double) n[j])))
indiffValsSeen = true;
else
rStar[j] = x;
}
if (indiffValsSeen)
rStar[j] = y;
}
bigNPrime = 0.0;
for (j = 0; j < rows; ++j)
bigNPrime += n[j] * rStar[j];
for (j = 0; j < rows; ++j)
p[j] = (1 - pZero) * rStar[j] / bigNPrime;
}
/**
* Returns the index of the bucket having the given frequency, or else -1 if no
* bucket has the given frequency.
*/
private int row(int freq) {
int i = 0;
while (i < rows && r[i] < freq) i++;
return ((i < rows && r[i] == freq) ? i : -1);
}
private void findBestFit() {
double XYs, Xsquares, meanX, meanY;
int i;
XYs = Xsquares = meanX = meanY = 0.0;
for (i = 0; i < rows; ++i) {
meanX += logR[i];
meanY += logZ[i];
}
meanX /= rows;
meanY /= rows;
for (i = 0; i < rows; ++i) {
XYs += (logR[i] - meanX) * (logZ[i] - meanY);
Xsquares += sq(logR[i] - meanX);
}
slope = XYs / Xsquares;
intercept = meanY - slope * meanX;
}
private double smoothed(int i) {
return (Math.exp(intercept + slope * Math.log(i)));
}
private static double sq(double x) {
return (x * x);
}
private void print() {
int i;
System.out.printf("%6s %6s %8s %8s%n", "r", "n", "p", "p*");
System.out.printf("%6s %6s %8s %8s%n", "----", "----", "----", "----");
System.out.printf("%6d %6d %8.4g %8.4g%n", 0, 0, 0.0, pZero);
for (i = 0; i < rows; ++i)
System.out.printf("%6d %6d %8.4g %8.4g%n", r[i], n[i], 1.0 * r[i] / bigN, p[i]);
}
/**
* Ensures that we have a proper probability distribution.
*/
private void validate(double tolerance) {
double sum = pZero;
for (int i = 0; i < n.length; i++) {
sum += (n[i] * p[i]);
}
double err = 1.0 - sum;
if (Math.abs(err) > tolerance) {
throw new IllegalStateException("ERROR: the probability distribution sums to " + sum);
}
}
// static methods -------------------------------------------------------------
/**
* Reads from STDIN a sequence of lines, each containing two integers,
* separated by whitespace. Returns a pair of int arrays containing the
* values read.
*/
private static int[][] readInput() throws Exception {
List rVals = new ArrayList();
List nVals = new ArrayList();
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
while ((line = in.readLine()) != null) {
String[] tokens = line.trim().split("\\s+");
if (tokens.length != 2)
throw new Exception("Line doesn't contain two tokens: " + line);
Integer r = Integer.valueOf(tokens[0]);
Integer n = Integer.valueOf(tokens[1]);
rVals.add(r);
nVals.add(n);
}
in.close();
int[][] result = new int[2][];
result[0] = integerList2IntArray(rVals);
result[1] = integerList2IntArray(nVals);
return result;
}
/**
* Helper to readInput().
*/
private static int[] integerList2IntArray(List integers) {
int[] ints = new int[integers.size()];
int i = 0;
for (Integer integer : integers) {
ints[i++] = integer;
}
return ints;
}
// main =======================================================================
/**
* Like Sampson's SGT program, reads data from STDIN and writes results to
* STDOUT. The input should contain two integers on each line, separated by
* whitespace. The first integer is a count; the second is a count for that
* count. The input must be sorted in ascending order, and should not contain
* 0s. For example, valid input is:
*
*
* 1 10
* 2 6
* 3 4
* 5 2
* 8 1
*
*
* This represents a collection in which 10 types occur once each, 6 types
* occur twice each, 4 types occur 3 times each, 2 types occur 5 times each,
* and one type occurs 10 times, for a total count of 52. This input will
* produce the following output:
*
*
* r n p p*
* ---- ---- ---- ----
* 0 0 0.000 0.1923
* 1 10 0.01923 0.01203
* 2 6 0.03846 0.02951
* 3 4 0.05769 0.04814
* 5 2 0.09615 0.08647
* 8 1 0.1538 0.1448
*
*
* The last column represents the smoothed probabilities, and the first item
* in this column represents the probability assigned to unseen items.
*/
public static void main(String[] args) throws Exception {
int[][] input = readInput();
SimpleGoodTuring sgt = new SimpleGoodTuring(input[0], input[1]);
sgt.print();
}
}