All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.stat.GoodTuring Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Smile.  If not, see .
 ******************************************************************************/

package smile.stat;

import smile.math.MathEx;

/**
 * Good–Turing frequency estimation. This technique is for estimating the
 * probability of encountering an object of a hitherto unseen species,
 * given a set of past observations of objects from different species.
 * In drawing balls from an urn, the 'objects' would be balls and the
 * 'species' would be the distinct colors of the balls (finite but
 * unknown in number). After drawing R_red red balls, R_black black
 * balls and , R_green green balls, we would ask what is the probability
 * of drawing a red ball, a black ball, a green ball or one of a
 * previously unseen color.
 * 

* This method takes a set of (frequency, frequency-of-frequency) pairs and * estimate the probabilities corresponding to the observed frequencies, * and P0, the joint probability of all unobserved species. * * @author Haifeng Li */ public class GoodTuring { /** The probabilities corresponding to the observed frequencies. */ public final double[] p; /** The joint probability of all unobserved species. */ public final double p0; /** * Private constructor. */ private GoodTuring(double[] p, double p0) { this.p = p; this.p0 = p0; } /** * Returns the fitted linear function value y = intercept + slope * log(x). */ private static double smoothed(int x, double slope, double intercept) { return Math.exp(intercept + slope * Math.log(x)); } /** * Returns the index of given frequency. * @param r the frequency list. * @param f the given frequency. * @return the index of given frequency or -1 if it doesn't exist in the list. */ private static int row(int[] r, int f) { int i = 0; while (i < r.length && r[i] < f) { ++i; } return ((i < r.length && r[i] == f) ? i : -1); } /** * Good–Turing frequency estimation. * * @param r the frequency in ascending order. * @param Nr the frequency of frequencies. * @return the estimation object. */ public static GoodTuring of(int[] r, int[] Nr) { final double CONFID_FACTOR = 1.96; if (r.length != Nr.length) { throw new IllegalArgumentException("The sizes of r and Nr are not same."); } int len = r.length; double[] p = new double[len]; double[] logR = new double[len]; double[] logZ = new double[len]; double[] Z = new double[len]; int N = 0; for (int j = 0; j < len; ++j) { N += r[j] * Nr[j]; } int n1 = (r[0] != 1) ? 0 : Nr[0]; double p0 = n1 / (double) N; for (int j = 0; j < len; ++j) { int q = j == 0 ? 0 : r[j - 1]; int t = j == len - 1 ? 2 * r[j] - q : r[j + 1]; Z[j] = 2.0 * Nr[j] / (t - q); logR[j] = Math.log(r[j]); logZ[j] = Math.log(Z[j]); } // Simple linear regression. double XYs = 0.0, Xsquares = 0.0, meanX = 0.0, meanY = 0.0; for (int i = 0; i < len; ++i) { meanX += logR[i]; meanY += logZ[i]; } meanX /= len; meanY /= len; for (int i = 0; i < len; ++i) { XYs += (logR[i] - meanX) * (logZ[i] - meanY); Xsquares += MathEx.sqr(logR[i] - meanX); } double slope = XYs / Xsquares; double intercept = meanY - slope * meanX; boolean indiffValsSeen = false; for (int j = 0; j < len; ++j) { double y = (r[j] + 1) * smoothed(r[j] + 1, slope, intercept) / smoothed(r[j], slope, intercept); if (row(r, r[j] + 1) < 0) { indiffValsSeen = true; } if (!indiffValsSeen) { int n = Nr[row(r, r[j] + 1)]; double x = (r[j] + 1) * n / (double) Nr[j]; if (Math.abs(x - y) <= CONFID_FACTOR * Math.sqrt(MathEx.sqr(r[j] + 1.0) * n / MathEx.sqr(Nr[j]) * (1 + n / (double) Nr[j]))) { indiffValsSeen = true; } else { p[j] = x; } } if (indiffValsSeen) { p[j] = y; } } double Nprime = 0.0; for (int j = 0; j < len; ++j) { Nprime += Nr[j] * p[j]; } for (int j = 0; j < len; ++j) { p[j] = (1 - p0) * p[j] / Nprime; } return new GoodTuring(p, p0); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy