org.monarchinitiative.phenol.stats.Hypergeometric Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phenol-analysis Show documentation
Show all versions of phenol-analysis Show documentation
phenol-analysis implements some algorithms for working with ontology data
package org.monarchinitiative.phenol.stats;
import java.util.Vector;
/**
* Class with static methods to calculate probabilities according to the
* hypergeometric distribution.
*
* @author Peter N. Robinson, Sebastian Bauer
*/
public class Hypergeometric {
/**
* Initialize the object lfactorial, which will act as a cache for log
* factorial calculations.
*/
public Hypergeometric() {
lfactorial = new Vector<>();
lfactorial.add(0, 0.0); /* 0! = 1, therefore let log(0)=0 */
lfactorial.add(1, 0.0);
}
/**
* This Vector contains log factorials for each index value and acts as a
* cache.
*/
private final Vector lfactorial;
/**
*
* For the hypergeometric distribution note the following.
*
*
* - We set up the problem as sampling a set of genes (study genes, for
* instance, the set of upregulated genes in some microarray experiment)
* from a larger set of genes (say, the set of all genes of a species). The
* sampling is done without replacement.
* - For each GO term, we can conceive of the population as being divided
* into genes annotated to this term and those genes that are not annotated
* to the term.
* - The probability of having a certain number of terms annotated to the
* term in the study set can then be calculated by the hypergeometric
* distribution. See GeneMerge by Castillo-Davis et al (Bioinformatics).
*
* - To do this, we need to divide the population into two groups, genes
* with and without annotation. The arguments to the function supply us with
* n, the total number of genes in the population group, and p
* , the proportion of genes with annotation to the term in question.
* We can then calculate the number of genes in the population annotated to
* the term by round(n*p), and the number of genes not annotated to
* the term by round(n*(1-p)).
*
*
* @param n Number of population genes
* @param p Proportion of population genes
* @param k Number of study genes
* @param r Number of study genes in group
*/
public double phypergeometric(int n, double p, int k, int r) {
/*
* Study group cannot be larger than population. If this happens there
* is probably something wrong with the input data, but returning 1.0
* prevents confusing and wrong output.
*/
if (k >= n)
return 1.0;
if (r < 1) {
return 1.0; // Not valid for r < 2, less than 2 study genes.
}
double q = 1.0 - p;
int np = (int) Math.round(n * p); // Round to nearest int
int nq = (int) Math.round(n * q);
double log_n_choose_k = lNchooseK(n, k);
int top = k;
if (np < k) {
top = np;
}
double lfoo = lNchooseK(np, top) + lNchooseK(nq, k - top);
double sum = 0.0;
for (int i = top; i >= r; --i) {
sum += Math.exp(lfoo - log_n_choose_k);
if (i > r) {
lfoo = lfoo
+ Math.log((double) i / (double) (np - i + 1))
+ Math.log((double) (nq - k + i)
/ (double) (k - i + 1));
}
}
return sum;
}
/**
* @param m Number of population genes
* @param m_t Number of population genes annotated to Term t
* @param n Number of study genes
* @param n_t Number of study genes annotated to Term t
* @return Fisher exact test probability of this degree of overrepresentation of Term t
*/
public double phypergeometric(int m, int m_t, int n, int n_t) {
/*
* Study group cannot be larger than population. If this happens there
* is probably something wrong with the input data, but returning 1.0
* prevents confusing and wrong output.
*/
if (n >= m)
return 1.0;
if (n_t < 1) {
return 1.0; // Not valid for r < 2, less than 2 study genes.
}
if (m_t>m) {
return 1.0; // Not valid, probably something wrong with input data
}
int nq = m - m_t;
double log_n_choose_k = lNchooseK(m, n);
int top = n;
if (m_t < n) {
top = m_t;
}
double lfoo = lNchooseK(m_t, top) + lNchooseK(nq, n - top);
double sum = 0.0;
for (int i = top; i >= n_t; --i) {
sum += Math.exp(lfoo - log_n_choose_k);
if (i > n_t) {
lfoo = lfoo
+ Math.log((double) i / (double) (m_t - i + 1))
+ Math.log((double) (nq - n + i)
/ (double) (n - i + 1));
}
}
return sum;
}
/**
* Calculates the probability that if you draw n balls from
* an urn without replacement containing N balls where M among
* them are white (and so N-M are black) you will get x white
* balls.
*
* @param x number of white balls drawn without replacement
* @param N number of balls in the urn
* @param M number of white balls in the urn
* @param n number of balls drawn from the urn
* @return the probability
*/
public double dhyper(int x, int N, int M, int n) {
/* It is not possible to draw more white balls
* from an urn containing M white balls. Hence
* the probability is 0.
*/
if (x > M) return 0;
/* Of course it is also not possible to draw
* more white balls than the number of drawings.
* The probability is 0.
*/
if (x > n) return 0;
/* Last but not least, it is also not possible
* to draw more black balls than there are within
* the urn.
*/
if (n - x > N - M) return 0;
return Math.exp(lNchooseK(M, x) + lNchooseK(N - M, n - x) - lNchooseK(N, n));
}
/**
* Calculates P(X > x) where X is the hypergeometric distribution
* with indices N,M,n. If lowerTail is set to true, then P(X <= x)
* is calculated.
*
* @param x number of white balls drawn without replacement
* @param N number of balls in the urn
* @param M number of white balls in the urn
* @param n number of balls drawn from the urn
* @param lowerTail defines if the lower tail should be calculated, i.e., if the
* parameter is set to true then P(X <= x) is calculated, otherwise P(X > x) is
* calculated.
* @return the probability
*/
public double phyper(int x, int N, int M, int n, boolean lowerTail) {
int i;
int up;
double p;
up = Math.min(n, M);
p = 0;
if (x < up / 2) {
for (i = x; i >= 0; i--)
p += dhyper(i, N, M, n);
if (lowerTail) return p;
else return 1 - p;
} else {
for (i = x + 1; i <= up; i++)
p += dhyper(i, N, M, n);
if (lowerTail) return 1 - p;
else return p;
}
}
public double lNchooseK(int n, int k) {
double ans;
ans = logfact(n) - logfact(k) - logfact(n - k);
return ans;
}
/**
* return the log factorial of i. Use a cache to avoid repeatedly
* calculating this. If we have a cache miss, fill up all values from the
* last valid cache value to the value we currently need.
*/
public double logfact(int i) {
/*
* Make sure value is already in lfactorial. If not, calculate all
* values up to that for i
*/
if (i > (lfactorial.size() - 1)) {
for (int j = lfactorial.size(); j <= i; j++) {
double lf = lfactorial.get(j - 1)
+ Math.log(j);
lfactorial.add(j, lf);
}
}
return lfactorial.get(i);
}
}