picard.fingerprint.HaplotypeProbabilities Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.fingerprint;
import java.util.Arrays;
import static java.lang.Math.log10;
import static picard.util.MathUtil.multiply;
import static picard.util.MathUtil.pNormalizeVector;
/**
* Abstract class for storing and calculating various likelihoods and probabilities
* for haplotype alleles given evidence.
*
* @author Tim Fennell
*/
public abstract class HaplotypeProbabilities {
private final HaplotypeBlock haplotypeBlock;
protected HaplotypeProbabilities(final HaplotypeBlock haplotypeBlock) {
this.haplotypeBlock = haplotypeBlock;
}
/** Returns the haplotype for which the probabilities apply. */
public HaplotypeBlock getHaplotype() {
return this.haplotypeBlock;
}
public double [] getPriorProbablities(){
return getHaplotype().getHaplotypeFrequencies();
}
/** Returns the probabilities, in order, of the AA, Aa and aa haplotypes.
* Mathematically, this is P(H | D, F) where and H is the vector of possible haplotypes {AA,Aa,aa}.
* D is the data seen by the class, and
* F is the population frequency of each genotype.
*/
/** Returns the posterior probabilities using the population frequency as a prior. */
public double[] getPosteriorProbabilities() {
return pNormalizeVector(multiply(getLikelihoods(), getPriorProbablities()));}
/**
* Returns the likelihoods, in order, of the AA, Aa and aa haplotypes given the evidence
*
* Mathematically this is P(evidence | haplotype) where haplotype={AA,Aa,aa}.
*/
public abstract double[] getLikelihoods();
public double[] getLogLikelihoods() {
final double[] likelihoods = getLikelihoods();
final double[] lLikelihoods = new double [likelihoods.length];
for (int i = 0; i < likelihoods.length; ++i) {
lLikelihoods[i] = Math.log10(likelihoods[i]);
}
return lLikelihoods;
}
/**
* Returns a representative SNP for this haplotype. Different subclasses may implement this in
* different ways, but should do so in a deterministic/repeatable fashion.
*/
public abstract Snp getRepresentativeSnp();
/**
* Returns the number of observations of alleles supporting the first/major haplotype allele.
* Strictly this doesn't make sense for all subclasses, but it's nice to have it part of the API so
* a default implementation is provided here.
* @return int
*/
public int getObsAllele1() { return 0; }
/**
* Returns the number of observations of alleles supporting the second/minor haplotype allele.
* Strictly this doesn't make sense for all subclasses, but it's nice to have it part of the API so
* a default implementation is provided here.
* @return int
*/
public int getObsAllele2() { return 0; }
/**
* Returns the total number of observations of any allele.
* Strictly this doesn't make sense for all subclasses, but it's nice to have it part of the API so
* a default implementation is provided here.
* @return int
*/
public int getTotalObs() { return 0; }
/** Returns true if evidence has been added, false if the probabilities are just the priors. */
public boolean hasEvidence() {
return true;
}
/** Merges in the likelihood information from the supplied haplotype probabilities object. */
public abstract void merge(final HaplotypeProbabilities other);
/**
* Returns the index of the highest probability which can then be used to look up
* DiploidHaplotypes or DiploidGenotypes as appropriate.
*/
int getMostLikelyIndex() {
final double[] probs = getPosteriorProbabilities();
if (probs[0] > probs[1] && probs[0] > probs[2]) return 0;
else if (probs[1] > probs[2]) return 1;
else return 2;
}
/** Gets the most likely haplotype given the probabilities. */
public DiploidHaplotype getMostLikelyHaplotype() {
return DiploidHaplotype.values()[getMostLikelyIndex()];
}
/** Gets the genotype for this Snp given the most likely haplotype. */
public DiploidGenotype getMostLikelyGenotype(final Snp snp) {
assertSnpPartOfHaplotype(snp);
return snp.getGenotype(getMostLikelyHaplotype());
}
/** Throws an exception if the passed SNP is not part of this haplotype. */
void assertSnpPartOfHaplotype(final Snp snp) {
if (!this.haplotypeBlock.getSnps().contains(snp)) {
throw new IllegalArgumentException("Snp " + snp + " does not belong to haplotype " + this.haplotypeBlock);
}
}
/** This function returns the scaled probability of the evidence collected
* given a vector of priors on the haplotype using the internal likelihood, which may be
* scaled by an unknown factor. This factor causes the result to be scaled, hence the name.
*
* Mathematically:
*
* P(Evidence| P(h_i)=F_i) = \sum_i P(Evidence | h_i) P(h_i)
* = \sum_i P(Evidence | h_i) F_i
* = c * \sum_i Likelihood_i * F_i
*
* Here, h_i are the three possible haplotypes, F_i are the given priors, and Likelihood_i
* are the stored likelihoods which are scaled from the actually likelihoods by an unknown
* factor, c. Note that the calculation ignores the internal haplotype probabilities (i.e. priors)
*
* @param genotypeFrequencies vector of (possibly scaled) probabilities of the three haplotypes
* @return P(evidence | P_h)) / c
*/
public double scaledEvidenceProbabilityUsingGenotypeFrequencies(final double[] genotypeFrequencies) {
final double[] likelihoods = getLikelihoods();
assert (genotypeFrequencies.length == likelihoods.length);
double result = 0;
for (int i = 0; i < likelihoods.length; ++i) {
result += likelihoods[i] * genotypeFrequencies[i];
}
return result;
}
public double shiftedLogEvidenceProbabilityUsingGenotypeFrequencies(final double[] genotypeFrequencies) {
return log10(scaledEvidenceProbabilityUsingGenotypeFrequencies(genotypeFrequencies));
}
/**
* returns the log-probability the evidence, using as priors the posteriors of another object
*
* @param otherHp an additional HaplotypeProbabilities object representing the same underlying HaplotypeBlock
* @return log10( P(evidence| P(h_i)=P(h_i|otherHp) ) + c where c is an unknown constant
*/
public double shiftedLogEvidenceProbabilityGivenOtherEvidence(final HaplotypeProbabilities otherHp) {
if (!this.haplotypeBlock.equals(otherHp.getHaplotype())) {
throw new IllegalArgumentException("Haplotypes are from different HaplotypeBlocks!");
}
/** Get the posterior from the other otherHp. Use this posterior as the prior to calculate probability.
*
* P(hap|x,y) = P(x|hap,y) P(hap|y) / P(x|y)
* = P(x | hap) * P(hap | y) / P(x)
* likelihood * other.posterior
*
* = P(x|hap) P(y|hap) P(hap)/P(x)P(y)
* = A P(x| hap) P(y| hap) P(hap) # where A is an unknown scaling factor
*/
return shiftedLogEvidenceProbabilityUsingGenotypeFrequencies(otherHp.getPosteriorProbabilities());
}
/**
* Returns log (p(evidence)) + c assuming that the prior on haplotypes is given by
* the internal haplotypeFrequencies
*/
public double shiftedLogEvidenceProbability() {
return shiftedLogEvidenceProbabilityUsingGenotypeFrequencies(getPriorProbablities());
}
/** Returns the LOD score between the most probable haplotype and the second most probable. */
public double getLodMostProbableGenotype() {
final double[] probs = getPosteriorProbabilities();
final double[] logs = new double[probs.length];
for (int i = 0; i < probs.length; ++i) {
logs[i] = log10(probs[i]);
}
Arrays.sort(logs);
return logs[2] - logs[1];
}
/** Log10(P(evidence| haplotype)) for the 3 different possible haplotypes
* {aa, ab, bb}
*/
//an enum whose only role in life is to help iterate over the 3 possible diploid genotypes
protected enum Genotype {
HOM_ALLELE1(0),
HET_ALLELE12(1),
HOM_ALLELE2(2);
int v; //value is the number of chromosomes in the genotype that have ALLELE2.
Genotype(final int v) {
this.v = v;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy