org.biojava.nbio.phylo.DistanceMatrixCalculator Maven / Gradle / Ivy
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.phylo;
import java.io.IOException;
import java.util.List;
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.core.sequence.MultipleSequenceAlignment;
import org.biojava.nbio.core.sequence.template.Compound;
import org.biojava.nbio.core.sequence.template.Sequence;
import org.forester.evoinference.distance.PairwiseDistanceCalculator;
import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
import org.forester.evoinference.matrix.distance.DistanceMatrix;
import org.forester.msa.Msa;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The DistanceMatrixCalculator methods generate a {@link DistanceMatrix} from a
* {@link MultipleSequenceAlignment} or other indirect distance infomation (RMSD).
*
* @author Aleix Lafita
* @since 4.1.1
*
*/
public class DistanceMatrixCalculator {
private static final Logger logger = LoggerFactory
.getLogger(DistanceMatrixCalculator.class);
/** Prevent instantiation */
private DistanceMatrixCalculator() {}
/**
* The fractional dissimilarity (D) is defined as the percentage of sites
* that differ between two aligned sequences. The percentage of identity
* (PID) is the fraction of identical sites between two aligned sequences.
*
*
* D = 1 - PID
*
*
* The gapped positons in the alignment are ignored in the calculation. This
* method is a wrapper to the forester implementation of the calculation:
* {@link PairwiseDistanceCalculator#calcFractionalDissimilarities(Msa)}
*
* @param msa
* MultipleSequenceAlignment
* @return DistanceMatrix
* @throws Exception
*/
public static , D extends Compound> DistanceMatrix fractionalDissimilarity(
MultipleSequenceAlignment msa) throws IOException {
Msa fMsa = ForesterWrapper.convert(msa);
DistanceMatrix DM = PairwiseDistanceCalculator
.calcFractionalDissimilarities(fMsa);
return DM;
}
/**
* The Poisson (correction) evolutionary distance (d) is a function of the
* fractional dissimilarity (D), given by:
*
*
* d = -log(1 - D)
*
*
* The gapped positons in the alignment are ignored in the calculation. This
* method is a wrapper to the forester implementation of the calculation:
* {@link PairwiseDistanceCalculator#calcPoissonDistances(Msa)}
*
* @param msa
* MultipleSequenceAlignment
* @return DistanceMatrix
* @throws IOException
*/
public static , D extends Compound> DistanceMatrix poissonDistance(
MultipleSequenceAlignment msa) throws IOException {
Msa fMsa = ForesterWrapper.convert(msa);
DistanceMatrix DM = PairwiseDistanceCalculator
.calcPoissonDistances(fMsa);
return DM;
}
/**
* The Kimura evolutionary distance (d) is a correction of the fractional
* dissimilarity (D) specially needed for large evolutionary distances. It
* is given by:
*
*
* d = -log(1 - D - 0.2 * D2)
*
*
* The equation is derived by fitting the relationship between the
* evolutionary distance (d) and the fractional dissimilarity (D) according
* to the PAM model of evolution (it is an empirical approximation for the
* method {@link #pamDistance(MultipleSequenceAlignment}). The gapped
* positons in the alignment are ignored in the calculation. This method is
* a wrapper to the forester implementation of the calculation:
* {@link PairwiseDistanceCalculator#calcKimuraDistances(Msa)}.
*
* @param msa
* MultipleSequenceAlignment
* @return DistanceMatrix
* @throws IOException
*/
public static , D extends Compound> DistanceMatrix kimuraDistance(
MultipleSequenceAlignment msa) throws IOException {
Msa fMsa = ForesterWrapper.convert(msa);
DistanceMatrix DM = PairwiseDistanceCalculator
.calcPoissonDistances(fMsa);
return DM;
}
/**
* BioJava implementation for percentage of identity (PID). Although the
* name of the method is percentage of identity, the DistanceMatrix contains
* the fractional dissimilarity (D), computed as D = 1 - PID.
*
* It is recommended to use the method
* {@link DistanceMatrixCalculator#fractionalDissimilarity(MultipleSequenceAlignment)}
* instead of this one.
*
* @param msa
* MultipleSequenceAlignment
* @return DistanceMatrix
*/
public static , D extends Compound> DistanceMatrix percentageIdentity(
MultipleSequenceAlignment msa) {
logger.info("{}:{}", "Determing Distances", 0);
int n = msa.getSize();
String[] sequenceString = new String[n];
for (int i = 0; i < n; i++) {
sequenceString[i] = msa.getAlignedSequence(i + 1)
.getSequenceAsString();
}
DistanceMatrix distance = new BasicSymmetricalDistanceMatrix(n);
int totalloopcount = (n / 2) * (n + 1);
int loopcount = 0;
for (int i = 0; i < (n - 1); i++) {
logger.info("{}:{}", "Determining Distances", (loopcount * 100)
/ totalloopcount);
distance.setIdentifier(i, msa.getAlignedSequence(i + 1)
.getAccession().getID());
for (int j = i; j < n; j++) {
loopcount++;
if (j == i) {
distance.setValue(i, j, 0);
} else {
distance.setValue(i, j, 100 - Comparison.PID(
sequenceString[i], sequenceString[j]));
distance.setValue(j, i, distance.getValue(i, j));
}
}
}
logger.info("{}:{}", "Determining Distances", 100);
return distance;
}
/**
* The fractional dissimilarity score (Ds) is a relative measure of the
* dissimilarity between two aligned sequences. It is calculated as:
*
*
* Ds = sum( max(M) - Mai,bi ) / (max(M)-min(M)) ) / L
*
*
* Where the sum through i runs for all the alignment positions, ai and bi
* are the AA at position i in the first and second aligned sequences,
* respectively, and L is the total length of the alignment (normalization).
*
* The fractional dissimilarity score (Ds) is in the interval [0, 1], where
* 0 means that the sequences are identical and 1 that the sequences are
* completely different.
*
* Gaps do not have a contribution to the similarity score calculation (gap
* penalty = 0)
*
* @param msa
* MultipleSequenceAlignment
* @param M
* SubstitutionMatrix for similarity scoring
* @return DistanceMatrix
*/
public static , D extends Compound> DistanceMatrix fractionalDissimilarityScore(
MultipleSequenceAlignment msa, SubstitutionMatrix M) {
// Calculate the similarity scores using the alignment package
logger.info("{}:{}", "Determing Distances", 0);
int n = msa.getSize();
DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n);
int totalloopcount = (n / 2) * (n + 1);
int end = msa.getLength();
String[] sequenceString = new String[n];
for (int i = 0; i < n; i++) {
sequenceString[i] = msa.getAlignedSequence(i + 1)
.getSequenceAsString();
}
List seqs = msa.getAlignedSequences();
int loopcount = 0;
for (int i = 0; i < (n - 1); i++) {
logger.info("{}:{}", "Determining Distances", (loopcount * 100)
/ totalloopcount);
// Obtain the similarity scores
for (int j = i; j < n; j++) {
double score = 0;
loopcount++;
for (int k = 0; k < end; k++) {
if (Comparison.isGap(sequenceString[i].charAt(k))
|| Comparison.isGap(sequenceString[j].charAt(k)))
continue;
score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs
.get(j).getCompoundAt(k + 1));
}
if (i == j)
DM.setValue(i, j, 0.0);
else {
double dS = (M.getMaxValue() - score / msa.getLength())
/ (M.getMaxValue() - M.getMinValue());
DM.setValue(i, j, dS);
DM.setValue(j, i, dS);
}
}
}
return DM;
}
/**
* The dissimilarity score is the additive inverse of the similarity score
* (sum of scores) between two aligned sequences using a substitution model
* (Substitution Matrix). The maximum dissimilarity score is taken to be the
* maximum similarity score between self-alignments (each sequence against
* itself). Calculation of the score is as follows:
*
*
* Ds = maxScore - sumi(Mai,bi)
*
*
* It is recommended to use the method
* {@link #fractionalDissimilarityScore(MultipleSequenceAlignment, SubstitutionMatrix)}
* , since the maximum similarity score is not relative to the data set, but
* relative to the Substitution Matrix, and the score is normalized by the
* alignment length (fractional).
*
* Gaps do not have a contribution to the similarity score calculation (gap
* penalty = 0).
*
* @param msa
* MultipleSequenceAlignment
* @param M
* SubstitutionMatrix for similarity scoring
* @return DistanceMatrix
*/
public static , D extends Compound> DistanceMatrix dissimilarityScore(
MultipleSequenceAlignment msa, SubstitutionMatrix M) {
logger.info("{}:{}", "Determing Distances", 0);
int n = msa.getSize();
String[] sequenceString = new String[n];
for (int i = 0; i < n; i++) {
sequenceString[i] = msa.getAlignedSequence(i + 1)
.getSequenceAsString();
}
List seqs = msa.getAlignedSequences();
DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n);
int totalloopcount = (n / 2) * (n + 1);
double maxscore = 0;
int end = msa.getLength();
int loopcount = 0;
for (int i = 0; i < (n - 1); i++) {
logger.info("{}:{}", "Determining Distances", (loopcount * 100)
/ totalloopcount);
// Obtain the similarity scores
for (int j = i; j < n; j++) {
double score = 0;
loopcount++;
for (int k = 0; k < end; k++) {
if (Comparison.isGap(sequenceString[i].charAt(k))
|| Comparison.isGap(sequenceString[j].charAt(k)))
continue;
score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs
.get(j).getCompoundAt(k + 1));
}
if (i != j){
score = Math.max(score, 0.0);
DM.setValue(i, j, score);
}
if (score > maxscore)
maxscore = score;
}
}
for (int i = 0; i < n; i++) {
DM.setIdentifier(i, msa.getAlignedSequence(i + 1).getAccession()
.getID());
for (int j = i; j < n; j++) {
if (i == j)
DM.setValue(i, j, 0.0);
else {
double dS = Math.max(maxscore - DM.getValue(i, j), 0);
DM.setValue(i, j, dS);
DM.setValue(j, i, dS);
}
}
}
logger.info("{}:{}", "Determining Distances", 100);
return DM;
}
/**
* The PAM (Point Accepted Mutations) distance is a measure of evolutionary
* distance in protein sequences. The PAM unit represents an average
* substitution rate of 1% per site. The fractional dissimilarity (D) of two
* aligned sequences is related with the PAM distance (d) by the equation:
*
*
* D = sum(fi * (1 - Miid))
*
*
* Where the sum is for all 20 AA, fi denotes the natural fraction of the
* given AA and M is the substitution matrix (in this case the PAM1 matrix).
*
* To calculate the PAM distance between two aligned sequences the maximum
* likelihood (ML) approach is used, which consists in finding d that
* maximazies the function:
*
*
* L(d) = product(fai * (1 - Mai,bid))
*
*
* Where the product is for every position i in the alignment, and ai and bi
* are the AA at position i in the first and second aligned sequences,
* respectively.
*
* @param msa
* MultipleSequenceAlignment
* @return
*/
public static , D extends Compound> DistanceMatrix pamMLdistance(
MultipleSequenceAlignment msa) {
// Need to import PAM1 matrix to biojava TODO
//SubstitutionMatrix PAM1 = SubstitutionMatrixHelper.getPAM250();
throw new IllegalStateException("PAM ML distance calculation not implemented!");
}
/**
* The structural distance (dS) uses the structural similarity
* (or dissimilarity) from a the structural alignment of two protein
* strutures. It is based on the diffusive model for protein fold evolution
* (Grishin 1995). The structural deviations are captured as RMS deviations.
*
*
* dSij = (rmsdmax2 / alpha2) *
* ln( (rmsdmax2 - rmsd02) /
* (rmsdmax2 - (rmsdij2) )
*
*
* @param rmsdMat
* RMSD matrix for all structure pairs (symmetric matrix)
* @param alpha
* change in CA positions introduced by a single AA substitution
* (Grishin 1995: 1 A)
* @param rmsdMax
* estimated RMSD between proteins of the same fold when the
* percentage of identity is infinitely low (the maximum allowed
* RMSD of proteins with the same fold). (Grishin 1995: 5 A)
* @param rmsd0
* arithmetical mean of squares of the RMSD for identical
* proteins (Grishin 1995: 0.4 A)
* @return DistanceMatrix
*/
public static , D extends Compound> DistanceMatrix structuralDistance(
double[][] rmsdMat, double alpha, double rmsdMax, double rmsd0) {
int n = rmsdMat.length;
DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n);
// Transform raw RMSD into distances and create matrix
for (int i = 0; i < n; i++) {
for (int j = i; j < n; j++) {
if (i == j)
DM.setValue(i, j, 0.0);
else {
double d = (rmsdMax * rmsdMax)
/ (alpha * alpha)
* Math.log((rmsdMax * rmsdMax - rmsd0 * rmsd0)
/ (rmsdMax * rmsdMax - rmsdMat[i][j]
* rmsdMat[i][j]));
d = Math.max(d, 0.0);
DM.setValue(i, j, d);
DM.setValue(j, i, d);
}
}
}
return DM;
}
}