All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.phylo.DistanceMatrixCalculator Maven / Gradle / Ivy

The newest version!
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */
package org.biojava.nbio.phylo;

import java.io.IOException;
import java.util.List;
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
import org.biojava.nbio.core.sequence.MultipleSequenceAlignment;
import org.biojava.nbio.core.sequence.template.Compound;
import org.biojava.nbio.core.sequence.template.Sequence;
import org.forester.evoinference.distance.PairwiseDistanceCalculator;
import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
import org.forester.evoinference.matrix.distance.DistanceMatrix;
import org.forester.msa.Msa;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The DistanceMatrixCalculator methods generate a {@link DistanceMatrix} from a
 * {@link MultipleSequenceAlignment} or other indirect distance infomation (RMSD).
 *
 * @author Aleix Lafita
 * @since 4.1.1
 *
 */
public class DistanceMatrixCalculator {

	private static final Logger logger = LoggerFactory
			.getLogger(DistanceMatrixCalculator.class);

	/** Prevent instantiation */
	private DistanceMatrixCalculator() {}

	/**
	 * The fractional dissimilarity (D) is defined as the percentage of sites
	 * that differ between two aligned sequences. The percentage of identity
	 * (PID) is the fraction of identical sites between two aligned sequences.
	 *
	 * 
	 * D = 1 - PID
	 * 
* * The gapped positons in the alignment are ignored in the calculation. This * method is a wrapper to the forester implementation of the calculation: * {@link PairwiseDistanceCalculator#calcFractionalDissimilarities(Msa)} * * @param msa * MultipleSequenceAlignment * @return DistanceMatrix * @throws Exception */ public static , D extends Compound> DistanceMatrix fractionalDissimilarity( MultipleSequenceAlignment msa) throws IOException { Msa fMsa = ForesterWrapper.convert(msa); DistanceMatrix DM = PairwiseDistanceCalculator .calcFractionalDissimilarities(fMsa); return DM; } /** * The Poisson (correction) evolutionary distance (d) is a function of the * fractional dissimilarity (D), given by: * *
	 * d = -log(1 - D)
	 * 
* * The gapped positons in the alignment are ignored in the calculation. This * method is a wrapper to the forester implementation of the calculation: * {@link PairwiseDistanceCalculator#calcPoissonDistances(Msa)} * * @param msa * MultipleSequenceAlignment * @return DistanceMatrix * @throws IOException */ public static , D extends Compound> DistanceMatrix poissonDistance( MultipleSequenceAlignment msa) throws IOException { Msa fMsa = ForesterWrapper.convert(msa); DistanceMatrix DM = PairwiseDistanceCalculator .calcPoissonDistances(fMsa); return DM; } /** * The Kimura evolutionary distance (d) is a correction of the fractional * dissimilarity (D) specially needed for large evolutionary distances. It * is given by: * *
	 * d = -log(1 - D - 0.2 * D2)
	 * 
* * The equation is derived by fitting the relationship between the * evolutionary distance (d) and the fractional dissimilarity (D) according * to the PAM model of evolution (it is an empirical approximation for the * method {@link #pamDistance(MultipleSequenceAlignment}). The gapped * positons in the alignment are ignored in the calculation. This method is * a wrapper to the forester implementation of the calculation: * {@link PairwiseDistanceCalculator#calcKimuraDistances(Msa)}. * * @param msa * MultipleSequenceAlignment * @return DistanceMatrix * @throws IOException */ public static , D extends Compound> DistanceMatrix kimuraDistance( MultipleSequenceAlignment msa) throws IOException { Msa fMsa = ForesterWrapper.convert(msa); DistanceMatrix DM = PairwiseDistanceCalculator .calcPoissonDistances(fMsa); return DM; } /** * BioJava implementation for percentage of identity (PID). Although the * name of the method is percentage of identity, the DistanceMatrix contains * the fractional dissimilarity (D), computed as D = 1 - PID. *

* It is recommended to use the method * {@link DistanceMatrixCalculator#fractionalDissimilarity(MultipleSequenceAlignment)} * instead of this one. * * @param msa * MultipleSequenceAlignment * @return DistanceMatrix */ public static , D extends Compound> DistanceMatrix percentageIdentity( MultipleSequenceAlignment msa) { logger.info("{}:{}", "Determing Distances", 0); int n = msa.getSize(); String[] sequenceString = new String[n]; for (int i = 0; i < n; i++) { sequenceString[i] = msa.getAlignedSequence(i + 1) .getSequenceAsString(); } DistanceMatrix distance = new BasicSymmetricalDistanceMatrix(n); int totalloopcount = (n / 2) * (n + 1); int loopcount = 0; for (int i = 0; i < (n - 1); i++) { logger.info("{}:{}", "Determining Distances", (loopcount * 100) / totalloopcount); distance.setIdentifier(i, msa.getAlignedSequence(i + 1) .getAccession().getID()); for (int j = i; j < n; j++) { loopcount++; if (j == i) { distance.setValue(i, j, 0); } else { distance.setValue(i, j, 100 - Comparison.PID( sequenceString[i], sequenceString[j])); distance.setValue(j, i, distance.getValue(i, j)); } } } logger.info("{}:{}", "Determining Distances", 100); return distance; } /** * The fractional dissimilarity score (Ds) is a relative measure of the * dissimilarity between two aligned sequences. It is calculated as: * *

	 * Ds = sum( max(M) - Mai,bi ) / (max(M)-min(M)) ) / L
	 * 
* * Where the sum through i runs for all the alignment positions, ai and bi * are the AA at position i in the first and second aligned sequences, * respectively, and L is the total length of the alignment (normalization). *

* The fractional dissimilarity score (Ds) is in the interval [0, 1], where * 0 means that the sequences are identical and 1 that the sequences are * completely different. *

* Gaps do not have a contribution to the similarity score calculation (gap * penalty = 0) * * @param msa * MultipleSequenceAlignment * @param M * SubstitutionMatrix for similarity scoring * @return DistanceMatrix */ public static , D extends Compound> DistanceMatrix fractionalDissimilarityScore( MultipleSequenceAlignment msa, SubstitutionMatrix M) { // Calculate the similarity scores using the alignment package logger.info("{}:{}", "Determing Distances", 0); int n = msa.getSize(); DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n); int totalloopcount = (n / 2) * (n + 1); int end = msa.getLength(); String[] sequenceString = new String[n]; for (int i = 0; i < n; i++) { sequenceString[i] = msa.getAlignedSequence(i + 1) .getSequenceAsString(); } List seqs = msa.getAlignedSequences(); int loopcount = 0; for (int i = 0; i < (n - 1); i++) { logger.info("{}:{}", "Determining Distances", (loopcount * 100) / totalloopcount); // Obtain the similarity scores for (int j = i; j < n; j++) { double score = 0; loopcount++; for (int k = 0; k < end; k++) { if (Comparison.isGap(sequenceString[i].charAt(k)) || Comparison.isGap(sequenceString[j].charAt(k))) continue; score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs .get(j).getCompoundAt(k + 1)); } if (i == j) DM.setValue(i, j, 0.0); else { double dS = (M.getMaxValue() - score / msa.getLength()) / (M.getMaxValue() - M.getMinValue()); DM.setValue(i, j, dS); DM.setValue(j, i, dS); } } } return DM; } /** * The dissimilarity score is the additive inverse of the similarity score * (sum of scores) between two aligned sequences using a substitution model * (Substitution Matrix). The maximum dissimilarity score is taken to be the * maximum similarity score between self-alignments (each sequence against * itself). Calculation of the score is as follows: * *

	 * Ds = maxScore - sumi(Mai,bi)
	 * 
* * It is recommended to use the method * {@link #fractionalDissimilarityScore(MultipleSequenceAlignment, SubstitutionMatrix)} * , since the maximum similarity score is not relative to the data set, but * relative to the Substitution Matrix, and the score is normalized by the * alignment length (fractional). *

* Gaps do not have a contribution to the similarity score calculation (gap * penalty = 0). * * @param msa * MultipleSequenceAlignment * @param M * SubstitutionMatrix for similarity scoring * @return DistanceMatrix */ public static , D extends Compound> DistanceMatrix dissimilarityScore( MultipleSequenceAlignment msa, SubstitutionMatrix M) { logger.info("{}:{}", "Determing Distances", 0); int n = msa.getSize(); String[] sequenceString = new String[n]; for (int i = 0; i < n; i++) { sequenceString[i] = msa.getAlignedSequence(i + 1) .getSequenceAsString(); } List seqs = msa.getAlignedSequences(); DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n); int totalloopcount = (n / 2) * (n + 1); double maxscore = 0; int end = msa.getLength(); int loopcount = 0; for (int i = 0; i < (n - 1); i++) { logger.info("{}:{}", "Determining Distances", (loopcount * 100) / totalloopcount); // Obtain the similarity scores for (int j = i; j < n; j++) { double score = 0; loopcount++; for (int k = 0; k < end; k++) { if (Comparison.isGap(sequenceString[i].charAt(k)) || Comparison.isGap(sequenceString[j].charAt(k))) continue; score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs .get(j).getCompoundAt(k + 1)); } if (i != j){ score = Math.max(score, 0.0); DM.setValue(i, j, score); } if (score > maxscore) maxscore = score; } } for (int i = 0; i < n; i++) { DM.setIdentifier(i, msa.getAlignedSequence(i + 1).getAccession() .getID()); for (int j = i; j < n; j++) { if (i == j) DM.setValue(i, j, 0.0); else { double dS = Math.max(maxscore - DM.getValue(i, j), 0); DM.setValue(i, j, dS); DM.setValue(j, i, dS); } } } logger.info("{}:{}", "Determining Distances", 100); return DM; } /** * The PAM (Point Accepted Mutations) distance is a measure of evolutionary * distance in protein sequences. The PAM unit represents an average * substitution rate of 1% per site. The fractional dissimilarity (D) of two * aligned sequences is related with the PAM distance (d) by the equation: * *

	 * D = sum(fi * (1 - Miid))
	 * 
* * Where the sum is for all 20 AA, fi denotes the natural fraction of the * given AA and M is the substitution matrix (in this case the PAM1 matrix). *

* To calculate the PAM distance between two aligned sequences the maximum * likelihood (ML) approach is used, which consists in finding d that * maximazies the function: * *

	 * L(d) = product(fai * (1 - Mai,bid))
	 * 
* * Where the product is for every position i in the alignment, and ai and bi * are the AA at position i in the first and second aligned sequences, * respectively. * * @param msa * MultipleSequenceAlignment * @return */ public static , D extends Compound> DistanceMatrix pamMLdistance( MultipleSequenceAlignment msa) { // Need to import PAM1 matrix to biojava TODO //SubstitutionMatrix PAM1 = SubstitutionMatrixHelper.getPAM250(); throw new IllegalStateException("PAM ML distance calculation not implemented!"); } /** * The structural distance (dS) uses the structural similarity * (or dissimilarity) from a the structural alignment of two protein * strutures. It is based on the diffusive model for protein fold evolution * (Grishin 1995). The structural deviations are captured as RMS deviations. * *
	 * dSij = (rmsdmax2 / alpha2) *
	 *        ln( (rmsdmax2 - rmsd02) /
	 *        (rmsdmax2 - (rmsdij2) )
	 * 
* * @param rmsdMat * RMSD matrix for all structure pairs (symmetric matrix) * @param alpha * change in CA positions introduced by a single AA substitution * (Grishin 1995: 1 A) * @param rmsdMax * estimated RMSD between proteins of the same fold when the * percentage of identity is infinitely low (the maximum allowed * RMSD of proteins with the same fold). (Grishin 1995: 5 A) * @param rmsd0 * arithmetical mean of squares of the RMSD for identical * proteins (Grishin 1995: 0.4 A) * @return DistanceMatrix */ public static , D extends Compound> DistanceMatrix structuralDistance( double[][] rmsdMat, double alpha, double rmsdMax, double rmsd0) { int n = rmsdMat.length; DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n); // Transform raw RMSD into distances and create matrix for (int i = 0; i < n; i++) { for (int j = i; j < n; j++) { if (i == j) DM.setValue(i, j, 0.0); else { double d = (rmsdMax * rmsdMax) / (alpha * alpha) * Math.log((rmsdMax * rmsdMax - rmsd0 * rmsd0) / (rmsdMax * rmsdMax - rmsdMat[i][j] * rmsdMat[i][j])); d = Math.max(d, 0.0); DM.setValue(i, j, d); DM.setValue(j, i, d); } } } return DM; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy