net.maizegenetics.taxa.distance.DistanceMatrixUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
The newest version!
// DistanceMatrixUtils.java
//
// (c) 1999-2001 PAL Development Core Team
//
// This package may be distributed under the
// terms of the Lesser GNU General Public License (LGPL)
package net.maizegenetics.taxa.distance;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.BitUtil;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Utility functions for distance matrices
*
* @author Alexei Drummond
* @author Terry Casstevens
*/
public class DistanceMatrixUtils {
private DistanceMatrixUtils() {
// utility class
}
/**
* Get Genetic Relationship Matrix (grm) file names.
*
* @param base filename base
*
* @return array of file names. Index 0 is the id filename (.grm.id); Index 1 is the binary matrix (.grm.bin); Index
* 2 is the binary counts (.grm.N.bin); Index 3 is the raw (text) matrix (.grm.raw); Index 4 is the text matrix
* (.txt)
*/
public static String[] getGRMFilenames(String base) {
String[] result = new String[5];
String filename = base.toLowerCase();
if (filename.endsWith(".txt")) {
int txtIndex = filename.lastIndexOf(".txt");
String temp = base.substring(0, txtIndex);
result[0] = temp + ".grm.id";
result[1] = temp + ".grm.bin";
result[2] = temp + ".grm.N.bin";
result[3] = temp + ".grm.raw";
result[4] = filename;
return result;
}
int grmIndex = filename.lastIndexOf(".grm");
if (grmIndex == -1) {
result[0] = base + ".grm.id";
result[1] = base + ".grm.bin";
result[2] = base + ".grm.N.bin";
result[3] = base + ".grm.raw";
result[4] = base + ".txt";
} else {
String temp = base.substring(0, grmIndex);
result[0] = temp + ".grm.id";
result[1] = temp + ".grm.bin";
result[2] = temp + ".grm.N.bin";
result[3] = temp + ".grm.raw";
result[4] = temp + ".txt";
}
return result;
}
/**
* Get DARwin file names.
*
* @param base filename base
*
* @return array of file names. Index 0 is the id filename (.don); Index 1 is the dissimilarity matrix (.dis)
*/
public static String[] getDARwinFilenames(String base) {
String[] result = new String[2];
int index = base.lastIndexOf(".");
if (index == -1) {
result[0] = base + ".don";
result[1] = base + ".dis";
return result;
} else {
String temp = base.substring(0, index);
result[0] = temp + ".don";
result[1] = temp + ".dis";
return result;
}
}
/**
* compute squared distance to second distance matrix. If both matrices have
* the same size it is assumed that the order of the taxa is identical.
*/
public static double squaredDistance(DistanceMatrix mat1, DistanceMatrix mat2, boolean weighted) {
boolean aliasNeeded = false;
if (mat1.getSize() != mat2.getSize()) {
aliasNeeded = true;
}
int[] alias = null;
if (aliasNeeded) {
if (mat1.getSize() > mat2.getSize()) {
//swap so mat1 is the smaller of the two
DistanceMatrix temp = mat2;
mat2 = mat1;
mat1 = temp;
}
alias = new int[mat1.getSize()];
for (int i = 0; i < alias.length; i++) {
alias[i] = mat2.whichIdNumber(mat1.getTaxon(i).getName());
}
} else {
alias = new int[mat1.getSize()];
for (int i = 0; i < alias.length; i++) {
alias[i] = i;
}
}
double sum = 0;
int ai;
final double[][] mat1Distance = mat1.getDistances();
final double[][] mat2Distance = mat2.getDistances();
for (int i = 0; i < mat1.getSize() - 1; i++) {
ai = alias[i];
for (int j = i + 1; j < mat1.getSize(); j++) {
double diff = mat1Distance[i][j] - mat2Distance[ai][alias[j]];
double weight;
if (weighted) {
// Fitch-Margoliash weight
// (variances proportional to distances)
weight = 1.0 / (mat1Distance[i][j] * mat2Distance[ai][alias[j]]);
} else {
// Cavalli-Sforza-Edwards weight
// (homogeneity of variances)
weight = 1.0;
}
sum += weight * diff * diff;
}
}
return 2.0 * sum; // we counted only half the matrix
}
/**
* Returns a distance matrix with the specified taxa removed.
*/
public static DistanceMatrix minus(DistanceMatrix parent, int taxaToRemove) {
int size = parent.numberOfTaxa() - 1;
double[][] distances = new double[size][size];
Taxon[] ids = new Taxon[size];
int counti = 0, countj = 0;
for (int i = 0; i < size; i++) {
if (counti == taxaToRemove) {
counti += 1;
}
ids[i] = parent.getTaxon(counti);
countj = 0;
final double[][] parentDistance = parent.getDistances();
for (int j = 0; j < size; j++) {
if (countj == taxaToRemove) {
countj += 1;
}
distances[i][j] = parentDistance[counti][countj];
countj += 1;
}
counti += 1;
}
TaxaList tl = new TaxaListBuilder().addAll(ids).build();
DistanceMatrix smaller = new DistanceMatrix(distances, tl);
return smaller;
}
/**
* @param parent the DistanceMatrix from which to extract a subset
* @param taxaToKeep an index of the taxa to keep
*
* @return A DistanceMatrix with all the taxa that are in both parent and taxaToKeep in the same order as taxaToKeep
*/
public static DistanceMatrix keepTaxa(DistanceMatrix parent, int[] taxaToKeep) {
int ntaxa = taxaToKeep.length;
double[][] newDistances = new double[ntaxa][ntaxa];
for (int r = 0; r < ntaxa; r++) {
for (int c = 0; c < ntaxa; c++) {
newDistances[r][c] = parent.getDistance(taxaToKeep[r], taxaToKeep[c]);
}
}
TaxaListBuilder taxaBuilder = new TaxaListBuilder();
for (int ndx : taxaToKeep) {
taxaBuilder.add(parent.getTaxon(ndx));
}
TaxaList taxaListToKeep = taxaBuilder.build();
return new DistanceMatrix(newDistances, taxaListToKeep);
}
/**
* @param parent the DistanceMatrix from which to extract a subset
* @param taxaToKeep a TaxaList of taxa to be kept
*
* @return a DistanceMatrix that contains only the taxa that are in both taxaToKeep and parent. The taxa will be in
* the same order as taxaToKeep.
*/
public static DistanceMatrix keepTaxa(DistanceMatrix parent, TaxaList taxaToKeep) {
int[] keepIndex = taxaToKeep.stream()
.mapToInt(t -> parent.whichIdNumber(t))
.filter(i -> i > -1)
.toArray();
return keepTaxa(parent, keepIndex);
}
public static DistanceMatrix clusterBySmallestDistance(DistanceMatrix orig) {
TaxaList taxa = orig.getTaxaList();
int numTaxa = taxa.numberOfTaxa();
TaxaPairLowestDistance[] lowValues = new TaxaPairLowestDistance[numTaxa];
for (int t = 0; t < numTaxa; t++) {
lowValues[t] = new TaxaPairLowestDistance(t);
}
for (int x = 0; x < numTaxa; x++) {
for (int y = x + 1; y < numTaxa; y++) {
float value = orig.getDistance(x, y);
if (!Float.isNaN(value)) {
if (lowValues[x].myLowValue > value) {
lowValues[x].myLowValue = value;
lowValues[x].myTaxon2 = y;
}
if (lowValues[y].myLowValue > value) {
lowValues[y].myLowValue = value;
lowValues[y].myTaxon2 = x;
}
}
}
}
Arrays.sort(lowValues);
List> clusters = new ArrayList<>();
List[] whichCluster = new ArrayList[numTaxa];
List unknownList = new ArrayList<>();
for (int t = 0; t < numTaxa; t++) {
int taxon1 = lowValues[t].myTaxon1;
int taxon2 = lowValues[t].myTaxon2;
if (taxon2 == -1) {
unknownList.add(taxon1);
} else if (whichCluster[taxon1] == null && whichCluster[taxon2] == null) {
List temp = new ArrayList<>();
temp.add(taxon1);
temp.add(taxon2);
clusters.add(temp);
whichCluster[taxon1] = temp;
whichCluster[taxon2] = temp;
} else if (whichCluster[taxon1] == null) {
whichCluster[taxon1] = whichCluster[taxon2];
whichCluster[taxon1].add(taxon1);
} else if (whichCluster[taxon2] == null) {
whichCluster[taxon2] = whichCluster[taxon1];
whichCluster[taxon2].add(taxon2);
}
}
clusters.add(unknownList);
DistanceMatrixBuilder builder = DistanceMatrixBuilder.getInstance(numTaxa);
int count = 0;
for (List current : clusters) {
int currentNumTaxa = current.size();
for (int taxon = 0; taxon < currentNumTaxa; taxon++) {
builder.addTaxon(taxa.get(current.get(taxon)));
for (int x = taxon; x < currentNumTaxa; x++) {
builder.set(x + count, taxon + count, orig.getDistance(current.get(taxon), current.get(x)));
}
}
count += currentNumTaxa;
}
return builder.build();
}
private static class TaxaPairLowestDistance implements Comparable {
private final int myTaxon1;
private int myTaxon2;
private float myLowValue = Float.POSITIVE_INFINITY;
public TaxaPairLowestDistance(int taxon1) {
myTaxon1 = taxon1;
myTaxon2 = -1;
myLowValue = Float.POSITIVE_INFINITY;
}
@Override
public int compareTo(TaxaPairLowestDistance o) {
if (myLowValue < o.myLowValue) {
return -1;
} else if (myLowValue > o.myLowValue) {
return 1;
} else {
return 0;
}
}
}
/**
* Calculates the IBS distance between two taxa with bitsets for for major
* and minor allele
*
* @param iMajor
* @param iMinor
* @param jMajor
* @param jMinor
*
* @return
*/
public static double getIBSDistance(long[] iMajor, long[] iMinor, long[] jMajor, long[] jMinor) {
int sameCnt = 0, diffCnt = 0, hetCnt = 0;
for (int x = 0; x < iMajor.length; x++) {
long same = (iMajor[x] & jMajor[x]) | (iMinor[x] & jMinor[x]);
long diff = (iMajor[x] & jMinor[x]) | (iMinor[x] & jMajor[x]);
long hets = same & diff;
sameCnt += BitUtil.pop(same);
diffCnt += BitUtil.pop(diff);
hetCnt += BitUtil.pop(hets);
}
double identity = (double) (sameCnt + (hetCnt / 2)) / (double) (sameCnt + diffCnt + hetCnt);
double dist = 1 - identity;
return dist;
}
public static double getIBSDistance(BitSet iMajor, BitSet iMinor, BitSet jMajor, BitSet jMinor) {
return getIBSDistance(iMajor.getBits(), iMinor.getBits(), jMajor.getBits(), jMinor.getBits());
}
}