All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.taxa.distance.DistanceMatrixUtils Maven / Gradle / Ivy

// DistanceMatrixUtils.java
//
// (c) 1999-2001 PAL Development Core Team
//
// This package may be distributed under the
// terms of the Lesser GNU General Public License (LGPL)
package net.maizegenetics.taxa.distance;

import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.BitUtil;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * Utility functions for distance matrices
 *
 * @author Alexei Drummond
 * @author Terry Casstevens
 */
public class DistanceMatrixUtils {

    private DistanceMatrixUtils() {
        // utility class
    }

    /**
     * Get Genetic Relationship Matrix (grm) file names.
     *
     * @param base filename base
     *
     * @return array of file names. Index 0 is the id filename (.grm.id); Index 1 is the binary matrix (.grm.bin); Index
     * 2 is the binary counts (.grm.N.bin); Index 3 is the raw (text) matrix (.grm.raw); Index 4 is the text matrix
     * (.txt)
     */
    public static String[] getGRMFilenames(String base) {

        String[] result = new String[5];
        String filename = base.toLowerCase();

        if (filename.endsWith(".txt")) {
            int txtIndex = filename.lastIndexOf(".txt");
            String temp = base.substring(0, txtIndex);
            result[0] = temp + ".grm.id";
            result[1] = temp + ".grm.bin";
            result[2] = temp + ".grm.N.bin";
            result[3] = temp + ".grm.raw";
            result[4] = filename;
            return result;
        }

        int grmIndex = filename.lastIndexOf(".grm");
        if (grmIndex == -1) {
            result[0] = base + ".grm.id";
            result[1] = base + ".grm.bin";
            result[2] = base + ".grm.N.bin";
            result[3] = base + ".grm.raw";
            result[4] = base + ".txt";
        } else {
            String temp = base.substring(0, grmIndex);
            result[0] = temp + ".grm.id";
            result[1] = temp + ".grm.bin";
            result[2] = temp + ".grm.N.bin";
            result[3] = temp + ".grm.raw";
            result[4] = temp + ".txt";
        }

        return result;

    }

    /**
     * Get DARwin file names.
     *
     * @param base filename base
     *
     * @return array of file names. Index 0 is the id filename (.don); Index 1 is the dissimilarity matrix (.dis)
     */
    public static String[] getDARwinFilenames(String base) {

        String[] result = new String[2];
        int index = base.lastIndexOf(".");
        if (index == -1) {
            result[0] = base + ".don";
            result[1] = base + ".dis";
            return result;
        } else {
            String temp = base.substring(0, index);
            result[0] = temp + ".don";
            result[1] = temp + ".dis";
            return result;
        }

    }

    /**
     * compute squared distance to second distance matrix. If both matrices have
     * the same size it is assumed that the order of the taxa is identical.
     */
    public static double squaredDistance(DistanceMatrix mat1, DistanceMatrix mat2, boolean weighted) {

        boolean aliasNeeded = false;
        if (mat1.getSize() != mat2.getSize()) {
            aliasNeeded = true;
        }

        int[] alias = null;

        if (aliasNeeded) {
            if (mat1.getSize() > mat2.getSize()) {
                //swap so mat1 is the smaller of the two
                DistanceMatrix temp = mat2;
                mat2 = mat1;
                mat1 = temp;
            }
            alias = new int[mat1.getSize()];
            for (int i = 0; i < alias.length; i++) {
                alias[i] = mat2.whichIdNumber(mat1.getTaxon(i).getName());
            }
        } else {
            alias = new int[mat1.getSize()];
            for (int i = 0; i < alias.length; i++) {
                alias[i] = i;
            }
        }

        double sum = 0;
        int ai;
        final double[][] mat1Distance = mat1.getDistances();
        final double[][] mat2Distance = mat2.getDistances();
        for (int i = 0; i < mat1.getSize() - 1; i++) {
            ai = alias[i];

            for (int j = i + 1; j < mat1.getSize(); j++) {
                double diff = mat1Distance[i][j] - mat2Distance[ai][alias[j]];
                double weight;
                if (weighted) {
                    // Fitch-Margoliash weight
                    // (variances proportional to distances)
                    weight = 1.0 / (mat1Distance[i][j] * mat2Distance[ai][alias[j]]);
                } else {
                    // Cavalli-Sforza-Edwards weight
                    // (homogeneity of variances)
                    weight = 1.0;
                }
                sum += weight * diff * diff;
            }
        }

        return 2.0 * sum; // we counted only half the matrix
    }

    /**
     * Returns a distance matrix with the specified taxa removed.
     */
    public static DistanceMatrix minus(DistanceMatrix parent, int taxaToRemove) {

        int size = parent.numberOfTaxa() - 1;

        double[][] distances = new double[size][size];
        Taxon[] ids = new Taxon[size];
        int counti = 0, countj = 0;
        for (int i = 0; i < size; i++) {
            if (counti == taxaToRemove) {
                counti += 1;
            }
            ids[i] = parent.getTaxon(counti);

            countj = 0;
            final double[][] parentDistance = parent.getDistances();
            for (int j = 0; j < size; j++) {
                if (countj == taxaToRemove) {
                    countj += 1;
                }
                distances[i][j] = parentDistance[counti][countj];
                countj += 1;
            }
            counti += 1;
        }
        TaxaList tl = new TaxaListBuilder().addAll(ids).build();
        DistanceMatrix smaller = new DistanceMatrix(distances, tl);

        return smaller;
    }

    /**
     * @param parent the DistanceMatrix from which to extract a subset
     * @param taxaToKeep an index of the taxa to keep
     *
     * @return A DistanceMatrix with all the taxa that are in both parent and taxaToKeep in the same order as taxaToKeep
     */
    public static DistanceMatrix keepTaxa(DistanceMatrix parent, int[] taxaToKeep) {
        int ntaxa = taxaToKeep.length;
        double[][] newDistances = new double[ntaxa][ntaxa];
        for (int r = 0; r < ntaxa; r++) {
            for (int c = 0; c < ntaxa; c++) {
                newDistances[r][c] = parent.getDistance(taxaToKeep[r], taxaToKeep[c]);
            }
        }

        TaxaListBuilder taxaBuilder = new TaxaListBuilder();
        for (int ndx : taxaToKeep) {
            taxaBuilder.add(parent.getTaxon(ndx));
        }
        TaxaList taxaListToKeep = taxaBuilder.build();

        return new DistanceMatrix(newDistances, taxaListToKeep);
    }

    /**
     * @param parent the DistanceMatrix from which to extract a subset
     * @param taxaToKeep a TaxaList of taxa to be kept
     *
     * @return a DistanceMatrix that contains only the taxa that are in both taxaToKeep and parent. The taxa will be in
     * the same order as taxaToKeep.
     */
    public static DistanceMatrix keepTaxa(DistanceMatrix parent, TaxaList taxaToKeep) {
        int[] keepIndex = taxaToKeep.stream()
                .mapToInt(t -> parent.whichIdNumber(t))
                .filter(i -> i > -1)
                .toArray();
        return keepTaxa(parent, keepIndex);
    }

    public static DistanceMatrix clusterBySmallestDistance(DistanceMatrix orig) {

        TaxaList taxa = orig.getTaxaList();
        int numTaxa = taxa.numberOfTaxa();

        TaxaPairLowestDistance[] lowValues = new TaxaPairLowestDistance[numTaxa];
        for (int t = 0; t < numTaxa; t++) {
            lowValues[t] = new TaxaPairLowestDistance(t);
        }

        for (int x = 0; x < numTaxa; x++) {
            for (int y = x + 1; y < numTaxa; y++) {

                float value = orig.getDistance(x, y);

                if (!Float.isNaN(value)) {

                    if (lowValues[x].myLowValue > value) {
                        lowValues[x].myLowValue = value;
                        lowValues[x].myTaxon2 = y;
                    }

                    if (lowValues[y].myLowValue > value) {
                        lowValues[y].myLowValue = value;
                        lowValues[y].myTaxon2 = x;
                    }

                }

            }
        }

        Arrays.sort(lowValues);

        List> clusters = new ArrayList<>();
        List[] whichCluster = new ArrayList[numTaxa];
        List unknownList = new ArrayList<>();
        for (int t = 0; t < numTaxa; t++) {

            int taxon1 = lowValues[t].myTaxon1;
            int taxon2 = lowValues[t].myTaxon2;

            if (taxon2 == -1) {
                unknownList.add(taxon1);
            } else if (whichCluster[taxon1] == null && whichCluster[taxon2] == null) {
                List temp = new ArrayList<>();
                temp.add(taxon1);
                temp.add(taxon2);
                clusters.add(temp);
                whichCluster[taxon1] = temp;
                whichCluster[taxon2] = temp;
            } else if (whichCluster[taxon1] == null) {
                whichCluster[taxon1] = whichCluster[taxon2];
                whichCluster[taxon1].add(taxon1);
            } else if (whichCluster[taxon2] == null) {
                whichCluster[taxon2] = whichCluster[taxon1];
                whichCluster[taxon2].add(taxon2);
            }

        }

        clusters.add(unknownList);

        DistanceMatrixBuilder builder = DistanceMatrixBuilder.getInstance(numTaxa);
        int count = 0;
        for (List current : clusters) {
            int currentNumTaxa = current.size();
            for (int taxon = 0; taxon < currentNumTaxa; taxon++) {
                builder.addTaxon(taxa.get(current.get(taxon)));
                for (int x = taxon; x < currentNumTaxa; x++) {
                    builder.set(x + count, taxon + count, orig.getDistance(current.get(taxon), current.get(x)));
                }
            }
            count += currentNumTaxa;
        }

        return builder.build();

    }

    private static class TaxaPairLowestDistance implements Comparable {

        private final int myTaxon1;
        private int myTaxon2;
        private float myLowValue = Float.POSITIVE_INFINITY;

        public TaxaPairLowestDistance(int taxon1) {
            myTaxon1 = taxon1;
            myTaxon2 = -1;
            myLowValue = Float.POSITIVE_INFINITY;
        }

        @Override
        public int compareTo(TaxaPairLowestDistance o) {
            if (myLowValue < o.myLowValue) {
                return -1;
            } else if (myLowValue > o.myLowValue) {
                return 1;
            } else {
                return 0;
            }
        }
    }

    /**
     * Calculates the IBS distance between two taxa with bitsets for for major
     * and minor allele
     *
     * @param iMajor
     * @param iMinor
     * @param jMajor
     * @param jMinor
     *
     * @return
     */
    public static double getIBSDistance(long[] iMajor, long[] iMinor, long[] jMajor, long[] jMinor) {
        int sameCnt = 0, diffCnt = 0, hetCnt = 0;
        for (int x = 0; x < iMajor.length; x++) {
            long same = (iMajor[x] & jMajor[x]) | (iMinor[x] & jMinor[x]);
            long diff = (iMajor[x] & jMinor[x]) | (iMinor[x] & jMajor[x]);
            long hets = same & diff;
            sameCnt += BitUtil.pop(same);
            diffCnt += BitUtil.pop(diff);
            hetCnt += BitUtil.pop(hets);
        }
        double identity = (double) (sameCnt + (hetCnt / 2)) / (double) (sameCnt + diffCnt + hetCnt);
        double dist = 1 - identity;
        return dist;
    }

    public static double getIBSDistance(BitSet iMajor, BitSet iMinor, BitSet jMajor, BitSet jMinor) {
        return getIBSDistance(iMajor.getBits(), iMinor.getBits(), jMajor.getBits(), jMinor.getBits());
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy