All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.distance.IBSDistanceMatrix Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

The newest version!
/*
 * TASSEL - Trait Analysis by a aSSociation Evolution & Linkage
 * Copyright (C) 2003 Ed Buckler
 *
 * This software evaluates linkage disequilibrium nucletide diversity and 
 * associations. For more information visit http://www.maizegenetics.net
 *
 * This software is distributed under GNU general public license and without
 * any warranty ot technical support.
 *
 * You can redistribute and/or modify it under the terms of GNU General 
 * public license. 
 *
 */
package net.maizegenetics.analysis.distance;

import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.BitUtil;
import net.maizegenetics.util.ProgressListener;

import java.util.stream.IntStream;

import static net.maizegenetics.dna.WHICH_ALLELE.Major;
import static net.maizegenetics.dna.WHICH_ALLELE.Minor;
import static net.maizegenetics.dna.WHICH_ALLELE.Minor2;
import net.maizegenetics.util.Tuple;

/**
 * This class calculates an identity by state matrix. It is scaled so only
 * non-missing comparison are used. It conducts bit level calculations of IBS
 * for genotypes. Only the two most common alleles are used in the distance
 * calculations.
 * 

* Please note that when heterozygous genotypes are used, Het to Het distance is * 0.5 NOT 0.0. The default along the identity diagonal is 0 (isTrueIBS = * false), but changing isTrueIBS = true will calculate the identity. *

* The distance estimates become wildly inaccurate when too few sites are used * to calculate distance. The minSiteComp parameter can be used to control the * minimum number of sites used for a calculation. If there are insufficient * sites in the estimate, then Double.NaN is returned. * * @author Ed Buckler * @version 1.0 */ public class IBSDistanceMatrix { private IBSDistanceMatrix() { // utility } /** * Compute observed distances for all taxa. Missing sites are ignored. * * @param theAlignment Alignment used to computed distances */ public static DistanceMatrix getInstance(GenotypeTable theAlignment) { return getInstance(theAlignment, null); } /** * Compute observed distances for all taxa. Missing sites are ignored. * * @param theAlignment Alignment used to computed distances * @param listener Listener to track progress in calculations */ public static DistanceMatrix getInstance(GenotypeTable theAlignment, ProgressListener listener) { return getInstance(theAlignment, 0, listener); } /** * Compute observed distances for all taxa. Missing sites are ignored. * * @param theAlignment Alignment used to computed distances * @param minSiteComp Minimum number of sites needed to estimate distance * @param listener Listener to track progress in calculations */ public static DistanceMatrix getInstance(GenotypeTable theAlignment, int minSiteComp, ProgressListener listener) { return getInstance(theAlignment, minSiteComp, false, listener, true); } /** * Compute observed distances for all taxa. Missing sites are ignored. * * @param theAlignment Alignment used to computed distances * @param minSiteComp Minimum number of sites needed to estimate distance * @param trueIBS estimate diagonal distance based IBS (default = false, * i=i=0.0) * @param listener Listener to track progress in calculations * @param useThirdState */ public static DistanceMatrix getInstance(GenotypeTable theAlignment, int minSiteComp, boolean trueIBS, ProgressListener listener, boolean useThirdState) { if (useThirdState) { return IBSDistanceMatrix3Alleles.getInstance(theAlignment, minSiteComp, trueIBS, listener); } else { return IBSDistanceMatrix2Alleles.getInstance(theAlignment, minSiteComp, trueIBS, listener); } } public static double[] computeHetDistances(byte[] first, byte[] second, int minSitesComp) { setupHetBitDistancesIncrementors(); long counts = 0; for (int i = 0; i < first.length; i++) { int key1 = (first[i] & 0x70) >>> 1 | (first[i] & 0x7); int key2 = (second[i] & 0x70) >>> 1 | (second[i] & 0x7); counts += INCREMENT_FUNCTIONS[key1][key2]; } int sameCount = (int) (counts & 0x1FFFFFl); int diffCount = (int) ((counts >>> 21) & 0x1FFFFFl); int hetCount = (int) ((counts >>> 42) & 0x1FFFFFl); long sites = sameCount + diffCount - hetCount; double identity = ((double) (sameCount) - 0.5 * hetCount) / (double) (sites); double dist = 1 - identity; if (sites < minSitesComp) { dist = Double.NaN; } return new double[]{dist, sites}; } /** * This is a cleanest, fastest and most accurate way to calculate distance. */ private static Tuple computeHetBitDistances(boolean useThirdState, ProgressListener listener, GenotypeTable theTBA, boolean isTrueIBS, int minSitesComp) { int numSeqs = theTBA.numberOfTaxa(); double avgTotalSites = 0; //LongAdder count = new LongAdder(); //note this distance object is modified by a parallel stream, but each element is only touched once double[][] distance = new double[numSeqs][numSeqs]; long numberOfTests = numSeqs * (numSeqs - 1) / 2; long time = System.currentTimeMillis(); IntStream.range(0, numSeqs).parallel().forEach(i -> { long[] iMj = theTBA.allelePresenceForAllSites(i, Major).getBits(); long[] iMn = theTBA.allelePresenceForAllSites(i, Minor).getBits(); long[] iMn2 = null; if (useThirdState) { iMn2 = theTBA.allelePresenceForAllSites(i, Minor2).getBits(); } for (int j = i; j < numSeqs; j++) { if (j == i && !isTrueIBS) { distance[i][i] = 0; } else { long[] jMj = theTBA.allelePresenceForAllSites(j, Major).getBits(); long[] jMn = theTBA.allelePresenceForAllSites(j, Minor).getBits(); double[] result; if (useThirdState) { long[] jMn2 = theTBA.allelePresenceForAllSites(j, Minor2).getBits(); result = computeHetBitDistancesThirdState(iMj, iMn, iMn2, jMj, jMn, jMn2, minSitesComp); } else { result = computeHetBitDistances(iMj, iMn, jMj, jMn, minSitesComp); } distance[i][j] = distance[j][i] = result[0]; //avgTotalSites += result[1]; //this assumes not hets //count.increment(); } } //fireProgress((int) ((double) (i + 1) / (double) numSeqs * 100.0), listener); }); //avgTotalSites /= (double) count.longValue(); System.out.println("computeHetBitDistances time = " + (System.currentTimeMillis() - time) / 1000 + " seconds"); return new Tuple<>(distance, avgTotalSites); } /** * Compute distance for a pair of taxa. * * @param theTBA input alignment * @param taxon1 index of taxon 1 * @param taxon2 index of taxon 2 * @return array of {distance, number of sites used in comparison} */ public static double[] computeHetBitDistances(GenotypeTable theTBA, int taxon1, int taxon2) { return computeHetBitDistances(theTBA, taxon1, taxon2, 0); } /** * Compute distance for a pair of taxa. * * @param theTBA input alignment * @param taxon1 index of taxon 1 * @param taxon2 index of taxon 2 * @param minSitesCompared Minimum number of sites needed to estimate * distance * * @return array of {distance, number of sites used in comparison} */ public static double[] computeHetBitDistances(GenotypeTable theTBA, int taxon1, int taxon2, int minSitesCompared) { long[] iMj = theTBA.allelePresenceForAllSites(taxon1, Major).getBits(); long[] iMn = theTBA.allelePresenceForAllSites(taxon1, Minor).getBits(); long[] iMn2 = theTBA.allelePresenceForAllSites(taxon1, Minor2).getBits(); long[] jMj = theTBA.allelePresenceForAllSites(taxon2, Major).getBits(); long[] jMn = theTBA.allelePresenceForAllSites(taxon2, Minor).getBits(); long[] jMn2 = theTBA.allelePresenceForAllSites(taxon2, Minor2).getBits(); return computeHetBitDistancesThirdState(iMj, iMn, iMn2, jMj, jMn, jMn2, minSitesCompared, 0, iMj.length - 1); } /** * Compute distance for a pair of taxa. Optimized for calculations sites * within a certain range of underlying word (64 sites chunks) in the TBit * array * * @param theTBA input alignment * @param taxon1 index of taxon 1 * @param taxon2 index of taxon 2 * @param minSitesCompared Minimum number of sites needed to estimate * distance * @param firstWord starting word for calculating distance * site=(firstWord*64) * @param lastWord ending word for calculating distance inclusive * site=(lastWord*64+63) * @param maskBadSet Optional mask for sites (those set to 1 are kept) * * @return array of {distance, number of sites used in comparison} */ public static double[] computeHetBitDistances(GenotypeTable theTBA, int taxon1, int taxon2, int minSitesCompared, int firstWord, int lastWord, BitSet maskBadSet) { long[] iMj = theTBA.allelePresenceForAllSites(taxon1, Major).getBits(); long[] iMn = theTBA.allelePresenceForAllSites(taxon1, Minor).getBits(); long[] iMn2 = theTBA.allelePresenceForAllSites(taxon1, Minor2).getBits(); if (maskBadSet != null) { long[] maskBad = maskBadSet.getBits(); for (int i = 0; i < iMj.length; i++) { iMj[i] = iMj[i] & maskBad[i]; } for (int i = 0; i < iMn.length; i++) { iMn[i] = iMn[i] & maskBad[i]; } for (int i = 0; i < iMn2.length; i++) { iMn2[i] = iMn2[i] & maskBad[i]; } } long[] jMj = theTBA.allelePresenceForAllSites(taxon2, Major).getBits(); long[] jMn = theTBA.allelePresenceForAllSites(taxon2, Minor).getBits(); long[] jMn2 = theTBA.allelePresenceForAllSites(taxon2, Minor2).getBits(); return computeHetBitDistancesThirdState(iMj, iMn, iMn2, jMj, jMn, jMn2, minSitesCompared, firstWord, lastWord); } /** * Calculation of distance using the bit vector of major and minor alleles. * * @param iMj Vector of major alleles for taxon i * @param iMn Vector of minor alleles for taxon i * @param jMj Vector of major alleles for taxon j * @param jMn Vector of minor alleles for taxon j * @param minSitesCompared Minimum number of sites needed to estimate * distance * @return array of {distance, number of sites used in comparison} */ public static double[] computeHetBitDistances(long[] iMj, long[] iMn, long[] jMj, long[] jMn, int minSitesCompared) { return computeHetBitDistances(iMj, iMn, jMj, jMn, minSitesCompared, 0, iMj.length - 1); } /** * Calculation of distance using the bit vector of the first three alleles. * * @param iMj Vector of major alleles for taxon i * @param iMn Vector of minor alleles for taxon i * @param jMj Vector of major alleles for taxon j * @param jMn Vector of minor alleles for taxon j * @param minSitesCompared Minimum number of sites needed to estimate * distance * @return array of {distance, number of sites used in comparison} */ public static double[] computeHetBitDistancesThirdState(long[] iMj, long[] iMn, long[] iMn2, long[] jMj, long[] jMn, long[] jMn2, int minSitesCompared) { return computeHetBitDistancesThirdState(iMj, iMn, iMn2, jMj, jMn, jMn2, minSitesCompared, 0, iMj.length - 1); } /** * Calculation of distance using the bit vector of major and minor alleles. * * @param iMj Vector of major alleles for taxon i * @param iMn Vector of minor alleles for taxon i * @param jMj Vector of major alleles for taxon j * @param jMn Vector of minor alleles for taxon j * @param minSitesCompared Minimum number of sites needed to estimate * distance * @param firstWord first world for calculating distance * @param lastWord last word for calculating distance inclusive * site=(endWord*64+63) * @return array of {distance, number of sites used in comparison} */ public static double[] computeHetBitDistances(long[] iMj, long[] iMn, long[] jMj, long[] jMn, int minSitesCompared, int firstWord, int lastWord) { int sameCnt = 0, diffCnt = 0, hetCnt = 0; for (int x = firstWord; x <= lastWord; x++) { long same = (iMj[x] & jMj[x]) | (iMn[x] & jMn[x]); long diff = (iMj[x] & jMn[x]) | (iMn[x] & jMj[x]); long hets = same & diff; sameCnt += BitUtil.pop(same); diffCnt += BitUtil.pop(diff); hetCnt += BitUtil.pop(hets); } int sites = sameCnt + diffCnt - hetCnt; double identity = ((double) (sameCnt) - 0.5 * hetCnt) / (double) (sites); double dist = 1 - identity; if (sites > minSitesCompared) { return new double[]{dist, (double) sites}; } else { return new double[]{Double.NaN, (double) sites}; } } /** * Calculation of distance using the bit vectors of the first three alleles. * * @param iMj Vector of major alleles for taxon i * @param iMn Vector of minor alleles for taxon i * @param iMn2 Vector of second minor alleles for taxon i * @param jMj Vector of major alleles for taxon j * @param jMn Vector of minor alleles for taxon j * @param jMn2 Vector of second minor alleles for taxon j * @param minSitesCompared Minimum number of sites needed to estimate * distance * @param firstWord first world for calculating distance * @param lastWord last word for calculating distance inclusive * site=(endWord*64+63) * @return array of {distance, number of sites used in comparison} */ public static double[] computeHetBitDistancesThirdState(long[] iMj, long[] iMn, long[] iMn2, long[] jMj, long[] jMn, long[] jMn2, int minSitesCompared, int firstWord, int lastWord) { int sameCnt = 0, diffCnt = 0, hetCnt = 0; for (int x = firstWord; x <= lastWord; x++) { long same = (iMj[x] & jMj[x]) | (iMn[x] & jMn[x]) | (iMn2[x] & jMn2[x]); long diff = (iMj[x] & jMn[x]) | (iMn[x] & jMj[x]) | (iMj[x] & jMn2[x]) | (iMn2[x] & jMj[x]) | (iMn[x] & jMn2[x]) | (iMn2[x] & jMn[x]); long hets = same & diff; sameCnt += BitUtil.pop(same); diffCnt += BitUtil.pop(diff); hetCnt += BitUtil.pop(hets); } int sites = sameCnt + diffCnt - hetCnt; double identity = ((double) (sameCnt) - 0.5 * hetCnt) / (double) (sites); double dist = 1 - identity; if (sites > minSitesCompared) { return new double[]{dist, (double) sites}; } else { return new double[]{Double.NaN, (double) sites}; } } private static final long TRUE_TRUE_LONG = 0x40000200001l; private static final long TRUE_FALSE_LONG = 0x1l; private static final long FALSE_TRUE_LONG = 0x200000l; private static final long FALSE_FALSE_LONG = 0x0l; private static long[][] INCREMENT_FUNCTIONS = null; private static void setupHetBitDistancesIncrementors() { if (INCREMENT_FUNCTIONS != null) { return; } INCREMENT_FUNCTIONS = new long[64][64]; int[] possibleValues = new int[]{0, 1, 2, 3, 4, 5, 15}; for (int a = 0; a < 7; a++) { for (int b = 0; b < 7; b++) { for (int c = 0; c < 7; c++) { for (int d = 0; d < 7; d++) { int key1 = (possibleValues[a] & 0x7) << 3 | (possibleValues[b] & 0x7); int key2 = (possibleValues[c] & 0x7) << 3 | (possibleValues[d] & 0x7); byte[] target = new byte[2]; target[0] = (byte) possibleValues[a]; target[1] = (byte) possibleValues[b]; byte[] match = new byte[2]; match[0] = (byte) possibleValues[c]; match[1] = (byte) possibleValues[d]; int[] result = new int[3]; result[0] = 0; result[1] = 0; if (target[0] != GenotypeTable.UNKNOWN_ALLELE) { if ((target[0] == match[0]) || (target[0] == match[1])) { result[0] = 1; } if ((match[0] != GenotypeTable.UNKNOWN_ALLELE) && (match[0] != target[0])) { result[1] = 1; } else if ((match[1] != GenotypeTable.UNKNOWN_ALLELE) && (match[1] != target[0])) { result[1] = 1; } } if ((result[0] == 1) && (result[1] == 1)) { INCREMENT_FUNCTIONS[key1][key2] = TRUE_TRUE_LONG; continue; } if (target[1] != GenotypeTable.UNKNOWN_ALLELE) { if ((target[1] == match[0]) || (target[1] == match[1])) { result[0] = 1; } if ((match[0] != GenotypeTable.UNKNOWN_ALLELE) && (match[0] != target[1])) { result[1] = 1; } else if ((match[1] != GenotypeTable.UNKNOWN_ALLELE) && (match[1] != target[1])) { result[1] = 1; } } if ((result[0] == 1) && (result[1] == 1)) { INCREMENT_FUNCTIONS[key1][key2] = TRUE_TRUE_LONG; continue; } if (result[0] == 1) { INCREMENT_FUNCTIONS[key1][key2] = TRUE_FALSE_LONG; } else if (result[1] == 1) { INCREMENT_FUNCTIONS[key1][key2] = FALSE_TRUE_LONG; } else { INCREMENT_FUNCTIONS[key1][key2] = FALSE_FALSE_LONG; } } } } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy