All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.popgen.LinkageDisequilibrium Maven / Gradle / Ivy

// LinkageDisequilibrium.java
//
// (c) 1999-2001 PAL Development Core Team
//
// This package may be distributed under the
// terms of the Lesser GNU General Public License (LGPL)
package net.maizegenetics.analysis.popgen;

import cern.colt.map.OpenLongObjectHashMap;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.stats.statistics.FisherExact;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.OpenBitSet;
import net.maizegenetics.util.ProgressListener;
import net.maizegenetics.util.TableReport;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.Serializable;
import java.io.StringWriter;
import java.util.Arrays;

/**
 * This class calculates D' and r^2 estimates of linkage disequilibrium. It also
 * calculates the significance of the LD by either Fisher Exact or the
 * multinomial permutation test. This class can work with either normal
 * alignments of annotated alignments. The alignments should be stripped of
 * invariable numSites.
 * 

{@link testDesign} sets matrix design for LD calculation. Either all by * all, sliding window, site by all, or site list. *

* There are multiple approaches for dealing with heterozygous sites. * {@link HetTreatment} sets the way these are treated. Haplotype assumes fully * phased heterozygous sites (any hets are double counted). This is the best * approach for speed when things are fully phased. Homozygous converted all * hets to missing. Genotype does a 3x3 genotype analysis (to be implemented) *

* 2 state estimates of D' and r^2 can be found reviewed and discussed in Weir * 1996 *

* Multi-state loci (>=3) require an averaging approach. In TASSEL 3 in 2010, * Buckler removed these approach as the relative magnitudes and meaningfulness * of these approaches has never been clear. Additionally with the moving away * from SSR to SNPs these methods are less relevant. Researchers should convert * to biallelic - either by ignoring rarer classes or collapsing rarer states. *

* TODO: Add 3x3 (genotype) mode. * * @version $Id: LinkageDisequilibrium.java,v 2 * * @author Ed Buckler */ public class LinkageDisequilibrium extends Thread implements Serializable, TableReport { private static final long serialVersionUID=-123423421342l; /** * Design of test matrix. */ public static enum testDesign { /** * Test All by All site */ All, /** * Sliding of LD comparisons */ SlidingWindow, /** * Test on site versus all others */ SiteByAll, /** * Test all sites with the site list with all others */ SiteList }; /** * Approaches for dealing with heterozygous sites. Haplotype assumes fully * phased heterozygous sites (any hets are double counted). This is the best * approach for speed when things are fully phased. Homozygous converted all * hets to missing. Genotype does a 3x3 genotype analysis (to be * implemented) */ public static enum HetTreatment { Haplotype, Homozygous, Genotype }; private static final Logger myLogger = LogManager.getLogger(LinkageDisequilibrium.class); private GenotypeTable myAlignment; // private Alignment mySBitAlignment; private int myMinTaxaForEstimate = 20; private int myWindowSize = 50; private int myTestSite = -1; // this is only set when one versus all numSites is calculated. private long myTotalTests = 0; private testDesign myCurrDesign = testDesign.SlidingWindow; /** * HashMap of results Key = (site1*siteNum + site2), Value = float[rsqr,d', * pvalue, sampleSize] */ private OpenLongObjectHashMap myMapResults; private ProgressListener myListener = null; private FisherExact myFisherExact; private boolean myIsAccumulativeReport = false; private int myNumAccumulativeBins = 100; private float myAccumulativeInterval; private int[] myAccumulativeRValueBins; private int[] mySiteList; private static String NotImplemented = "NotImplemented"; private static String NA = "N/A"; private static Integer IntegerTwo = Integer.valueOf(2); private HetTreatment myHetTreatment = HetTreatment.Homozygous; /** * Constructor for doing LD analysis * * @param alignment Input alignment with segregating sites * @param windowSize Size of sliding window * @param LDType * @param testSite * @param listener * @param isAccumulativeReport * @param numAccumulateIntervals * @param sitesList * @param hetTreatment */ public LinkageDisequilibrium(GenotypeTable alignment, int windowSize, testDesign LDType, int testSite, ProgressListener listener, boolean isAccumulativeReport, int numAccumulateIntervals, int[] sitesList, HetTreatment hetTreatment) { myAlignment = alignment; myFisherExact = FisherExact.getInstance((2 * myAlignment.numberOfTaxa()) + 10); myWindowSize = windowSize; myCurrDesign = LDType; myTestSite = testSite; myListener = listener; myIsAccumulativeReport = isAccumulativeReport; if (myIsAccumulativeReport) { myNumAccumulativeBins = numAccumulateIntervals; } mySiteList = sitesList; if (mySiteList != null) { Arrays.sort(mySiteList); } myHetTreatment = hetTreatment; } /** * starts the thread to calculate LD */ public void run() { initMatrices(); switch (myHetTreatment) { case Haplotype: calculateBitLDForHaplotype(false); break; case Homozygous: calculateBitLDForHaplotype(true); break; case Genotype: calculateBitLDWithHets(); break; default: myLogger.error("Unknown LD analysis type selected for heterozygotes; skipping"); break; } } private void initMatrices() { long numSites = myAlignment.numberOfSites(); if (myCurrDesign == testDesign.All) { myTotalTests = numSites * (numSites - 1l) / 2l; } else if (myCurrDesign == testDesign.SlidingWindow) { long n = Math.min(numSites - 1l, myWindowSize); myTotalTests = ((n * (n + 1l)) / 2l) + (numSites - n - 1l) * n; } else if (myCurrDesign == testDesign.SiteByAll) { myTotalTests = numSites - 1l; } else if (myCurrDesign == testDesign.SiteList) { long n = mySiteList.length; myTotalTests = ((n * (n + 1l)) / 2l) + (numSites - n - 1l) * n; } if (myIsAccumulativeReport) { myAccumulativeInterval = 1.0f / (float) myNumAccumulativeBins; myAccumulativeRValueBins = new int[myNumAccumulativeBins + 1]; } else { myMapResults = new OpenLongObjectHashMap((int)numSites); } } private long getMapKey(int r, int c) { return (c < r) ? (((long) c * myAlignment.numberOfSites()) + r) : (((long) r * myAlignment.numberOfSites()) + c); } public static LDResult calculateBitLDForHaplotype(boolean ignoreHets, int minTaxaForEstimate, GenotypeTable alignment, int site1, int site2) { FisherExact fisherExact = FisherExact.getInstance((2 * alignment.numberOfTaxa()) + 10); BitSet rMj = alignment.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Major); BitSet rMn = alignment.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Minor); BitSet cMj = alignment.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Major); BitSet cMn = alignment.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Minor); return getLDForSitePair(rMj, rMn, cMj, cMn, 2, minTaxaForEstimate, -1.0f, fisherExact, site1, site2); } public static LDResult calculateBitLDForHaplotype(int minTaxaForEstimate, int minorCnt, GenotypeTable alignment, int site1, int site2) { FisherExact fisherExact = FisherExact.getInstance((2 * alignment.numberOfTaxa()) + 10); BitSet rMj = alignment.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Major); BitSet rMn = alignment.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Minor); BitSet cMj = alignment.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Major); BitSet cMn = alignment.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Minor); return getLDForSitePair(rMj, rMn, cMj, cMn, minorCnt, minTaxaForEstimate, -1.0f, fisherExact, site1, site2); } private void calculateBitLDForHaplotype(boolean ignoreHets) { //It will ignore hets, make a new Alignment and set all het calls to missing. Otherwise set the pointer to the old alignment GenotypeTable workingAlignment; if (ignoreHets) { workingAlignment =GenotypeTableBuilder.getHomozygousInstance(myAlignment); } else { workingAlignment = myAlignment; } for (long currTest = 0; currTest < myTotalTests; currTest++) { int r = getRowFromIndex(currTest); int c = getColFromIndex(currTest); int currentProgress = (int) (100.0 * ((double) currTest / (double) myTotalTests)); fireProgress(currentProgress); BitSet rMj = workingAlignment.allelePresenceForAllTaxa(r, WHICH_ALLELE.Major); BitSet rMn = workingAlignment.allelePresenceForAllTaxa(r, WHICH_ALLELE.Minor); BitSet cMj = workingAlignment.allelePresenceForAllTaxa(c, WHICH_ALLELE.Major); BitSet cMn = workingAlignment.allelePresenceForAllTaxa(c, WHICH_ALLELE.Minor); LDResult ldr = getLDForSitePair(rMj, rMn, cMj, cMn, 2, myMinTaxaForEstimate, -1.0f, myFisherExact,r,c); if (myIsAccumulativeReport) { if (Float.isNaN(ldr.r2())) { myAccumulativeRValueBins[myNumAccumulativeBins]++; } else if (ldr.r2() == 1.0f) { myAccumulativeRValueBins[myNumAccumulativeBins - 1]++; } else { int index = (int) Math.floor(ldr.r2() / myAccumulativeInterval); myAccumulativeRValueBins[index]++; } } else { long key = getMapKey(r, c); myMapResults.put(key, ldr); } } //end of currTest if (myMapResults != null) myMapResults.trimToSize(); } private void calculateBitLDWithHets() { //Do nothing; not implemented yet myLogger.error("Calculating LD with hets as a third state is not implemented yet; skipping"); throw new IllegalStateException("LinkageDisequilibrium: calculateBitLDWithHets: Treating hets as a third state is not yet implemented"); } public static double calculateDPrime(int countAB, int countAb, int countaB, int countab, int minTaxaForEstimate) { //this is the normalized D' is Weir Genetic Data Analysis II 1986 p120 double freqR, freqC, freq, countR, countC, nonmissingSampleSize; nonmissingSampleSize = countAB + countAb + countaB + countab; if (nonmissingSampleSize < minTaxaForEstimate) { return Double.NaN; } countR = countab + countAb; countC = countab + countaB; freqR = (nonmissingSampleSize - countR) / nonmissingSampleSize; freqC = (nonmissingSampleSize - countC) / nonmissingSampleSize; // if((freqR==0)||(freqC==0)||(freqR==1)||(freqC==1)) return -999; //changed by ed 8-13-2004 if ((freqR == 0) || (freqC == 0) || (freqR == 1) || (freqC == 1)) { return Double.NaN; } freq = ((double) countAB / nonmissingSampleSize) - (freqR * freqC); if (freq < 0) { return freq / Math.max(-freqR * freqC, -(1 - freqR) * (1 - freqC)); } else { return freq / Math.min((1 - freqR) * freqC, (1 - freqC) * freqR); } //check these equations } public static double calculateRSqr(int countAB, int countAb, int countaB, int countab, int minTaxaForEstimate) { //this is the Hill & Robertson measure as used in Awadella Science 1999 286:2524 double freqA, freqB, rsqr, nonmissingSampleSize; nonmissingSampleSize = countAB + countAb + countaB + countab; if (nonmissingSampleSize < minTaxaForEstimate) { return Double.NaN; } freqA = (double) (countAB + countAb) / nonmissingSampleSize; freqB = (double) (countAB + countaB) / nonmissingSampleSize; //Through missing data & incomplete datasets some alleles can be fixed this returns missing value if ((freqA == 0) || (freqB == 0) || (freqA == 1) || (freqB == 1)) { return Double.NaN; } rsqr = ((double) countAB / nonmissingSampleSize) * ((double) countab / nonmissingSampleSize); rsqr -= ((double) countaB / nonmissingSampleSize) * ((double) countAb / nonmissingSampleSize); rsqr *= rsqr; rsqr /= freqA * (1 - freqA) * freqB * (1 - freqB); return rsqr; } /** * Method for estimating LD between a pair of bit sets. Since there can be tremendous missing data, minimum minor and * minimum site counts ensure that meaningful results are estimated. Site indices are merely there for annotating the LDResult. * @param rMj site 1 major alleles * @param rMn site 1 minor alleles * @param cMj site 2 major alleles * @param cMn site 2 minor alleles * @param minMinorCnt minimum minor allele count after intersection * @param minCnt minimum count after intersection * @param minR2 results below this r2 are ignored for p-value calculation (save times) * @param myFisherExact * @param site1Index annotation of LDresult with sites indices * @param site2Index annotation of LDresult with sites indices * @return */ public static LDResult getLDForSitePair(BitSet rMj, BitSet rMn, BitSet cMj, BitSet cMn, int minMinorCnt, int minCnt, float minR2, FisherExact myFisherExact, int site1Index, int site2Index) { // float[] results = {Float.NaN, Float.NaN, Float.NaN, Float.NaN}; if(myFisherExact==null) myFisherExact=FisherExact.getInstance((2 * (int)rMj.size()) + 10); LDResult.Builder results = new LDResult.Builder(site1Index,site2Index); int n = 0; int[][] contig = new int[2][2]; n += contig[1][1] = (int) OpenBitSet.intersectionCount(rMn, cMn); n += contig[1][0] = (int) OpenBitSet.intersectionCount(rMn, cMj); if (contig[1][0] + contig[1][1] < minMinorCnt) { return results.build(); } n += contig[0][1] = (int) OpenBitSet.intersectionCount(rMj, cMn); if (contig[0][1] + contig[1][1] < minMinorCnt) { return results.build(); } n += contig[0][0] = (int) OpenBitSet.intersectionCount(rMj, cMj); results.n(n); if (n < minCnt) { return results.build(); } double rValue = LinkageDisequilibrium.calculateRSqr(contig[0][0], contig[1][0], contig[0][1], contig[1][1], minCnt); results.r2((float)rValue); if (Double.isNaN(rValue)) { return results.build(); } results.dprime((float) LinkageDisequilibrium.calculateDPrime(contig[0][0], contig[1][0], contig[0][1], contig[1][1], minCnt)); if (rValue < minR2) { return results.build(); } double pValue = myFisherExact.getTwoTailedP(contig[0][0], contig[1][0], contig[0][1], contig[1][1]); results.p((float) pValue); return results.build(); } private int getRowFromIndex(long index) { int row = 0; int n = myAlignment.numberOfSites(); int w = myWindowSize; if (myCurrDesign == testDesign.SlidingWindow && n > w + 1 && index >= w * (w + 1) / (double) 2) { row = (int) Math.ceil(((double) index + 1 - w * (w + 1) / 2 + w * w) / w); } else if (myCurrDesign == testDesign.SiteByAll) { if (index < myTestSite) { row = myTestSite; } else { row = (int) index + 1; } } else if (myCurrDesign == testDesign.SiteList) { int k = (int) Math.ceil((n - 1.5) - Math.sqrt((n - 1.5) * (n - 1.5) + 2 * (n - index - 2))); int m = n * (k + 1) - ((k + 1) * (k + 2) / 2) - 1; if (m - index > n - mySiteList[k] - 1) { row = mySiteList[k]; } else { row = n - 1 - m + (int) index; } } else { row = (int) Math.ceil((Math.sqrt(8 * (index + 1) + 1) - 1) / 2); } return row; } private int getColFromIndex(long index) { int row = getRowFromIndex(index); int col = 0; int n = myAlignment.numberOfSites(); int w = myWindowSize; if (myCurrDesign == testDesign.SlidingWindow && n > w + 1 && index >= w * (w + 1) / (double) 2) { col = (int) ((row - 1 - (double) w * (w + 1) / 2 - w * (row - w) + 1 + index)); } else if (myCurrDesign == testDesign.SiteByAll) { if (index < myTestSite) { col = (int) index; } else { col = myTestSite; } } else if (myCurrDesign == testDesign.SiteList) { int k = (int) Math.ceil((n - 1.5) - Math.sqrt((n - 1.5) * (n - 1.5) + 2 * (n - index - 2))); int m = n * (k + 1) - ((k + 1) * (k + 2) / 2) - 1; if (row != mySiteList[k]) { col = mySiteList[k]; } else { col = n - m + (int) index - 2; int yy = Arrays.binarySearch(mySiteList, row); int y = Arrays.binarySearch(mySiteList, col); while (yy + (y + 1) != 0) { if (y < 0) { y = -(y + 1); } col = col - (yy - y); yy = y; y = Arrays.binarySearch(mySiteList, col); } } } else { col = row - (row * (row + 1) / 2 - (int) index); } return col; } /** * Returns P-value estimate for a given pair of numSites. If there were only * 2 alleles at each locus, then the Fisher Exact P-value (one-tail) is * returned. If more states then the permuted Monte Carlo test is used. * * @param r is site 1 * @param c is site 2 * @return P-value */ public double getPVal(int r, int c) { long key = getMapKey(r, c); LDResult result = (LDResult) myMapResults.get(key); if (result == null) { return Float.NaN; } return result.p(); } /** * Get number of gametes included in LD calculations (after missing data was * excluded) * * @param r is site 1 * @param c is site 2 * @return number of gametes */ public int getSampleSize(int r, int c) { long key = getMapKey(r, c); LDResult result = (LDResult) myMapResults.get(key); if (result == null) { return 0; } return result.n(); } /** * Returns D' estimate for a given pair of numSites * * @param r is site 1 * @param c is site 2 * @return D' */ public float getDPrime(int r, int c) { long key = getMapKey(r, c); LDResult result = (LDResult) myMapResults.get(key); if (result == null) { return Float.NaN; } return result.dPrime(); } /** * Returns r^2 estimate for a given pair of numSites * * @param r is site 1 * @param c is site 2 * @return r^2 */ public float getRSqr(int r, int c) { long key = getMapKey(r, c); LDResult result = (LDResult) myMapResults.get(key); if (result == null) { return Float.NaN; } return result.r2(); } public int getX(int row) { return getColFromIndex(row); } public int getY(int row) { return getRowFromIndex(row); } /** * Returns the counts of the numSites in the alignment */ public int getSiteCount() { return myAlignment.numberOfSites(); } /** * Returns an annotated aligment if one was used for this LD this could be * used to access information of locus position */ public GenotypeTable getAlignment() { return myAlignment; } /** * Returns representation of the LD results as a string */ public String toString() { String delimit = "\t"; StringWriter sw = new StringWriter(); Object[] colNames = getTableColumnNames(); for (int j = 0; j < colNames.length; j++) { sw.write(colNames[j].toString()); sw.write(delimit); } sw.write("\n"); for (long r = 0; r < myTotalTests; r++) { Object[] theRow = getRow(r); for (int i = 0; i < theRow.length; i++) { sw.write(theRow[i].toString()); sw.write(delimit); } } return sw.toString(); } @Override public Object[] getTableColumnNames() { String[] annotatedLabels = null; if (myIsAccumulativeReport) { annotatedLabels = new String[]{"R2BinMin", "R2BinMax", "Count"}; } else { annotatedLabels = new String[]{"Locus1", "Position1", "Site1", "NumberOfStates1", "States1", "Frequency1", "Locus2", "Position2", "Site2", "NumberOfStates2", "States2", "Frequency2", "Dist_bp", "R^2", "DPrime", "pDiseq", "N"}; } return annotatedLabels; } @Override public Object[] getRow(long row) { if (myIsAccumulativeReport) { Object[] data = new Object[3]; if (row == myNumAccumulativeBins) { data[0] = Double.NaN; data[1] = Double.NaN; data[2] = Integer.valueOf(myAccumulativeRValueBins[(int) row]); } else { double start = myAccumulativeInterval * (double) row; data[0] = Double.valueOf(start); data[1] = Double.valueOf(start + myAccumulativeInterval); data[2] = Integer.valueOf(myAccumulativeRValueBins[(int) row]); } return data; } else { int labelOffset = 0; Object[] data = new Object[17]; int r = getRowFromIndex(row); int c = getColFromIndex(row); String rState = myAlignment.majorAlleleAsString(r) + ":" + myAlignment.minorAlleleAsString(r); Integer rStr = Integer.valueOf(r); String cState = myAlignment.majorAlleleAsString(c) + ":" + myAlignment.minorAlleleAsString(c); Integer cStr = Integer.valueOf(c); data[labelOffset++] = myAlignment.chromosomeName(r); data[labelOffset++] = Integer.valueOf(myAlignment.chromosomalPosition(r)); data[labelOffset++] = rStr; data[labelOffset++] = IntegerTwo; data[labelOffset++] = rState; data[labelOffset++] = NotImplemented; data[labelOffset++] = myAlignment.chromosomeName(c); data[labelOffset++] = Integer.valueOf(myAlignment.chromosomalPosition(c)); data[labelOffset++] = cStr; data[labelOffset++] = IntegerTwo; data[labelOffset++] = cState; data[labelOffset++] = NotImplemented; if (myAlignment.chromosomeName(r).equals(myAlignment.chromosomeName(c))) { data[labelOffset++] = Integer.valueOf(Math.abs(myAlignment.chromosomalPosition(r) - myAlignment.chromosomalPosition(c))); } else { data[labelOffset++] = NA; } data[labelOffset++] = getRSqr(r, c); data[labelOffset++] = getDPrime(r, c); data[labelOffset++] = getPVal(r, c); data[labelOffset++] = getSampleSize(r, c); return data; } } @Override public String getTableTitle() { return "Linkage Disequilibrium"; } @Override public int getColumnCount() { return getTableColumnNames().length; } @Override public long getRowCount() { if (myIsAccumulativeReport) { return myNumAccumulativeBins + 1; } else { return myTotalTests; } } @Override public long getElementCount() { return getRowCount() * getColumnCount(); } @Override public Object getValueAt(long row, int col) { return getRow(row)[col]; } protected void fireProgress(int percent) { if (percent < 0) { percent = -percent; } if (myListener != null) { myListener.progress(percent, null); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy