net.maizegenetics.analysis.imputation.NucleotideImputationUtils Maven / Gradle / Ivy
package net.maizegenetics.analysis.imputation;
import net.maizegenetics.analysis.clustering.Haplotype;
import net.maizegenetics.analysis.clustering.HaplotypeCluster;
import net.maizegenetics.analysis.clustering.HaplotypeClusterer;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.FilterGenotypeTable;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multiset.Entry;
import net.maizegenetics.stats.math.GammaFunction;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.TaxaListUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.BitUtil;
import net.maizegenetics.util.OpenBitSet;
import org.apache.commons.math3.stat.inference.TestUtils;
import org.apache.log4j.Logger;
import java.util.*;
public class NucleotideImputationUtils {
private static final Logger myLogger = Logger.getLogger(NucleotideImputationUtils.class);
static final byte AA = NucleotideAlignmentConstants.getNucleotideDiploidByte("AA");
static final byte CC = NucleotideAlignmentConstants.getNucleotideDiploidByte("CC");
static final byte GG = NucleotideAlignmentConstants.getNucleotideDiploidByte("GG");
static final byte TT = NucleotideAlignmentConstants.getNucleotideDiploidByte("TT");
static final byte AC = NucleotideAlignmentConstants.getNucleotideDiploidByte("AC");
static final byte AG = NucleotideAlignmentConstants.getNucleotideDiploidByte("AG");
static final byte AT = NucleotideAlignmentConstants.getNucleotideDiploidByte("AT");
static final byte CG = NucleotideAlignmentConstants.getNucleotideDiploidByte("CG");
static final byte CT = NucleotideAlignmentConstants.getNucleotideDiploidByte("CT");
static final byte GT = NucleotideAlignmentConstants.getNucleotideDiploidByte("GT");
static final byte NN = NucleotideAlignmentConstants.getNucleotideDiploidByte("NN");
static final byte CA = NucleotideAlignmentConstants.getNucleotideDiploidByte("CA");
static final byte[] byteval = new byte[] {AA,CC,GG,TT,AC};
final static HashMap genotypeMap = new HashMap();
static{
genotypeMap.put(AA, 0);
genotypeMap.put(CC, 1);
genotypeMap.put(GG, 2);
genotypeMap.put(TT, 3);
genotypeMap.put(AC, 4);
genotypeMap.put(CA, 4);
}
static final byte[][] genoval = new byte[][]{{AA,AC,AG,AT},{AC,CC,CG,CT},{AG,CG,GG,GT},{AT,CT,GT,TT}};
//prevents instantiation of this class
private NucleotideImputationUtils() {}
public static void callParentAllelesByWindow(PopulationData popdata, double maxMissing, double minMaf, int windowSize, double minR) {
byte N = GenotypeTable.UNKNOWN_ALLELE;
BitSet polybits;
double segratio = popdata.contribution1;
if (minMaf < 0 && (segratio == 0.5 || segratio == 0.25 || segratio == 0.75)) {
polybits = whichSitesSegregateCorrectly(popdata.original, maxMissing, segratio);
} else {
if (minMaf < 0) minMaf = 0;
polybits = whichSitesArePolymorphic(popdata.original, maxMissing, minMaf);
}
myLogger.info("polybits cardinality = " + polybits.cardinality());
OpenBitSet filteredBits = whichSnpsAreFromSameTag(popdata.original, polybits);
myLogger.info("filteredBits.cardinality = " + filteredBits.cardinality());
BitSet ldFilteredBits;
if (minR > 0) {
int halfWindow = windowSize / 2;
ldFilteredBits = ldfilter(popdata.original, halfWindow, minR, filteredBits);
} else {
ldFilteredBits = filteredBits;
}
myLogger.info("ldFilteredBits.cardinality = " + ldFilteredBits.cardinality());
int nsites = popdata.original.numberOfSites();
int ntaxa = popdata.original.numberOfTaxa();
popdata.alleleA = new byte[nsites];
popdata.alleleC = new byte[nsites];
popdata.snpIndex = new OpenBitSet(nsites);
for (int s = 0; s < nsites; s++) {
popdata.alleleA[s] = N;
popdata.alleleC[s] = N;
}
int[] parentIndex = new int[2];
parentIndex[0] = popdata.original.taxa().indexOf(popdata.parent1);
parentIndex[1] = popdata.original.taxa().indexOf(popdata.parent2);
//iterate through windows
GenotypeTable[] prevAlignment = null;
int[][] snpIndices = getWindows(ldFilteredBits, windowSize);
boolean append = false;
int nWindows = snpIndices.length;
for (int w = 0; w < nWindows; w++) {
int[] snpIndex;
if (append) {
int n1 = snpIndices[w-1].length;
int n2 = snpIndices[w].length;
snpIndex = new int[n1 + n2];
System.arraycopy(snpIndices[w-1], 0, snpIndex, 0, n1);
System.arraycopy(snpIndices[w], 0, snpIndex, n1, n2);
append = false;
} else {
snpIndex = snpIndices[w];
}
//mask the hets
// MutableNucleotideAlignment mna = MutableNucleotideAlignment.getInstance(FilterAlignment.getInstance(popdata.original, snpIndex));
// int n = snpIndex.length;
// byte missing = NucleotideAlignmentConstants.getNucleotideDiploidByte('N');
// for (int i = 0; i < n; i++) {
// for (int t = 0; t < ntaxa; t++) {
// if (hetMask[t].fastGet(snpIndex[i])) mna.setBase(t, i, missing);
// }
// }
// mna.clean();
GenotypeTable windowAlignment = FilterGenotypeTable.getInstance(popdata.original, snpIndex);
LinkedList snpList = new LinkedList(); //snpList is a list of snps (indices) in this window
for (int s:snpIndex) snpList.add(s);
GenotypeTable[] taxaAlignments = getTaxaGroupAlignments(windowAlignment, parentIndex, snpList);
if (taxaAlignments == null) {
append = true;
} else {
//are groups in this alignment correlated with groups in the previous alignment
double r = 0;
if (prevAlignment != null) {
r = getIdCorrelation(new TaxaList[][] {{prevAlignment[0].taxa(), prevAlignment[1].taxa()},{taxaAlignments[0].taxa(), taxaAlignments[1].taxa()}});
myLogger.info("For " + popdata.name + " the window starting at " + popdata.original.siteName(snpIndex[0]) + ", r = " + r + " , # of snps in alignment = " + snpList.size());
} else {
myLogger.info("For " + popdata.name + " the window starting at " + popdata.original.siteName(snpIndex[0]) + ", # of snps in alignment = " + snpList.size());
}
checkAlignmentOrderIgnoringParents(taxaAlignments, popdata, r); //if r is negative switch alignment order (which will result in a positive r)
//debug -check upgma tree
// int[] selectSnps = new int[snpList.size()];
// int cnt = 0;
// for (Integer s : snpList) selectSnps[cnt++] = s;
// SBitAlignment sba = SBitAlignment.getInstance(FilterAlignment.getInstance(popdata.original, selectSnps));
// IBSDistanceMatrix dm = new IBSDistanceMatrix(sba);
// estimateMissingDistances(dm);
// Tree myTree = new UPGMATree(dm);
// TreeDisplayPlugin tdp = new TreeDisplayPlugin(null, true);
// tdp.performFunction(new DataSet(new Datum("Snp Tree", myTree, "Snp Tree"), null));
prevAlignment = taxaAlignments;
callParentAllelesUsingTaxaGroups(popdata, taxaAlignments, snpList);
}
}
myLogger.info("number of called snps = " + popdata.snpIndex.cardinality());
//create the imputed array with A/C calls
int nsnps = (int) popdata.snpIndex.cardinality();
ntaxa = popdata.original.numberOfTaxa();
nsites = popdata.original.numberOfSites();
int[] snpIndex = new int[nsnps];
int snpcount = 0;
PositionListBuilder posBuilder = new PositionListBuilder();
for (int s = 0; s < nsites; s++) {
if (popdata.snpIndex.fastGet(s)) {
snpIndex[snpcount++] = s;
posBuilder.add(popdata.original.positions().get(s));
}
}
snpIndex = Arrays.copyOf(snpIndex, snpcount);
GenotypeTableBuilder builder = GenotypeTableBuilder.getTaxaIncremental(posBuilder.build());
nsnps = snpIndex.length;
for (int t = 0; t < ntaxa; t++) {
byte[] taxonGeno = new byte[nsnps];
for (int s = 0; s < nsnps; s++) {
byte Aallele = popdata.alleleA[snpIndex[s]];
byte Callele = popdata.alleleC[snpIndex[s]];
byte[] val = GenotypeTableUtils.getDiploidValues(popdata.original.genotype(t, snpIndex[s]));
if (val[0] == Aallele && val[1] == Aallele) taxonGeno[s] = AA;
else if (val[0] == Callele && val[1] == Callele) taxonGeno[s] = CC;
else if ((val[0] == Aallele && val[1] == Callele) || (val[0] == Callele && val[1] == Aallele)) taxonGeno[s] = AC;
else taxonGeno[s] = NN;
}
builder.addTaxon(popdata.original.taxa().get(t), taxonGeno);
}
popdata.imputed = builder.build();
}
public static void callParentAllelesUsingClusters(PopulationData popdata, double maxMissing, double minMaf, int windowSize, boolean checkSubpops) {
HaplotypeCluster.ReturnHaplotype = HaplotypeCluster.TYPE.majority;
int ntaxa = popdata.original.numberOfTaxa();
int nOriginalSites = popdata.original.numberOfSites();
popdata.alleleA = new byte[nOriginalSites];
popdata.alleleC = new byte[nOriginalSites];
popdata.snpIndex = new OpenBitSet(nOriginalSites);
// int minCount = (int) Math.ceil(1 - (maxMissing * ntaxa));
// Alignment a = AlignmentUtils.removeSitesBasedOnFreqIgnoreMissing(popdata.original, minMaf, 1.0, minCount);
// a = deleteSnpsFromSameTag(a);
double maxHet = 0.06;
if (minMaf < 0) minMaf = 0.05;
GenotypeTable a = filterSnpsByTag(popdata.original, minMaf, maxMissing, maxHet);
int nFilteredSites = a.numberOfSites();
int[] siteTranslationArray = new int[nFilteredSites];
//FilterGenotypeTable fa = (FilterGenotypeTable) a;
//for (int s = 0; s < nFilteredSites; s++) siteTranslationArray[s] = fa.translateSite(s);
int[] translateSites = a.siteTranslations();
for (int s = 0; s < nFilteredSites; s++) siteTranslationArray[s] = translateSites[s];
//find a window with only two distinct haplotype clusters
int maxdist = 100;
ArrayList clusters = null;
int start = 0;
while (maxdist > 10) {
clusters = clusterWindow(a, start, windowSize, 4);
int mindist2 = 0, mindist3 = 0;
if (clusters.size() > 2) mindist2 = Math.min(HaplotypeClusterer.clusterDistanceMaxPairDiff(clusters.get(0), clusters.get(2)),
HaplotypeClusterer.clusterDistanceMaxPairDiff(clusters.get(1), clusters.get(2)));
if (clusters.size() > 3) mindist3 = Math.min(HaplotypeClusterer.clusterDistanceMaxPairDiff(clusters.get(0), clusters.get(3)),
HaplotypeClusterer.clusterDistanceMaxPairDiff(clusters.get(1), clusters.get(3)));
maxdist = Math.max(mindist2, mindist3);
start += windowSize;
}
HaplotypeCluster.ReturnHaplotype = HaplotypeCluster.TYPE.majority;
for (int i = 0; i < 4; i++) if (clusters.size() > i) System.out.println(clusters.get(i));
int[] sites = new int[windowSize];
int currentSite = start - windowSize;
for (int i = 0; i < windowSize; i++) sites[i] = currentSite++;
//extend to the right
ArrayList seedClusters = new ArrayList(clusters);
int[] siteNumber = new int[]{windowSize};
int[] firstWindowSites = null;
while (siteNumber[0] == windowSize && sites[windowSize - 1] < nFilteredSites - 1) {
seedClusters = extendClusters(a, seedClusters, sites, siteNumber, true);
if (firstWindowSites == null) firstWindowSites = Arrays.copyOf(sites, sites.length); //start here to extend to the left
if (siteNumber[0] > 0) {
if (siteNumber[0] < windowSize) sites = Arrays.copyOf(sites, siteNumber[0]);
int[] origSites = translateSitesBackToOriginal(sites, siteTranslationArray);
updateAlleleCalls(seedClusters.get(0), origSites, popdata.alleleA);
updateAlleleCalls(seedClusters.get(1), origSites, popdata.alleleC);
for (int site : origSites) popdata.snpIndex.fastSet(site);
}
}
//extend to the left
seedClusters = new ArrayList(clusters);
siteNumber = new int[]{windowSize};
sites = firstWindowSites;
while (siteNumber[0] == windowSize && sites[0] > 0) {
seedClusters = extendClusters(a, seedClusters, sites, siteNumber, false);
if (siteNumber[0] > 0) {
if (siteNumber[0] < windowSize) sites = Arrays.copyOfRange(sites, windowSize - siteNumber[0], windowSize);
int[] origSites = translateSitesBackToOriginal(sites, siteTranslationArray);
updateAlleleCalls(seedClusters.get(0), origSites, popdata.alleleA);
updateAlleleCalls(seedClusters.get(1), origSites, popdata.alleleC);
for (int site : origSites) popdata.snpIndex.fastSet(site);
}
}
//check parents to see if alleles A and C should be exchanged
checkParentage(popdata);
//create the imputed array with A/C calls
int nsnps = (int) popdata.snpIndex.cardinality();
ntaxa = popdata.original.numberOfTaxa();
int nsites = popdata.original.numberOfSites();
int[] snpIndex = new int[nsnps];
int snpcount = 0;
for (int s = 0; s < nsites; s++) {
if (popdata.snpIndex.fastGet(s)) snpIndex[snpcount++] = s;
}
snpIndex = Arrays.copyOf(snpIndex, snpcount);
nsnps = snpIndex.length;
GenotypeTable target = FilterGenotypeTable.getInstance(popdata.original, snpIndex);
createSubPops(popdata, checkSubpops);
checksubpops(popdata, 20);
//filter on taxa ids
int npops = popdata.subpopulationGroups.size();
TaxaList[] taxaLists = new TaxaList[popdata.subpopulationGroups.size()];
popdata.subpopulationGroups.toArray(taxaLists);
TaxaList allTaxa = TaxaListUtils.getAllTaxa(taxaLists);
target = FilterGenotypeTable.getInstance(target, allTaxa);
GenotypeTableBuilder builder = GenotypeTableBuilder.getTaxaIncremental(target.positions());
for (int p = 0; p < npops; p++) {
//create an index back to target for the taxa
TaxaList popTaxaList = popdata.subpopulationGroups.get(p);
ntaxa = popTaxaList.numberOfTaxa();
int[] taxaIndex = new int[ntaxa];
for (int t = 0; t < ntaxa; t++) taxaIndex[t] = target.taxa().indexOf(popTaxaList.get(t));
BitSet subpopSnpUse = popdata.subpoplationSiteIndex.get(p);
nsnps = target.numberOfSites();
for (int t = 0; t < ntaxa; t++) {
byte[] taxonGeno = new byte[nsnps];
String gstr = target.genotypeAsStringRow(t);
for (int s = 0; s < nsnps; s++) {
byte Aallele = popdata.alleleA[snpIndex[s]];
byte Callele = popdata.alleleC[snpIndex[s]];
byte val = target.genotype(taxaIndex[t], s);
if (val == Aallele) taxonGeno[s] = AA;
else if (val == Callele) taxonGeno[s] = CC;
else if (GenotypeTableUtils.isHeterozygous(val)) taxonGeno[s] = AC;
else taxonGeno[s] = NN;
}
builder.addTaxon(popTaxaList.get(t), taxonGeno);
}
}
popdata.imputed = builder.build();
}
public static void callParentAllelesUsingClustersOnly(PopulationData popdata, double maxMissing, double minMaf, int windowSize, boolean checkSubpops ) {
double mindiff = 0.7; //the minimum distance for which two clusters will be considered to be from different haplotypes
int overlap = 10; //overlap between windows used to link adjacent windows
HaplotypeCluster.ReturnHaplotype = HaplotypeCluster.TYPE.majority;
int ntaxa = popdata.original.numberOfTaxa();
int nOriginalSites = popdata.original.numberOfSites();
popdata.alleleA = new byte[nOriginalSites];
popdata.alleleC = new byte[nOriginalSites];
popdata.snpIndex = new OpenBitSet(nOriginalSites);
double maxHet = 0.06;
GenotypeTable a = filterSnpsByTag(popdata.original, minMaf, maxMissing, maxHet);
int nFilteredSites = a.numberOfSites();
int[] siteTranslationArray = new int[nFilteredSites];
//FilterGenotypeTable fa = (FilterGenotypeTable) a;
//for (int s = 0; s < nFilteredSites; s++) siteTranslationArray[s] = fa.translateSite(s);
int[] translateSites = a.siteTranslations();
for (int s = 0; s < nFilteredSites; s++) siteTranslationArray[s] = translateSites[s];
//iterate through windows
int maxsite = nFilteredSites - windowSize - overlap;
ArrayList clusters = null;
HaplotypeCluster.ReturnHaplotype = HaplotypeCluster.TYPE.unanimous;
byte[] prevA = null;
byte[] prevC = null;
for (int start = 0; start <= maxsite; start += windowSize - overlap) {
if (start + windowSize > maxsite) clusters = clusterWindow(a, start, nFilteredSites - start, 2);
else clusters = clusterWindow(a, start, windowSize, 2);
/*
* Strategy for separating clusters, populations with homozygous parents
* Cluster taxa in overlapping windows (3 sites is enough), maxdiff=4
* If cluster diff proportion between biggest cluster and next biggest is < 0.8 (mindiff), try third biggest. If diff < 0.8 raise an error and quit.
* Get haplotype from biggest cluster and the next biggest differnt cluster. Match the overlap to tell which haplotype it goes with.
* If the overlap does not match perfectly raise an error.
*
* */
double diff1 = HaplotypeClusterer.clusterDistanceClusterDiffProportion(clusters.get(0), clusters.get(1));
double diff2 = HaplotypeClusterer.clusterDistanceClusterDiffProportion(clusters.get(0), clusters.get(2));
double diff3 = HaplotypeClusterer.clusterDistanceClusterDiffProportion(clusters.get(0), clusters.get(3));
double diff4 = HaplotypeClusterer.clusterDistanceClusterDiffProportion(clusters.get(0), clusters.get(4));
byte[] hap0 = clusters.get(0).getHaplotype();
byte[] hap1;
if (diff1 >= mindiff) {
hap1 = clusters.get(1).getHaplotype();
} else if (diff2 >= mindiff){
hap1 = clusters.get(2).getHaplotype();
} else if (diff3 >= mindiff){
hap1 = clusters.get(3).getHaplotype();
} else if (diff4 >= mindiff){
hap1 = clusters.get(4).getHaplotype();
} else {
throw new IllegalArgumentException(String.format("No second haplotype at site %d, position %d.", start, a.chromosomalPosition(start)));
}
if (start == 0) {
//update sites that are different in hap0 and 1
for (int s = 0; s < windowSize; s++) {
if (hap0[s] != NN && hap1[s] != NN && hap0[s] != hap1[s]) {
int origSite = siteTranslationArray[s];
popdata.snpIndex.fastSet(origSite);
popdata.alleleA[origSite] = hap0[s];
popdata.alleleC[origSite] = hap1[s];
}
}
prevA = hap0;
prevC = hap1;
} else {
//use overlap to match this cluster to existing haplotypes
//if there is a .8 or better match, invalidate any non-matching sites
int seqlength = hap0.length;
if (haplotypeOverlapSimilarity(prevA, hap0, overlap, windowSize) > 0.8 && haplotypeOverlapSimilarity(prevC, hap1, overlap, windowSize) > 0.8) {
for (int s = 0; s < seqlength; s++) {
boolean overlapMatchingSite = true;
if (s < overlap) {
int prevSite = s + windowSize - overlap;
if (prevA[prevSite] != NN && hap0[s] != NN && (prevA[prevSite] != hap0[s] || prevC[prevSite] != hap1[s])) overlapMatchingSite = false;
}
if (hap0[s] != NN && hap1[s] != NN && hap0[s] != hap1[s] && overlapMatchingSite) {
int origSite = siteTranslationArray[start + s];
popdata.snpIndex.fastSet(origSite);
popdata.alleleA[origSite] = hap0[s];
popdata.alleleC[origSite] = hap1[s];
}
}
prevA = hap0;
prevC = hap1;
} else if (haplotypeOverlapSimilarity(prevA, hap1, overlap, windowSize) > 0.8 && haplotypeOverlapSimilarity(prevC, hap0, overlap, windowSize) > 0.8) {
for (int s = 0; s < seqlength; s++) {
boolean overlapMatchingSite = true;
if (s < overlap) {
int prevSite = s + windowSize - overlap;
if (prevA[prevSite] != NN && hap1[s] != NN && (prevA[prevSite] != hap1[s] || prevC[prevSite] != hap0[s])) overlapMatchingSite = false;
}
if (hap0[s] != NN && hap1[s] != NN && hap0[s] != hap1[s] && overlapMatchingSite) {
int origSite = siteTranslationArray[start + s];
popdata.snpIndex.fastSet(origSite);
popdata.alleleA[origSite] = hap1[s];
popdata.alleleC[origSite] = hap0[s];
}
}
prevA = hap1;
prevC = hap0;
} else {
throw new IllegalArgumentException(String.format("Haplotypes fail to approximately match previous haplotypes at site %d, position %d.", start, a.chromosomalPosition(start)));
}
}
}
//test each called site for LD with neighbors
createSubPops(popdata, checkSubpops);
checksubpops(popdata, 20);
int nsnps = (int) popdata.snpIndex.cardinality();
// ntaxa = popdata.original.getSequenceCount();
int nsites = popdata.original.numberOfSites();
int[] snpIndex = new int[nsnps];
int snpcount = 0;
for (int s = 0; s < nsites; s++) {
if (popdata.snpIndex.fastGet(s)) snpIndex[snpcount++] = s;
}
snpIndex = Arrays.copyOf(snpIndex, snpcount);
nsnps = snpIndex.length;
GenotypeTable target = FilterGenotypeTable.getInstance(popdata.original, snpIndex);
GenotypeTableBuilder builder = GenotypeTableBuilder.getTaxaIncremental(target.positions());
//assign parent calls to imputed alignment
int nsubpops = popdata.subpopulationGroups.size();
for (int sp = 0; sp < nsubpops; sp++) {
BitSet SI = popdata.subpoplationSiteIndex.get(sp);
TaxaList subpopTaxa = popdata.subpopulationGroups.get(sp);
ntaxa = subpopTaxa.numberOfTaxa();
for (int t = 0; t < ntaxa; t++) {
int taxaTargetIndex = target.taxa().indexOf(subpopTaxa.get(t));
byte[] taxonGeno = new byte[nsnps];
for (int s = 0; s < nsnps; s++) {
if (SI.fastGet(snpIndex[s])) {
byte Aallele = popdata.alleleA[snpIndex[s]];
byte Callele = popdata.alleleC[snpIndex[s]];
byte[] val = GenotypeTableUtils.getDiploidValues(target.genotype(taxaTargetIndex, s));
if (val[0] == Aallele && val[1] == Aallele) taxonGeno[s] = AA;
else if (val[0] == Callele && val[1] == Callele) taxonGeno[s] = CC;
else if ((val[0] == Aallele && val[1] == Callele) || (val[0] == Callele && val[1] == Aallele)) taxonGeno[s] = AC;
else taxonGeno[s] = NN;
}
}
builder.addTaxon(subpopTaxa.get(t), taxonGeno);
}
}
popdata.imputed = builder.build();
}
public static boolean compareHaplotypes(byte[] h0, byte[] h1, int overlap, int haplength) {
boolean same = true;
int ptr = 0;
int start = haplength - overlap;
while (same && ptr < overlap) {
if (h0[start + ptr] != h1[ptr] && h0[start + ptr] != NN && h1[ptr] != NN) same = false;
ptr++;
}
return same;
}
public static double haplotypeOverlapSimilarity(byte[] h0, byte[] h1, int overlap, int haplength) {
int ptr = 0;
int start = haplength - overlap;
int matchCount = 0;
int nonmissingCount = 0;
while (ptr < overlap) {
if (h0[start + ptr] != NN && h1[ptr] != NN) {
nonmissingCount++;
if(h0[start + ptr] == h1[ptr]) matchCount++;
}
ptr++;
}
return ((double) matchCount) / ((double) nonmissingCount);
}
public static void createSubPops(PopulationData popdata, boolean isNAM) {
popdata.subpopulationGroups = new ArrayList();
if (isNAM) {
//create the population subgoups
TaxaListBuilder[] sublists = new TaxaListBuilder[3];
for (int i = 0; i < 3; i++) sublists[i] = new TaxaListBuilder();
for (Taxon taxon:popdata.original.taxa()) {
int pop = SubpopulationFinder.getNamSubPopulation(taxon);
if (pop >= 0) sublists[pop].add(taxon);
}
//add the subgroups to popdata
for (int i = 0; i < 3; i++) {
TaxaList poplist = sublists[i].build();
if (poplist.size() > 0) popdata.subpopulationGroups.add(poplist);
}
} else {
popdata.subpopulationGroups.add(popdata.original.taxa());
}
}
public static void checksubpops(PopulationData popdata, int halfWindowSize) {
double minMatchScore = 0.9;
int nsubpops = popdata.subpopulationGroups.size();
int totalsites = popdata.original.numberOfSites();
popdata.subpoplationSiteIndex = new ArrayList();
for (int i = 0; i < nsubpops; i++) {
GenotypeTable a = FilterGenotypeTable.getInstance(popdata.original, popdata.subpopulationGroups.get(i) );
OpenBitSet snpUseIndex = new OpenBitSet(totalsites);
popdata.subpoplationSiteIndex.add(snpUseIndex);
//goodsites are the sites to be used for this population, that is the sites that have been assigned parent allele calls
//remove sites with maf = 0 from goodsites
int nsites = (int) popdata.snpIndex.cardinality();
int[] goodsites = new int[nsites];
int ptr = 0;
for (int ts = 0; ts < totalsites; ts++) if (popdata.snpIndex.fastGet(ts) && a.minorAlleleFrequency(ts) > 0) goodsites[ptr++] = ts;
nsites = ptr;
goodsites = Arrays.copyOf(goodsites, ptr);
//calculate site scores for the goodsites
int ntaxa = a.numberOfTaxa();
double[] score = new double[nsites];
for (int s = 0; s < nsites; s++) {
int firstSite = s - halfWindowSize;
int lastSite = s + halfWindowSize;
if (firstSite < 0) {
lastSite -= firstSite;
firstSite = 0;
} else if (lastSite > nsites - 1) {
firstSite -= lastSite - nsites + 1;
lastSite = nsites - 1;
}
byte Aallele = popdata.alleleA[goodsites[s]];
byte Callele = popdata.alleleC[goodsites[s]];
int matchCount = 0;
int totalCount = 0;
for (int t = 0; t < ntaxa; t++) {
byte tgeno = a.genotype(t, goodsites[s]);
byte[] matchAllele;
if (tgeno == Aallele || tgeno == Callele) {
if (tgeno == Aallele) matchAllele = popdata.alleleA;
else matchAllele = popdata.alleleC;
for (int s2 = firstSite; s2 <= lastSite; s2++) if (s2 != s) {
byte tgeno2 = a.genotype(t, goodsites[s2]);
if (tgeno2 != NN) {
totalCount += 1;
if (tgeno2 == matchAllele[goodsites[s2]]) matchCount++;
}
}
}
}
score[s] = ((double) matchCount) / ((double) totalCount);
}
//iterative improvement in this block (experimental)
int iter = 1;
double[] cuts = new double[]{0.7, 0.8, 0.9};
for (double cutoff:cuts) {
int[] bestsites = new int[nsites];
ptr = 0;
for (int s = 0; s < nsites; s++) {
if (score[s] > cutoff) bestsites[ptr++] = goodsites[s];
}
bestsites = Arrays.copyOf(bestsites, ptr);
int nbest = ptr;
for (int s = 0; s < nsites; s++) {
int closeSite = Arrays.binarySearch(bestsites, goodsites[s]);
if (closeSite < 0) closeSite = -closeSite - 1;
int firstSite = closeSite - halfWindowSize;
int lastSite = closeSite + halfWindowSize;
if (firstSite < 0) {
lastSite -= firstSite;
firstSite = 0;
} else if (lastSite > nbest - 1) {
firstSite -= lastSite - nbest + 1;
lastSite = nbest - 1;
}
byte Aallele = popdata.alleleA[goodsites[s]];
byte Callele = popdata.alleleC[goodsites[s]];
int matchCount = 0;
int totalCount = 0;
for (int t = 0; t < ntaxa; t++) {
byte tgeno = a.genotype(t, goodsites[s]);
byte[] matchAllele;
if (tgeno == Aallele || tgeno == Callele) {
if (tgeno == Aallele) matchAllele = popdata.alleleA;
else matchAllele = popdata.alleleC;
for (int s2 = firstSite; s2 <= lastSite; s2++) if (bestsites[s2] != goodsites[s]) {
byte tgeno2 = a.genotype(t, bestsites[s2]);
if (tgeno2 != NN) {
totalCount += 1;
if (tgeno2 == matchAllele[bestsites[s2]]) matchCount++;
}
}
}
}
double matchScore = ((double) matchCount) / ((double) totalCount);
score[s] = matchScore;
}
}
//use score to set use index
//note that scores are the goodsite scores, so the snp index is contained in the goodsite index
for (int s = 0; s < nsites; s++) {
if (score[s] >= minMatchScore) {
snpUseIndex.fastSet(goodsites[s]);
}
}
}
}
public static int[] translateSitesBackToOriginal(int[] sites, int[] translation) {
int n = sites.length;
int[] orig = new int[n];
for (int i = 0; i < n; i++) {
orig[i] = translation[sites[i]];
}
return orig;
}
public static ArrayList extendClusters(GenotypeTable a, ArrayList clusters, int[] sites, int[] windowSize, boolean forward) {
int maxDiff = 4;
int totalSites = a.numberOfSites();
int targetSize = windowSize[0];
int direction;
int nextSite;
int nSelected = 0;
if (forward) {
direction = 1;
nextSite = sites[sites.length - 1] + 1;
} else {
direction = -1;
nextSite = sites[0] - 1;
}
String[] hap0 = new String[targetSize];
String[] hap1 = new String[targetSize];
while (nSelected < targetSize && nextSite < totalSites && nextSite > -1) {
//snpset0 provides a count of alleles for cluster0, snpset1 for cluster1
Multiset snpset0 = HashMultiset.create();
Multiset snpset1 = HashMultiset.create();
HaplotypeCluster hc = clusters.get(0);
for (Iterator iterator = hc.getIterator(); iterator.hasNext();) {
Haplotype hap = iterator.next();
int taxon = hap.taxonIndex;
byte allele = a.genotype(taxon, nextSite);
if (allele != Haplotype.N) {
snpset0.add(NucleotideAlignmentConstants.getNucleotideIUPAC(allele));
}
}
hc = clusters.get(1);
for (Iterator iterator = hc.getIterator(); iterator.hasNext();) {
Haplotype hap = iterator.next();
int taxon = hap.taxonIndex;
byte allele = a.genotype(taxon, nextSite);
if (allele != Haplotype.N) {
snpset1.add(NucleotideAlignmentConstants.getNucleotideIUPAC(allele));
}
}
//if either cluster has two alleles with roughly equal classes, then do not use
String mj0 = getMajorAlleleFromSnpset(snpset0);
if (mj0 != null) { /*only one big class in snpset0*/
String mj1 = getMajorAlleleFromSnpset(snpset1);
if (mj1 != null && !mj0.equals(mj1)) { /*only one big class in snpset1*/ /*the alleles are not equal*/
if (forward) {
hap0[nSelected] = mj0;
hap1[nSelected] = mj1;
sites[nSelected] = nextSite;
} else {
int ptr = targetSize - 1 - nSelected;
hap0[ptr] = mj0;
hap1[ptr] = mj1;
sites[ptr] = nextSite;
}
nSelected ++;
}
}
nextSite += direction;
}
windowSize[0] = nSelected;
//create 2 new clusters
//add taxa to each cluster that have diff <= 4 compared to either hap0 or hap1 but not both
int[] selectedSites;
if (forward) {
selectedSites = Arrays.copyOf(sites, nSelected);
} else {
selectedSites = Arrays.copyOfRange(sites, targetSize - nSelected, targetSize);
}
int numberOfSites = selectedSites.length;
byte[] seq = new byte[numberOfSites];
if (forward) {
for (int s = 0; s < numberOfSites; s++) seq[s] = NucleotideAlignmentConstants.getNucleotideDiploidByte(hap0[s]);
} else {
for (int s = 0; s < numberOfSites; s++) seq[s] = NucleotideAlignmentConstants.getNucleotideDiploidByte(hap0[s + targetSize - numberOfSites]);
}
Haplotype haplotype0 = new Haplotype(seq);
seq = new byte[numberOfSites];
if (forward || numberOfSites == targetSize) {
for (int s = 0; s < numberOfSites; s++) seq[s] = NucleotideAlignmentConstants.getNucleotideDiploidByte(hap1[s]);
} else {
int dif = targetSize - numberOfSites;
for (int s = 0; s < numberOfSites; s++) seq[s] = NucleotideAlignmentConstants.getNucleotideDiploidByte(hap1[s + dif]);
}
Haplotype haplotype1 = new Haplotype(seq);
ArrayList cluster0 = new ArrayList<>();
ArrayList cluster1 = new ArrayList<>();
int ntaxa = a.numberOfTaxa();
GenotypeTable b = FilterGenotypeTable.getInstance(a, selectedSites);
for (int t = 0; t < ntaxa; t++) {
seq = b.genotypeAllSites(t);
Haplotype hap = new Haplotype(seq, t);
int dist0 = hap.distanceFrom(haplotype0);
int dist1 = hap.distanceFrom(haplotype1);
if (dist0 <= maxDiff && dist1 > maxDiff) cluster0.add(hap);
else if (dist0 > maxDiff && dist1 <= maxDiff) cluster1.add(hap);
else if (numberOfSites < maxDiff) {
if (dist0 == 0 && dist1 > 1) cluster0.add(hap);
else if (dist0 > 1 && dist1 == 0) cluster1.add(hap);
}
}
//return the clusters
ArrayList newClusters = new ArrayList<>();
newClusters.add(new HaplotypeCluster(cluster0));
newClusters.add(new HaplotypeCluster(cluster1));
return newClusters;
}
public static String getMajorAlleleFromSnpset(Multiset snpset, double alpha) {
ArrayList> snplist = new ArrayList<>(snpset.entrySet());
if (snplist.size() == 0) return null;
if (snplist.size() == 1) return snplist.get(0).getElement();
Collections.sort(snplist, new Comparator>(){
@Override
public int compare(Entry entry1, Entry entry2) {
if (entry1.getCount() > entry2.getCount()) return -1;
else if (entry1.getCount() < entry2.getCount()) return 1;
return 0;
}
});
long[] observed = new long[]{snplist.get(0).getCount(), snplist.get(1).getCount()};
double total = observed[0] + observed[1];
double[] expected = new double[]{ total/2.0, total/2.0 };
boolean different = false;
try {
different = TestUtils.chiSquareTest(expected, observed, alpha);
} catch (Exception e) {
System.out.println("Exception calculating chi-square in NucleotideImputationUtils.testClassSize(): ");
e.printStackTrace();
}
if (different) {
return snplist.get(0).getElement();
}
return null;
}
public static String getMajorAlleleFromSnpset(Multiset snpset) {
return getMajorAlleleFromSnpset(snpset, 0.05);
}
public static void updateAlleleCalls(HaplotypeCluster cluster, int[] sites, byte[] alleleCalls) {
byte[] haplotype = cluster.getHaplotype();
int n = haplotype.length;
for (int i = 0; i < n; i++) alleleCalls[sites[i]] = haplotype[i];
}
public static void callParentAllelesByWindowForBackcrosses(PopulationData popdata, double maxMissing, double minMaf, int windowSize, double minR) {
BitSet polybits = whichSitesSegregateCorrectly(popdata.original, maxMissing, .25);
myLogger.info("polybits cardinality = " + polybits.cardinality());
OpenBitSet filteredBits = whichSnpsAreFromSameTag(popdata.original, 0.8);
filteredBits.and(polybits);
System.out.println("filteredBits.cardinality = " + filteredBits.cardinality());
BitSet ldFilteredBits;
if (minR > 0) {
int halfWindow = windowSize / 2;
ldFilteredBits = ldfilter(popdata.original, halfWindow, minR, filteredBits);
} else {
ldFilteredBits = filteredBits;
}
myLogger.info("ldFilteredBits.cardinality = " + ldFilteredBits.cardinality());
int nsites = popdata.original.numberOfSites();
int ntaxa = popdata.original.numberOfTaxa();
popdata.alleleA = new byte[nsites];
popdata.alleleC = new byte[nsites];
popdata.snpIndex = new OpenBitSet(nsites);
for (int s = 0; s < nsites; s++) {
if(ldFilteredBits.fastGet(s)) {
popdata.alleleA[s] = popdata.original.majorAllele(s);
popdata.alleleC[s] = popdata.original.minorAllele(s);
popdata.snpIndex.fastSet(s);
}
}
myLogger.info("number of called snps = " + popdata.snpIndex.cardinality());
//create the imputed array with A/C calls
int nsnps = (int) popdata.snpIndex.cardinality();
ntaxa = popdata.original.numberOfTaxa();
nsites = popdata.original.numberOfSites();
int[] snpIndex = new int[nsnps];
int snpcount = 0;
for (int s = 0; s < nsites; s++) {
if (popdata.snpIndex.fastGet(s)) snpIndex[snpcount++] = s;
}
snpIndex = Arrays.copyOf(snpIndex, snpcount);
GenotypeTable target = FilterGenotypeTable.getInstance(popdata.original, snpIndex);
myLogger.info("filtered on snps");
//remove taxa with low coverage
TaxaListBuilder taxaBuilder = new TaxaListBuilder();
int minGametes = 200;
for (int t = 0; t < ntaxa; t++) {
if (target.totalGametesNonMissingForTaxon(t) > minGametes) taxaBuilder.add(target.taxa().get(t));
}
GenotypeTable target2 = FilterGenotypeTable.getInstance(target, taxaBuilder.build());
myLogger.info("identified low coverage taxa");
nsnps = snpIndex.length;
ntaxa = target2.numberOfTaxa();
GenotypeTableBuilder builder = GenotypeTableBuilder.getSiteIncremental(target2.taxa());
for (int s = 0; s < nsnps; s++) {
byte Aallele = popdata.alleleA[snpIndex[s]];
byte Callele = popdata.alleleC[snpIndex[s]];
byte[] siteGeno = new byte[ntaxa];
for (int t = 0; t < ntaxa; t++) {
byte[] val = GenotypeTableUtils.getDiploidValues(target2.genotype(t, s));
if (val[0] == Aallele && val[1] == Aallele) siteGeno[t] = AA;
else if (val[0] == Callele && val[1] == Callele) siteGeno[t] = CC;
else if ((val[0] == Aallele && val[1] == Callele)||(val[0] == Callele && val[1] == Aallele)) siteGeno[t] = AC;
else siteGeno[t] = NN;
}
builder.addSite(target2.positions().get(s), siteGeno);
}
myLogger.info("called alleles");
popdata.imputed = builder.build();
}
public static void callParentAllelesByWindowForMultipleBC(PopulationData popdata, double maxMissing, int minMinorAlleleCount, int windowSize) {
BitSet polybits = whichSitesArePolymorphic(popdata.original, maxMissing, minMinorAlleleCount);
myLogger.info("polybits cardinality = " + polybits.cardinality());
OpenBitSet filteredBits = whichSnpsAreFromSameTag(popdata.original, 0.8);
filteredBits.and(polybits);
System.out.println("filteredBits.cardinality = " + filteredBits.cardinality());
int nsites = popdata.original.numberOfSites();
int ntaxa = popdata.original.numberOfTaxa();
popdata.alleleA = new byte[nsites];
popdata.alleleC = new byte[nsites];
popdata.snpIndex = new OpenBitSet(nsites);
for (int s = 0; s < nsites; s++) {
if(filteredBits.fastGet(s)) {
popdata.alleleA[s] = popdata.original.majorAllele(s);
popdata.alleleC[s] = popdata.original.minorAllele(s);
popdata.snpIndex.fastSet(s);
}
}
myLogger.info("number of called snps = " + popdata.snpIndex.cardinality());
//create the imputed array with A/C calls
int nsnps = (int) popdata.snpIndex.cardinality();
ntaxa = popdata.original.numberOfTaxa();
nsites = popdata.original.numberOfSites();
int[] snpIndex = new int[nsnps];
int snpcount = 0;
for (int s = 0; s < nsites; s++) {
if (popdata.snpIndex.fastGet(s)) snpIndex[snpcount++] = s;
}
snpIndex = Arrays.copyOf(snpIndex, snpcount);
GenotypeTable target = FilterGenotypeTable.getInstance(popdata.original, snpIndex);
// MutableNucleotideAlignment mna = MutableNucleotideAlignment.getInstance(target);
nsnps = snpIndex.length;
GenotypeTableBuilder builder = GenotypeTableBuilder.getSiteIncremental(target.taxa());
for (int s = 0; s < nsnps; s++) {
byte Aallele = popdata.alleleA[snpIndex[s]];
byte Callele = popdata.alleleC[snpIndex[s]];
byte[] siteGeno = new byte[ntaxa];
for (int t = 0; t < ntaxa; t++) {
byte[] val = GenotypeTableUtils.getDiploidValues(target.genotype(t, s));
if (val[0] == Aallele && val[1] == Aallele) siteGeno[t] = AA;
else if (val[0] == Callele && val[1] == Callele) siteGeno[t] = CC;
else if ((val[0] == Aallele && val[1] == Callele)||(val[0] == Callele && val[1] == Aallele)) siteGeno[t] = AC;
else siteGeno[t] = NN;
}
builder.addSite(target.positions().get(s), siteGeno);
}
myLogger.info("called alleles");
popdata.imputed = builder.build();
}
public static void checkAlignmentOrderIgnoringParents(GenotypeTable[] alignments, PopulationData family, double r) {
boolean swapAlignments = false;
double minR = -0.05;
int p1group, p2group;
//r is the correlation between taxa assignments in this window and the previous one
//if r is negative then parental assignments should be swapped, which will make it positive
//this can happen when the parents are not in the data set and the assignment of parents to group is arbitrary
if (r < minR) swapAlignments = true;
if (swapAlignments) {
GenotypeTable temp = alignments[0];
alignments[0] = alignments[1];
alignments[1] = temp;
}
}
public static int[][] getWindows(BitSet ispoly, int windowSize) {
int npoly = (int) ispoly.cardinality();
int nsnps = (int) ispoly.size();
int nwindows = npoly/windowSize;
int remainder = npoly % windowSize;
if (remainder > windowSize/2) nwindows++; //round up
int[][] windows = new int[nwindows][];
int setsize = npoly/nwindows;
int windowCount = 0;
int snpCount = 0;
int polyCount = 0;
while (snpCount < nsnps && windowCount < nwindows) {
int numberLeft = npoly - polyCount;
if (numberLeft < setsize * 2) setsize = numberLeft;
int[] set = new int[setsize];
int setcount = 0;
while (setcount < setsize && snpCount < nsnps) {
if (ispoly.fastGet(snpCount)) {
set[setcount++] = snpCount;
polyCount++;
}
snpCount++;
}
windows[windowCount++] = set;
}
return windows;
}
/**
* @param family a PopulationData object containing information for this family
* @param taxaGroups an array of two alignments corresponding to two clusters of taxa
* @param snpList the list of snps to be called
*/
public static void callParentAllelesUsingTaxaGroups(PopulationData family, GenotypeTable[] taxaGroups, LinkedList snpList) {
int nsnps = taxaGroups[0].numberOfSites();
Iterator snpit = snpList.iterator();
for ( int s = 0; s < nsnps; s++) {
byte[] major = new byte[2];
major[0] = taxaGroups[0].majorAllele(s);
major[1] = taxaGroups[1].majorAllele(s);
Integer snpIndex = snpit.next();
if(major[0] != GenotypeTable.UNKNOWN_ALLELE && major[1] != GenotypeTable.UNKNOWN_ALLELE && major[0] != major[1]) {
family.alleleA[snpIndex] = major[0];
family.alleleC[snpIndex] = major[1];
family.snpIndex.fastSet(snpIndex);
}
}
}
public static double getIdCorrelation(TaxaList[][] id) {
double[][] counts = new double[2][2];
counts[0][0] = TaxaListUtils.getCommonTaxa(id[0][0], id[1][0]).numberOfTaxa();
counts[0][1] = TaxaListUtils.getCommonTaxa(id[0][0], id[1][1]).numberOfTaxa();
counts[1][0] = TaxaListUtils.getCommonTaxa(id[0][1], id[1][0]).numberOfTaxa();
counts[1][1] = TaxaListUtils.getCommonTaxa(id[0][1], id[1][1]).numberOfTaxa();
double num = counts[0][0] * counts[1][1] - counts[0][1] * counts[1][0];
double p1 = counts[0][0] + counts[0][1];
double q1 = counts[1][0] + counts[1][1];
double p2 = counts[0][0] + counts[1][0];
double q2 = counts[0][1] + counts[1][1];
return num / Math.sqrt(p1 * q1 * p2 * q2);
}
public static BitSet whichSitesArePolymorphic(GenotypeTable a, double maxMissing, int minMinorAlleleCount) {
int ntaxa = a.numberOfTaxa();
double minMaf = ((double) minMinorAlleleCount) / ((double) (2 * ntaxa));
return whichSitesArePolymorphic(a, maxMissing, minMaf);
}
public static BitSet whichSitesArePolymorphic(GenotypeTable a, double maxMissing, double minMaf) {
//which sites are polymorphic? minor allele count > 2 and exceed the minimum allele count
int nsites = a.numberOfSites();
int ntaxa = a.numberOfTaxa();
OpenBitSet polybits = new OpenBitSet(nsites);
for (int s = 0; s < nsites; s++) {
int notMissingCount = a.totalNonMissingForSite(s);
int minorAlleleCount = a.minorAlleleCount(s);
double maf = a.minorAlleleFrequency(s);
double pMissing = ((double) (ntaxa - notMissingCount)) / ((double) ntaxa);
if (pMissing <= maxMissing && maf >= minMaf) polybits.fastSet(s);
}
return polybits;
}
public static BitSet whichSitesSegregateCorrectly(GenotypeTable a, double maxMissing, double ratio) {
int nsites = a.numberOfSites();
int ntaxa = a.numberOfTaxa();
OpenBitSet polybits = new OpenBitSet(nsites);
for (int s = 0; s < nsites; s++) {
int[][] freq = a.allelesSortedByFrequency(s);
int nMissing = ntaxa - a.totalNonMissingForSite(s);
double pMissing = ((double) nMissing) / ((double) ntaxa);
if (freq[1].length > 1 && pMissing <= maxMissing) {
int Mj = freq[1][0];
int Mn = freq[1][1];
double pmono = binomialProbability(Mj + Mn, Mn, 0.002);
double pquarter = binomialProbability(Mj + Mn, Mn, 0.25);
double phalf = binomialProbability(Mj + Mn, Mn, 0.5);
if (ratio == 0.25 || ratio == 0.75) {
// if (pquarter / (pmono + phalf) > 2) polybits.fastSet(s);
// double poneseven = binomialProbability(Mj + Mn, Mn, .125);
// double pthreefive = binomialProbability(Mj + Mn, Mn, .375);
// if (pquarter > poneseven && pquarter > pthreefive) polybits.fastSet(s);
if (pquarter > phalf && pquarter > pmono) polybits.fastSet(s);
} else {
if (phalf / (pmono + pquarter) > 2) polybits.fastSet(s);
}
}
}
return polybits;
}
private static double binomialProbability(int trials, int successes, double pSuccess) {
double n = trials;
double k = successes;
double logprob = GammaFunction.lnGamma(n + 1.0) -
GammaFunction.lnGamma(k + 1.0) - GammaFunction.lnGamma(n - k + 1.0) + k * Math.log(pSuccess) + (n - k) * Math.log(1 - pSuccess);
return Math.exp(logprob);
}
public static GenotypeTable[] getTaxaGroupAlignments(GenotypeTable a, int[] parentIndex, LinkedList snpIndices) {
//cluster taxa for these snps to find parental haplotypes (cluster on taxa)
GenotypeTable[] taxaClusters = ImputationUtils.getTwoClusters(a, 20);
LinkedList originalList = new LinkedList(snpIndices);
int nsites = a.numberOfSites();
boolean[] include = new boolean[nsites];
int[] includedSnps = new int[nsites];
int snpcount = 0;
for (int s = 0; s < nsites; s++) {
Integer snpIndex = snpIndices.remove();
if ( taxaClusters[0].majorAllele(s) != taxaClusters[1].majorAllele(s) ) {
include[s] = true;
includedSnps[snpcount++] = s;
snpIndices.add(snpIndex);
} else {
include[s] = false;
}
}
if (snpcount > 5) {
includedSnps = Arrays.copyOf(includedSnps, snpcount);
if (snpcount == nsites) return taxaClusters;
else return ImputationUtils.getTwoClusters(FilterGenotypeTable.getInstance(a, includedSnps), 20);
} else {
snpIndices.clear();
snpIndices.addAll(originalList);
return taxaClusters;
}
}
public static double computeRForAlleles(int site1, int site2, GenotypeTable a) {
int s1Count = 0;
int s2Count = 0;
int prodCount = 0;
int totalCount = 0;
long[] m11 = a.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Major).getBits();
long[] m12 = a.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Minor).getBits();
long[] m21 = a.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Major).getBits();
long[] m22 = a.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Minor).getBits();
int n = m11.length;
for (int i = 0; i < n; i++) {
long valid = (m11[i] ^ m12[i]) & (m21[i] ^ m22[i]); //only count non-het & non-missing
long s1major = m11[i] & valid;
long s2major = m21[i] & valid;
long s1s2major = s1major & s2major & valid;
s1Count += BitUtil.pop(s1major);
s2Count += BitUtil.pop(s2major);
prodCount += BitUtil.pop(s1s2major);
totalCount += BitUtil.pop(valid);
}
if (totalCount < 2) return Double.NaN;
//Explanation of method:
// if major site one is x=1, minor site one is x = 0 and for site 2 y = 1 or 0
// r = [sum(xy) - sum(x)sum(y)/N] / sqrt[(sum(x) - sum(x)*sum(x)/N) * ((sum(y) - sum(y)*sum(y)/N)]
// and sum(x) - sum(x)*sum(x)/N = sum(x)(N - sum(x))/N
// because sum(x^2) = sum(x)
double num = ((double) prodCount - ((double) s1Count * s2Count) / ((double) totalCount));
double denom = ((double) (s1Count * (totalCount - s1Count))) / ((double) totalCount);
denom *= ((double) (s2Count * (totalCount - s2Count))) / ((double) totalCount);
if (denom == 0) return Double.NaN;
return num / Math.sqrt(denom);
}
public static double computeRForMissingness(int site1, int site2, GenotypeTable a) {
// int s1Count = 0;
// int s2Count = 0;
// int prodCount = 0;
int totalCount = a.numberOfTaxa();
BitSet m11 = a.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Major);
BitSet m12 = a.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Minor);
BitSet m21 = a.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Major);
BitSet m22 = a.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Minor);
OpenBitSet s1present = new OpenBitSet(m11.getBits(), m11.getNumWords());
s1present.union(m12);
OpenBitSet s2present = new OpenBitSet(m21.getBits(), m21.getNumWords());
s2present.union(m22);
long s1Count = s1present.cardinality();
long s2Count = s2present.cardinality();
long prodCount = OpenBitSet.intersectionCount(s1present, s2present);
//Explanation of method:
// if major site one is x=1, minor site one is x = 0 and for site 2 y = 1 or 0
// r = [sum(xy) - sum(x)sum(y)/N] / sqrt[(sum(x) - sum(x)*sum(x)/N) * ((sum(y) - sum(y)*sum(y)/N)]
// and sum(x) - sum(x)*sum(x)/N = sum(x)(N - sum(x))/N
// because sum(x^2) = sum(x)
double num = ((double) prodCount - ((double) s1Count * s2Count) / ((double) totalCount));
double denom = ((double) (s1Count * (totalCount - s1Count))) / ((double) totalCount);
denom *= ((double) (s2Count * (totalCount - s2Count))) / ((double) totalCount);
if (denom == 0) return Double.NaN;
return num / Math.sqrt(denom);
}
public static boolean areSameTag(int site1, int site2, GenotypeTable a) {
int pos1 = a.chromosomalPosition(site1);
int pos2 = a.chromosomalPosition(site2);
if (Math.abs(pos2 - pos1) > 64) return false;
long[] matchCount = matchCount(site1, site2, a);
long numberNotMatching = matchCount[1] - matchCount[0];
if (numberNotMatching > 5) return false;
return true;
}
public static long[] matchCount(int site1, int site2, GenotypeTable a) {
int totalCount = a.numberOfTaxa();
BitSet m11 = a.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Major);
BitSet m12 = a.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Minor);
BitSet m21 = a.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Major);
BitSet m22 = a.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Minor);
OpenBitSet majorSame = new OpenBitSet(m11);
majorSame.notXor(m21);
OpenBitSet minorSame = new OpenBitSet(m12);
minorSame.notXor(m22);
OpenBitSet equal1 = new OpenBitSet(majorSame);
equal1.and(minorSame);
OpenBitSet majorMinorSame = new OpenBitSet(m11.getBits());
majorMinorSame.notXor(m22);
OpenBitSet minorMajorSame = new OpenBitSet(m12.getBits());
minorMajorSame.notXor(m21);
OpenBitSet equal2 = new OpenBitSet(majorMinorSame.getBits());
equal2.and(minorMajorSame);
//Sites for which both the minor and minor alleles have the same value
long majorCardinality = equal1.cardinality(totalCount - 1);
long majorMinorCardinality = equal2.cardinality(totalCount - 1);
if (majorCardinality >= majorMinorCardinality) return new long[]{majorCardinality, totalCount};
return new long[]{majorMinorCardinality, totalCount};
}
public static double computeGenotypeR(int site1, int site2, GenotypeTable a) throws IllegalStateException {
BitSet s1mj = a.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Major);
BitSet s1mn = a.allelePresenceForAllTaxa(site1, WHICH_ALLELE.Minor);
BitSet s2mj = a.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Major);
BitSet s2mn = a.allelePresenceForAllTaxa(site2, WHICH_ALLELE.Minor);
OpenBitSet bothpresent = new OpenBitSet(s1mj);
bothpresent.union(s1mn);
OpenBitSet s2present = new OpenBitSet(s2mj);
s2present.union(s2mn);
bothpresent.intersect(s2present);
long nsites = a.numberOfTaxa();
double sum1 = 0;
double sum2 = 0;
double sumsq1 = 0;
double sumsq2 = 0;
double sumprod = 0;
double count = 0;
for (long i = 0; i < nsites; i++) {
if (bothpresent.fastGet(i)) {
double val1 = 0;
double val2 = 0;
if (s1mj.fastGet(i)) {
if (s1mn.fastGet(i)) {
val1++;
} else {
val1 += 2;
}
}
if (s2mj.fastGet(i)) {
if (s2mn.fastGet(i)) {
val2++;
} else {
val2 += 2;
}
}
sum1 += val1;
sumsq1 += val1*val1;
sum2 += val2;
sumsq2 += val2*val2;
sumprod += val1*val2;
count++;
}
}
//r = sum(xy)/sqrt[sum(x2)*sum(y2)], where x = X - Xbar and y = Y - Ybar
//num.r = sum(XY) - 1/n * sum(X)sum(y)
//denom.r = sqrt[ (sum(XX) - 1/n*sum(X)sum(X)) * (sum(YY) - 1/n*sum(Y)sum(Y)) ]
double num = sumprod - sum1 / count * sum2;
double denom = Math.sqrt( (sumsq1 - sum1 / count * sum1) * (sumsq2 - sum2 / count * sum2) );
if (denom == 0) return Double.NaN;
return num / denom;
}
public static GenotypeTable imputeUsingViterbiFiveState(GenotypeTable a, double probHeterozygous, String familyName) {
return imputeUsingViterbiFiveState(a, probHeterozygous, familyName, false);
}
public static GenotypeTable imputeUsingViterbiFiveState(GenotypeTable a, double probHeterozygous, String familyName, boolean useVariableRecombitionRates) {
//states are in {all A; 3A:1C; 1A:1C, 1A:3C; all C}
//obs are in {A, C, M}, where M is heterozygote A/C
int maxIterations = 50;
HashMap obsMap = new HashMap();
obsMap.put(AA, (byte) 0);
obsMap.put(AC, (byte) 1);
obsMap.put(CA, (byte) 1);
obsMap.put(CC, (byte) 2);
int ntaxa = a.numberOfTaxa();
int nsites = a.numberOfSites();
//initialize the transition matrix
double[][] transition = new double[][] {
{.999,.0001,.0003,.0001,.0005},
{.0002,.999,.00005,.00005,.0002},
{.0002,.00005,.999,.00005,.0002},
{.0002,.00005,.00005,.999,.0002},
{.0005,.0001,.0003,.0001,.999}
};
TransitionProbability tp;
if (useVariableRecombitionRates) {
tp = new TransitionProbabilityWithVariableRecombination(a.chromosomeName(0));
} else {
tp = new TransitionProbability();
}
tp.setTransitionProbability(transition);
int chrlength = a.chromosomalPosition(nsites - 1) - a.chromosomalPosition(0);
tp.setAverageSegmentLength( chrlength / nsites );
//initialize the emission matrix, states (5) in rows, observations (3) in columns
double[][] emission = new double[][] {
{.998,.001,.001},
{.6,.2,.2},
{.4,.2,.4},
{.2,.2,.6},
{.001,.001,.998}
};
EmissionProbability ep = new EmissionProbability();
ep.setEmissionProbability(emission);
//set up indices to non-missing data
ArrayList notMissingIndex = new ArrayList();
int[] notMissingCount = new int[ntaxa];
ArrayList nonMissingObs = new ArrayList();
ArrayList snpPositions = new ArrayList();
for (int t = 0; t < ntaxa; t++) {
long[] bits = a.allelePresenceForAllSites(t, WHICH_ALLELE.Major).getBits();
BitSet notMiss = new OpenBitSet(bits, bits.length);
notMiss.or(a.allelePresenceForAllSites(t, WHICH_ALLELE.Minor));
notMissingIndex.add(notMiss);
notMissingCount[t] = (int) notMiss.cardinality();
}
for (int t = 0; t < ntaxa; t++) {
byte[] obs = new byte[notMissingCount[t]];
int[] pos = new int[notMissingCount[t]];
nonMissingObs.add(obs);
snpPositions.add(pos);
BitSet isNotMissing = notMissingIndex.get(t);
int nmcount = 0;
for (int s = 0; s < nsites; s++) {
byte base = a.genotype(t, s);
if (isNotMissing.fastGet(s) && obsMap.get(a.genotype(t, s)) == null) {
myLogger.info("null from " + Byte.toString(base));
}
if (isNotMissing.fastGet(s)) {
obs[nmcount] = obsMap.get(a.genotype(t, s));
pos[nmcount++] = a.chromosomalPosition(s);
}
}
}
double phom = (1 - probHeterozygous) / 2;
double[] pTrue = new double[]{phom, .25*probHeterozygous ,.5 * probHeterozygous, .25*probHeterozygous, phom};
//iterate
ArrayList bestStates = new ArrayList();
int[][] previousStateCount = new int[5][3];
int iter = 0;
boolean hasNotConverged = true;
while (iter < maxIterations && hasNotConverged) {
//apply Viterbi
myLogger.info("Iteration " + iter++ + " for " + familyName);
bestStates.clear();
for (int t = 0; t < ntaxa; t++) {
tp.setPositions(snpPositions.get(t));
int nobs = notMissingCount[t];
if (nobs >= 20) {
ViterbiAlgorithm va = new ViterbiAlgorithm(nonMissingObs.get(t), tp, ep, pTrue);
va.calculate();
bestStates.add(va.getMostProbableStateSequence());
} else { //do not impute if obs < 20
myLogger.info("Fewer then 20 observations for " + a.taxa().taxaName(t));
byte[] states = new byte[nobs];
byte[] obs = nonMissingObs.get(t);
for (int i = 0; i < nobs; i++) {
if (obs[i] == AA) states[i] = 0;
else if (obs[i] == CC) states[i] = 4;
else states[i] = 2;
}
bestStates.add(states);
}
}
//re-estimate transition probabilities
int[][] transitionCounts = new int[5][5];
double[][] transitionProb = new double[5][5];
for (int t = 0; t < ntaxa; t++) {
byte[] states = bestStates.get(t);
for (int s = 1; s < notMissingCount[t]; s++) {
transitionCounts[states[s-1]][states[s]]++;
}
}
//transition is prob(state2 | state1) = count(cell)/count(row)
for (int row = 0; row < 5; row++) {
double rowsum = 0;
for (int col = 0; col < 5; col++) rowsum += transitionCounts[row][col];
for (int col = 0; col < 5; col++) transitionProb[row][col] = ((double) transitionCounts[row][col]) / rowsum;
}
tp.setTransitionCounts(transitionCounts, chrlength, ntaxa);
//re-estimate emission probabilities
int[][] emissionCounts = new int[5][3];
double[][] emissionProb = new double[5][3];
for (int t = 0; t < ntaxa; t++) {
byte[] obs = nonMissingObs.get(t);
byte[] states = bestStates.get(t);
for (int s = 0; s < notMissingCount[t]; s++) {
emissionCounts[states[s]][obs[s]]++;
}
}
//debug - print observation/state counts
// StringBuilder strb = new StringBuilder("Imputation counts, rows=states, columns=observations:\n");
// for (int[] row:emissionCounts) {
// for (int cell:row) {
// strb.append(cell).append("\t");
// }
// strb.append("\n");
// }
// strb.append("\n");
// myLogger.info(strb.toString());
//check to see if there is a change in the observation/state counts
hasNotConverged = false;
for (int r = 0; r < 5; r++) {
for (int c = 0; c < 3; c++) {
if (previousStateCount[r][c] != emissionCounts[r][c]) {
hasNotConverged = true;
previousStateCount[r][c] = emissionCounts[r][c];
}
}
}
//emission is prob(obs | state) = count(cell)/count(row)
double[] rowSums = new double[5];
double total = 0;
for (int row = 0; row < 5; row++) {
double rowsum = 0;
for (int col = 0; col < 3; col++) rowsum += emissionCounts[row][col];
for (int col = 0; col < 3; col++) emissionProb[row][col] = ((double) emissionCounts[row][col]) / rowsum;
rowSums[row] = rowsum;
total += rowsum;
}
ep.setEmissionProbability(emissionProb);
//re-estimate pTrue
for (int i = 0; i < 5; i++) {
pTrue[i
] = rowSums[i] / total;
}
//if the model has converged or if the max iterations has been reached print tables
if (!hasNotConverged || iter == maxIterations) {
StringBuilder sb = new StringBuilder("Family ");
sb.append(familyName).append(", chromosome ").append(a.chromosomeName(0));
if (iter < maxIterations) {
sb.append(": EM algorithm converged at iteration ").append(iter).append(".\n");
} else {
sb.append(": EM algorithm failed to converge after ").append(iter).append(" iterations.\n");
}
//print transition counts
sb = new StringBuilder("Transition counts from row to column:\n");
for (int[] row:transitionCounts) {
for (int cell:row) {
sb.append(cell).append("\t");
}
sb.append("\n");
}
sb.append("\n");
myLogger.info(sb.toString());
//print transition probabilities
sb = new StringBuilder("Transition probabilities:\n");
for (double[] row:transitionProb) {
for (double cell:row) {
sb.append(cell).append("\t");
}
sb.append("\n");
}
sb.append("\n");
myLogger.info(sb.toString());
//print observation/state counts
sb = new StringBuilder("Imputation counts, rows=states, columns=observations:\n");
for (int[] row:emissionCounts) {
for (int cell:row) {
sb.append(cell).append("\t");
}
sb.append("\n");
}
sb.append("\n");
myLogger.info(sb.toString());
//print emission probabilities
sb = new StringBuilder("Emission probabilities:\n");
for (double[] row:emissionProb) {
for (double cell:row) {
sb.append(cell).append("\t");
}
sb.append("\n");
}
sb.append("\n");
myLogger.info(sb.toString());
}
}
GenotypeTableBuilder builder = GenotypeTableBuilder.getTaxaIncremental(a.positions());
nsites = a.numberOfSites();
for (int t = 0; t < ntaxa; t++) {
BitSet hasData = notMissingIndex.get(t);
byte[] states = bestStates.get(t);
int stateCount = 0;
byte[] taxonGeno = new byte[nsites];
Arrays.fill(taxonGeno, NN);
for (int s = 0; s < nsites; s++) {
if (hasData.fastGet(s)) {
if (states[stateCount] == 0) taxonGeno[s] = AA;
else if (states[stateCount] < 4) taxonGeno[s] = AC;
else if (states[stateCount] == 4) taxonGeno[s] = CC;
stateCount++;
}
}
builder.addTaxon(a.taxa().get(t), taxonGeno);
}
return builder.build();
}
public static void fillGapsInAlignment(PopulationData popdata) {
popdata.imputed = fillGapsInImputedAlignment(popdata);
}
public static GenotypeTable fillGapsInImputedAlignment(PopulationData popdata) {
int ntaxa = popdata.imputed.numberOfTaxa();
int nsites = popdata.imputed.numberOfSites();
GenotypeTableBuilder builder = GenotypeTableBuilder.getTaxaIncremental(popdata.imputed.positions());
for (int t = 0; t < ntaxa; t++) {
int prevsite = -1;
byte prevValue = -1;
byte[] taxonGeno = new byte[nsites];
Arrays.fill(taxonGeno, NN);
for (int s = 0; s < nsites; s++) {
byte val = popdata.imputed.genotype(t, s);
if (val != NN) {
if (prevsite == -1) {
prevsite = s;
prevValue = val;
taxonGeno[s] = val;
} else if(val == prevValue) {
for (int site = prevsite + 1; site <= s; site++) {
taxonGeno[site] = prevValue;
}
prevsite = s;
} else {
prevsite = s;
prevValue = val;
taxonGeno[s] = val;
}
}
}
builder.addTaxon(popdata.imputed.taxa().get(t), taxonGeno);
}
return builder.build();
}
public static GenotypeTable convertParentCallsToNucleotides(PopulationData popdata) {
//set monomorphic sites to major (or only allele) (or not)
//set polymorphic sites consistent with flanking markers if equal, unchanged otherwise
//do not change sites that are not clearly monomorhpic or polymorphic
GenotypeTableBuilder builder = GenotypeTableBuilder.getSiteIncremental(popdata.imputed.taxa());
BitSet isPopSnp = popdata.snpIndex;
int nsites = popdata.original.numberOfSites();
int ntaxa = popdata.imputed.taxa().numberOfTaxa();
int imputedSnpCount = 0;
for (int s = 0; s < nsites; s++) {
if (isPopSnp.fastGet(s)) {
byte AAcall = popdata.alleleA[s];
byte CCcall = popdata.alleleC[s];
byte ACcall;
if (GenotypeTableUtils.isHeterozygous(AAcall)) ACcall = AAcall;
else if (GenotypeTableUtils.isHeterozygous(CCcall)) ACcall = CCcall;
else ACcall = GenotypeTableUtils.getDiploidValue(AAcall, CCcall);
byte[] geno = new byte[ntaxa];
for (int t = 0; t < ntaxa; t++) {
byte parentCall = popdata.imputed.genotype(t, imputedSnpCount);
if (parentCall == AA) {
geno[t] = AAcall;
} else if (parentCall == CC) {
geno[t] = CCcall;
} else if (parentCall == AC || parentCall == CA) {
geno[t] = ACcall;
} else {
geno[t] = NN;
}
}
builder.addSite(popdata.imputed.positions().get(imputedSnpCount), geno);
imputedSnpCount++;
}
}
myLogger.info("Original alignment updated for family " + popdata.name + " chromosome " + popdata.original.chromosomeName(0) + ".\n");
return builder.build();
}
public static OpenBitSet whichSnpsAreFromSameTag(GenotypeTable geno, double minRsq) {
//if (alignIn instanceof SBitAlignment) sba = (SBitAlignment) alignIn;
//else sba = SBitAlignment.getInstance(alignIn);
int nsites = geno.numberOfSites();
int firstSite = 0;
OpenBitSet isSelected = new OpenBitSet(nsites);
isSelected.fastSet(0);
Chromosome firstSnpChr = geno.chromosome(0);
int firstSnpPos = geno.chromosomalPosition(0);
while (firstSite < nsites - 1) {
int nextSite = firstSite + 1;
int nextSnpPos = geno.chromosomalPosition(nextSite);
Chromosome nextSnpChr = geno.chromosome(nextSite);
while (firstSnpChr.equals(nextSnpChr) && nextSnpPos - firstSnpPos < 64) {
//calculate r^2 between snps
BitSet rMj = geno.allelePresenceForAllTaxa(firstSite, WHICH_ALLELE.Major);
BitSet rMn = geno.allelePresenceForAllTaxa(firstSite, WHICH_ALLELE.Minor);
BitSet cMj = geno.allelePresenceForAllTaxa(nextSite, WHICH_ALLELE.Major);
BitSet cMn = geno.allelePresenceForAllTaxa(nextSite, WHICH_ALLELE.Minor);
int[][] contig = new int[2][2];
contig[0][0] = (int) OpenBitSet.intersectionCount(rMj, cMj);
contig[1][0] = (int) OpenBitSet.intersectionCount(rMn, cMj);
contig[0][1] = (int) OpenBitSet.intersectionCount(rMj, cMn);
contig[1][1] = (int) OpenBitSet.intersectionCount(rMn, cMn);
double rsq = calculateRSqr(contig[0][0], contig[0][1], contig[1][0], contig[1][1], 2);
//if rsq cannot be calculated or rsq is less than the minimum rsq for a snp to be considered highly correlated, select this snp
if (Double.isNaN(rsq) || rsq < minRsq) {
isSelected.fastSet(nextSite);
break;
}
nextSite++;
if (nextSite >= nsites) break;
nextSnpPos = geno.chromosomalPosition(nextSite);
nextSnpChr = geno.chromosome(nextSite);
}
firstSite = nextSite;
if (firstSite < nsites) {
firstSnpChr = nextSnpChr;
firstSnpPos = nextSnpPos;
isSelected.fastSet(firstSite);
}
}
return isSelected;
}
public static OpenBitSet whichSnpsAreFromSameTag(GenotypeTable geno, BitSet polybits) {
if (polybits.cardinality() == 0) {
if (polybits instanceof OpenBitSet) return (OpenBitSet) polybits;
else return new OpenBitSet(polybits.getBits(), polybits.getNumWords());
}
int nsites = geno.numberOfSites();
int firstSite = 0;
while (!polybits.fastGet(firstSite)) firstSite++;
OpenBitSet isSelected = new OpenBitSet(nsites);
isSelected.fastSet(firstSite);
int firstSnpPos = geno.chromosomalPosition(firstSite);
Chromosome firstChr = geno.chromosome(firstSite);
while (firstSite < nsites - 1) {
int nextSite = firstSite + 1;
int nextSnpPos = geno.chromosomalPosition(nextSite);
Chromosome nextChr = geno.chromosome(nextSite);
boolean skip = !polybits.fastGet(nextSite) || ( firstChr.equals(nextChr) && nextSnpPos - firstSnpPos < 64 /*&& computeRForMissingness(firstSite, nextSite, geno) > 0.8*/) ;
while (skip) {
boolean validSite = nextSite < nsites;
while (validSite && !polybits.fastGet(nextSite)) {
nextSite++;
validSite = nextSite < nsites;
}
if (validSite) {
nextSnpPos = geno.chromosomalPosition(nextSite);
nextChr = geno.chromosome(nextSite);
int dist = nextSnpPos - firstSnpPos;
boolean nearbySite = firstChr.equals(nextChr) && dist < 64;
double r = computeRForMissingness(firstSite, nextSite, geno);
boolean correlatedSite = Double.isNaN(r) || r > 0.7;
if (nearbySite && correlatedSite) {
skip = true;
nextSite++;
validSite = nextSite < nsites;
} else skip = false;
} else skip = false;
}
firstSite = nextSite;
if (firstSite < nsites) {
firstChr = nextChr;
firstSnpPos = nextSnpPos;
isSelected.fastSet(firstSite);
}
}
return isSelected;
}
public static GenotypeTable filterSnpsByTag(GenotypeTable a, double minMaf, double maxMissing, double maxHet) {
int nOriginalSites = a.numberOfSites();
int[] selectedSites = new int[nOriginalSites];
int ntaxa = a.numberOfTaxa();
//start at the first site with sufficient data
int headSite = -1;
double maf, pmissing, phet;
int npresent, nhet;
do {
headSite++;
if (headSite >= nOriginalSites) {
myLogger.error("Unable to locate a starting site with MAF > " + minMaf + " and fraction missing < " +
maxMissing + " and fraction heterozygous < " + maxHet);
break;
}
maf = a.minorAlleleFrequency(headSite);
npresent = a.totalNonMissingForSite(headSite);
nhet = a.heterozygousCount(headSite);
pmissing = ((double) (ntaxa - npresent)) /((double) ntaxa);
if (npresent == 0) phet = 0;
else phet = ((double) nhet) / ((double) npresent);
} while (maf < minMaf || pmissing > maxMissing || phet > maxHet);
selectedSites[0] = headSite;
int selectedCount = 1;
for (int s = 1; s < nOriginalSites; s++) {
int dist = a.chromosomalPosition(s) - a.chromosomalPosition(headSite);
maf = a.minorAlleleFrequency(s);
npresent = a.totalNonMissingForSite(s);
nhet = a.heterozygousCount(s);
pmissing = ((double) (ntaxa - npresent)) /((double) ntaxa);
if (npresent == 0) phet = 0;
else phet = ((double) nhet) / ((double) npresent);
if ((dist >= 64 || computeRForMissingness(headSite, s, a) < 0.7) && maf >= minMaf && pmissing <= maxMissing && phet <= maxHet) {
selectedSites[selectedCount++] = s;
headSite = s;
}
}
selectedSites = Arrays.copyOf(selectedSites, selectedCount);
return FilterGenotypeTable.getInstance(a, selectedSites);
}
static double calculateRSqr(int countAB, int countAb, int countaB, int countab, int minTaxaForEstimate) {
//this is the Hill & Robertson measure as used in Awadella Science 1999 286:2524
double freqA, freqB, rsqr, nonmissingSampleSize;
nonmissingSampleSize = countAB + countAb + countaB + countab;
if (nonmissingSampleSize < minTaxaForEstimate) {
return Double.NaN;
}
freqA = (double) (countAB + countAb) / nonmissingSampleSize;
freqB = (double) (countAB + countaB) / nonmissingSampleSize;
//Through missing data & incomplete datasets some alleles can be fixed this returns missing value
if ((freqA == 0) || (freqB == 0) || (freqA == 1) || (freqB == 1)) {
return Double.NaN;
}
rsqr = ((double) countAB / nonmissingSampleSize) * ((double) countab / nonmissingSampleSize);
rsqr -= ((double) countaB / nonmissingSampleSize) * ((double) countAb / nonmissingSampleSize);
rsqr *= rsqr;
rsqr /= freqA * (1 - freqA) * freqB * (1 - freqB);
return rsqr;
}
public static BitSet ldfilter(GenotypeTable a, int window, double minR, BitSet filterBits) {
int nsites = a.numberOfSites();
OpenBitSet ldbits = new OpenBitSet(nsites);
for (int s = 0; s < nsites; s++) {
if (filterBits.fastGet(s)) {
double avgr = neighborLD(a, s, window, filterBits);
if (avgr >= minR) {
ldbits.fastSet(s);
}
}
}
return ldbits;
}
public static double neighborLD(GenotypeTable a, int snp, int window, BitSet filterBits) {
double sumr = 0;
int nsites = a.numberOfSites();
//test next sites or to the end of the chromosome
int site = snp + 10;
int siteCount = 0;
int sitesTestedCount = 0;
while (siteCount < window && site < nsites) {
if (filterBits.fastGet(site)) {
double r = computeGenotypeR(snp, site, a);
if (!Double.isNaN(r)) {
sumr += Math.abs(computeGenotypeR(snp, site, a));
siteCount++;
sitesTestedCount++;
}
}
site++;
}
//test previous sites or to the beginning of the chromosome
site = snp - 10;
siteCount = 0;
while (siteCount < window && site >= 0) {
if (filterBits.fastGet(site)) {
sumr += Math.abs(computeGenotypeR(snp, site, a));
siteCount++;
sitesTestedCount++;
}
site--;
}
return sumr / sitesTestedCount;
}
public static ArrayList clusterWindow(GenotypeTable a, int start, int length, int maxdif) {
int ntaxa = a.numberOfTaxa();
int end = start + length;
ArrayList haps = new ArrayList();
for (int t = 0; t < ntaxa; t++) {
haps.add(new Haplotype(a.genotypeRange(t, start, end), t));
}
HaplotypeClusterer hc = new HaplotypeClusterer(haps);
hc.makeClusters();
ArrayList mergedClusters = HaplotypeClusterer.getMergedClusters(hc.getClusterList(), maxdif);
//get sequential haplotypes
// System.out.println("Sequential haplotypes:");
// hc.sortClusters();
//
// for (int i = 0; i < 5; i++) {
// ArrayList clusters = hc.getClusterList();
// if (clusters.size() > 0) {
// double score = clusters.get(0).getScore();
// String hapstring = clusters.get(0).getHaplotypeAsString();
// HaplotypeCluster hapclust = hc.removeFirstHaplotypes(2);
// System.out.printf("Count = %d, score = %1.2f: %s\n",hapclust.getSize(), score, hapstring);
// }
// }
return mergedClusters;
}
/**
* Checks whether alleleA is parent1 and alleleC is parent2. If not, switches them.
* @param popdata
*/
public static void checkParentage(PopulationData popdata) {
myLogger.info(String.format("Checking parents of %s", popdata.name));
// IdGroup ids = popdata.original.getIdGroup();
TaxaList myTaxaList = popdata.original.taxa();
LinkedList p1list = new LinkedList();
LinkedList p2list = new LinkedList();
int nsites = popdata.original.numberOfSites();
int ntaxa = popdata.original.numberOfTaxa();
for (int t = 0; t < ntaxa; t++) {
String tname = myTaxaList.taxaName(t);
if (tname.toUpperCase().contains(popdata.parent1.toUpperCase())) p1list.add(t);
if (tname.toUpperCase().contains(popdata.parent2.toUpperCase())) p2list.add(t);
}
int consistent = 0;
int notConsistent = 0;
for (Integer tndx : p1list) {
int total = 0;
int amatches = 0;
int cmatches = 0;
int other = 0;
for (int s = 0; s < nsites; s++) {
if (popdata.snpIndex.fastGet(s)) {
byte thisAllele = popdata.original.genotype(tndx, s);
if (thisAllele != NN) {
total++;
if (thisAllele == popdata.alleleA[s]) amatches++;
else if (thisAllele == popdata.alleleC[s]) cmatches++;
else other++;
}
}
}
consistent += amatches;
notConsistent += cmatches;
myLogger.info(String.format("%s: total = %d, amatches = %d, cmatches = %d, other = %d", myTaxaList.taxaName(tndx), total, amatches, cmatches, other));
}
for (Integer tndx : p2list) {
int total = 0;
int amatches = 0;
int cmatches = 0;
int other = 0;
for (int s = 0; s < nsites; s++) {
if (popdata.snpIndex.fastGet(s)) {
byte thisAllele = popdata.original.genotype(tndx, s);
if (thisAllele != NN) {
total++;
if (thisAllele == popdata.alleleA[s]) amatches++;
else if (thisAllele == popdata.alleleC[s]) cmatches++;
else other++;
}
}
}
consistent += cmatches;
notConsistent += amatches;
myLogger.info(String.format("%s: total = %d, amatches = %d, cmatches = %d, other = %d", myTaxaList.taxaName(tndx), total, amatches, cmatches, other));
}
double probNotConsistent = 0;
if (notConsistent > 0) {
probNotConsistent = ((double) notConsistent) / ((double) (notConsistent + consistent));
if (probNotConsistent > 0.75) {
byte[] temp = popdata.alleleA;
popdata.alleleA = popdata.alleleC;
popdata.alleleC = temp;
}
}
myLogger.info(String.format("Finished checking parents. probNotConsistent = " + probNotConsistent));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy