net.maizegenetics.analysis.imputation.PhaseHighCoverage Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
package net.maizegenetics.analysis.imputation;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.commons.lang.SerializationUtils;
import org.apache.commons.math3.distribution.ChiSquaredDistribution;
import org.apache.commons.math3.stat.inference.ChiSquareTest;
import net.maizegenetics.analysis.data.FileLoadPlugin;
import net.maizegenetics.analysis.data.FileLoadPlugin.TasselFileType;
import net.maizegenetics.analysis.distance.MultiDimensionalScalingPlugin;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.stats.PCA.ClassicMds;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.taxa.tree.Tree;
import net.maizegenetics.taxa.tree.TreeClusters;
import net.maizegenetics.taxa.tree.UPGMATree;
import net.maizegenetics.util.LoggingUtils;
public class PhaseHighCoverage {
//this class uses sites with depth of 5 to phase parents
//can phase parents using one progeny but will not know where xo's are
private static final byte N = GenotypeTable.UNKNOWN_ALLELE;
private static final byte NN = GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
private Path parentagePath;
private Path genopath;
private Path outhapsSelfCross;
private Path outhapsCross;
private Path monomorphs;
private Path hapsSelf;
private Path outhapsCrossHighCover;
private Path outhapsAllProgeny;
private GenotypeTable myGenotypeTable;
private static ChiSquareTest chisqTest = new ChiSquareTest();
private int monoMultiplier = 100;
Map selfHaps;
public PhaseHighCoverage(GenotypeTable genotype) {
myGenotypeTable = genotype;
}
public List loadPlotInfo() {
List plotList = new ArrayList<>();
List taxaNames = myGenotypeTable.taxa().stream().map(t -> t.getName()).collect(Collectors.toList());
try (BufferedReader br = Files.newBufferedReader(parentagePath)) {
br.readLine();
String input;
while ((input = br.readLine()) != null) {
String[] data = input.split("\t");
if (data.length > 3) {
//check to make sure that myGenotypeTable contains the plot name
if (taxaNames.contains(data[0]))
plotList.add(data);
}
}
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
return plotList;
}
public void phaseParentsUsingAllAvailableProgeny(double minEigenRatio, Path savepath) {
outhapsCrossHighCover = savepath;
phaseParentsUsingAllAvailableProgeny(minEigenRatio);
}
public void phaseParentsUsingAllAvailableProgeny(double minEigenRatio) {
System.out.println("Phasing parents using method phaseParentsUsingAllAvailableProgeny().");
System.out.println("That in turn uses phaseParentUsingSelfAndCrossProgeny()");
System.out.println("-------------------------------------------------------");
int minNumberPhasedSites = 1000;
Map phasedParents = new HashMap<>();
// myGenotypeTable = loadC2TeoGenotype();
List plotInfo = loadPlotInfo();
TreeSet parentSet = new TreeSet<>();
for (String[] plot : plotInfo) {
parentSet.add(plot[1]);
parentSet.add(plot[2]);
}
for (String parent : parentSet) {
System.out.printf("Phasing %s\n", parent);
byte[][] phase = phaseParentUsingSelfAndCrossProgeny(parent, myGenotypeTable, minEigenRatio, plotInfo);
if (phase == null) {
System.out.println("Too few phased haplotypes, skipping.");
System.out.println();
} else {
int nsites = phase[0].length;
int phasedSiteCount = 0;
for (int s = 0; s < nsites; s++) {
if (phase[0][s] != N) {
phasedSiteCount++;
}
}
System.out.printf("%d sites phased for %s\n", phasedSiteCount, parent);
if (phasedSiteCount < minNumberPhasedSites) {
System.out.println("Too few sites phased, skipping.");
} else {
phasedParents.put(parent, phase);
}
}
}
SelfedHaplotypeFinder.serializePhasedHaplotypes(phasedParents, outhapsCrossHighCover);
System.out.println("Finished phasing and storing haplotypes.");
}
public byte[][] phaseParentUsingSelfAndCrossProgeny(String parent, GenotypeTable myGeno, double minEigenRatio, List plotInfo) {
//set the min chisquare statistic
double alpha = 0.0001;
double chisqLimit = new ChiSquaredDistribution(1).inverseCumulativeProbability(1 - alpha);
List phasedHaplotypes = new ArrayList<>();
int parentIndex = myGeno.taxa().indexOf(parent);
byte[] parentGeno = myGeno.genotypeAllSites(parentIndex);
byte[][] phasedParent = new byte[2][];
for (int i = 0; i < 2; i++) {
phasedParent[i] = new byte[myGeno.numberOfSites()];
Arrays.fill(phasedParent[i], N);
}
for (String[] plot : plotInfo) {
if (plot[1].equals(parent) || plot[2].equals(parent)) {
String otherParent;
if (plot[1].equals(parent)) otherParent = plot[2];
else otherParent = plot[1];
System.out.println("phasing " + plot[0]);
byte[][] haps = phaseParentUsingOneProgeny(parent, otherParent, plot[0], myGeno);
phasedHaplotypes.add(haps[0]);
phasedHaplotypes.add(haps[1]);
}
}
if (phasedHaplotypes.size() < 10) return null;
//need to decide how to handle IBD segments (minimize xo's?)
//initially, only polymorphic sites are being phased, which will eliminate IBD segments
//adding monomorphic sites back in will capture those
//have to do this by chromosome
int[] chrstart = myGeno.chromosomesOffsets();
int[] chrend = new int[10];
System.arraycopy(chrstart, 1, chrend, 0, 9);
chrend[9] = myGeno.numberOfSites();
int window = 40; //number of sites with sufficient coverage & polymorphic
int minWindow = 20;
int minPresent = 4;
List monomorphicSites = new ArrayList<>();
for (int c = 0; c < 10; c++) {
int s = chrstart[c];
List prevHapIndices1 = null;
List prevHapIndices2 = null;
int[] previousSiteIndex = null;
boolean isPreviousHapValid = false;
while (s < chrend[c]) {
//index of polymorphic sites
int[] siteIndex = new int[window];
int indexCount = 0;
//this while segment finds the next window polymorphic loci
//stores the sites in siteIndex. indexCount is the number found, which is < window only at the end of the chromosome
int startS = s;
while (s < chrend[c] && indexCount < window) {
int[] alleleCounts = countAllelesAtSite(phasedHaplotypes, s);
int npresent = Arrays.stream(alleleCounts).sum();
if (npresent > minPresent) {
//generate sorted Index
List index = IntStream.range(0, 6).boxed().collect(Collectors.toList());
Collections.sort(index, (a,b) -> alleleCounts[a] >= alleleCounts[b] ? -1 : 1);
//test for polymorphism (minor allele count * 4 > major allele count)
int mult = 10; //was 4 for phase 1, but changing to 10 in phase2 teosinte
if (alleleCounts[index.get(1)] > 1 && alleleCounts[index.get(1)] * mult > alleleCounts[index.get(0)]) {
siteIndex[indexCount++] = s;
}
//test for monomorphism
if (alleleCounts[index.get(0)] > 20 && alleleCounts[index.get(1)] == 0 ) {
monomorphicSites.add(new int[]{s, index.get(0)});
}
}
s++;
}
int nSitesScanned = s - startS;
//if indexCount is too small, create a window by adding the new sites on the end of the previous window
if (indexCount < minWindow) {
if (!isPreviousHapValid) {
//at end of chromosome and no valid window has been found. Probably, the entire chromosome is homozygous.
System.out.printf("No windows phased in chromosome %d%n", c + 1);
continue;
}
int combinedCount = previousSiteIndex.length + indexCount;
int[] combinedIndex = new int[combinedCount];
System.arraycopy(previousSiteIndex, 0, combinedIndex, 0, previousSiteIndex.length);
System.arraycopy(siteIndex, 0, combinedIndex, previousSiteIndex.length, indexCount);
siteIndex = combinedIndex;
indexCount = siteIndex.length;
}
//create a list of these segments
List seglist = new ArrayList<>();
for (byte[] haps : phasedHaplotypes) {
byte[] seg = new byte[indexCount];
for (int i = 0; i < indexCount; i++) seg[i] = haps[siteIndex[i]];
seglist.add(seg);
}
//calculate pairwise distances as mismatch proportion
double[][] dist = mismatchDistanceMatrix(seglist);
List dummyTaxa = IntStream.range(0, dist.length).mapToObj(i -> new Taxon(Integer.toString(i))).collect(Collectors.toList());
DistanceMatrix dm = new DistanceMatrix(dist, new TaxaListBuilder().addAll(dummyTaxa).build());
//test for single haplotype
ClassicMds mds = new ClassicMds(dm);
System.out.printf("Eigenvalue ratio = %1.3f, Eigenvalues: %1.3f, %1.3f, %1.3f, %1.3f\n", mds.getEigenvalue(0)/mds.getEigenvalue(1), mds.getEigenvalue(0), mds.getEigenvalue(1), mds.getEigenvalue(2), mds.getEigenvalue(3));
System.out.printf("%d sites scanned to generate this interval\n", nSitesScanned);
if (mds.getEigenvalue(0)/mds.getEigenvalue(1) < minEigenRatio) continue; //skip this interval
UPGMATree myTree = new UPGMATree(dm);
TreeClusters myClusters = new TreeClusters(myTree);
int[] myGroups = myClusters.getGroups(2);
//make two groups
List hapIndices1 = new ArrayList<>();
List hapIndices2 = new ArrayList<>();
int n = myGroups.length;
for (int i = 0; i < n; i++) {
String name = myTree.getExternalNode(i).getIdentifier().getName();
if (myGroups[i] == 0) hapIndices1.add(Integer.parseInt(name));
else hapIndices2.add(Integer.parseInt(name));
}
//keep track of which haplotypes are assigned to which windows
boolean reverseHaps;
if (prevHapIndices1 == null) reverseHaps = false;
else {
int[][] matches = new int[2][2];
matches[0][0] = countSharedMembers(hapIndices1, prevHapIndices1);
matches[0][1] = countSharedMembers(hapIndices1, prevHapIndices2);
matches[1][0] = countSharedMembers(hapIndices2, prevHapIndices1);
matches[1][1] = countSharedMembers(hapIndices2, prevHapIndices2);
int mainDiagSum = matches[0][0] + matches[1][1];
int offDiagSum = matches[0][1] + matches[1][0];
if (mainDiagSum > 2 * offDiagSum) {
reverseHaps = false;
isPreviousHapValid = true;
}
else if (offDiagSum > 2 * mainDiagSum) {
reverseHaps = true;
isPreviousHapValid = true;
}
else {
//if the previous haplotype is the first one, it has not been validated (by making sure that it can be matched to the adjacent haplotype)
//if the previous haplotype has not been validated, invalidate it (do not use it)
if (!isPreviousHapValid) {
//set previous Hap sites to missing
for (int prevSite : previousSiteIndex) {
phasedParent[0][prevSite] = N;
phasedParent[1][prevSite] = N;
}
reverseHaps = false;
} else {
continue; //do not process this interval but go on to next
}
}
System.out.printf("haplotype matches at chr %d, site %d: %d, %d, %d, %d, reverse = %b\n", c + 1, s, matches[0][0],matches[0][1],matches[1][0],matches[1][1], reverseHaps);
}
System.out.printf("hap list 1 has %d members, 2 has %d members (chr %d, site %d)\n", hapIndices1.size(), hapIndices2.size(), c + 1, s);
List hapList1, hapList2;
if (reverseHaps) {
hapList1 = hapIndices2.stream().map(I -> seglist.get(I)).collect(Collectors.toList());
hapList2 = hapIndices1.stream().map(I -> seglist.get(I)).collect(Collectors.toList());
} else {
hapList1 = hapIndices1.stream().map(I -> seglist.get(I)).collect(Collectors.toList());
hapList2 = hapIndices2.stream().map(I -> seglist.get(I)).collect(Collectors.toList());
}
System.out.println(haplotypeAsString(consensusHaplotype(hapList1)));
System.out.println(haplotypeAsString(consensusHaplotype(hapList2)));
//update phasedHaplotypes with sites that segregate with haplotypes
//update phasedHaplotypes[0] with hapList1, phasedHaplotypes[1] with hapList2
for (int i = 0; i < indexCount; i++) {
int[] alleleCount1 = countAllelesAtSite(hapList1, i);
int[] alleleCount2 = countAllelesAtSite(hapList2, i);
long[][] counts = new long[2][2];
int which = 0;
for (int j = 0; j < 4; j++) {
if (alleleCount1[j] > 0 || alleleCount2[j] > 0) {
if (which == 2) {
System.out.printf("Site %d has more than 2 alleles\n", siteIndex[i]);
break;
}
counts[0][which] = alleleCount1[j];
counts[1][which] = alleleCount2[j];
which++;
}
}
double[] ratio = new double[]{counts[0][0] / (double) counts[0][1], counts[1][0] / (double) counts[1][1]};
if ((ratio[0] >= 2 && ratio[1] <= 0.5) || (ratio[1] >= 2 && ratio[0] <= 0.5)) {
double testval = chisqTest.chiSquare(counts);
if (testval >= chisqLimit) { //update haplotypes with this site
phasedParent[0][siteIndex[i]] = maxAllele(alleleCount1);
phasedParent[1][siteIndex[i]] = maxAllele(alleleCount2);
}
}
}
if (reverseHaps) {
prevHapIndices1 = hapIndices2;
prevHapIndices2 = hapIndices1;
} else {
prevHapIndices1 = hapIndices1;
prevHapIndices2 = hapIndices2;
}
previousSiteIndex = siteIndex;
}
}
//If there are not at least 1500 phased polymorphic sites, return null
int siteCount = 0;
int nsites = myGeno.numberOfSites();
for (int s = 0; s < nsites; s++) {
if (phasedParent[0][s] != N && phasedParent[1][s] != N) siteCount++;
}
System.out.printf("There were %d polymorphic sites phased for %s\n", siteCount, parent);
if (siteCount < 1500) return null;
//add in the monomorphic sites
for (int[] site : monomorphicSites) {
phasedParent[0][site[0]] = (byte) site[1];
phasedParent[1][site[0]] = (byte) site[1];
}
return phasedParent;
}
private byte maxAllele(int[] alleleCounts) {
int max = 0;
for (int i = 1; i < alleleCounts.length; i++) {
if (alleleCounts[i] > alleleCounts[max]) max = i;
}
return (byte) max;
}
private int countSharedMembers(List list1, List list2) {
List refList = new ArrayList(list1);
Collections.sort(refList);
int count = 0;
for (Integer I : list2)
if (Collections.binarySearch(refList, I) > -1) count++;
return count;
}
private byte[] consensusHaplotype(List haplotypes) {
int nsites = haplotypes.get(0).length;
byte[] consensus = new byte[nsites];
for (int i = 0; i < nsites; i++) {
int[] alleleCounts = new int[6];
for (byte[] hap:haplotypes) {
if (hap[i] < 6) alleleCounts[hap[i]]++;
}
int ndx = 0;
for (int j = 1; j < 6; j++) {
if (alleleCounts[j] > alleleCounts[ndx]) ndx = j;
}
consensus[i] = (byte) ndx;
}
return consensus;
}
private String haplotypeAsString(byte[] hap) {
StringBuilder sb = new StringBuilder();
for (byte b:hap) sb.append(NucleotideAlignmentConstants.getHaplotypeNucleotide(b));
return sb.toString();
}
private int[] countAllelesAtSite(List haps, int site) {
int[] alleleCounts = new int[6];
for (byte[] hap:haps) {
if (hap[site] < 6) alleleCounts[hap[site]]++;
}
return alleleCounts;
}
public byte[][] phaseParentUsingOneProgeny(String parent, String otherParent, String progeny, GenotypeTable gt) {
//phase only at sites with sufficient depth for parents and progeny (all hets are okay, homozygotes with depth >= minDepth)
int minDepth = 7;
int nsites = gt.numberOfSites();
byte[][] phasedGenotype = new byte[2][nsites];
Arrays.fill(phasedGenotype[0], N);
Arrays.fill(phasedGenotype[1], N);
int parentIndex = gt.taxa().indexOf(parent);
int otherParentIndex = gt.taxa().indexOf(otherParent);
int progenyIndex = gt.taxa().indexOf(progeny);
for (int s = 0; s < nsites; s++) {
byte parentGenotype = gt.genotype(parentIndex, s);
byte otherParentGenotype = gt.genotype(otherParentIndex, s);
byte progenyGenotype = gt.genotype(progenyIndex, s);
boolean parentHomozygous = !GenotypeTableUtils.isHeterozygous(parentGenotype) && gt.depth().depth(parentIndex, s) >= minDepth;
boolean otherParentHomozygous = !GenotypeTableUtils.isHeterozygous(otherParentGenotype) && gt.depth().depth(otherParentIndex, s) >= minDepth;
boolean progenyHomozygous = !GenotypeTableUtils.isHeterozygous(progenyGenotype) && gt.depth().depth(progenyIndex, s) >= minDepth;
//1. if parent is homozygous (minDepth) then haplotype = parent allele
//2. if progeny is homozygous (minDepth) then haplotype = progeny allele
//3. if other parent is homozygous (minDepth) then
//3a. if progeny is not homozygous other parent then parent is opposite allele of other parent
//otherwise haplotype == N
if (parentHomozygous) {
phasedGenotype[0][s] = phasedGenotype[1][s] = GenotypeTableUtils.getDiploidValues(parentGenotype)[0];
} else if (progenyHomozygous) {
phasedGenotype[0][s] = GenotypeTableUtils.getDiploidValues(progenyGenotype)[0];
if (parentGenotype != NN) {
byte progenyAllele = GenotypeTableUtils.getDiploidValues(progenyGenotype)[0];
byte[] parentAlleles = GenotypeTableUtils.getDiploidValues(parentGenotype);
if (parentAlleles[0] != progenyAllele) phasedGenotype[1][s] = parentAlleles[0];
else if (parentAlleles[1] != progenyAllele) phasedGenotype[1][s] = parentAlleles[1];
}
} else if (otherParentHomozygous) {
byte[] progenyAlleles = GenotypeTableUtils.getDiploidValues(progenyGenotype);
byte otherAllele = GenotypeTableUtils.getDiploidValues(otherParentGenotype)[0];
if (progenyAlleles[0] != otherAllele || progenyAlleles[1] != otherAllele) {
if (progenyAlleles[0] != otherAllele) phasedGenotype[0][s] = progenyAlleles[0];
else phasedGenotype[0][s] = progenyAlleles[1];
byte[] parentAlleles = GenotypeTableUtils.getDiploidValues(parentGenotype);
if (parentAlleles[0] != phasedGenotype[0][s]) phasedGenotype[1][s] = parentAlleles[0];
else if (parentAlleles[1] != phasedGenotype[0][s]) phasedGenotype[1][s] = parentAlleles[1];
}
}
}
return phasedGenotype;
}
private double[][] mismatchDistanceMatrix(List segments) {
int n = segments.size();
double[][] dist = new double[n][n];
for (int i = 0; i < n - 1; i++) {
byte[] seg1 = segments.get(i);
for (int j = i + 1; j < n; j++) {
byte[] seg2 = segments.get(j);
int notMissingCount = 0;
int notMatchCount = 0;
for (int k = 0; k < seg1.length; k++) {
if (seg1[k] != N && seg2[k] != N) {
notMissingCount++;
if ((seg1[k] != seg2[k])) notMatchCount++;
}
}
if (notMissingCount > 0) {
dist[i][j] = dist[j][i] = notMatchCount / (double) notMissingCount;
}
}
}
return dist;
}
private double averageDistanceToCluster(double[][] distance, List cluster, int ndx) {
int n = cluster.size();
double total = 0;
for (int i = 0; i < n; i++) {
total += distance[cluster.get(i)][ndx];
}
return total / (double) n;
}
private void saveSeglist(List seglist, String filename) {
try (BufferedWriter bw = Files.newBufferedWriter(Paths.get(filename))) {
for (byte[] seg : seglist) {
for (byte b : seg) bw.write(NucleotideAlignmentConstants.getHaplotypeNucleotide(b));
}
bw.write("\n");
} catch(IOException e) {
e.printStackTrace();
}
}
private void saveDistanceMatrix(DistanceMatrix dm, String filename) {
int n = dm.numberOfTaxa();
try (BufferedWriter bw = Files.newBufferedWriter(Paths.get(filename))) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
bw.write(String.format("%1.5f ", dm.getDistance(i, j)));
}
}
} catch(IOException e) {
e.printStackTrace();
}
}
public void setOuthapsAllProgeny(String filename) {
outhapsAllProgeny = Paths.get(filename);
}
public void setParentage(String filename) {
parentagePath = Paths.get(filename);
}
public void setGenotypeTable(GenotypeTable genotype) {
myGenotypeTable = genotype;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy