All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.imputation.PhaseHighCoverage Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

There is a newer version: 5.2.94
Show newest version
package net.maizegenetics.analysis.imputation;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import org.apache.commons.lang.SerializationUtils;
import org.apache.commons.math3.distribution.ChiSquaredDistribution;
import org.apache.commons.math3.stat.inference.ChiSquareTest;

import net.maizegenetics.analysis.data.FileLoadPlugin;
import net.maizegenetics.analysis.data.FileLoadPlugin.TasselFileType;
import net.maizegenetics.analysis.distance.MultiDimensionalScalingPlugin;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.stats.PCA.ClassicMds;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.taxa.tree.Tree;
import net.maizegenetics.taxa.tree.TreeClusters;
import net.maizegenetics.taxa.tree.UPGMATree;
import net.maizegenetics.util.LoggingUtils;

public class PhaseHighCoverage {
	//this class uses sites with depth of 5 to phase parents
	//can phase parents using one progeny but will not know where xo's are
	private static final byte N = GenotypeTable.UNKNOWN_ALLELE;
	private static final byte NN = GenotypeTable.UNKNOWN_DIPLOID_ALLELE;

	private Path parentagePath;
	private Path genopath;
	
	private Path outhapsSelfCross;
	private Path outhapsCross;
	private Path monomorphs;
	private Path hapsSelf;
	private Path outhapsCrossHighCover;
	private Path outhapsAllProgeny;
	
	private GenotypeTable myGenotypeTable;

	private static ChiSquareTest chisqTest = new ChiSquareTest();
	private int monoMultiplier = 100;
	
	Map selfHaps;

	public PhaseHighCoverage(GenotypeTable genotype) {
		myGenotypeTable = genotype;
	}
	
	public List loadPlotInfo() {
    	List plotList = new ArrayList<>();
    	List taxaNames = myGenotypeTable.taxa().stream().map(t -> t.getName()).collect(Collectors.toList());
		try (BufferedReader br = Files.newBufferedReader(parentagePath)) {
			br.readLine();
			String input;
			while ((input = br.readLine()) != null) {
				String[] data = input.split("\t");
				if (data.length > 3) {
					//check to make sure that myGenotypeTable contains the plot name
					if (taxaNames.contains(data[0]))
						plotList.add(data);
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
			System.exit(-1);
		}
		return plotList;
	}

	public void phaseParentsUsingAllAvailableProgeny(double minEigenRatio, Path savepath) {
		outhapsCrossHighCover = savepath;
		phaseParentsUsingAllAvailableProgeny(minEigenRatio);
	}
	
	public void phaseParentsUsingAllAvailableProgeny(double minEigenRatio) {
		System.out.println("Phasing parents using method phaseParentsUsingAllAvailableProgeny().");
		System.out.println("That in turn uses phaseParentUsingSelfAndCrossProgeny()");
		System.out.println("-------------------------------------------------------");
		int minNumberPhasedSites = 1000;
		Map phasedParents = new HashMap<>();
//		myGenotypeTable = loadC2TeoGenotype();
		
		List plotInfo = loadPlotInfo();
		
		TreeSet parentSet = new TreeSet<>();
		for (String[] plot : plotInfo) {
			parentSet.add(plot[1]);
			parentSet.add(plot[2]);
		}
		
		for (String parent : parentSet) {
			System.out.printf("Phasing %s\n", parent);
			byte[][] phase = phaseParentUsingSelfAndCrossProgeny(parent, myGenotypeTable, minEigenRatio, plotInfo);
			if (phase == null) {
				System.out.println("Too few phased haplotypes, skipping.");
				System.out.println();
			} else {
				int nsites = phase[0].length;
				int phasedSiteCount = 0;
				for (int s = 0; s < nsites; s++) {
					if (phase[0][s] != N) {
						phasedSiteCount++;
					}
					
				}
				System.out.printf("%d sites phased for %s\n", phasedSiteCount, parent);
				if (phasedSiteCount < minNumberPhasedSites) {
					System.out.println("Too few sites phased, skipping.");
				} else {
					phasedParents.put(parent, phase);
				}
			}

		}
		
		SelfedHaplotypeFinder.serializePhasedHaplotypes(phasedParents, outhapsCrossHighCover);
		System.out.println("Finished phasing and storing haplotypes.");

	}
	
	public byte[][] phaseParentUsingSelfAndCrossProgeny(String parent, GenotypeTable myGeno, double minEigenRatio, List plotInfo) {
		//set the min chisquare statistic
		double alpha = 0.0001;
		double chisqLimit = new ChiSquaredDistribution(1).inverseCumulativeProbability(1 - alpha);
		
		List phasedHaplotypes = new ArrayList<>();
		int parentIndex = myGeno.taxa().indexOf(parent);
		byte[] parentGeno = myGeno.genotypeAllSites(parentIndex);
		
		byte[][] phasedParent = new byte[2][];
		for (int i = 0; i < 2; i++) {
			phasedParent[i] = new byte[myGeno.numberOfSites()];
			Arrays.fill(phasedParent[i], N);
		}
		
		for (String[] plot : plotInfo) {
			if (plot[1].equals(parent) || plot[2].equals(parent)) {
				String otherParent;
				if (plot[1].equals(parent)) otherParent = plot[2];
				else otherParent = plot[1];
				System.out.println("phasing " + plot[0]);
				byte[][] haps = phaseParentUsingOneProgeny(parent, otherParent, plot[0], myGeno);
				phasedHaplotypes.add(haps[0]);
				phasedHaplotypes.add(haps[1]);
			}
		}
		
		if (phasedHaplotypes.size() < 10) return null;

		//need to decide how to handle IBD segments (minimize xo's?)
		//initially, only polymorphic sites are being phased, which will eliminate IBD segments
		//adding monomorphic sites back in will capture those
		//have to do this by chromosome
		int[] chrstart = myGeno.chromosomesOffsets();
		int[] chrend = new int[10];
		System.arraycopy(chrstart, 1, chrend, 0, 9);
		chrend[9] = myGeno.numberOfSites();
		int window = 40; //number of sites with sufficient coverage & polymorphic
		int minWindow = 20;
		int minPresent = 4;
		List monomorphicSites = new ArrayList<>();
		for (int c = 0; c < 10; c++) { 
			int s = chrstart[c];
			List prevHapIndices1 = null;
			List prevHapIndices2 = null;
			int[] previousSiteIndex = null;
			boolean isPreviousHapValid = false;
			while (s < chrend[c]) {
				//index of polymorphic sites
				int[] siteIndex = new int[window];
				int indexCount = 0;
				
				//this while segment finds the next window polymorphic loci
				//stores the sites in siteIndex. indexCount is the number found, which is < window only at the end of the chromosome
				int startS = s;
				while (s < chrend[c] && indexCount < window) {
					int[] alleleCounts = countAllelesAtSite(phasedHaplotypes, s);
					int npresent = Arrays.stream(alleleCounts).sum();
					if (npresent > minPresent) {
						
						//generate sorted Index
						List index = IntStream.range(0, 6).boxed().collect(Collectors.toList());
						Collections.sort(index, (a,b) -> alleleCounts[a] >= alleleCounts[b] ? -1 : 1);
						
						//test for polymorphism (minor allele count * 4 > major allele count)
						int mult = 10; //was 4 for phase 1, but changing to 10 in phase2 teosinte
						if (alleleCounts[index.get(1)] > 1 && alleleCounts[index.get(1)] * mult > alleleCounts[index.get(0)]) {
							siteIndex[indexCount++] = s;
						}
						
						//test for monomorphism
						if (alleleCounts[index.get(0)] > 20 && alleleCounts[index.get(1)] == 0 ) {
							monomorphicSites.add(new int[]{s, index.get(0)});
						}
					}
					s++;
				}
				
				int nSitesScanned = s - startS;
				
				//if indexCount is too small, create a window by adding the new sites on the end of the previous window
				if (indexCount < minWindow) {
					if (!isPreviousHapValid) {
						//at end of chromosome and no valid window has been found. Probably, the entire chromosome is homozygous.
						System.out.printf("No windows phased in chromosome %d%n", c + 1);
						continue; 
					}
					int combinedCount = previousSiteIndex.length + indexCount;
					int[] combinedIndex = new int[combinedCount];
					System.arraycopy(previousSiteIndex, 0, combinedIndex, 0, previousSiteIndex.length);
					System.arraycopy(siteIndex, 0, combinedIndex, previousSiteIndex.length, indexCount);
					siteIndex = combinedIndex;
					indexCount = siteIndex.length;
				}
				
				//create a list of these segments
				List seglist = new ArrayList<>();
				for (byte[] haps : phasedHaplotypes) {
					byte[] seg = new byte[indexCount];
					for (int i = 0; i < indexCount; i++) seg[i] = haps[siteIndex[i]];
					seglist.add(seg);
				}
				
				//calculate pairwise distances as mismatch proportion
				double[][] dist = mismatchDistanceMatrix(seglist);
				List dummyTaxa = IntStream.range(0,  dist.length).mapToObj(i -> new Taxon(Integer.toString(i))).collect(Collectors.toList());
				DistanceMatrix dm = new DistanceMatrix(dist, new TaxaListBuilder().addAll(dummyTaxa).build());
				
				//test for single haplotype
				ClassicMds mds = new ClassicMds(dm);
				System.out.printf("Eigenvalue ratio = %1.3f, Eigenvalues: %1.3f, %1.3f, %1.3f, %1.3f\n", mds.getEigenvalue(0)/mds.getEigenvalue(1), mds.getEigenvalue(0), mds.getEigenvalue(1), mds.getEigenvalue(2), mds.getEigenvalue(3));
				System.out.printf("%d sites scanned to generate this interval\n", nSitesScanned);
				if (mds.getEigenvalue(0)/mds.getEigenvalue(1) < minEigenRatio) continue; //skip this interval
				
				UPGMATree myTree = new UPGMATree(dm);
				TreeClusters myClusters = new TreeClusters(myTree);
				int[] myGroups = myClusters.getGroups(2);
				
				//make two groups
				List hapIndices1 = new ArrayList<>();
				List hapIndices2 = new ArrayList<>();
				int n = myGroups.length;
				for (int i = 0; i < n; i++) {
					String name = myTree.getExternalNode(i).getIdentifier().getName();
					if (myGroups[i] == 0) hapIndices1.add(Integer.parseInt(name));
					else hapIndices2.add(Integer.parseInt(name));
				}
				
				//keep track of which haplotypes are assigned to which windows
				boolean reverseHaps;
				if (prevHapIndices1 == null) reverseHaps = false;
				else {
					int[][] matches = new int[2][2];
					matches[0][0] = countSharedMembers(hapIndices1, prevHapIndices1);
					matches[0][1] = countSharedMembers(hapIndices1, prevHapIndices2);
					matches[1][0] = countSharedMembers(hapIndices2, prevHapIndices1);
					matches[1][1] = countSharedMembers(hapIndices2, prevHapIndices2);
					int mainDiagSum = matches[0][0] + matches[1][1];
					int offDiagSum = matches[0][1] + matches[1][0];
					if (mainDiagSum > 2 * offDiagSum) {
						reverseHaps = false;
						isPreviousHapValid = true;
					}
					else if (offDiagSum > 2 * mainDiagSum) {
						reverseHaps = true;
						isPreviousHapValid = true;
					}
					else {
						//if the previous haplotype is the first one, it has not been validated (by making sure that it can be matched to the adjacent haplotype)
						//if the previous haplotype has not been validated, invalidate it (do not use it)
						if (!isPreviousHapValid) {
							//set previous Hap sites to missing
							for (int prevSite : previousSiteIndex) {
								phasedParent[0][prevSite] = N;
								phasedParent[1][prevSite] = N;
							}
							reverseHaps = false;
						} else {
							continue; //do not process this interval but go on to next
						}
						
					}
					System.out.printf("haplotype matches at chr %d, site %d: %d, %d, %d, %d, reverse = %b\n", c + 1, s, matches[0][0],matches[0][1],matches[1][0],matches[1][1], reverseHaps);
				}

				System.out.printf("hap list 1 has %d members, 2 has %d members (chr %d, site %d)\n", hapIndices1.size(), hapIndices2.size(), c + 1, s);
				List hapList1, hapList2;
				if (reverseHaps) {
					hapList1 = hapIndices2.stream().map(I -> seglist.get(I)).collect(Collectors.toList());
					hapList2 = hapIndices1.stream().map(I -> seglist.get(I)).collect(Collectors.toList());
				} else {
					hapList1 = hapIndices1.stream().map(I -> seglist.get(I)).collect(Collectors.toList());
					hapList2 = hapIndices2.stream().map(I -> seglist.get(I)).collect(Collectors.toList());
				}
				
				System.out.println(haplotypeAsString(consensusHaplotype(hapList1)));
				System.out.println(haplotypeAsString(consensusHaplotype(hapList2)));

				//update phasedHaplotypes with sites that segregate with haplotypes
				//update phasedHaplotypes[0] with hapList1, phasedHaplotypes[1] with hapList2
				for (int i = 0; i < indexCount; i++) {
					int[] alleleCount1 = countAllelesAtSite(hapList1, i);
					int[] alleleCount2 = countAllelesAtSite(hapList2, i);
					long[][] counts = new long[2][2];
					int which = 0;
					for (int j = 0; j < 4; j++) {
						if (alleleCount1[j] > 0 || alleleCount2[j] > 0) {
							if (which == 2) {
								System.out.printf("Site %d has more than 2 alleles\n", siteIndex[i]);
								break;
							}
							counts[0][which] = alleleCount1[j];
							counts[1][which] = alleleCount2[j];
							which++;
						}
					}
					
					double[] ratio = new double[]{counts[0][0] / (double) counts[0][1], counts[1][0] / (double) counts[1][1]};
					if ((ratio[0] >= 2 && ratio[1] <= 0.5) || (ratio[1] >= 2 && ratio[0] <= 0.5)) {
						double testval = chisqTest.chiSquare(counts);
						if (testval >= chisqLimit) { //update haplotypes with this site
							phasedParent[0][siteIndex[i]] = maxAllele(alleleCount1);
							phasedParent[1][siteIndex[i]] = maxAllele(alleleCount2);
						}
					}
					
				}
				
				if (reverseHaps) {
					prevHapIndices1 = hapIndices2;
					prevHapIndices2 = hapIndices1;
				} else {
					prevHapIndices1 = hapIndices1;
					prevHapIndices2 = hapIndices2;
				}
				previousSiteIndex = siteIndex;
			}
		}
		
		//If there are not at least 1500 phased polymorphic sites, return null
		int siteCount = 0;
		int nsites = myGeno.numberOfSites();
		for (int s = 0; s < nsites; s++) {
			if (phasedParent[0][s] != N && phasedParent[1][s] != N) siteCount++;
		}
		System.out.printf("There were %d polymorphic sites phased for %s\n", siteCount, parent);
		if (siteCount < 1500) return null;
		
		//add in the monomorphic sites
		for (int[] site : monomorphicSites) {
			phasedParent[0][site[0]] = (byte) site[1];
			phasedParent[1][site[0]] = (byte) site[1];
		}
		return phasedParent;
	}
	
	private byte maxAllele(int[] alleleCounts) {
		int max = 0;
		for (int i = 1; i < alleleCounts.length; i++) {
			if (alleleCounts[i] > alleleCounts[max]) max = i;
		}
		return (byte) max;
	}
	
	private int countSharedMembers(List list1, List list2) {
		List refList = new ArrayList(list1);
		Collections.sort(refList);
		int count = 0;
		for (Integer I : list2) 
			if (Collections.binarySearch(refList, I) > -1) count++;
		return count;
	}
	
	private byte[] consensusHaplotype(List haplotypes) {
		int nsites = haplotypes.get(0).length;
		byte[] consensus = new byte[nsites];
		for (int i = 0; i < nsites; i++) {
			int[] alleleCounts = new int[6];
			for (byte[] hap:haplotypes) {
				if (hap[i] < 6) alleleCounts[hap[i]]++;
			}
			int ndx = 0;
			for (int j = 1; j < 6; j++) {
				if (alleleCounts[j] > alleleCounts[ndx]) ndx = j;
			}
			consensus[i] = (byte) ndx;
		}
		return consensus;
	}
	
	private String haplotypeAsString(byte[] hap) {
		StringBuilder sb = new StringBuilder();
		for (byte b:hap) sb.append(NucleotideAlignmentConstants.getHaplotypeNucleotide(b));
		return sb.toString();
	}
	
	private int[] countAllelesAtSite(List haps, int site) {
		int[] alleleCounts = new int[6];
		for (byte[] hap:haps) {
			if (hap[site] < 6)  alleleCounts[hap[site]]++;
		}
		return alleleCounts;
	}
	
	public byte[][] phaseParentUsingOneProgeny(String parent, String otherParent, String progeny, GenotypeTable gt) {
		//phase only at sites with sufficient depth for parents and progeny (all hets are okay, homozygotes with depth >= minDepth)
		int minDepth = 7;
		int nsites = gt.numberOfSites();
		byte[][] phasedGenotype = new byte[2][nsites];
		Arrays.fill(phasedGenotype[0], N);
		Arrays.fill(phasedGenotype[1], N);
		
		int parentIndex = gt.taxa().indexOf(parent);
		int otherParentIndex = gt.taxa().indexOf(otherParent);
		int progenyIndex = gt.taxa().indexOf(progeny);
		
		for (int s = 0; s < nsites; s++) {
			byte parentGenotype = gt.genotype(parentIndex, s);
			byte otherParentGenotype = gt.genotype(otherParentIndex, s);
			byte progenyGenotype = gt.genotype(progenyIndex, s);
			
			boolean parentHomozygous = !GenotypeTableUtils.isHeterozygous(parentGenotype) && gt.depth().depth(parentIndex, s) >= minDepth;
			boolean otherParentHomozygous = !GenotypeTableUtils.isHeterozygous(otherParentGenotype) && gt.depth().depth(otherParentIndex, s) >= minDepth;
			boolean progenyHomozygous = !GenotypeTableUtils.isHeterozygous(progenyGenotype) && gt.depth().depth(progenyIndex, s) >= minDepth;
			
			//1. if parent is homozygous (minDepth) then haplotype = parent allele
			//2. if progeny is homozygous (minDepth) then haplotype = progeny allele
			//3. if other parent is homozygous (minDepth) then
			//3a. if progeny is not homozygous other parent then parent is opposite allele of other parent
			//otherwise haplotype == N
			
			if (parentHomozygous) {
				phasedGenotype[0][s] = phasedGenotype[1][s] = GenotypeTableUtils.getDiploidValues(parentGenotype)[0];
			} else if (progenyHomozygous) {
				phasedGenotype[0][s] = GenotypeTableUtils.getDiploidValues(progenyGenotype)[0];
				if (parentGenotype != NN) {
					byte progenyAllele = GenotypeTableUtils.getDiploidValues(progenyGenotype)[0];
					byte[] parentAlleles = GenotypeTableUtils.getDiploidValues(parentGenotype);
					if (parentAlleles[0] != progenyAllele) phasedGenotype[1][s] = parentAlleles[0];
					else if (parentAlleles[1] != progenyAllele) phasedGenotype[1][s] = parentAlleles[1];
				}
			} else if (otherParentHomozygous) {
				byte[] progenyAlleles = GenotypeTableUtils.getDiploidValues(progenyGenotype);
				byte otherAllele = GenotypeTableUtils.getDiploidValues(otherParentGenotype)[0];
				if (progenyAlleles[0] != otherAllele || progenyAlleles[1] != otherAllele) {
					if (progenyAlleles[0] != otherAllele) phasedGenotype[0][s] = progenyAlleles[0];
					else phasedGenotype[0][s] = progenyAlleles[1];
					byte[] parentAlleles = GenotypeTableUtils.getDiploidValues(parentGenotype);
					if (parentAlleles[0] != phasedGenotype[0][s]) phasedGenotype[1][s] = parentAlleles[0];
					else if (parentAlleles[1] != phasedGenotype[0][s]) phasedGenotype[1][s] = parentAlleles[1];
				}
			} 
		}
		
		return phasedGenotype;
	}
	
	private double[][] mismatchDistanceMatrix(List segments) {
		int n = segments.size();
		double[][] dist = new double[n][n];
		for (int i = 0; i < n - 1; i++) {
			byte[] seg1 = segments.get(i);
			for (int j = i + 1; j < n; j++) {
				byte[] seg2 = segments.get(j);
				int notMissingCount = 0;
				int notMatchCount = 0;
				for (int k = 0; k < seg1.length; k++) {
					if (seg1[k] != N && seg2[k] != N) {
						notMissingCount++;
						if ((seg1[k] != seg2[k])) notMatchCount++;
					}
				}
				if (notMissingCount > 0) {
					dist[i][j] = dist[j][i] = notMatchCount / (double) notMissingCount;
				}
			}
		}
		return dist;
	}

	private double averageDistanceToCluster(double[][] distance, List cluster, int ndx) {
		int n = cluster.size();
		double total = 0;
		for (int  i = 0; i < n; i++) {
			total += distance[cluster.get(i)][ndx];
		}
		return total / (double) n;
	}
	
	private void saveSeglist(List seglist, String filename) {
		try (BufferedWriter bw = Files.newBufferedWriter(Paths.get(filename))) {
			for (byte[] seg : seglist) {
				for (byte b : seg) bw.write(NucleotideAlignmentConstants.getHaplotypeNucleotide(b));
			}
			bw.write("\n");
		} catch(IOException e) {
			e.printStackTrace();
		}
	}
	
	private void saveDistanceMatrix(DistanceMatrix dm, String filename) {
		int n = dm.numberOfTaxa();
		try (BufferedWriter bw = Files.newBufferedWriter(Paths.get(filename))) {
			for (int i = 0; i < n; i++) {
				for (int j = 0; j < n; j++) {
					bw.write(String.format("%1.5f ", dm.getDistance(i, j)));
				}
			}
		} catch(IOException e) {
			e.printStackTrace();
		}

	}
	
	public void setOuthapsAllProgeny(String filename) {
		outhapsAllProgeny = Paths.get(filename);
	}
	
	public void setParentage(String filename) {
		parentagePath = Paths.get(filename);
	}
	
	public void setGenotypeTable(GenotypeTable genotype) {
		myGenotypeTable = genotype;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy