All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.imputation.FILLINImputationPlugin Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

There is a newer version: 5.2.94
Show newest version
package net.maizegenetics.analysis.imputation;

import com.google.common.primitives.Bytes;
import com.google.common.primitives.Ints;
import net.maizegenetics.analysis.popgen.DonorHypoth;
import net.maizegenetics.dna.map.DonorHaplotypes;
import net.maizegenetics.dna.snp.*;
import net.maizegenetics.dna.snp.io.ProjectionGenotypeIO;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.OpenBitSet;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;

import javax.swing.*;
import java.awt.*;
import java.io.File;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import static net.maizegenetics.dna.snp.GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
import static net.maizegenetics.dna.WHICH_ALLELE.Major;
import static net.maizegenetics.dna.WHICH_ALLELE.Minor;
import static net.maizegenetics.dna.snp.GenotypeTableUtils.isHeterozygous;
import static net.maizegenetics.dna.snp.NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;



/**
 * FILLIN imputation relies on a libary of haplotypes and uses nearest neighbor searches
 * followed by HMM Viterbi resolution or block-based resolution.
 * It is the best approach for substantially unrelated taxa in TASSEL.  BEAGLE4 is a better
 * approach currently for landraces, while FILLIN outperforms if there is a good reference
 * set of haplotypes.
 * 

* The algorithm relies on having a series of donor haplotypes. If phased haplotypes are * already known, they can be used directly. If they need to be reconstructed, the * {@link FindMergeHaplotypesPlugin} can be used to create haplotypes for windows * across the genome. *

* Imputation is done one taxon at a time using the donor haplotypes. The strategy is as * follows: *
  • Every 64 sites is considered a block (or a word in the bitset terminology - length of a long). There is * a separate initial search for nearest neighbors centered around each block. The flanks * of the scan window vary but always contain the number of minor alleles specified (a key parameter). * Minor alleles approximate the information content of the search. *
  • Calculate distance between the target taxon and the donor haplotypes for every window, and rank * the top 10 donor haplotypes for each 64 site focus block. *
  • Evaluate by Viterbi whether 1 or 2 haplotypes will explain all the sites across the * donor haplotype window. If successful, move to the next region for donor haplotypes. *
  • Resolve each focus block by nearest neighbors. If inbred NN are not close enough, * then do a hybrid search seeded with one parent being the initial 10 haplotypes. * Set bases based on what is resolved first inbred or hybrid. * *

    * Error rates are bounded away from zero, but adding 0.5 error to all error * rates that that were observed to be zero. *

    * @author Edward Buckler * @author Kelly Swarts * @cite */ //@Citation("Two papers: Viterbi from Bradbury, et al (in prep) Recombination patterns in maize\n"+ // "NearestNeighborSearch Swarts,...,Buckler (in prep) Imputation with large genotyping by sequencing data\n") public class FILLINImputationPlugin extends AbstractPlugin { private int minMajorRatioToMinorCnt=10; //refinement of minMinorCnt to account for regions with all major //Plugin parameters private PluginParameter hmpFile= new PluginParameter.Builder<>("hmp",null,String.class).guiName("Target file").inFile().required(true) .description("Input HapMap file of target genotypes to impute. Accepts all file types supported by TASSEL5. This file should include _ALL_ available sites, not " + "just those segregating in your material. (ie: don't filter the input)").build(); private PluginParameter donorFile= new PluginParameter.Builder<>("d",null,String.class).guiName("Donor").required(true) .description("Directory containing donor haplotype files from output of FILLINFindHaplotypesPlugin. All files with '.gc' in the filename will be read in, " + "only those with matching sites are used. Alternately, a single file to use as a donor, will be cut into sub genos in size specified (eg, high density" + "SNP file for projection").build(); private PluginParameter outFileBase= new PluginParameter.Builder<>("o",null,String.class).guiName("Output filename").outFile().required(true) .description("Output file; hmp.txt.gz and .hmp.h5 accepted.").build(); private PluginParameter appoxSitesPerDonorGenotypeTable= new PluginParameter.Builder<>("hapSize",8000,Integer.class).guiName("Preferred haplotype size") .description("Preferred haplotype block size in sites (use same as in FILLINFindHaplotypesPlugin)").build(); private PluginParameter hetThresh= new PluginParameter.Builder<>("mxHet",0.02,Double.class).guiName("Heterozygosity threshold") .description("Threshold per taxon heterozygosity for treating taxon as heterozygous (no Viterbi, het thresholds).").build(); private PluginParameter maximumInbredError= new PluginParameter.Builder<>("mxInbErr",0.01,Double.class).guiName("Max error to impute one donor") .description("Maximum error rate for applying one haplotype to entire site window").build(); private PluginParameter maxHybridErrorRate= new PluginParameter.Builder<>("mxHybErr",0.003,Double.class).guiName("Max combined error to impute two donors") .description("Maximum error rate for applying Viterbi with two haplotypes to entire site window").build(); private PluginParameter minTestSites= new PluginParameter.Builder<>("mnTestSite",20,Integer.class).guiName("Min sites to test match") .description("Minimum number of sites to test for IBS between haplotype and target in focus block").build(); private PluginParameter minMinorCnt= new PluginParameter.Builder<>("minMnCnt",20,Integer.class).guiName("Min num of minor alleles to compare") .description("Minimum number of informative minor alleles in the search window (or "+minMajorRatioToMinorCnt+"X major)").build(); private PluginParameter maxDonorHypotheses= new PluginParameter.Builder<>("mxDonH",20,Integer.class).guiName("Max donor hypotheses") .description("Maximum number of donor hypotheses to be explored. (Note: For heterozygous samples you may want to set this number higher, but the computation load goes up quickly.)").build(); private PluginParameter imputeAllHets= new PluginParameter.Builder<>("impAllHets",false,Boolean.class).guiName("Impute all het calls") .description("Write all imputed heterozygous calls as such, even if the original file has a homozygous call. (Not recommended for inbred lines.)").build(); //if true, uses combination mode in focus block, else set don't impute (default is true) private PluginParameter hybridNN= new PluginParameter.Builder<>("hybNN",true,Boolean.class).guiName("Combine two haplotypes as heterozygote") .description("If true, uses combination mode in focus block, else does not impute").build(); private PluginParameter isOutputProjection= new PluginParameter.Builder<>("ProjA",false,Boolean.class).guiName("Output projection alignment") .description("Create a projection alignment for high density markers").build(); private PluginParameter imputeDonorFile= new PluginParameter.Builder<>("impDonor",false,Boolean.class).guiName("Impute donor file") .description("Impute the donor file itself").build(); private PluginParameter nonverboseOutput= new PluginParameter.Builder<>("nV",false,Boolean.class).guiName("Supress system out") .description("Supress system out").build(); //for calculating accuracy private PluginParameter accuracy= new PluginParameter.Builder<>("accuracy",false,Boolean.class).guiName("Calculate accuracy") .description("Masks input file before imputation and calculates accuracy based on masked genotypes").build(); private PluginParameter propSitesMask= new PluginParameter.Builder<>("propSitesMask",0.01,Double.class).guiName("Proportion of genotypes to mask if no depth") .description("Proportion of genotypes to mask for accuracy calculation if depth not available").dependentOnParameter(accuracy).build(); private PluginParameter depthToMask= new PluginParameter.Builder<>("depthMask",9,Integer.class).guiName("Depth of genotypes to mask") .description("Depth of genotypes to mask for accuracy calculation if depth information available").dependentOnParameter(accuracy).build(); private PluginParameter propDepthSitesMask= new PluginParameter.Builder<>("propDepthSitesMask",0.2,Double.class).guiName("Proportion of depth genotypes to mask") .description("Proportion of genotypes of given depth to mask for accuracy calculation if depth available").dependentOnParameter(accuracy).build(); private PluginParameter maskKey= new PluginParameter.Builder<>("maskKey",null,String.class).inFile().required(false).guiName("Optional key to calculate accuracy") .description("Key to calculate accuracy. Genotypes missing (masked) in target file should be present in key, with all other sites set to missing. Overrides other" + "accuracy options if present and all sites and taxa present in target file present").dependentOnParameter(accuracy).build(); private PluginParameter byMAF= new PluginParameter.Builder<>("byMAF",false,Boolean.class).guiName("Calculate accuracy within MAF categories") .description("Calculate R2 accuracy within MAF categories based on donor file").dependentOnParameter(accuracy).build(); //Additional variables private boolean verboseOutput= true; private boolean twoWayViterbi= true;//if true, the viterbi runs in both directions (the longest path length wins, if inconsistencies) private double minimumDonorDistance=maximumInbredError.value()*5; //used to prevent Viterbi errors with too similar sequences private double maxNonMedelian=maximumInbredError.value()*5; //if used avoid Viterbi if too many sites are non-Mendelian //options for focus blocks private double maxHybridErrFocusHomo= .3333*maxHybridErrorRate.value();////max error rate for discrepacy between two haplotypes for the focus block. it's default is higher because calculating for fewer sites private double maxInbredErrFocusHomo= .3*maximumInbredError.value();//.003; private double maxSmashErrFocusHomo= maximumInbredError.value();//.01; private double maxInbredErrFocusHet= .1*maximumInbredError.value();//.001;//the error rate for imputing one haplotype in focus block for a het taxon private double maxSmashErrFocusHet= maximumInbredError.value();//.01; public static GenotypeTable unimpAlign; //the unimputed alignment to be imputed, unphased public static GenotypeTable maskKeyAlign= null; //the key to the unimputed alignment mask public static double[] MAFClass= new double[]{0,.02,.05,.10,.20,.3,.4,.5,1};//must retain 0 and 1 private int testing=0; //level of reporting to stdout //major and minor alleles can be differ between the donor and unimp alignment private boolean isSwapMajorMinor=true; //if swapped try to fix it private boolean resolveHetIfUndercalled=false;//if true, sets genotypes called to a het to a het, even if already a homozygote //initialize the transition matrix (T1) double[][] transition = new double[][] { {.999,.0001,.0003,.0001,.0005}, {.0002,.999,.00005,.00005,.0002}, {.0002,.00005,.999,.00005,.0002}, {.0002,.00005,.00005,.999,.0002}, {.0005,.0001,.0003,.0001,.999} }; //initialize the emission matrix, states (5) in rows, observations (3) in columns double[][] emission = new double[][] { {.998,.001,.001}, {.6,.2,.2}, {.4,.2,.4}, {.2,.2,.6}, {.001,.001,.998} }; FILLINImputationAccuracy acc= null; //holds the accuracy information if accuracy flagged private static final Logger myLogger = Logger.getLogger(FILLINImputationPlugin.class); @Override protected void postProcessParameters() { System.out.println("Calling FILLINImputationPlugin.postProcessParameters()..."); File parent = new File(outputFilename()).getParentFile(); if ((parent != null) && (!parent.exists())) { parent.mkdirs(); } minimumDonorDistance=maximumInbredError.value()*5; maxNonMedelian=maximumInbredError.value()*5; maxInbredErrFocusHomo= .3*maximumInbredError.value();//.003; maxSmashErrFocusHomo= maximumInbredError.value();//.01; maxInbredErrFocusHet= .1*maximumInbredError.value();//.001;//the error rate for imputing one haplotype in focus block for a het taxon maxSmashErrFocusHet= maximumInbredError.value();//.01; maxHybridErrFocusHomo= .3333*maxHybridErrorRate.value(); resolveHetIfUndercalled = imputeAllHets.value(); if (nonverboseOutput.value()) verboseOutput= false; if (byMAF.value()==false) MAFClass= null; if (!new File(donorFile.value()).exists()) System.out.println("Donor is not a file or folder: "+donorFile.value()); } public FILLINImputationPlugin() { super(null, false); } public FILLINImputationPlugin(Frame parentFrame, boolean isInteractive) { super(parentFrame, isInteractive); } @Override public String getCitation() { return "Swarts K, Li H, Romero Navarro JA, An D, Romay MC, Hearne S, Acharya C, Glaubitz JC, Mitchell S, Elshire RJ, Buckler ES, Bradbury PJ " + "(2014) " + "Novel methods to optimize genotypic imputation for low-coverage, next-generation sequence data in crop plants. " + "Plant Genome 7(3). doi:10.3835/plantgenome2014.05.0023"; } @Override public DataSet processData(DataSet input) { long time=System.currentTimeMillis(); String donor= donorFile.value(); unimpAlign=ImportUtils.read(hmpFile.value()); if (isOutputProjection.value()) { unimpAlign= FILLINDonorGenotypeUtils.RemoveSitesThatDoNotMatchMinMaj(donorFile.value(), unimpAlign,verboseOutput); donor= donor.subSequence(0, donor.lastIndexOf("."))+".matchMinMaj.hmp.txt.gz"; } GenotypeTable[] donorAlign=FILLINDonorGenotypeUtils.loadDonors(donor, unimpAlign, minTestSites.value(), verboseOutput,appoxSitesPerDonorGenotypeTable.value()); if (accuracy.value()) { time= System.currentTimeMillis()-time; //holds the time so far if (maskKey.value()!=null) maskKeyAlign= ImportUtils.readGuessFormat(maskKey.value()); acc= new FILLINImputationAccuracy(unimpAlign,maskKeyAlign,donorAlign,propSitesMask.value(),depthToMask.value(), propDepthSitesMask.value(),outFileBase.value(),MAFClass,verboseOutput); unimpAlign= acc.initiateAccuracy(); time= System.currentTimeMillis()-time;//restarts the time, including the time to load donors and target but not including the time to set up accuracy } OpenBitSet[][] conflictMasks=FILLINDonorGenotypeUtils.createMaskForAlignmentConflicts(unimpAlign, donorAlign, verboseOutput); System.out.printf("Unimputed taxa:%d sites:%d %n",unimpAlign.numberOfTaxa(),unimpAlign.numberOfSites()); System.out.println("Creating Export GenotypeTable:"+outFileBase.value()); Object mna; if(isOutputProjection.value()) { mna=new ProjectionBuilder(ImportUtils.readGuessFormat(donorFile.value())); } else { if(outFileBase.value().contains(".h5")) { mna= GenotypeTableBuilder.getTaxaIncremental(this.unimpAlign.positions(),outFileBase.value()); }else { mna= GenotypeTableBuilder.getTaxaIncremental(this.unimpAlign.positions()); } } int numThreads = Runtime.getRuntime().availableProcessors(); System.out.println("Time to read in files and generate masks: "+((System.currentTimeMillis()-time)/1000)+" sec"); ExecutorService pool = Executors.newFixedThreadPool(numThreads); for (int taxon = 0; taxon < unimpAlign.numberOfTaxa(); taxon+=1) { int[] trackBlockNN= new int[5];//global variable to track number of focus blocks solved in NN search for system out; index 0 is inbred, 1 is viterbi, 2 is smash, 3 is not solved, 4 is total for all modes ImputeOneTaxon theTaxon= (((double)unimpAlign.heterozygousCountForTaxon(taxon)/(double)unimpAlign.totalNonMissingForTaxon(taxon))minSitesPresent); int countFullLength=0; int countByFocus= 0; for (int da = 0; (da < donorAlign.length)&&enoughData ; da++) { int donorOffset=unimpAlign.siteOfPhysicalPosition(donorAlign[da].chromosomalPosition(0), donorAlign[da].chromosome(0)); int blocks=donorAlign[da].allelePresenceForAllSites(0, Major).getNumWords(); BitSet[] maskedTargetBits=FILLINDonorGenotypeUtils.arrangeMajorMinorBtwAlignments(unimpAlign, taxon, donorOffset, donorAlign[da].numberOfSites(), conflictMasks[da][0], conflictMasks[da][1], isSwapMajorMinor); //if imputing the donor file, these donor indices prevent self imputation int[] donorIndices; if(imputeDonorFile){ donorIndices=new int[donorAlign[da].numberOfTaxa()-1]; for (int i = 0; i < donorIndices.length; i++) {donorIndices[i]=i; if(i>=taxon) donorIndices[i]++;} } else { donorIndices=new int[donorAlign[da].numberOfTaxa()]; for (int i = 0; i < donorIndices.length; i++) {donorIndices[i]=i;} } //Finds the best haplotype donors for each focus block within a donorGenotypeTable DonorHypoth[][] regionHypthInbred=new DonorHypoth[blocks][maxDonorHypotheses.value()]; byte[][][] targetToDonorDistances=FILLINImputationUtils.calcAllelePresenceCountsBtwTargetAndDonors(maskedTargetBits, donorAlign[da]); for (int focusBlock = 0; focusBlock < blocks; focusBlock++) { int[] resultRange=FILLINImputationUtils.getBlockWithMinMinorCount(maskedTargetBits[0].getBits(), maskedTargetBits[1].getBits(), focusBlock, minMinorCnt.value(), minMinorCnt.value()*minMajorRatioToMinorCnt); if(resultRange==null) continue; //no data in the focus Block //search for the best inbred donors for a segment regionHypthInbred[focusBlock]=FILLINImputationUtils.findHomozygousDonorHypoth(taxon, resultRange[0], resultRange[2], focusBlock, donorIndices, targetToDonorDistances, minTestSites.value(), maxDonorHypotheses.value()); } impTaxon.setSegmentSolved(false); //tries to solve the entire donorAlign region by Virterbi or Inbred impTaxon=solveEntireDonorRegion(taxon, donorAlign[da], donorOffset, regionHypthInbred, impTaxon, maskedTargetBits, maxHybridErrorRate.value(), targetToDonorDistances); if(impTaxon.isSegmentSolved()) {countFullLength++; continue;} //resorts to solving block by block, first by inbred, then by viterbi, and then by hybrid impTaxon=solveByBlockNearestNeighbor(impTaxon, taxon, donorAlign[da], donorOffset, regionHypthInbred, hybridNN.value(), maskedTargetBits, minMinorCnt.value(), focusInbredErr, focusHybridErr, focusSmashErr, donorIndices, trackBlockNN, hetsMiss); if(impTaxon.isSegmentSolved()) {countByFocus++;} } double totalFocus= (double)trackBlockNN[3]+(double)trackBlockNN[4]; sb.append(String.format("InbredOrViterbi:%d FocusBlock:%d PropFocusInbred:%f PropFocusViterbi:%f PropFocusSmash:%f PropFocusMissing:%f BlocksSolved:%d ", countFullLength, countByFocus, (double)trackBlockNN[0]/totalFocus, (double)trackBlockNN[1]/totalFocus, (double)trackBlockNN[2]/totalFocus, (double)trackBlockNN[3]/totalFocus, impTaxon.getBlocksSolved())); //sb.append(String.format("InbredOrViterbi:%d FocusBlock:%d BlocksSolved:%d ", countFullLength, countByFocus, impTaxon.getBlocksSolved())); int[] unk=FILLINImputationUtils.countUnknownAndHeterozygotes(impTaxon.resolveGeno); sb.append(String.format("Unk:%d PropMissing:%g ", unk[0], (double) unk[0] / (double) impTaxon.getOrigGeno().length)); sb.append(String.format("Het:%d PropHet:%g ", unk[1], (double)unk[1]/(double)impTaxon.getOrigGeno().length)); sb.append(" BreakPoints:"+impTaxon.getBreakPoints().size()); if(!isOutputProjection.value()) { alignBuilder.addTaxon(unimpAlign.taxa().get(taxon), impTaxon.resolveGeno); } else { projBuilder.addTaxon(unimpAlign.taxa().get(taxon),impTaxon.getBreakPoints()); } if(verboseOutput) System.out.println(sb.toString()); } } /** * Solve the entire donor alignment window with either one or two haplotypes. Generally, it is solved with the * HMM Viterbi algorithm. The Viterbi algorithm is relatively slow the big choice is how to choose the potential * donors. * * * @param taxon * @param donorAlign * @param donorOffset * @param regionHypoth * @param impT * @param maskedTargetBits * @param maxHybridErrorRate * @return */ private ImputedTaxon solveEntireDonorRegion(int taxon, GenotypeTable donorAlign, int donorOffset, DonorHypoth[][] regionHypoth, ImputedTaxon impT, BitSet[] maskedTargetBits, double maxHybridErrorRate, byte[][][] targetToDonorDistances) { int blocks=maskedTargetBits[0].getNumWords(); if(testing==1) System.out.println("Starting complete hybrid search"); //create a list of the best donors based on showing up frequently high in many focus blocks //in the test data set the best results achieved with on the best hypothesis recovered. //todo consider whether to use this approach // int[] d=FILLINImputationUtils.mostFrequentDonorsAcrossFocusBlocks(regionHypoth, maxDonorHypotheses); //Alternative test is find best donors int[] d=FILLINImputationUtils.bestDonorsAcrossEntireRegion(targetToDonorDistances, minTestSites.value(),maxDonorHypotheses.value()); int[] testList=FILLINImputationUtils.fillInc(0,donorAlign.numberOfTaxa()-1); int[] bestDonorList=Arrays.copyOfRange(d,0,Math.min(d.length,5)); DonorHypoth[] bestDBasedOnBest=FILLINImputationUtils.findHeterozygousDonorHypoth(taxon, maskedTargetBits[0].getBits(), maskedTargetBits[1].getBits(), 0, blocks-1, blocks/2, donorAlign, bestDonorList, testList, maxDonorHypotheses.value(), minTestSites.value()); //make all combinations of best donor and find the the pairs that minimize errors //with the true switch also will make inbreds DonorHypoth[] best2Dsearchdonors=FILLINImputationUtils.findHeterozygousDonorHypoth(taxon, maskedTargetBits[0].getBits(), maskedTargetBits[1].getBits(), 0, blocks-1, blocks/2, donorAlign, d, d, maxDonorHypotheses.value(), minTestSites.value()); DonorHypoth[] best2donors=FILLINImputationUtils.combineDonorHypothArrays(maxDonorHypotheses.value(),bestDBasedOnBest,best2Dsearchdonors); if(testing==1) System.out.println(Arrays.toString(best2donors)); ArrayList goodDH=new ArrayList(); for (DonorHypoth dh : best2donors) { if(dh==null) continue; if(dh.isInbred() && (dh.getErrorRate() goodDH= new ArrayList(); //KLS0201 if (best2donors[0].getErrorRate()=donorAlign.numberOfSites())?donorAlign.numberOfSites()-1: dh.endSite; int sites=endSite-dh.startSite+1; //Find the informative sites StatePositionChain informative=createInformativeStateChainForViterbi(dh.startSite, maxNonMedelian, minimumDonorDistance, unimpAlign.genotypeRange(dh.targetTaxon, dh.startSite+donorOffset, endSite+1+donorOffset), donorAlign.genotypeRange(dh.donor1Taxon, dh.startSite, endSite+1), donorAlign.genotypeRange(dh.donor2Taxon, dh.startSite, endSite+1)); if(informative==null) return null; //Find the most likely states by Virterbi int chrlength = donorAlign.chromosomalPosition(endSite) - donorAlign.chromosomalPosition(dh.startSite); byte[] callsF=callsFromViterbi(trans, chrlength/sites, informative); DonorHypoth dh2=new DonorHypoth(dh.targetTaxon,dh.donor1Taxon, dh.donor2Taxon, dh.startBlock, dh.focusBlock, dh.endBlock); dh2.phasedResults= callsF; if (forwardReverse) { byte[] callsR=callsFromViterbi(trans, chrlength/sites, StatePositionChain.reverseInstance(informative)); ArrayUtils.reverse(callsR); //These are now in same direction as callsF byte[] callsC=new byte[callsF.length]; System.arraycopy(callsR,0,callsC,0,callsC.length/2); System.arraycopy(callsF,callsC.length/2,callsC,callsC.length/2,callsC.length-callsC.length/2); dh2.phasedResults= callsC; } return dh2; } private byte[] callsFromViterbi(double[][] trans, int avgChrLength, StatePositionChain informative) { //There is no EM optimization of Viterbi, clearly something that could be changed TransitionProbability tpF = new TransitionProbability(); EmissionProbability ep = new EmissionProbability(); tpF.setTransitionProbability(trans); ep.setEmissionProbability(emission); final double probHeterozygous=0.5; final double phom = (1 - probHeterozygous) / 2; final double[] pTrue = new double[]{phom, .25*probHeterozygous ,.5 * probHeterozygous, .25*probHeterozygous, phom}; tpF.setAverageSegmentLength(avgChrLength); tpF.setPositions(informative.informSites); ViterbiAlgorithm vaF = new ViterbiAlgorithm(informative.informStates, tpF, ep, pTrue); vaF.calculate(); byte[] resultStatesF=vaF.getMostProbableStateSequence(); int currPos=0; //converts the informative states back to all states byte[] callsF=new byte[informative.totalSiteCnt]; for(int cs=0; cs nonMissingObs = new ArrayList<>(); ArrayList informSites = new ArrayList<>(); //Selects only the informative sites for the Viterbi algorithm (known & gap free & homozygous in donor) for(int cs=0; csmaxNonMendelian) return null; //if the donors are too similar Viterbi performs poorly. Only accepted different donors if((double)donorDifferences/(double)informSites.size() donors= new HashSet<>(); for (int h = 0; h < regionHypth[block].length; h++) { if (regionHypth[block][h]!=null) { donors.add(regionHypth[block][h].donor1Taxon); donors.add(regionHypth[block][h].donor2Taxon); } } if (donors.isEmpty()) return null; return Ints.toArray(donors); } // /** // * Simple algorithm that tests every possible two donor combination to minimize // * the number of unmatched informative alleles. Currently, there is little tie // * breaking, longer matches are favored. // * @param targetTaxon // * @param startBlock // * @param endBlock // * @param focusBlock // * @return int[] array of {donor1, donor2, testSites} sorted by testPropUnmatched // */ // private DonorHypoth[] getBestHybridDonors(int targetTaxon, long[] mjT, long[] mnT, // int startBlock, int endBlock, int focusBlock, GenotypeTable donorAlign, // int[] donor1Indices, int[] donor2Indices, // boolean viterbiSearch) { // TreeMap bestDonors=new TreeMap(); // Set testedDonorPairs=new HashSet<>(); // double lastKeytestPropUnmatched=1.0; // double inc=1e-9; // double[] donorDist; // for (int d1: donor1Indices) { // long[] mj1=donorAlign.allelePresenceForSitesBlock(d1, Major, startBlock, endBlock + 1); // long[] mn1=donorAlign.allelePresenceForSitesBlock(d1, Minor, startBlock, endBlock + 1); // for (int d2 : donor2Indices) { // if((!viterbiSearch)&&(d1==d2)) continue; // if(testedDonorPairs.contains(((long)d1<<32)+(long)d2)) continue; // if(testedDonorPairs.contains(((long)d2<<32)+(long)d1)) continue; // testedDonorPairs.add(((long)d1<<32)+(long)d2); // long[] mj2=donorAlign.allelePresenceForSitesBlock(d2, Major, startBlock, endBlock + 1); // long[] mn2=donorAlign.allelePresenceForSitesBlock(d2, Minor, startBlock, endBlock + 1); // if(viterbiSearch) { // donorDist=IBSDistanceMatrix.computeHetBitDistances(mj1, mn1, mj2, mn2, minTestSites); // if((d1!=d2)&&(donorDist[0]maxDonorHypotheses) { // bestDonors.remove(bestDonors.lastKey()); // lastKeytestPropUnmatched=bestDonors.lastKey(); // } // } // } // } // DonorHypoth[] result=new DonorHypoth[maxDonorHypotheses]; // int count=0; // for (DonorHypoth dh : bestDonors.values()) { // result[count]=dh; // count++; // } // return result; // } // // private int[] mendelErrorComparison(long[] mjT, long[] mnT, long[] mj1, long[] mn1, // long[] mj2, long[] mn2) { // int mjUnmatched=0; // int mnUnmatched=0; // int testSites=0; // for (int i = 0; i < mjT.length; i++) { // long siteMask=(mjT[i]|mnT[i])&(mj1[i]|mn1[i])&(mj2[i]|mn2[i]); // mjUnmatched+=Long.bitCount(siteMask&mjT[i]&(mjT[i]^mj1[i])&(mjT[i]^mj2[i])); // mnUnmatched+=Long.bitCount(siteMask&mnT[i]&(mnT[i]^mn1[i])&(mnT[i]^mn2[i])); // testSites+=Long.bitCount(siteMask); // } // int totalMendelianErrors=mjUnmatched+mnUnmatched; // // double testPropUnmatched=(double)(totalMendelianErrors)/(double)testSites; // return (new int[] {totalMendelianErrors, testSites}); // } private ImputedTaxon setAlignmentWithDonors(GenotypeTable donorAlign, DonorHypoth[] theDH, int donorOffset, boolean setJustFocus, ImputedTaxon impT, boolean smashOn, boolean hetsMiss) { if(theDH[0].targetTaxon<0) return impT; boolean print=false; int startSite=(setJustFocus)?theDH[0].getFocusStartSite():theDH[0].startSite; int endSite=(setJustFocus)?theDH[0].getFocusEndSite():theDH[0].endSite; if(endSite>=donorAlign.numberOfSites()) endSite=donorAlign.numberOfSites()-1; //prevDonors is used to track recombination breakpoint start and stop int[] prevDonors=new int[]{-1, -1}; if(theDH[0].getPhaseForSite(startSite)==0) {prevDonors=new int[]{theDH[0].donor1Taxon, theDH[0].donor1Taxon};} else if(theDH[0].getPhaseForSite(startSite)==2) {prevDonors=new int[]{theDH[0].donor2Taxon, theDH[0].donor2Taxon};} else if(theDH[0].getPhaseForSite(startSite)==1) {prevDonors=new int[]{theDH[0].donor1Taxon, theDH[0].donor2Taxon};} int prevDonorStart=startSite; int[] currDonors=prevDonors; for(int cs=startSite; cs<=endSite; cs++) { byte donorEst=UNKNOWN_DIPLOID_ALLELE; byte neighbor=0; //site imputation will try to use all donor hypotheses; breakpoints only use the best (e.g. i==0 tests) for (int i = 0; (i < theDH.length) && (donorEst==UNKNOWN_DIPLOID_ALLELE); i++) { neighbor++; if((theDH[i]==null)||(theDH[i].donor1Taxon<0)) continue; if(theDH[i].getErrorRate()>this.maximumInbredError.value()) continue; byte bD1=donorAlign.genotype(theDH[i].donor1Taxon, cs); if(theDH[i].getPhaseForSite(cs)==0) { donorEst=bD1; if(i==0) currDonors=new int[]{theDH[0].donor1Taxon, theDH[0].donor1Taxon}; } else { byte bD2=donorAlign.genotype(theDH[i].donor2Taxon, cs); if(theDH[i].getPhaseForSite(cs)==2) { donorEst=bD2; if(i==0) currDonors=new int[]{theDH[0].donor2Taxon, theDH[0].donor2Taxon}; } else { donorEst=GenotypeTableUtils.getUnphasedDiploidValueNoHets(bD1, bD2); if(i==0) currDonors=new int[]{theDH[0].donor1Taxon, theDH[0].donor2Taxon}; } } } byte knownBase=impT.getOrigGeno(cs+donorOffset); //record recombination breakpoint if it changes if(!Arrays.equals(prevDonors, currDonors)) { DonorHaplotypes dhaps=new DonorHaplotypes(donorAlign.chromosome(prevDonorStart), donorAlign.chromosomalPosition(prevDonorStart), donorAlign.chromosomalPosition(cs),prevDonors[0],prevDonors[1]); impT.addBreakPoint(dhaps); prevDonors=currDonors; prevDonorStart=cs; } //chgHis records the section of the imputation pipeline that made the change, Viterbi negative, blockNN positive if(theDH[0].phasedResults==null) {impT.chgHis[cs+donorOffset]=(byte)-neighbor;} else {impT.chgHis[cs+donorOffset]=(byte)neighbor;} impT.impGeno[cs+donorOffset]= donorEst; //predicted based on neighbor //if genotype is unknown or het undercall then resolves //todo there is currently not an option if the predicted disagrees with a homozygous call if(knownBase==UNKNOWN_DIPLOID_ALLELE) { if (isHeterozygous(donorEst)) { if (smashOn && hetsMiss) {//if imputing a heterozygote, just set to missing impT.resolveGeno[cs+donorOffset]= knownBase; } else impT.resolveGeno[cs+donorOffset]= donorEst; //if not in hybrid, set to het } else {//if not imputed to a het impT.resolveGeno[cs+donorOffset]= donorEst;} } else if (isHeterozygous(donorEst)){ if(resolveHetIfUndercalled&&GenotypeTableUtils.isPartiallyEqual(knownBase, donorEst)&&smashOn==false){//if smash off, set homozygotes imputed to het to het //System.out.println("ResolveHet:"+theDH[0].targetTaxon+":"+cs+donorOffset+":"+knownBaseString+":"+donorEstString); impT.resolveGeno[cs+donorOffset]= donorEst; } } } //end of cs loop //enter a stop of the DH at the beginning of the next block DonorHaplotypes dhaps=new DonorHaplotypes(donorAlign.chromosome(prevDonorStart), donorAlign.chromosomalPosition(prevDonorStart), donorAlign.chromosomalPosition(endSite),prevDonors[0],prevDonors[1]); impT.addBreakPoint(dhaps); return impT; } @Override public ImageIcon getIcon() { return null; } @Override public String getButtonName() { return "Impute By FILLIN"; } @Override public String getToolTipText() { return "Imputation that relies on a combination of HMM and Nearest Neighbor using donors derived from FILLINFindHaplotypesPlugin"; } // The following getters and setters were auto-generated. // Please use this method to re-generate. // public static void main(String[] args) { GeneratePluginCode.generate(FILLINImputationPlugin.class); } /** * Convenience method to run plugin with one return object. */ /*// TODO: Replace with specific type. public runPlugin(DataSet input) { return () performFunction(input).getData(0).getData(); }*/ /** * Input HapMap file of target genotypes to impute. Accepts * all file types supported by TASSEL5 * * @return Target file */ public String targetFile() { return hmpFile.value(); } /** * Set Target file. Input HapMap file of target genotypes * to impute. Accepts all file types supported by TASSEL5 * * @param value Target file * * @return this plugin */ public FILLINImputationPlugin targetFile(String value) { hmpFile = new PluginParameter<>(hmpFile, value); return this; } /** * Directory containing donor haplotype files from output * of FILLINFindHaplotypesPlugin. All files with '.gc' * in the filename will be read in, only those with matching * sites are used * * @return Donor Dir */ public String donorDir() { return donorFile.value(); } /** * Set Donor Dir. Directory containing donor haplotype * files from output of FILLINFindHaplotypesPlugin. All * files with '.gc' in the filename will be read in, only * those with matching sites are used * * @param value Donor Dir * * @return this plugin */ public FILLINImputationPlugin donorDir(String value) { donorFile = new PluginParameter<>(donorFile, value); return this; } /** * Output file; hmp.txt.gz and .hmp.h5 accepted. * * @return Output filename */ public String outputFilename() { return outFileBase.value(); } /** * Set Output filename. Output file; hmp.txt.gz and .hmp.h5 * accepted. * * @param value Output filename * * @return this plugin */ public FILLINImputationPlugin outputFilename(String value) { outFileBase = new PluginParameter<>(outFileBase, value); return this; } /** * Preferred haplotype block size in sites (use same as * in FILLINFindHaplotypesPlugin) * * @return Preferred haplotype size */ public Integer preferredHaplotypeSize() { return appoxSitesPerDonorGenotypeTable.value(); } /** * Set Preferred haplotype size. Preferred haplotype block * size in sites (use same as in FILLINFindHaplotypesPlugin) * * @param value Preferred haplotype size * * @return this plugin */ public FILLINImputationPlugin preferredHaplotypeSize(Integer value) { appoxSitesPerDonorGenotypeTable = new PluginParameter<>(appoxSitesPerDonorGenotypeTable, value); return this; } /** * Threshold per taxon heterozygosity for treating taxon * as heterozygous (no Viterbi, het thresholds). * * @return Heterozygosity threshold */ public Double heterozygosityThreshold() { return hetThresh.value(); } /** * Set Heterozygosity threshold. Threshold per taxon heterozygosity * for treating taxon as heterozygous (no Viterbi, het * thresholds). * * @param value Heterozygosity threshold * * @return this plugin */ public FILLINImputationPlugin heterozygosityThreshold(Double value) { hetThresh = new PluginParameter<>(hetThresh, value); return this; } /** * Maximum error rate for applying one haplotype to entire * site window * * @return Max error to impute one donor */ public Double maxErrorToImputeOneDonor() { return maximumInbredError.value(); } /** * Set Max error to impute one donor. Maximum error rate * for applying one haplotype to entire site window * * @param value Max error to impute one donor * * @return this plugin */ public FILLINImputationPlugin maxErrorToImputeOneDonor(Double value) { maximumInbredError = new PluginParameter<>(maximumInbredError, value); return this; } /** * Maximum error rate for applying Viterbi with to haplotypes * to entire site window * * @return Max combined error to impute two donors */ public Double maxCombinedErrorToImputeTwoDonors() { return maxHybridErrorRate.value(); } /** * Set Max combined error to impute two donors. Maximum * error rate for applying Viterbi with to haplotypes * to entire site window * * @param value Max combined error to impute two donors * * @return this plugin */ public FILLINImputationPlugin maxCombinedErrorToImputeTwoDonors(Double value) { maxHybridErrorRate = new PluginParameter<>(maxHybridErrorRate, value); return this; } /** * Minimum number of sites to test for IBS between haplotype * and target in focus block * * @return Min sites to test match */ public Integer minSitesToTestMatch() { return minTestSites.value(); } /** * Set Min sites to test match. Minimum number of sites * to test for IBS between haplotype and target in focus * block * * @param value Min sites to test match * * @return this plugin */ public FILLINImputationPlugin minSitesToTestMatch(Integer value) { minTestSites = new PluginParameter<>(minTestSites, value); return this; } /** * Minimum number of informative minor alleles in the * search window (or 10X major) * * @return Min num of minor alleles to compare */ public Integer minNumOfMinorAllelesToCompare() { return minMinorCnt.value(); } /** * Set Min num of minor alleles to compare. Minimum number * of informative minor alleles in the search window (or * 10X major) * * @param value Min num of minor alleles to compare * * @return this plugin */ public FILLINImputationPlugin minNumOfMinorAllelesToCompare(Integer value) { minMinorCnt = new PluginParameter<>(minMinorCnt, value); return this; } /** * Maximum number of donor hypotheses to be explored * * @return Max donor hypotheses */ public Integer maxDonorHypotheses() { return maxDonorHypotheses.value(); } /** * Set Max donor hypotheses. Maximum number of donor hypotheses * to be explored * * @param value Max donor hypotheses * * @return this plugin */ public FILLINImputationPlugin maxDonorHypotheses(Integer value) { maxDonorHypotheses = new PluginParameter<>(maxDonorHypotheses, value); return this; } /** * Write all imputed heterozygous calls as such, even * if the original file has a homozygous call. (Not recommended * for inbred lines.) * * @return Impute all het calls */ public Boolean imputeAllHetCalls() { return imputeAllHets.value(); } /** * Set Impute all het calls. Write all imputed heterozygous * calls as such, even if the original file has a homozygous * call. (Not recommended for inbred lines.) * * @param value Impute all het calls * * @return this plugin */ public FILLINImputationPlugin imputeAllHetCalls(Boolean value) { imputeAllHets = new PluginParameter<>(imputeAllHets, value); return this; } /** * If true, uses combination mode in focus block, else * does not impute * * @return Combine two haplotypes as heterozygote */ public Boolean combineTwoHaplotypesAsHeterozygote() { return hybridNN.value(); } /** * Set Combine two haplotypes as heterozygote. If true, * uses combination mode in focus block, else does not * impute * * @param value Combine two haplotypes as heterozygote * * @return this plugin */ public FILLINImputationPlugin combineTwoHaplotypesAsHeterozygote(Boolean value) { hybridNN = new PluginParameter<>(hybridNN, value); return this; } /** * Create a projection alignment for high density markers * * @return Output projection alignment */ public Boolean outputProjectionAlignment() { return isOutputProjection.value(); } /** * Set Output projection alignment. Create a projection * alignment for high density markers * * @param value Output projection alignment * * @return this plugin */ public FILLINImputationPlugin outputProjectionAlignment(Boolean value) { isOutputProjection = new PluginParameter<>(isOutputProjection, value); return this; } /** * Impute the donor file itself * * @return Impute donor file */ public Boolean imputeDonorFile() { return imputeDonorFile.value(); } /** * Set Impute donor file. Impute the donor file itself * * @param value Impute donor file * * @return this plugin */ public FILLINImputationPlugin imputeDonorFile(Boolean value) { imputeDonorFile = new PluginParameter<>(imputeDonorFile, value); return this; } /** * Supress system out * * @return Supress system out */ public Boolean supressSystemOut() { return nonverboseOutput.value(); } /** * Set Supress system out. Supress system out * * @param value Supress system out * * @return this plugin */ public FILLINImputationPlugin supressSystemOut(Boolean value) { nonverboseOutput = new PluginParameter<>(nonverboseOutput, value); return this; } /** * Masks input file before imputation and calculates accuracy * based on masked genotypes * * @return Calculate accuracy */ public Boolean calculateAccuracy() { return accuracy.value(); } /** * Set Calculate accuracy. Masks input file before imputation * and calculates accuracy based on masked genotypes * * @param value Calculate accuracy * * @return this plugin */ public FILLINImputationPlugin calculateAccuracy(Boolean value) { accuracy = new PluginParameter<>(accuracy, value); return this; } /** * Proportion of genotypes to mask for accuracy calculation * if depth not available * * @return Proportion of genotypes to mask if no depth */ public Double proportionOfGenotypesToMaskIfNoDepth() { return propSitesMask.value(); } /** * Set Proportion of genotypes to mask if no depth. Proportion * of genotypes to mask for accuracy calculation if depth * not available * * @param value Proportion of genotypes to mask if no depth * * @return this plugin */ public FILLINImputationPlugin proportionOfGenotypesToMaskIfNoDepth(Double value) { propSitesMask = new PluginParameter<>(propSitesMask, value); return this; } /** * Depth of genotypes to mask for accuracy calculation * if depth information available * * @return Depth of genotypes to mask */ public Integer depthOfGenotypesToMask() { return depthToMask.value(); } /** * Set Depth of genotypes to mask. Depth of genotypes * to mask for accuracy calculation if depth information * available * * @param value Depth of genotypes to mask * * @return this plugin */ public FILLINImputationPlugin depthOfGenotypesToMask(Integer value) { depthToMask = new PluginParameter<>(depthToMask, value); return this; } /** * Proportion of genotypes of given depth to mask for * accuracy calculation if depth available * * @return Proportion of depth genotypes to mask */ public Double proportionOfDepthGenotypesToMask() { return propDepthSitesMask.value(); } /** * Set Proportion of depth genotypes to mask. Proportion * of genotypes of given depth to mask for accuracy calculation * if depth available * * @param value Proportion of depth genotypes to mask * * @return this plugin */ public FILLINImputationPlugin proportionOfDepthGenotypesToMask(Double value) { propDepthSitesMask = new PluginParameter<>(propDepthSitesMask, value); return this; } /** * Key to calculate accuracy. Genotypes missing (masked) * in target file should be present in key, with all other * sites set to missing. Overrides otheraccuracy options * if present and all sites and taxa present in target * file present * * @return Optional key to calculate accuracy */ public String optionalKeyToCalculateAccuracy() { return maskKey.value(); } /** * Set Optional key to calculate accuracy. Key to calculate * accuracy. Genotypes missing (masked) in target file * should be present in key, with all other sites set * to missing. Overrides otheraccuracy options if present * and all sites and taxa present in target file present * * @param value Optional key to calculate accuracy * * @return this plugin */ public FILLINImputationPlugin optionalKeyToCalculateAccuracy(String value) { maskKey = new PluginParameter<>(maskKey, value); return this; } /** * Calculate R2 accuracy within MAF categories based on * donor file * * @return Calculate accuracy within MAF categories */ public Boolean calculateAccuracyWithinMAFCategories() { return byMAF.value(); } /** * Set Calculate accuracy within MAF categories. Calculate * R2 accuracy within MAF categories based on donor file * * @param value Calculate accuracy within MAF categories * * @return this plugin */ public FILLINImputationPlugin calculateAccuracyWithinMAFCategories(Boolean value) { byMAF = new PluginParameter<>(byMAF, value); return this; } }




    © 2015 - 2025 Weber Informatics LLC | Privacy Policy