Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package net.maizegenetics.analysis.imputation;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Ints;
import net.maizegenetics.analysis.popgen.DonorHypoth;
import net.maizegenetics.dna.map.DonorHaplotypes;
import net.maizegenetics.dna.snp.*;
import net.maizegenetics.dna.snp.io.ProjectionGenotypeIO;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.OpenBitSet;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.io.File;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import static net.maizegenetics.dna.snp.GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
import static net.maizegenetics.dna.WHICH_ALLELE.Major;
import static net.maizegenetics.dna.WHICH_ALLELE.Minor;
import static net.maizegenetics.dna.snp.GenotypeTableUtils.isHeterozygous;
import static net.maizegenetics.dna.snp.NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;
/**
* FILLIN imputation relies on a libary of haplotypes and uses nearest neighbor searches
* followed by HMM Viterbi resolution or block-based resolution.
* It is the best approach for substantially unrelated taxa in TASSEL. BEAGLE4 is a better
* approach currently for landraces, while FILLIN outperforms if there is a good reference
* set of haplotypes.
*
* The algorithm relies on having a series of donor haplotypes. If phased haplotypes are
* already known, they can be used directly. If they need to be reconstructed, the
* {@link FindMergeHaplotypesPlugin} can be used to create haplotypes for windows
* across the genome.
*
* Imputation is done one taxon at a time using the donor haplotypes. The strategy is as
* follows:
* Every 64 sites is considered a block (or a word in the bitset terminology - length of a long). There is
* a separate initial search for nearest neighbors centered around each block. The flanks
* of the scan window vary but always contain the number of minor alleles specified (a key parameter).
* Minor alleles approximate the information content of the search.
* Calculate distance between the target taxon and the donor haplotypes for every window, and rank
* the top 10 donor haplotypes for each 64 site focus block.
* Evaluate by Viterbi whether 1 or 2 haplotypes will explain all the sites across the
* donor haplotype window. If successful, move to the next region for donor haplotypes.
* Resolve each focus block by nearest neighbors. If inbred NN are not close enough,
* then do a hybrid search seeded with one parent being the initial 10 haplotypes.
* Set bases based on what is resolved first inbred or hybrid.
*
*
* Error rates are bounded away from zero, but adding 0.5 error to all error
* rates that that were observed to be zero.
*
* @author Edward Buckler
* @author Kelly Swarts
* @cite
*///@Citation("Two papers: Viterbi from Bradbury, et al (in prep) Recombination patterns in maize\n"+// "NearestNeighborSearch Swarts,...,Buckler (in prep) Imputation with large genotyping by sequencing data\n")publicclassFILLINImputationPluginextendsAbstractPlugin {
privateint minMajorRatioToMinorCnt=10; //refinement of minMinorCnt to account for regions with all major//Plugin parametersprivate PluginParameter hmpFile= new PluginParameter.Builder<>("hmp",null,String.class).guiName("Target file").inFile().required(true)
.description("Input HapMap file of target genotypes to impute. Accepts all file types supported by TASSEL5. This file should include _ALL_ available sites, not " +
"just those segregating in your material. (ie: don't filter the input)").build();
private PluginParameter donorFile= new PluginParameter.Builder<>("d",null,String.class).guiName("Donor").required(true)
.description("Directory containing donor haplotype files from output of FILLINFindHaplotypesPlugin. All files with '.gc' in the filename will be read in, "
+ "only those with matching sites are used. Alternately, a single file to use as a donor, will be cut into sub genos in size specified (eg, high density"
+ "SNP file for projection").build();
private PluginParameter outFileBase= new PluginParameter.Builder<>("o",null,String.class).guiName("Output filename").outFile().required(true)
.description("Output file; hmp.txt.gz and .hmp.h5 accepted.").build();
private PluginParameter appoxSitesPerDonorGenotypeTable= new PluginParameter.Builder<>("hapSize",8000,Integer.class).guiName("Preferred haplotype size")
.description("Preferred haplotype block size in sites (use same as in FILLINFindHaplotypesPlugin)").build();
private PluginParameter hetThresh= new PluginParameter.Builder<>("mxHet",0.02,Double.class).guiName("Heterozygosity threshold")
.description("Threshold per taxon heterozygosity for treating taxon as heterozygous (no Viterbi, het thresholds).").build();
private PluginParameter maximumInbredError= new PluginParameter.Builder<>("mxInbErr",0.01,Double.class).guiName("Max error to impute one donor")
.description("Maximum error rate for applying one haplotype to entire site window").build();
private PluginParameter maxHybridErrorRate= new PluginParameter.Builder<>("mxHybErr",0.003,Double.class).guiName("Max combined error to impute two donors")
.description("Maximum error rate for applying Viterbi with two haplotypes to entire site window").build();
private PluginParameter minTestSites= new PluginParameter.Builder<>("mnTestSite",20,Integer.class).guiName("Min sites to test match")
.description("Minimum number of sites to test for IBS between haplotype and target in focus block").build();
private PluginParameter minMinorCnt= new PluginParameter.Builder<>("minMnCnt",20,Integer.class).guiName("Min num of minor alleles to compare")
.description("Minimum number of informative minor alleles in the search window (or "+minMajorRatioToMinorCnt+"X major)").build();
private PluginParameter maxDonorHypotheses= new PluginParameter.Builder<>("mxDonH",20,Integer.class).guiName("Max donor hypotheses")
.description("Maximum number of donor hypotheses to be explored. (Note: For heterozygous samples you may want to set this number higher, but the computation load goes up quickly.)").build();
private PluginParameter imputeAllHets= new PluginParameter.Builder<>("impAllHets",false,Boolean.class).guiName("Impute all het calls")
.description("Write all imputed heterozygous calls as such, even if the original file has a homozygous call. (Not recommended for inbred lines.)").build();
//if true, uses combination mode in focus block, else set don't impute (default is true)private PluginParameter hybridNN= new PluginParameter.Builder<>("hybNN",true,Boolean.class).guiName("Combine two haplotypes as heterozygote")
.description("If true, uses combination mode in focus block, else does not impute").build();
private PluginParameter isOutputProjection= new PluginParameter.Builder<>("ProjA",false,Boolean.class).guiName("Output projection alignment")
.description("Create a projection alignment for high density markers").build();
private PluginParameter imputeDonorFile= new PluginParameter.Builder<>("impDonor",false,Boolean.class).guiName("Impute donor file")
.description("Impute the donor file itself").build();
private PluginParameter nonverboseOutput= new PluginParameter.Builder<>("nV",false,Boolean.class).guiName("Supress system out")
.description("Supress system out").build();
//for calculating accuracyprivate PluginParameter accuracy= new PluginParameter.Builder<>("accuracy",false,Boolean.class).guiName("Calculate accuracy")
.description("Masks input file before imputation and calculates accuracy based on masked genotypes").build();
private PluginParameter propSitesMask= new PluginParameter.Builder<>("propSitesMask",0.01,Double.class).guiName("Proportion of genotypes to mask if no depth")
.description("Proportion of genotypes to mask for accuracy calculation if depth not available").dependentOnParameter(accuracy).build();
private PluginParameter depthToMask= new PluginParameter.Builder<>("depthMask",9,Integer.class).guiName("Depth of genotypes to mask")
.description("Depth of genotypes to mask for accuracy calculation if depth information available").dependentOnParameter(accuracy).build();
private PluginParameter propDepthSitesMask= new PluginParameter.Builder<>("propDepthSitesMask",0.2,Double.class).guiName("Proportion of depth genotypes to mask")
.description("Proportion of genotypes of given depth to mask for accuracy calculation if depth available").dependentOnParameter(accuracy).build();
private PluginParameter maskKey= new PluginParameter.Builder<>("maskKey",null,String.class).inFile().required(false).guiName("Optional key to calculate accuracy")
.description("Key to calculate accuracy. Genotypes missing (masked) in target file should be present in key, with all other sites set to missing. Overrides other"
+ "accuracy options if present and all sites and taxa present in target file present").dependentOnParameter(accuracy).build();
private PluginParameter byMAF= new PluginParameter.Builder<>("byMAF",false,Boolean.class).guiName("Calculate accuracy within MAF categories")
.description("Calculate R2 accuracy within MAF categories based on donor file").dependentOnParameter(accuracy).build();
//Additional variablesprivate boolean verboseOutput= true;
private boolean twoWayViterbi= true;//if true, the viterbi runs in both directions (the longest path length wins, if inconsistencies)privatedouble minimumDonorDistance=maximumInbredError.value()*5; //used to prevent Viterbi errors with too similar sequencesprivatedouble maxNonMedelian=maximumInbredError.value()*5; //if used avoid Viterbi if too many sites are non-Mendelian//options for focus blocksprivatedouble maxHybridErrFocusHomo= .3333*maxHybridErrorRate.value();////max error rate for discrepacy between two haplotypes for the focus block. it's default is higher because calculating for fewer sitesprivatedouble maxInbredErrFocusHomo= .3*maximumInbredError.value();//.003;privatedouble maxSmashErrFocusHomo= maximumInbredError.value();//.01;privatedouble maxInbredErrFocusHet= .1*maximumInbredError.value();//.001;//the error rate for imputing one haplotype in focus block for a het taxonprivatedouble maxSmashErrFocusHet= maximumInbredError.value();//.01;publicstatic GenotypeTable unimpAlign; //the unimputed alignment to be imputed, unphasedpublicstatic GenotypeTable maskKeyAlign= null; //the key to the unimputed alignment maskpublicstaticdouble[] MAFClass= newdouble[]{0,.02,.05,.10,.20,.3,.4,.5,1};//must retain 0 and 1privateint testing=0; //level of reporting to stdout//major and minor alleles can be differ between the donor and unimp alignmentprivate boolean isSwapMajorMinor=true; //if swapped try to fix itprivate boolean resolveHetIfUndercalled=false;//if true, sets genotypes called to a het to a het, even if already a homozygote//initialize the transition matrix (T1)double[][] transition = newdouble[][] {
{.999,.0001,.0003,.0001,.0005},
{.0002,.999,.00005,.00005,.0002},
{.0002,.00005,.999,.00005,.0002},
{.0002,.00005,.00005,.999,.0002},
{.0005,.0001,.0003,.0001,.999}
};
//initialize the emission matrix, states (5) in rows, observations (3) in columnsdouble[][] emission = newdouble[][] {
{.998,.001,.001},
{.6,.2,.2},
{.4,.2,.4},
{.2,.2,.6},
{.001,.001,.998}
};
FILLINImputationAccuracy acc= null; //holds the accuracy information if accuracy flaggedprivatestatic final Logger myLogger = Logger.getLogger(FILLINImputationPlugin.class);
@Override
protectedvoidpostProcessParameters() {
System.out.println("Calling FILLINImputationPlugin.postProcessParameters()...");
File parent = new File(outputFilename()).getParentFile();
if ((parent != null) && (!parent.exists())) {
parent.mkdirs();
}
minimumDonorDistance=maximumInbredError.value()*5;
maxNonMedelian=maximumInbredError.value()*5;
maxInbredErrFocusHomo= .3*maximumInbredError.value();//.003;
maxSmashErrFocusHomo= maximumInbredError.value();//.01;
maxInbredErrFocusHet= .1*maximumInbredError.value();//.001;//the error rate for imputing one haplotype in focus block for a het taxon
maxSmashErrFocusHet= maximumInbredError.value();//.01;
maxHybridErrFocusHomo= .3333*maxHybridErrorRate.value();
resolveHetIfUndercalled = imputeAllHets.value();
if (nonverboseOutput.value()) verboseOutput= false;
if (byMAF.value()==false) MAFClass= null;
if (!new File(donorFile.value()).exists()) System.out.println("Donor is not a file or folder: "+donorFile.value());
}
publicFILLINImputationPlugin() {
super(null, false);
}
publicFILLINImputationPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public String getCitation() {
return"Swarts K, Li H, Romero Navarro JA, An D, Romay MC, Hearne S, Acharya C, Glaubitz JC, Mitchell S, Elshire RJ, Buckler ES, Bradbury PJ "
+ "(2014) "
+ "Novel methods to optimize genotypic imputation for low-coverage, next-generation sequence data in crop plants. "
+ "Plant Genome 7(3). doi:10.3835/plantgenome2014.05.0023";
}
@Override
public DataSet processData(DataSet input) {
long time=System.currentTimeMillis();
String donor= donorFile.value();
unimpAlign=ImportUtils.read(hmpFile.value());
if (isOutputProjection.value()) {
unimpAlign= FILLINDonorGenotypeUtils.RemoveSitesThatDoNotMatchMinMaj(donorFile.value(), unimpAlign,verboseOutput);
donor= donor.subSequence(0, donor.lastIndexOf("."))+".matchMinMaj.hmp.txt.gz";
}
GenotypeTable[] donorAlign=FILLINDonorGenotypeUtils.loadDonors(donor, unimpAlign, minTestSites.value(),
verboseOutput,appoxSitesPerDonorGenotypeTable.value());
if (accuracy.value()) {
time= System.currentTimeMillis()-time; //holds the time so farif (maskKey.value()!=null) maskKeyAlign= ImportUtils.readGuessFormat(maskKey.value());
acc= new FILLINImputationAccuracy(unimpAlign,maskKeyAlign,donorAlign,propSitesMask.value(),depthToMask.value(),
propDepthSitesMask.value(),outFileBase.value(),MAFClass,verboseOutput);
unimpAlign= acc.initiateAccuracy();
time= System.currentTimeMillis()-time;//restarts the time, including the time to load donors and target but not including the time to set up accuracy
}
OpenBitSet[][] conflictMasks=FILLINDonorGenotypeUtils.createMaskForAlignmentConflicts(unimpAlign, donorAlign,
verboseOutput);
System.out.printf("Unimputed taxa:%d sites:%d %n",unimpAlign.numberOfTaxa(),unimpAlign.numberOfSites());
System.out.println("Creating Export GenotypeTable:"+outFileBase.value());
Object mna;
if(isOutputProjection.value()) {
mna=new ProjectionBuilder(ImportUtils.readGuessFormat(donorFile.value()));
} else {
if(outFileBase.value().contains(".h5")) {
mna= GenotypeTableBuilder.getTaxaIncremental(this.unimpAlign.positions(),outFileBase.value());
}else {
mna= GenotypeTableBuilder.getTaxaIncremental(this.unimpAlign.positions());
}
}
int numThreads = Runtime.getRuntime().availableProcessors();
System.out.println("Time to read in files and generate masks: "+((System.currentTimeMillis()-time)/1000)+" sec");
ExecutorService pool = Executors.newFixedThreadPool(numThreads);
for (int taxon = 0; taxon < unimpAlign.numberOfTaxa(); taxon+=1) {
int[] trackBlockNN= newint[5];//global variable to track number of focus blocks solved in NN search for system out; index 0 is inbred, 1 is viterbi, 2 is smash, 3 is not solved, 4 is total for all modes
ImputeOneTaxon theTaxon= (((double)unimpAlign.heterozygousCountForTaxon(taxon)/(double)unimpAlign.totalNonMissingForTaxon(taxon))minSitesPresent);
int countFullLength=0;
int countByFocus= 0;
for (int da = 0; (da < donorAlign.length)&&enoughData ; da++) {
int donorOffset=unimpAlign.siteOfPhysicalPosition(donorAlign[da].chromosomalPosition(0), donorAlign[da].chromosome(0));
int blocks=donorAlign[da].allelePresenceForAllSites(0, Major).getNumWords();
BitSet[] maskedTargetBits=FILLINDonorGenotypeUtils.arrangeMajorMinorBtwAlignments(unimpAlign, taxon, donorOffset, donorAlign[da].numberOfSites(), conflictMasks[da][0], conflictMasks[da][1], isSwapMajorMinor);
//if imputing the donor file, these donor indices prevent self imputationint[] donorIndices;
if(imputeDonorFile){
donorIndices=newint[donorAlign[da].numberOfTaxa()-1];
for (int i = 0; i < donorIndices.length; i++) {donorIndices[i]=i; if(i>=taxon) donorIndices[i]++;}
} else {
donorIndices=newint[donorAlign[da].numberOfTaxa()];
for (int i = 0; i < donorIndices.length; i++) {donorIndices[i]=i;}
}
//Finds the best haplotype donors for each focus block within a donorGenotypeTable
DonorHypoth[][] regionHypthInbred=new DonorHypoth[blocks][maxDonorHypotheses.value()];
byte[][][] targetToDonorDistances=FILLINImputationUtils.calcAllelePresenceCountsBtwTargetAndDonors(maskedTargetBits,
donorAlign[da]);
for (int focusBlock = 0; focusBlock < blocks; focusBlock++) {
int[] resultRange=FILLINImputationUtils.getBlockWithMinMinorCount(maskedTargetBits[0].getBits(), maskedTargetBits[1].getBits(), focusBlock, minMinorCnt.value(), minMinorCnt.value()*minMajorRatioToMinorCnt);
if(resultRange==null) continue; //no data in the focus Block//search for the best inbred donors for a segment
regionHypthInbred[focusBlock]=FILLINImputationUtils.findHomozygousDonorHypoth(taxon, resultRange[0], resultRange[2],
focusBlock, donorIndices, targetToDonorDistances, minTestSites.value(), maxDonorHypotheses.value());
}
impTaxon.setSegmentSolved(false);
//tries to solve the entire donorAlign region by Virterbi or Inbred
impTaxon=solveEntireDonorRegion(taxon, donorAlign[da], donorOffset, regionHypthInbred, impTaxon, maskedTargetBits, maxHybridErrorRate.value(), targetToDonorDistances);
if(impTaxon.isSegmentSolved()) {countFullLength++; continue;}
//resorts to solving block by block, first by inbred, then by viterbi, and then by hybrid
impTaxon=solveByBlockNearestNeighbor(impTaxon, taxon, donorAlign[da], donorOffset, regionHypthInbred, hybridNN.value(), maskedTargetBits, minMinorCnt.value(), focusInbredErr, focusHybridErr, focusSmashErr, donorIndices, trackBlockNN, hetsMiss);
if(impTaxon.isSegmentSolved()) {countByFocus++;}
}
double totalFocus= (double)trackBlockNN[3]+(double)trackBlockNN[4];
sb.append(String.format("InbredOrViterbi:%d FocusBlock:%d PropFocusInbred:%f PropFocusViterbi:%f PropFocusSmash:%f PropFocusMissing:%f BlocksSolved:%d ",
countFullLength, countByFocus, (double)trackBlockNN[0]/totalFocus, (double)trackBlockNN[1]/totalFocus, (double)trackBlockNN[2]/totalFocus, (double)trackBlockNN[3]/totalFocus, impTaxon.getBlocksSolved()));
//sb.append(String.format("InbredOrViterbi:%d FocusBlock:%d BlocksSolved:%d ", countFullLength, countByFocus, impTaxon.getBlocksSolved()));int[] unk=FILLINImputationUtils.countUnknownAndHeterozygotes(impTaxon.resolveGeno);
sb.append(String.format("Unk:%d PropMissing:%g ", unk[0], (double) unk[0] / (double) impTaxon.getOrigGeno().length));
sb.append(String.format("Het:%d PropHet:%g ", unk[1], (double)unk[1]/(double)impTaxon.getOrigGeno().length));
sb.append(" BreakPoints:"+impTaxon.getBreakPoints().size());
if(!isOutputProjection.value()) {
alignBuilder.addTaxon(unimpAlign.taxa().get(taxon), impTaxon.resolveGeno);
} else {
projBuilder.addTaxon(unimpAlign.taxa().get(taxon),impTaxon.getBreakPoints());
}
if(verboseOutput) System.out.println(sb.toString());
}
}
/**
* Solve the entire donor alignment window with either one or two haplotypes. Generally, it is solved with the
* HMM Viterbi algorithm. The Viterbi algorithm is relatively slow the big choice is how to choose the potential
* donors.
*
*
* @param taxon
* @param donorAlign
* @param donorOffset
* @param regionHypoth
* @param impT
* @param maskedTargetBits
* @param maxHybridErrorRate
* @return
*/private ImputedTaxon solveEntireDonorRegion(int taxon, GenotypeTable donorAlign, int donorOffset,
DonorHypoth[][] regionHypoth, ImputedTaxon impT, BitSet[] maskedTargetBits, double maxHybridErrorRate, byte[][][] targetToDonorDistances) {
int blocks=maskedTargetBits[0].getNumWords();
if(testing==1) System.out.println("Starting complete hybrid search");
//create a list of the best donors based on showing up frequently high in many focus blocks//in the test data set the best results achieved with on the best hypothesis recovered.//todo consider whether to use this approach// int[] d=FILLINImputationUtils.mostFrequentDonorsAcrossFocusBlocks(regionHypoth, maxDonorHypotheses);//Alternative test is find best donorsint[] d=FILLINImputationUtils.bestDonorsAcrossEntireRegion(targetToDonorDistances, minTestSites.value(),maxDonorHypotheses.value());
int[] testList=FILLINImputationUtils.fillInc(0,donorAlign.numberOfTaxa()-1);
int[] bestDonorList=Arrays.copyOfRange(d,0,Math.min(d.length,5));
DonorHypoth[] bestDBasedOnBest=FILLINImputationUtils.findHeterozygousDonorHypoth(taxon, maskedTargetBits[0].getBits(),
maskedTargetBits[1].getBits(), 0, blocks-1, blocks/2, donorAlign, bestDonorList, testList, maxDonorHypotheses.value(), minTestSites.value());
//make all combinations of best donor and find the the pairs that minimize errors//with the true switch also will make inbreds
DonorHypoth[] best2Dsearchdonors=FILLINImputationUtils.findHeterozygousDonorHypoth(taxon, maskedTargetBits[0].getBits(),
maskedTargetBits[1].getBits(), 0, blocks-1, blocks/2, donorAlign, d, d, maxDonorHypotheses.value(), minTestSites.value());
DonorHypoth[] best2donors=FILLINImputationUtils.combineDonorHypothArrays(maxDonorHypotheses.value(),bestDBasedOnBest,best2Dsearchdonors);
if(testing==1) System.out.println(Arrays.toString(best2donors));
ArrayList goodDH=new ArrayList();
for (DonorHypoth dh : best2donors) {
if(dh==null) continue;
if(dh.isInbred() && (dh.getErrorRate() goodDH= new ArrayList(); //KLS0201if (best2donors[0].getErrorRate()=donorAlign.numberOfSites())?donorAlign.numberOfSites()-1: dh.endSite;
int sites=endSite-dh.startSite+1;
//Find the informative sites
StatePositionChain informative=createInformativeStateChainForViterbi(dh.startSite,
maxNonMedelian, minimumDonorDistance,
unimpAlign.genotypeRange(dh.targetTaxon, dh.startSite+donorOffset, endSite+1+donorOffset),
donorAlign.genotypeRange(dh.donor1Taxon, dh.startSite, endSite+1),
donorAlign.genotypeRange(dh.donor2Taxon, dh.startSite, endSite+1));
if(informative==null) returnnull;
//Find the most likely states by Virterbiint chrlength = donorAlign.chromosomalPosition(endSite) - donorAlign.chromosomalPosition(dh.startSite);
byte[] callsF=callsFromViterbi(trans, chrlength/sites, informative);
DonorHypoth dh2=new DonorHypoth(dh.targetTaxon,dh.donor1Taxon, dh.donor2Taxon, dh.startBlock, dh.focusBlock, dh.endBlock);
dh2.phasedResults= callsF;
if (forwardReverse) {
byte[] callsR=callsFromViterbi(trans, chrlength/sites, StatePositionChain.reverseInstance(informative));
ArrayUtils.reverse(callsR); //These are now in same direction as callsFbyte[] callsC=newbyte[callsF.length];
System.arraycopy(callsR,0,callsC,0,callsC.length/2);
System.arraycopy(callsF,callsC.length/2,callsC,callsC.length/2,callsC.length-callsC.length/2);
dh2.phasedResults= callsC;
}
return dh2;
}
privatebyte[] callsFromViterbi(double[][] trans, int avgChrLength, StatePositionChain informative) {
//There is no EM optimization of Viterbi, clearly something that could be changed
TransitionProbability tpF = new TransitionProbability();
EmissionProbability ep = new EmissionProbability();
tpF.setTransitionProbability(trans);
ep.setEmissionProbability(emission);
final double probHeterozygous=0.5;
final double phom = (1 - probHeterozygous) / 2;
final double[] pTrue = newdouble[]{phom, .25*probHeterozygous ,.5 * probHeterozygous, .25*probHeterozygous, phom};
tpF.setAverageSegmentLength(avgChrLength);
tpF.setPositions(informative.informSites);
ViterbiAlgorithm vaF = new ViterbiAlgorithm(informative.informStates, tpF, ep, pTrue);
vaF.calculate();
byte[] resultStatesF=vaF.getMostProbableStateSequence();
int currPos=0;
//converts the informative states back to all statesbyte[] callsF=newbyte[informative.totalSiteCnt];
for(int cs=0; cs nonMissingObs = new ArrayList<>();
ArrayList informSites = new ArrayList<>();
//Selects only the informative sites for the Viterbi algorithm (known & gap free & homozygous in donor)for(int cs=0; csmaxNonMendelian) returnnull;
//if the donors are too similar Viterbi performs poorly. Only accepted different donorsif((double)donorDifferences/(double)informSites.size() donors= new HashSet<>();
for (int h = 0; h < regionHypth[block].length; h++) {
if (regionHypth[block][h]!=null) {
donors.add(regionHypth[block][h].donor1Taxon);
donors.add(regionHypth[block][h].donor2Taxon);
}
}
if (donors.isEmpty()) returnnull;
return Ints.toArray(donors);
}
// /**// * Simple algorithm that tests every possible two donor combination to minimize// * the number of unmatched informative alleles. Currently, there is little tie// * breaking, longer matches are favored.// * @param targetTaxon// * @param startBlock// * @param endBlock// * @param focusBlock// * @return int[] array of {donor1, donor2, testSites} sorted by testPropUnmatched// */// private DonorHypoth[] getBestHybridDonors(int targetTaxon, long[] mjT, long[] mnT,// int startBlock, int endBlock, int focusBlock, GenotypeTable donorAlign,// int[] donor1Indices, int[] donor2Indices,// boolean viterbiSearch) {// TreeMap bestDonors=new TreeMap();// Set testedDonorPairs=new HashSet<>();// double lastKeytestPropUnmatched=1.0;// double inc=1e-9;// double[] donorDist;// for (int d1: donor1Indices) {// long[] mj1=donorAlign.allelePresenceForSitesBlock(d1, Major, startBlock, endBlock + 1);// long[] mn1=donorAlign.allelePresenceForSitesBlock(d1, Minor, startBlock, endBlock + 1);// for (int d2 : donor2Indices) {// if((!viterbiSearch)&&(d1==d2)) continue;// if(testedDonorPairs.contains(((long)d1<<32)+(long)d2)) continue;// if(testedDonorPairs.contains(((long)d2<<32)+(long)d1)) continue;// testedDonorPairs.add(((long)d1<<32)+(long)d2);// long[] mj2=donorAlign.allelePresenceForSitesBlock(d2, Major, startBlock, endBlock + 1);// long[] mn2=donorAlign.allelePresenceForSitesBlock(d2, Minor, startBlock, endBlock + 1);// if(viterbiSearch) {// donorDist=IBSDistanceMatrix.computeHetBitDistances(mj1, mn1, mj2, mn2, minTestSites);// if((d1!=d2)&&(donorDist[0]maxDonorHypotheses) {// bestDonors.remove(bestDonors.lastKey());// lastKeytestPropUnmatched=bestDonors.lastKey();// }// }// }// }// DonorHypoth[] result=new DonorHypoth[maxDonorHypotheses];// int count=0;// for (DonorHypoth dh : bestDonors.values()) {// result[count]=dh;// count++;// }// return result;// }//// private int[] mendelErrorComparison(long[] mjT, long[] mnT, long[] mj1, long[] mn1,// long[] mj2, long[] mn2) {// int mjUnmatched=0;// int mnUnmatched=0;// int testSites=0;// for (int i = 0; i < mjT.length; i++) {// long siteMask=(mjT[i]|mnT[i])&(mj1[i]|mn1[i])&(mj2[i]|mn2[i]);// mjUnmatched+=Long.bitCount(siteMask&mjT[i]&(mjT[i]^mj1[i])&(mjT[i]^mj2[i]));// mnUnmatched+=Long.bitCount(siteMask&mnT[i]&(mnT[i]^mn1[i])&(mnT[i]^mn2[i]));// testSites+=Long.bitCount(siteMask);// }// int totalMendelianErrors=mjUnmatched+mnUnmatched;// // double testPropUnmatched=(double)(totalMendelianErrors)/(double)testSites;// return (new int[] {totalMendelianErrors, testSites});// }private ImputedTaxon setAlignmentWithDonors(GenotypeTable donorAlign, DonorHypoth[] theDH, int donorOffset,
boolean setJustFocus, ImputedTaxon impT, boolean smashOn, boolean hetsMiss) {
if(theDH[0].targetTaxon<0) return impT;
boolean print=false;
int startSite=(setJustFocus)?theDH[0].getFocusStartSite():theDH[0].startSite;
int endSite=(setJustFocus)?theDH[0].getFocusEndSite():theDH[0].endSite;
if(endSite>=donorAlign.numberOfSites()) endSite=donorAlign.numberOfSites()-1;
//prevDonors is used to track recombination breakpoint start and stopint[] prevDonors=newint[]{-1, -1};
if(theDH[0].getPhaseForSite(startSite)==0) {prevDonors=newint[]{theDH[0].donor1Taxon, theDH[0].donor1Taxon};}
elseif(theDH[0].getPhaseForSite(startSite)==2) {prevDonors=newint[]{theDH[0].donor2Taxon, theDH[0].donor2Taxon};}
elseif(theDH[0].getPhaseForSite(startSite)==1) {prevDonors=newint[]{theDH[0].donor1Taxon, theDH[0].donor2Taxon};}
int prevDonorStart=startSite;
int[] currDonors=prevDonors;
for(int cs=startSite; cs<=endSite; cs++) {
byte donorEst=UNKNOWN_DIPLOID_ALLELE;
byte neighbor=0;
//site imputation will try to use all donor hypotheses; breakpoints only use the best (e.g. i==0 tests)for (int i = 0; (i < theDH.length) && (donorEst==UNKNOWN_DIPLOID_ALLELE); i++) {
neighbor++;
if((theDH[i]==null)||(theDH[i].donor1Taxon<0)) continue;
if(theDH[i].getErrorRate()>this.maximumInbredError.value()) continue;
byte bD1=donorAlign.genotype(theDH[i].donor1Taxon, cs);
if(theDH[i].getPhaseForSite(cs)==0) {
donorEst=bD1;
if(i==0) currDonors=newint[]{theDH[0].donor1Taxon, theDH[0].donor1Taxon};
}
else {
byte bD2=donorAlign.genotype(theDH[i].donor2Taxon, cs);
if(theDH[i].getPhaseForSite(cs)==2) {
donorEst=bD2;
if(i==0) currDonors=newint[]{theDH[0].donor2Taxon, theDH[0].donor2Taxon};
}
else {
donorEst=GenotypeTableUtils.getUnphasedDiploidValueNoHets(bD1, bD2);
if(i==0) currDonors=newint[]{theDH[0].donor1Taxon, theDH[0].donor2Taxon};
}
}
}
byte knownBase=impT.getOrigGeno(cs+donorOffset);
//record recombination breakpoint if it changesif(!Arrays.equals(prevDonors, currDonors)) {
DonorHaplotypes dhaps=new DonorHaplotypes(donorAlign.chromosome(prevDonorStart), donorAlign.chromosomalPosition(prevDonorStart),
donorAlign.chromosomalPosition(cs),prevDonors[0],prevDonors[1]);
impT.addBreakPoint(dhaps);
prevDonors=currDonors;
prevDonorStart=cs;
}
//chgHis records the section of the imputation pipeline that made the change, Viterbi negative, blockNN positiveif(theDH[0].phasedResults==null) {impT.chgHis[cs+donorOffset]=(byte)-neighbor;}
else {impT.chgHis[cs+donorOffset]=(byte)neighbor;}
impT.impGeno[cs+donorOffset]= donorEst; //predicted based on neighbor//if genotype is unknown or het undercall then resolves//todo there is currently not an option if the predicted disagrees with a homozygous callif(knownBase==UNKNOWN_DIPLOID_ALLELE) {
if (isHeterozygous(donorEst)) {
if (smashOn && hetsMiss) {//if imputing a heterozygote, just set to missing
impT.resolveGeno[cs+donorOffset]= knownBase;
}
else impT.resolveGeno[cs+donorOffset]= donorEst; //if not in hybrid, set to het
}
else {//if not imputed to a het
impT.resolveGeno[cs+donorOffset]= donorEst;}
} elseif (isHeterozygous(donorEst)){
if(resolveHetIfUndercalled&&GenotypeTableUtils.isPartiallyEqual(knownBase, donorEst)&&smashOn==false){//if smash off, set homozygotes imputed to het to het//System.out.println("ResolveHet:"+theDH[0].targetTaxon+":"+cs+donorOffset+":"+knownBaseString+":"+donorEstString);
impT.resolveGeno[cs+donorOffset]= donorEst;
}
}
} //end of cs loop//enter a stop of the DH at the beginning of the next block
DonorHaplotypes dhaps=new DonorHaplotypes(donorAlign.chromosome(prevDonorStart), donorAlign.chromosomalPosition(prevDonorStart),
donorAlign.chromosomalPosition(endSite),prevDonors[0],prevDonors[1]);
impT.addBreakPoint(dhaps);
return impT;
}
@Override
public ImageIcon getIcon() {
returnnull;
}
@Override
public String getButtonName() {
return"Impute By FILLIN";
}
@Override
public String getToolTipText() {
return"Imputation that relies on a combination of HMM and Nearest Neighbor using donors derived from FILLINFindHaplotypesPlugin";
}
// The following getters and setters were auto-generated.// Please use this method to re-generate.//publicstaticvoidmain(String[] args) {
GeneratePluginCode.generate(FILLINImputationPlugin.class);
}
/**
* Convenience method to run plugin with one return object.
*//*// TODO: Replace with specific type.
public runPlugin(DataSet input) {
return () performFunction(input).getData(0).getData();
}*//**
* Input HapMap file of target genotypes to impute. Accepts
* all file types supported by TASSEL5
*
* @return Target file
*/public String targetFile() {
return hmpFile.value();
}
/**
* Set Target file. Input HapMap file of target genotypes
* to impute. Accepts all file types supported by TASSEL5
*
* @param value Target file
*
* @return this plugin
*/public FILLINImputationPlugin targetFile(String value) {
hmpFile = new PluginParameter<>(hmpFile, value);
returnthis;
}
/**
* Directory containing donor haplotype files from output
* of FILLINFindHaplotypesPlugin. All files with '.gc'
* in the filename will be read in, only those with matching
* sites are used
*
* @return Donor Dir
*/public String donorDir() {
return donorFile.value();
}
/**
* Set Donor Dir. Directory containing donor haplotype
* files from output of FILLINFindHaplotypesPlugin. All
* files with '.gc' in the filename will be read in, only
* those with matching sites are used
*
* @param value Donor Dir
*
* @return this plugin
*/public FILLINImputationPlugin donorDir(String value) {
donorFile = new PluginParameter<>(donorFile, value);
returnthis;
}
/**
* Output file; hmp.txt.gz and .hmp.h5 accepted.
*
* @return Output filename
*/public String outputFilename() {
return outFileBase.value();
}
/**
* Set Output filename. Output file; hmp.txt.gz and .hmp.h5
* accepted.
*
* @param value Output filename
*
* @return this plugin
*/public FILLINImputationPlugin outputFilename(String value) {
outFileBase = new PluginParameter<>(outFileBase, value);
returnthis;
}
/**
* Preferred haplotype block size in sites (use same as
* in FILLINFindHaplotypesPlugin)
*
* @return Preferred haplotype size
*/public Integer preferredHaplotypeSize() {
return appoxSitesPerDonorGenotypeTable.value();
}
/**
* Set Preferred haplotype size. Preferred haplotype block
* size in sites (use same as in FILLINFindHaplotypesPlugin)
*
* @param value Preferred haplotype size
*
* @return this plugin
*/public FILLINImputationPlugin preferredHaplotypeSize(Integer value) {
appoxSitesPerDonorGenotypeTable = new PluginParameter<>(appoxSitesPerDonorGenotypeTable, value);
returnthis;
}
/**
* Threshold per taxon heterozygosity for treating taxon
* as heterozygous (no Viterbi, het thresholds).
*
* @return Heterozygosity threshold
*/public Double heterozygosityThreshold() {
return hetThresh.value();
}
/**
* Set Heterozygosity threshold. Threshold per taxon heterozygosity
* for treating taxon as heterozygous (no Viterbi, het
* thresholds).
*
* @param value Heterozygosity threshold
*
* @return this plugin
*/public FILLINImputationPlugin heterozygosityThreshold(Double value) {
hetThresh = new PluginParameter<>(hetThresh, value);
returnthis;
}
/**
* Maximum error rate for applying one haplotype to entire
* site window
*
* @return Max error to impute one donor
*/public Double maxErrorToImputeOneDonor() {
return maximumInbredError.value();
}
/**
* Set Max error to impute one donor. Maximum error rate
* for applying one haplotype to entire site window
*
* @param value Max error to impute one donor
*
* @return this plugin
*/public FILLINImputationPlugin maxErrorToImputeOneDonor(Double value) {
maximumInbredError = new PluginParameter<>(maximumInbredError, value);
returnthis;
}
/**
* Maximum error rate for applying Viterbi with to haplotypes
* to entire site window
*
* @return Max combined error to impute two donors
*/public Double maxCombinedErrorToImputeTwoDonors() {
return maxHybridErrorRate.value();
}
/**
* Set Max combined error to impute two donors. Maximum
* error rate for applying Viterbi with to haplotypes
* to entire site window
*
* @param value Max combined error to impute two donors
*
* @return this plugin
*/public FILLINImputationPlugin maxCombinedErrorToImputeTwoDonors(Double value) {
maxHybridErrorRate = new PluginParameter<>(maxHybridErrorRate, value);
returnthis;
}
/**
* Minimum number of sites to test for IBS between haplotype
* and target in focus block
*
* @return Min sites to test match
*/public Integer minSitesToTestMatch() {
return minTestSites.value();
}
/**
* Set Min sites to test match. Minimum number of sites
* to test for IBS between haplotype and target in focus
* block
*
* @param value Min sites to test match
*
* @return this plugin
*/public FILLINImputationPlugin minSitesToTestMatch(Integer value) {
minTestSites = new PluginParameter<>(minTestSites, value);
returnthis;
}
/**
* Minimum number of informative minor alleles in the
* search window (or 10X major)
*
* @return Min num of minor alleles to compare
*/public Integer minNumOfMinorAllelesToCompare() {
return minMinorCnt.value();
}
/**
* Set Min num of minor alleles to compare. Minimum number
* of informative minor alleles in the search window (or
* 10X major)
*
* @param value Min num of minor alleles to compare
*
* @return this plugin
*/public FILLINImputationPlugin minNumOfMinorAllelesToCompare(Integer value) {
minMinorCnt = new PluginParameter<>(minMinorCnt, value);
returnthis;
}
/**
* Maximum number of donor hypotheses to be explored
*
* @return Max donor hypotheses
*/public Integer maxDonorHypotheses() {
return maxDonorHypotheses.value();
}
/**
* Set Max donor hypotheses. Maximum number of donor hypotheses
* to be explored
*
* @param value Max donor hypotheses
*
* @return this plugin
*/public FILLINImputationPlugin maxDonorHypotheses(Integer value) {
maxDonorHypotheses = new PluginParameter<>(maxDonorHypotheses, value);
returnthis;
}
/**
* Write all imputed heterozygous calls as such, even
* if the original file has a homozygous call. (Not recommended
* for inbred lines.)
*
* @return Impute all het calls
*/public Boolean imputeAllHetCalls() {
return imputeAllHets.value();
}
/**
* Set Impute all het calls. Write all imputed heterozygous
* calls as such, even if the original file has a homozygous
* call. (Not recommended for inbred lines.)
*
* @param value Impute all het calls
*
* @return this plugin
*/public FILLINImputationPlugin imputeAllHetCalls(Boolean value) {
imputeAllHets = new PluginParameter<>(imputeAllHets, value);
returnthis;
}
/**
* If true, uses combination mode in focus block, else
* does not impute
*
* @return Combine two haplotypes as heterozygote
*/public Boolean combineTwoHaplotypesAsHeterozygote() {
return hybridNN.value();
}
/**
* Set Combine two haplotypes as heterozygote. If true,
* uses combination mode in focus block, else does not
* impute
*
* @param value Combine two haplotypes as heterozygote
*
* @return this plugin
*/public FILLINImputationPlugin combineTwoHaplotypesAsHeterozygote(Boolean value) {
hybridNN = new PluginParameter<>(hybridNN, value);
returnthis;
}
/**
* Create a projection alignment for high density markers
*
* @return Output projection alignment
*/public Boolean outputProjectionAlignment() {
return isOutputProjection.value();
}
/**
* Set Output projection alignment. Create a projection
* alignment for high density markers
*
* @param value Output projection alignment
*
* @return this plugin
*/public FILLINImputationPlugin outputProjectionAlignment(Boolean value) {
isOutputProjection = new PluginParameter<>(isOutputProjection, value);
returnthis;
}
/**
* Impute the donor file itself
*
* @return Impute donor file
*/public Boolean imputeDonorFile() {
return imputeDonorFile.value();
}
/**
* Set Impute donor file. Impute the donor file itself
*
* @param value Impute donor file
*
* @return this plugin
*/public FILLINImputationPlugin imputeDonorFile(Boolean value) {
imputeDonorFile = new PluginParameter<>(imputeDonorFile, value);
returnthis;
}
/**
* Supress system out
*
* @return Supress system out
*/public Boolean supressSystemOut() {
return nonverboseOutput.value();
}
/**
* Set Supress system out. Supress system out
*
* @param value Supress system out
*
* @return this plugin
*/public FILLINImputationPlugin supressSystemOut(Boolean value) {
nonverboseOutput = new PluginParameter<>(nonverboseOutput, value);
returnthis;
}
/**
* Masks input file before imputation and calculates accuracy
* based on masked genotypes
*
* @return Calculate accuracy
*/public Boolean calculateAccuracy() {
return accuracy.value();
}
/**
* Set Calculate accuracy. Masks input file before imputation
* and calculates accuracy based on masked genotypes
*
* @param value Calculate accuracy
*
* @return this plugin
*/public FILLINImputationPlugin calculateAccuracy(Boolean value) {
accuracy = new PluginParameter<>(accuracy, value);
returnthis;
}
/**
* Proportion of genotypes to mask for accuracy calculation
* if depth not available
*
* @return Proportion of genotypes to mask if no depth
*/public Double proportionOfGenotypesToMaskIfNoDepth() {
return propSitesMask.value();
}
/**
* Set Proportion of genotypes to mask if no depth. Proportion
* of genotypes to mask for accuracy calculation if depth
* not available
*
* @param value Proportion of genotypes to mask if no depth
*
* @return this plugin
*/public FILLINImputationPlugin proportionOfGenotypesToMaskIfNoDepth(Double value) {
propSitesMask = new PluginParameter<>(propSitesMask, value);
returnthis;
}
/**
* Depth of genotypes to mask for accuracy calculation
* if depth information available
*
* @return Depth of genotypes to mask
*/public Integer depthOfGenotypesToMask() {
return depthToMask.value();
}
/**
* Set Depth of genotypes to mask. Depth of genotypes
* to mask for accuracy calculation if depth information
* available
*
* @param value Depth of genotypes to mask
*
* @return this plugin
*/public FILLINImputationPlugin depthOfGenotypesToMask(Integer value) {
depthToMask = new PluginParameter<>(depthToMask, value);
returnthis;
}
/**
* Proportion of genotypes of given depth to mask for
* accuracy calculation if depth available
*
* @return Proportion of depth genotypes to mask
*/public Double proportionOfDepthGenotypesToMask() {
return propDepthSitesMask.value();
}
/**
* Set Proportion of depth genotypes to mask. Proportion
* of genotypes of given depth to mask for accuracy calculation
* if depth available
*
* @param value Proportion of depth genotypes to mask
*
* @return this plugin
*/public FILLINImputationPlugin proportionOfDepthGenotypesToMask(Double value) {
propDepthSitesMask = new PluginParameter<>(propDepthSitesMask, value);
returnthis;
}
/**
* Key to calculate accuracy. Genotypes missing (masked)
* in target file should be present in key, with all other
* sites set to missing. Overrides otheraccuracy options
* if present and all sites and taxa present in target
* file present
*
* @return Optional key to calculate accuracy
*/public String optionalKeyToCalculateAccuracy() {
return maskKey.value();
}
/**
* Set Optional key to calculate accuracy. Key to calculate
* accuracy. Genotypes missing (masked) in target file
* should be present in key, with all other sites set
* to missing. Overrides otheraccuracy options if present
* and all sites and taxa present in target file present
*
* @param value Optional key to calculate accuracy
*
* @return this plugin
*/public FILLINImputationPlugin optionalKeyToCalculateAccuracy(String value) {
maskKey = new PluginParameter<>(maskKey, value);
returnthis;
}
/**
* Calculate R2 accuracy within MAF categories based on
* donor file
*
* @return Calculate accuracy within MAF categories
*/public Boolean calculateAccuracyWithinMAFCategories() {
return byMAF.value();
}
/**
* Set Calculate accuracy within MAF categories. Calculate
* R2 accuracy within MAF categories based on donor file
*
* @param value Calculate accuracy within MAF categories
*
* @return this plugin
*/public FILLINImputationPlugin calculateAccuracyWithinMAFCategories(Boolean value) {
byMAF = new PluginParameter<>(byMAF, value);
returnthis;
}
}