net.maizegenetics.analysis.imputation.FILLINImputationPlugin Maven / Gradle / Ivy

Go to download
package net.maizegenetics.analysis.imputation;

import com.google.common.primitives.Bytes;
import com.google.common.primitives.Ints;
import net.maizegenetics.analysis.popgen.DonorHypoth;
import net.maizegenetics.dna.map.DonorHaplotypes;
import net.maizegenetics.dna.snp.*;
import net.maizegenetics.dna.snp.io.ProjectionGenotypeIO;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.OpenBitSet;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;

import javax.swing.*;
import java.awt.*;
import java.io.File;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import static net.maizegenetics.dna.snp.GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
import static net.maizegenetics.dna.WHICH_ALLELE.Major;
import static net.maizegenetics.dna.WHICH_ALLELE.Minor;
import static net.maizegenetics.dna.snp.GenotypeTableUtils.isHeterozygous;
import static net.maizegenetics.dna.snp.NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;



/**
 * FILLIN imputation relies on a libary of haplotypes and uses nearest neighbor searches
 * followed by HMM Viterbi resolution or block-based resolution.
 * It is the best approach for substantially unrelated taxa in TASSEL.  BEAGLE4 is a better
 * approach currently for landraces, while FILLIN outperforms if there is a good reference
 * set of haplotypes.
 * 
 * The algorithm relies on having a series of donor haplotypes.  If phased haplotypes are
 * already known, they can be used directly.  If they need to be reconstructed, the
 * {@link FindMergeHaplotypesPlugin} can be used to create haplotypes for windows
 * across the genome.
 * 
 * Imputation is done one taxon at a time using the donor haplotypes.  The strategy is as
 *  follows:
 * Every 64 sites is considered a block (or a word in the bitset terminology - length of a long).  There is
 * a separate initial search for nearest neighbors centered around each block. The flanks
 * of the scan window vary but always contain the number of minor alleles specified (a key parameter).
 * Minor alleles approximate the information content of the search.
 *  Calculate distance between the target taxon and the donor haplotypes for every window, and rank
 * the top 10 donor haplotypes for each 64 site focus block.
 *  Evaluate by Viterbi whether 1 or 2 haplotypes will explain all the sites across the
 * donor haplotype window.  If successful, move to the next region for donor haplotypes.
 *  Resolve each focus block by nearest neighbors.  If inbred NN are not close enough,
 * then do a hybrid search seeded with one parent being the initial 10 haplotypes.
 * Set bases based on what is resolved first inbred or hybrid.
 *
 *
 * Error rates are bounded away from zero, but adding 0.5 error to all error
 * rates that that were observed to be zero.
 * 

 * @author Edward Buckler
 * @author Kelly Swarts
 * @cite
 */
//@Citation("Two papers: Viterbi from Bradbury, et al (in prep) Recombination patterns in maize\n"+
//        "NearestNeighborSearch Swarts,...,Buckler (in prep) Imputation with large genotyping by sequencing data\n")
public class FILLINImputationPlugin extends AbstractPlugin {
    private int minMajorRatioToMinorCnt=10;  //refinement of minMinorCnt to account for regions with all major
    //Plugin parameters
    private PluginParameter hmpFile= new PluginParameter.Builder<>("hmp",null,String.class).guiName("Target file").inFile().required(true)
            .description("Input HapMap file of target genotypes to impute. Accepts all file types supported by TASSEL5. This file should include _ALL_ available sites, not " +
                    "just those segregating in your material. (ie: don't filter the input)").build();
    private PluginParameter donorFile= new PluginParameter.Builder<>("d",null,String.class).guiName("Donor").required(true)
            .description("Directory containing donor haplotype files from output of FILLINFindHaplotypesPlugin. All files with '.gc' in the filename will be read in, "
                    + "only those with matching sites are used. Alternately, a single file to use as a donor, will be cut into sub genos in size specified (eg, high density"
                    + "SNP file for projection").build();
    private PluginParameter outFileBase= new PluginParameter.Builder<>("o",null,String.class).guiName("Output filename").outFile().required(true)
            .description("Output file; hmp.txt.gz and .hmp.h5 accepted.").build();
    
    private PluginParameter appoxSitesPerDonorGenotypeTable= new PluginParameter.Builder<>("hapSize",8000,Integer.class).guiName("Preferred haplotype size")
            .description("Preferred haplotype block size in sites (use same as in FILLINFindHaplotypesPlugin)").build();
    private PluginParameter hetThresh= new PluginParameter.Builder<>("mxHet",0.02,Double.class).guiName("Heterozygosity threshold")
            .description("Threshold per taxon heterozygosity for treating taxon as heterozygous (no Viterbi, het thresholds).").build();
    private PluginParameter maximumInbredError= new PluginParameter.Builder<>("mxInbErr",0.01,Double.class).guiName("Max error to impute one donor")
            .description("Maximum error rate for applying one haplotype to entire site window").build();
    private PluginParameter maxHybridErrorRate= new PluginParameter.Builder<>("mxHybErr",0.003,Double.class).guiName("Max combined error to impute two donors")
            .description("Maximum error rate for applying Viterbi with two haplotypes to entire site window").build();
    private PluginParameter minTestSites= new PluginParameter.Builder<>("mnTestSite",20,Integer.class).guiName("Min sites to test match")
            .description("Minimum number of sites to test for IBS between haplotype and target in focus block").build();
    private PluginParameter minMinorCnt= new PluginParameter.Builder<>("minMnCnt",20,Integer.class).guiName("Min num of minor alleles to compare")
            .description("Minimum number of informative minor alleles in the search window (or "+minMajorRatioToMinorCnt+"X major)").build();
    private PluginParameter maxDonorHypotheses= new PluginParameter.Builder<>("mxDonH",20,Integer.class).guiName("Max donor hypotheses")
            .description("Maximum number of donor hypotheses to be explored. (Note: For heterozygous samples you may want to set this number higher, but the computation load goes up quickly.)").build();
    private PluginParameter imputeAllHets= new PluginParameter.Builder<>("impAllHets",false,Boolean.class).guiName("Impute all het calls")
            .description("Write all imputed heterozygous calls as such, even if the original file has a homozygous call. (Not recommended for inbred lines.)").build();


        //if true, uses combination mode in focus block, else set don't impute (default is true)
    private PluginParameter hybridNN= new PluginParameter.Builder<>("hybNN",true,Boolean.class).guiName("Combine two haplotypes as heterozygote")
            .description("If true, uses combination mode in focus block, else does not impute").build();
    private PluginParameter isOutputProjection= new PluginParameter.Builder<>("ProjA",false,Boolean.class).guiName("Output projection alignment")
            .description("Create a projection alignment for high density markers").build();
    private PluginParameter imputeDonorFile= new PluginParameter.Builder<>("impDonor",false,Boolean.class).guiName("Impute donor file")
            .description("Impute the donor file itself").build();
    private PluginParameter nonverboseOutput= new PluginParameter.Builder<>("nV",false,Boolean.class).guiName("Supress system out")
            .description("Supress system out").build();
    
            //for calculating accuracy
    private PluginParameter accuracy= new PluginParameter.Builder<>("accuracy",false,Boolean.class).guiName("Calculate accuracy")
            .description("Masks input file before imputation and calculates accuracy based on masked genotypes").build();
    private PluginParameter propSitesMask= new PluginParameter.Builder<>("propSitesMask",0.01,Double.class).guiName("Proportion of genotypes to mask if no depth")
            .description("Proportion of genotypes to mask for accuracy calculation if depth not available").dependentOnParameter(accuracy).build();
    private PluginParameter depthToMask= new PluginParameter.Builder<>("depthMask",9,Integer.class).guiName("Depth of genotypes to mask")
            .description("Depth of genotypes to mask for accuracy calculation if depth information available").dependentOnParameter(accuracy).build();
    private PluginParameter propDepthSitesMask= new PluginParameter.Builder<>("propDepthSitesMask",0.2,Double.class).guiName("Proportion of depth genotypes to mask")
            .description("Proportion of genotypes of given depth to mask for accuracy calculation if depth available").dependentOnParameter(accuracy).build();
    private PluginParameter maskKey= new PluginParameter.Builder<>("maskKey",null,String.class).inFile().required(false).guiName("Optional key to calculate accuracy")
            .description("Key to calculate accuracy. Genotypes missing (masked) in target file should be present in key, with all other sites set to missing. Overrides other"
                    + "accuracy options if present and all sites and taxa present in target file present").dependentOnParameter(accuracy).build();
    private PluginParameter byMAF= new PluginParameter.Builder<>("byMAF",false,Boolean.class).guiName("Calculate accuracy within MAF categories")
            .description("Calculate R2 accuracy within MAF categories based on donor file").dependentOnParameter(accuracy).build();
    
    //Additional variables
    private boolean verboseOutput= true;
    private boolean twoWayViterbi= true;//if true, the viterbi runs in both directions (the longest path length wins, if inconsistencies)
    private double minimumDonorDistance=maximumInbredError.value()*5; //used to prevent Viterbi errors with too similar sequences
    private double maxNonMedelian=maximumInbredError.value()*5; //if used avoid Viterbi if too many sites are non-Mendelian

        //options for focus blocks
    private double maxHybridErrFocusHomo= .3333*maxHybridErrorRate.value();////max error rate for discrepacy between two haplotypes for the focus block. it's default is higher because calculating for fewer sites
    private double maxInbredErrFocusHomo= .3*maximumInbredError.value();//.003;
    private double maxSmashErrFocusHomo= maximumInbredError.value();//.01;
    private double maxInbredErrFocusHet= .1*maximumInbredError.value();//.001;//the error rate for imputing one haplotype in focus block for a het taxon
    private double maxSmashErrFocusHet= maximumInbredError.value();//.01;

    public static GenotypeTable unimpAlign;  //the unimputed alignment to be imputed, unphased
    public static GenotypeTable maskKeyAlign= null;  //the key to the unimputed alignment mask
    public static double[] MAFClass= new double[]{0,.02,.05,.10,.20,.3,.4,.5,1};//must retain 0 and 1
    private int testing=0;  //level of reporting to stdout
        //major and minor alleles can be differ between the donor and unimp alignment
    private boolean isSwapMajorMinor=true;  //if swapped try to fix it

    private boolean resolveHetIfUndercalled=false;//if true, sets genotypes called to a het to a het, even if already a homozygote

    //initialize the transition matrix (T1)
    double[][] transition = new double[][] {
            {.999,.0001,.0003,.0001,.0005},
            {.0002,.999,.00005,.00005,.0002},
            {.0002,.00005,.999,.00005,.0002},
            {.0002,.00005,.00005,.999,.0002},
            {.0005,.0001,.0003,.0001,.999}
    };

    //initialize the emission matrix, states (5) in rows, observations (3) in columns
    double[][] emission = new double[][] {
                    {.998,.001,.001},
                    {.6,.2,.2},
                    {.4,.2,.4},
                    {.2,.2,.6},
                    {.001,.001,.998}
    };
    FILLINImputationAccuracy acc= null; //holds the accuracy information if accuracy flagged


    private static final Logger myLogger = Logger.getLogger(FILLINImputationPlugin.class);

    @Override
    protected void postProcessParameters() {
        System.out.println("Calling FILLINImputationPlugin.postProcessParameters()...");
        File parent = new File(outputFilename()).getParentFile();
        if ((parent != null) && (!parent.exists())) {
            parent.mkdirs();
        }
        minimumDonorDistance=maximumInbredError.value()*5;
        maxNonMedelian=maximumInbredError.value()*5; 
        maxInbredErrFocusHomo= .3*maximumInbredError.value();//.003;
        maxSmashErrFocusHomo= maximumInbredError.value();//.01;
        maxInbredErrFocusHet= .1*maximumInbredError.value();//.001;//the error rate for imputing one haplotype in focus block for a het taxon
        maxSmashErrFocusHet= maximumInbredError.value();//.01;
        maxHybridErrFocusHomo= .3333*maxHybridErrorRate.value();
        resolveHetIfUndercalled = imputeAllHets.value();
        if (nonverboseOutput.value()) verboseOutput= false;
        if (byMAF.value()==false) MAFClass= null;
        if (!new File(donorFile.value()).exists()) System.out.println("Donor is not a file or folder: "+donorFile.value());
    }
    
    public FILLINImputationPlugin() {
        super(null, false);
    }

    public FILLINImputationPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    
    @Override
    public String getCitation() {
        return "Swarts K, Li H, Romero Navarro JA, An D, Romay MC, Hearne S, Acharya C, Glaubitz JC, Mitchell S, Elshire RJ, Buckler ES, Bradbury PJ "
                + "(2014) "
                + "Novel methods to optimize genotypic imputation for low-coverage, next-generation sequence data in crop plants. "
                + "Plant Genome 7(3). doi:10.3835/plantgenome2014.05.0023";
    }
    
    @Override
    public DataSet processData(DataSet input) {
        long time=System.currentTimeMillis();
        String donor= donorFile.value();
        unimpAlign=ImportUtils.read(hmpFile.value());
        if (isOutputProjection.value()) {
            unimpAlign= FILLINDonorGenotypeUtils.RemoveSitesThatDoNotMatchMinMaj(donorFile.value(), unimpAlign,verboseOutput);
            donor= donor.subSequence(0, donor.lastIndexOf("."))+".matchMinMaj.hmp.txt.gz";
        }
        GenotypeTable[] donorAlign=FILLINDonorGenotypeUtils.loadDonors(donor, unimpAlign, minTestSites.value(),
                verboseOutput,appoxSitesPerDonorGenotypeTable.value());
        if (accuracy.value()) {
            time= System.currentTimeMillis()-time; //holds the time so far
            if (maskKey.value()!=null) maskKeyAlign= ImportUtils.readGuessFormat(maskKey.value());
            acc= new FILLINImputationAccuracy(unimpAlign,maskKeyAlign,donorAlign,propSitesMask.value(),depthToMask.value(), 
                propDepthSitesMask.value(),outFileBase.value(),MAFClass,verboseOutput);
            unimpAlign= acc.initiateAccuracy();
            time= System.currentTimeMillis()-time;//restarts the time, including the time to load donors and target but not including the time to set up accuracy
        }
        OpenBitSet[][] conflictMasks=FILLINDonorGenotypeUtils.createMaskForAlignmentConflicts(unimpAlign, donorAlign,
                verboseOutput);

        System.out.printf("Unimputed taxa:%d sites:%d %n",unimpAlign.numberOfTaxa(),unimpAlign.numberOfSites());
        System.out.println("Creating Export GenotypeTable:"+outFileBase.value());
        Object mna;
        if(isOutputProjection.value()) {
            mna=new ProjectionBuilder(ImportUtils.readGuessFormat(donorFile.value()));
        } else {
            if(outFileBase.value().contains(".h5")) {
                mna= GenotypeTableBuilder.getTaxaIncremental(this.unimpAlign.positions(),outFileBase.value());
            }else {
                mna= GenotypeTableBuilder.getTaxaIncremental(this.unimpAlign.positions());
            }
        }
        int numThreads = Runtime.getRuntime().availableProcessors();
        System.out.println("Time to read in files and generate masks: "+((System.currentTimeMillis()-time)/1000)+" sec");
        ExecutorService pool = Executors.newFixedThreadPool(numThreads);
        
        for (int taxon = 0; taxon < unimpAlign.numberOfTaxa(); taxon+=1) {
            int[] trackBlockNN= new int[5];//global variable to track number of focus blocks solved in NN search for system out; index 0 is inbred, 1 is viterbi, 2 is smash, 3 is not solved, 4 is total for all modes
            ImputeOneTaxon theTaxon= (((double)unimpAlign.heterozygousCountForTaxon(taxon)/(double)unimpAlign.totalNonMissingForTaxon(taxon))minSitesPresent);
            int countFullLength=0;
            int countByFocus= 0;
            for (int da = 0; (da < donorAlign.length)&&enoughData ; da++) {
                int donorOffset=unimpAlign.siteOfPhysicalPosition(donorAlign[da].chromosomalPosition(0), donorAlign[da].chromosome(0));
                int blocks=donorAlign[da].allelePresenceForAllSites(0, Major).getNumWords();
                BitSet[] maskedTargetBits=FILLINDonorGenotypeUtils.arrangeMajorMinorBtwAlignments(unimpAlign, taxon, donorOffset, donorAlign[da].numberOfSites(), conflictMasks[da][0], conflictMasks[da][1], isSwapMajorMinor);

                //if imputing the donor file, these donor indices prevent self imputation
                int[] donorIndices;
                if(imputeDonorFile){
                    donorIndices=new int[donorAlign[da].numberOfTaxa()-1];
                    for (int i = 0; i < donorIndices.length; i++) {donorIndices[i]=i; if(i>=taxon) donorIndices[i]++;}
                } else {
                    donorIndices=new int[donorAlign[da].numberOfTaxa()];
                    for (int i = 0; i < donorIndices.length; i++) {donorIndices[i]=i;}
                }

                //Finds the best haplotype donors for each focus block within a donorGenotypeTable
                DonorHypoth[][] regionHypthInbred=new DonorHypoth[blocks][maxDonorHypotheses.value()];
                byte[][][] targetToDonorDistances=FILLINImputationUtils.calcAllelePresenceCountsBtwTargetAndDonors(maskedTargetBits,
                        donorAlign[da]);
                for (int focusBlock = 0; focusBlock < blocks; focusBlock++) {
                    int[] resultRange=FILLINImputationUtils.getBlockWithMinMinorCount(maskedTargetBits[0].getBits(), maskedTargetBits[1].getBits(), focusBlock, minMinorCnt.value(), minMinorCnt.value()*minMajorRatioToMinorCnt);
                    if(resultRange==null) continue; //no data in the focus Block
                    //search for the best inbred donors for a segment
                    regionHypthInbred[focusBlock]=FILLINImputationUtils.findHomozygousDonorHypoth(taxon, resultRange[0], resultRange[2],
                            focusBlock, donorIndices, targetToDonorDistances, minTestSites.value(), maxDonorHypotheses.value());
                }
                impTaxon.setSegmentSolved(false);

                //tries to solve the entire donorAlign region by Virterbi or Inbred
                impTaxon=solveEntireDonorRegion(taxon, donorAlign[da], donorOffset, regionHypthInbred, impTaxon, maskedTargetBits, maxHybridErrorRate.value(), targetToDonorDistances);
                if(impTaxon.isSegmentSolved()) {countFullLength++; continue;}

                //resorts to solving block by block, first by inbred, then by viterbi, and then by hybrid
                impTaxon=solveByBlockNearestNeighbor(impTaxon, taxon, donorAlign[da], donorOffset, regionHypthInbred, hybridNN.value(), maskedTargetBits, minMinorCnt.value(), focusInbredErr, focusHybridErr, focusSmashErr, donorIndices, trackBlockNN, hetsMiss);
                if(impTaxon.isSegmentSolved()) {countByFocus++;}
            }
            double totalFocus= (double)trackBlockNN[3]+(double)trackBlockNN[4];
            sb.append(String.format("InbredOrViterbi:%d FocusBlock:%d PropFocusInbred:%f PropFocusViterbi:%f PropFocusSmash:%f PropFocusMissing:%f BlocksSolved:%d ",
                    countFullLength, countByFocus, (double)trackBlockNN[0]/totalFocus, (double)trackBlockNN[1]/totalFocus, (double)trackBlockNN[2]/totalFocus, (double)trackBlockNN[3]/totalFocus, impTaxon.getBlocksSolved()));
            //sb.append(String.format("InbredOrViterbi:%d FocusBlock:%d BlocksSolved:%d ", countFullLength, countByFocus, impTaxon.getBlocksSolved()));
            int[] unk=FILLINImputationUtils.countUnknownAndHeterozygotes(impTaxon.resolveGeno);
            sb.append(String.format("Unk:%d PropMissing:%g ", unk[0], (double) unk[0] / (double) impTaxon.getOrigGeno().length));
            sb.append(String.format("Het:%d PropHet:%g ", unk[1], (double)unk[1]/(double)impTaxon.getOrigGeno().length));
            sb.append(" BreakPoints:"+impTaxon.getBreakPoints().size());
            if(!isOutputProjection.value()) {
                alignBuilder.addTaxon(unimpAlign.taxa().get(taxon), impTaxon.resolveGeno);
            } else {
                projBuilder.addTaxon(unimpAlign.taxa().get(taxon),impTaxon.getBreakPoints());
            }
            if(verboseOutput) System.out.println(sb.toString());
        }
    }



    /**
     * Solve the entire donor alignment window with either one or two haplotypes.  Generally, it is solved with the
     * HMM Viterbi algorithm.  The Viterbi algorithm is relatively slow the big choice is how to choose the potential
     * donors.
     *
     *
     * @param taxon
     * @param donorAlign
     * @param donorOffset
     * @param regionHypoth
     * @param impT
     * @param maskedTargetBits
     * @param maxHybridErrorRate
     * @return
     */
    private ImputedTaxon solveEntireDonorRegion(int taxon, GenotypeTable donorAlign, int donorOffset,
                DonorHypoth[][] regionHypoth, ImputedTaxon impT, BitSet[] maskedTargetBits, double maxHybridErrorRate, byte[][][] targetToDonorDistances) {

        int blocks=maskedTargetBits[0].getNumWords();
        if(testing==1) System.out.println("Starting complete hybrid search");
        //create a list of the best donors based on showing up frequently high in many focus blocks
        //in the test data set the best results achieved with on the best hypothesis recovered.
        //todo consider whether to use this approach
//        int[] d=FILLINImputationUtils.mostFrequentDonorsAcrossFocusBlocks(regionHypoth, maxDonorHypotheses);
        //Alternative test is find best donors
       int[] d=FILLINImputationUtils.bestDonorsAcrossEntireRegion(targetToDonorDistances, minTestSites.value(),maxDonorHypotheses.value());
        int[] testList=FILLINImputationUtils.fillInc(0,donorAlign.numberOfTaxa()-1);
        int[] bestDonorList=Arrays.copyOfRange(d,0,Math.min(d.length,5));
        DonorHypoth[] bestDBasedOnBest=FILLINImputationUtils.findHeterozygousDonorHypoth(taxon, maskedTargetBits[0].getBits(),
                maskedTargetBits[1].getBits(), 0, blocks-1, blocks/2, donorAlign, bestDonorList, testList, maxDonorHypotheses.value(), minTestSites.value());

        //make all combinations of best donor and find the the pairs that minimize errors
        //with the true switch also will make inbreds
        DonorHypoth[] best2Dsearchdonors=FILLINImputationUtils.findHeterozygousDonorHypoth(taxon, maskedTargetBits[0].getBits(),
                maskedTargetBits[1].getBits(), 0, blocks-1, blocks/2, donorAlign, d, d, maxDonorHypotheses.value(), minTestSites.value());
        DonorHypoth[] best2donors=FILLINImputationUtils.combineDonorHypothArrays(maxDonorHypotheses.value(),bestDBasedOnBest,best2Dsearchdonors);
        if(testing==1) System.out.println(Arrays.toString(best2donors));
        ArrayList goodDH=new ArrayList();
        for (DonorHypoth dh : best2donors) {
            if(dh==null) continue;
            if(dh.isInbred() && (dh.getErrorRate() goodDH= new ArrayList(); //KLS0201
            if (best2donors[0].getErrorRate()=donorAlign.numberOfSites())?donorAlign.numberOfSites()-1: dh.endSite;
        int sites=endSite-dh.startSite+1;
        //Find the informative sites
        StatePositionChain informative=createInformativeStateChainForViterbi(dh.startSite,
                maxNonMedelian, minimumDonorDistance,
                unimpAlign.genotypeRange(dh.targetTaxon, dh.startSite+donorOffset, endSite+1+donorOffset),
                donorAlign.genotypeRange(dh.donor1Taxon, dh.startSite, endSite+1),
                donorAlign.genotypeRange(dh.donor2Taxon, dh.startSite, endSite+1));
        if(informative==null) return null;
        //Find the most likely states by Virterbi
        int chrlength = donorAlign.chromosomalPosition(endSite) - donorAlign.chromosomalPosition(dh.startSite);
        byte[] callsF=callsFromViterbi(trans, chrlength/sites, informative);
        DonorHypoth dh2=new DonorHypoth(dh.targetTaxon,dh.donor1Taxon, dh.donor2Taxon, dh.startBlock, dh.focusBlock, dh.endBlock);
        dh2.phasedResults= callsF;
        if (forwardReverse) {
            byte[] callsR=callsFromViterbi(trans, chrlength/sites, StatePositionChain.reverseInstance(informative));
            ArrayUtils.reverse(callsR);  //These are now in same direction as callsF
            byte[] callsC=new byte[callsF.length];
            System.arraycopy(callsR,0,callsC,0,callsC.length/2);
            System.arraycopy(callsF,callsC.length/2,callsC,callsC.length/2,callsC.length-callsC.length/2);
            dh2.phasedResults= callsC;
        }
        return dh2;
    }

    private byte[] callsFromViterbi(double[][] trans, int avgChrLength, StatePositionChain informative) {
        //There is no EM optimization of Viterbi, clearly something that could be changed
        TransitionProbability tpF = new TransitionProbability();
        EmissionProbability ep = new EmissionProbability();
        tpF.setTransitionProbability(trans);
        ep.setEmissionProbability(emission);
        final double probHeterozygous=0.5;
        final double phom = (1 - probHeterozygous) / 2;
        final double[] pTrue = new double[]{phom, .25*probHeterozygous ,.5 * probHeterozygous, .25*probHeterozygous, phom};
        tpF.setAverageSegmentLength(avgChrLength);
        tpF.setPositions(informative.informSites);
        ViterbiAlgorithm vaF = new ViterbiAlgorithm(informative.informStates, tpF, ep, pTrue);
        vaF.calculate();
        byte[] resultStatesF=vaF.getMostProbableStateSequence();
        int currPos=0;
        //converts the informative states back to all states
        byte[] callsF=new byte[informative.totalSiteCnt];
        for(int cs=0; cs nonMissingObs = new ArrayList<>();
        ArrayList informSites = new ArrayList<>();
        //Selects only the informative sites for the Viterbi algorithm (known & gap free & homozygous in donor)
        for(int cs=0; csmaxNonMendelian) return null;
        //if the donors are too similar Viterbi performs poorly.  Only accepted different donors
        if((double)donorDifferences/(double)informSites.size() donors= new HashSet<>();
        for (int h = 0; h < regionHypth[block].length; h++) {
            if (regionHypth[block][h]!=null) {
                donors.add(regionHypth[block][h].donor1Taxon);
                donors.add(regionHypth[block][h].donor2Taxon);
            }
        }
        if (donors.isEmpty()) return null;
        return Ints.toArray(donors);
    }

//    /**
//     * Simple algorithm that tests every possible two donor combination to minimize
//     * the number of unmatched informative alleles.  Currently, there is little tie
//     * breaking, longer matches are favored.
//     * @param targetTaxon
//     * @param startBlock
//     * @param endBlock
//     * @param focusBlock
//     * @return int[] array of {donor1, donor2, testSites}  sorted by  testPropUnmatched
//     */
//    private DonorHypoth[] getBestHybridDonors(int targetTaxon, long[] mjT, long[] mnT,
//                                              int startBlock, int endBlock, int focusBlock, GenotypeTable donorAlign,
//                                              int[] donor1Indices, int[] donor2Indices,
//                                              boolean viterbiSearch) {
//        TreeMap bestDonors=new TreeMap();
//        Set testedDonorPairs=new HashSet<>();
//        double lastKeytestPropUnmatched=1.0;
//        double inc=1e-9;
//        double[] donorDist;
//        for (int d1: donor1Indices) {
//            long[] mj1=donorAlign.allelePresenceForSitesBlock(d1, Major, startBlock, endBlock + 1);
//            long[] mn1=donorAlign.allelePresenceForSitesBlock(d1, Minor, startBlock, endBlock + 1);
//            for (int d2 : donor2Indices) {
//                if((!viterbiSearch)&&(d1==d2)) continue;
//                if(testedDonorPairs.contains(((long)d1<<32)+(long)d2)) continue;
//                if(testedDonorPairs.contains(((long)d2<<32)+(long)d1)) continue;
//                testedDonorPairs.add(((long)d1<<32)+(long)d2);
//                long[] mj2=donorAlign.allelePresenceForSitesBlock(d2, Major, startBlock, endBlock + 1);
//                long[] mn2=donorAlign.allelePresenceForSitesBlock(d2, Minor, startBlock, endBlock + 1);
//                if(viterbiSearch) {
//                    donorDist=IBSDistanceMatrix.computeHetBitDistances(mj1, mn1, mj2, mn2, minTestSites);
//                    if((d1!=d2)&&(donorDist[0]maxDonorHypotheses) {
//                        bestDonors.remove(bestDonors.lastKey());
//                        lastKeytestPropUnmatched=bestDonors.lastKey();
//                    }
//                }
//            }
//        }
//        DonorHypoth[] result=new DonorHypoth[maxDonorHypotheses];
//        int count=0;
//        for (DonorHypoth dh : bestDonors.values()) {
//            result[count]=dh;
//            count++;
//        }
//        return result;
//    }
//
//    private int[] mendelErrorComparison(long[] mjT, long[] mnT, long[] mj1, long[] mn1,
//                                        long[] mj2, long[] mn2) {
//        int mjUnmatched=0;
//        int mnUnmatched=0;
//        int testSites=0;
//        for (int i = 0; i < mjT.length; i++) {
//            long siteMask=(mjT[i]|mnT[i])&(mj1[i]|mn1[i])&(mj2[i]|mn2[i]);
//            mjUnmatched+=Long.bitCount(siteMask&mjT[i]&(mjT[i]^mj1[i])&(mjT[i]^mj2[i]));
//            mnUnmatched+=Long.bitCount(siteMask&mnT[i]&(mnT[i]^mn1[i])&(mnT[i]^mn2[i]));
//            testSites+=Long.bitCount(siteMask);
//        }
//        int totalMendelianErrors=mjUnmatched+mnUnmatched;
//        // double testPropUnmatched=(double)(totalMendelianErrors)/(double)testSites;
//        return (new int[] {totalMendelianErrors, testSites});
//    }


    private ImputedTaxon setAlignmentWithDonors(GenotypeTable donorAlign, DonorHypoth[] theDH, int donorOffset,
                                                boolean setJustFocus, ImputedTaxon impT, boolean smashOn, boolean hetsMiss) {
        if(theDH[0].targetTaxon<0) return impT;
        boolean print=false;
        int startSite=(setJustFocus)?theDH[0].getFocusStartSite():theDH[0].startSite;
        int endSite=(setJustFocus)?theDH[0].getFocusEndSite():theDH[0].endSite;
        if(endSite>=donorAlign.numberOfSites()) endSite=donorAlign.numberOfSites()-1;
        //prevDonors is used to track recombination breakpoint start and stop
        int[] prevDonors=new int[]{-1, -1};
        if(theDH[0].getPhaseForSite(startSite)==0) {prevDonors=new int[]{theDH[0].donor1Taxon, theDH[0].donor1Taxon};}
        else if(theDH[0].getPhaseForSite(startSite)==2) {prevDonors=new int[]{theDH[0].donor2Taxon, theDH[0].donor2Taxon};}
        else if(theDH[0].getPhaseForSite(startSite)==1) {prevDonors=new int[]{theDH[0].donor1Taxon, theDH[0].donor2Taxon};}
        int prevDonorStart=startSite;

        int[] currDonors=prevDonors;

        for(int cs=startSite; cs<=endSite; cs++) {
            byte donorEst=UNKNOWN_DIPLOID_ALLELE;
            byte neighbor=0;
            //site imputation will try to use all donor hypotheses; breakpoints only use the best (e.g. i==0 tests)
            for (int i = 0; (i < theDH.length) && (donorEst==UNKNOWN_DIPLOID_ALLELE); i++) {
                neighbor++;
                if((theDH[i]==null)||(theDH[i].donor1Taxon<0)) continue;
                if(theDH[i].getErrorRate()>this.maximumInbredError.value()) continue;
                byte bD1=donorAlign.genotype(theDH[i].donor1Taxon, cs);
                if(theDH[i].getPhaseForSite(cs)==0) {
                    donorEst=bD1;
                    if(i==0) currDonors=new int[]{theDH[0].donor1Taxon, theDH[0].donor1Taxon};
                }
                else {
                    byte bD2=donorAlign.genotype(theDH[i].donor2Taxon, cs);
                    if(theDH[i].getPhaseForSite(cs)==2) {
                        donorEst=bD2;
                        if(i==0) currDonors=new int[]{theDH[0].donor2Taxon, theDH[0].donor2Taxon};
                    }
                    else {
                        donorEst=GenotypeTableUtils.getUnphasedDiploidValueNoHets(bD1, bD2);
                        if(i==0) currDonors=new int[]{theDH[0].donor1Taxon, theDH[0].donor2Taxon};
                    }
                }
            }
            byte knownBase=impT.getOrigGeno(cs+donorOffset);
            //record recombination breakpoint if it changes
            if(!Arrays.equals(prevDonors, currDonors)) {
                DonorHaplotypes dhaps=new DonorHaplotypes(donorAlign.chromosome(prevDonorStart), donorAlign.chromosomalPosition(prevDonorStart),
                        donorAlign.chromosomalPosition(cs),prevDonors[0],prevDonors[1]);
                impT.addBreakPoint(dhaps);
                prevDonors=currDonors;
                prevDonorStart=cs;
            }
            //chgHis records the section of the imputation pipeline that made the change, Viterbi negative, blockNN positive
            if(theDH[0].phasedResults==null) {impT.chgHis[cs+donorOffset]=(byte)-neighbor;}
            else {impT.chgHis[cs+donorOffset]=(byte)neighbor;}

            impT.impGeno[cs+donorOffset]= donorEst;  //predicted based on neighbor
            //if genotype is unknown or het undercall then resolves
            //todo there is currently not an option if the predicted disagrees with a homozygous call
            if(knownBase==UNKNOWN_DIPLOID_ALLELE) {
                if (isHeterozygous(donorEst)) {
                    if (smashOn && hetsMiss) {//if imputing a heterozygote, just set to missing
                        impT.resolveGeno[cs+donorOffset]= knownBase;
                    }
                    else impT.resolveGeno[cs+donorOffset]= donorEst; //if not in hybrid, set to het
                }
                else {//if not imputed to a het
                    impT.resolveGeno[cs+donorOffset]= donorEst;}
            } else if (isHeterozygous(donorEst)){
                if(resolveHetIfUndercalled&&GenotypeTableUtils.isPartiallyEqual(knownBase, donorEst)&&smashOn==false){//if smash off, set homozygotes imputed to het to het
                    //System.out.println("ResolveHet:"+theDH[0].targetTaxon+":"+cs+donorOffset+":"+knownBaseString+":"+donorEstString);
                    impT.resolveGeno[cs+donorOffset]= donorEst;
                }
            }
        } //end of cs loop
        //enter a stop of the DH at the beginning of the next block
        DonorHaplotypes dhaps=new DonorHaplotypes(donorAlign.chromosome(prevDonorStart), donorAlign.chromosomalPosition(prevDonorStart),
                donorAlign.chromosomalPosition(endSite),prevDonors[0],prevDonors[1]);
        impT.addBreakPoint(dhaps);
        return impT;

    }

    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "Impute By FILLIN";
    }

    @Override
    public String getToolTipText() {
        return "Imputation that relies on a combination of HMM and Nearest Neighbor using donors derived from FILLINFindHaplotypesPlugin";
    }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
     public static void main(String[] args) {
         GeneratePluginCode.generate(FILLINImputationPlugin.class);
     }

    /**
     * Convenience method to run plugin with one return object.
     */
    /*// TODO: Replace  with specific type.
    public  runPlugin(DataSet input) {
        return () performFunction(input).getData(0).getData();
    }*/

    /**
     * Input HapMap file of target genotypes to impute. Accepts
     * all file types supported by TASSEL5
     *
     * @return Target file
     */
    public String targetFile() {
        return hmpFile.value();
    }

    /**
     * Set Target file. Input HapMap file of target genotypes
     * to impute. Accepts all file types supported by TASSEL5
     *
     * @param value Target file
     *
     * @return this plugin
     */
    public FILLINImputationPlugin targetFile(String value) {
        hmpFile = new PluginParameter<>(hmpFile, value);
        return this;
    }

    /**
     * Directory containing donor haplotype files from output
     * of FILLINFindHaplotypesPlugin. All files with '.gc'
     * in the filename will be read in, only those with matching
     * sites are used
     *
     * @return Donor Dir
     */
    public String donorDir() {
        return donorFile.value();
    }

    /**
     * Set Donor Dir. Directory containing donor haplotype
     * files from output of FILLINFindHaplotypesPlugin. All
     * files with '.gc' in the filename will be read in, only
     * those with matching sites are used
     *
     * @param value Donor Dir
     *
     * @return this plugin
     */
    public FILLINImputationPlugin donorDir(String value) {
        donorFile = new PluginParameter<>(donorFile, value);
        return this;
    }

    /**
     * Output file; hmp.txt.gz and .hmp.h5 accepted.
     *
     * @return Output filename
     */
    public String outputFilename() {
        return outFileBase.value();
    }

    /**
     * Set Output filename. Output file; hmp.txt.gz and .hmp.h5
     * accepted.
     *
     * @param value Output filename
     *
     * @return this plugin
     */
    public FILLINImputationPlugin outputFilename(String value) {
        outFileBase = new PluginParameter<>(outFileBase, value);
        return this;
    }

    /**
     * Preferred haplotype block size in sites (use same as
     * in FILLINFindHaplotypesPlugin)
     *
     * @return Preferred haplotype size
     */
    public Integer preferredHaplotypeSize() {
        return appoxSitesPerDonorGenotypeTable.value();
    }

    /**
     * Set Preferred haplotype size. Preferred haplotype block
     * size in sites (use same as in FILLINFindHaplotypesPlugin)
     *
     * @param value Preferred haplotype size
     *
     * @return this plugin
     */
    public FILLINImputationPlugin preferredHaplotypeSize(Integer value) {
        appoxSitesPerDonorGenotypeTable = new PluginParameter<>(appoxSitesPerDonorGenotypeTable, value);
        return this;
    }

    /**
     * Threshold per taxon heterozygosity for treating taxon
     * as heterozygous (no Viterbi, het thresholds).
     *
     * @return Heterozygosity threshold
     */
    public Double heterozygosityThreshold() {
        return hetThresh.value();
    }

    /**
     * Set Heterozygosity threshold. Threshold per taxon heterozygosity
     * for treating taxon as heterozygous (no Viterbi, het
     * thresholds).
     *
     * @param value Heterozygosity threshold
     *
     * @return this plugin
     */
    public FILLINImputationPlugin heterozygosityThreshold(Double value) {
        hetThresh = new PluginParameter<>(hetThresh, value);
        return this;
    }

    /**
     * Maximum error rate for applying one haplotype to entire
     * site window
     *
     * @return Max error to impute one donor
     */
    public Double maxErrorToImputeOneDonor() {
        return maximumInbredError.value();
    }

    /**
     * Set Max error to impute one donor. Maximum error rate
     * for applying one haplotype to entire site window
     *
     * @param value Max error to impute one donor
     *
     * @return this plugin
     */
    public FILLINImputationPlugin maxErrorToImputeOneDonor(Double value) {
        maximumInbredError = new PluginParameter<>(maximumInbredError, value);
        return this;
    }

    /**
     * Maximum error rate for applying Viterbi with to haplotypes
     * to entire site window
     *
     * @return Max combined error to impute two donors
     */
    public Double maxCombinedErrorToImputeTwoDonors() {
        return maxHybridErrorRate.value();
    }

    /**
     * Set Max combined error to impute two donors. Maximum
     * error rate for applying Viterbi with to haplotypes
     * to entire site window
     *
     * @param value Max combined error to impute two donors
     *
     * @return this plugin
     */
    public FILLINImputationPlugin maxCombinedErrorToImputeTwoDonors(Double value) {
        maxHybridErrorRate = new PluginParameter<>(maxHybridErrorRate, value);
        return this;
    }

    /**
     * Minimum number of sites to test for IBS between haplotype
     * and target in focus block
     *
     * @return Min sites to test match
     */
    public Integer minSitesToTestMatch() {
        return minTestSites.value();
    }

    /**
     * Set Min sites to test match. Minimum number of sites
     * to test for IBS between haplotype and target in focus
     * block
     *
     * @param value Min sites to test match
     *
     * @return this plugin
     */
    public FILLINImputationPlugin minSitesToTestMatch(Integer value) {
        minTestSites = new PluginParameter<>(minTestSites, value);
        return this;
    }

    /**
     * Minimum number of informative minor alleles in the
     * search window (or 10X major)
     *
     * @return Min num of minor alleles to compare
     */
    public Integer minNumOfMinorAllelesToCompare() {
        return minMinorCnt.value();
    }

    /**
     * Set Min num of minor alleles to compare. Minimum number
     * of informative minor alleles in the search window (or
     * 10X major)
     *
     * @param value Min num of minor alleles to compare
     *
     * @return this plugin
     */
    public FILLINImputationPlugin minNumOfMinorAllelesToCompare(Integer value) {
        minMinorCnt = new PluginParameter<>(minMinorCnt, value);
        return this;
    }

    /**
     * Maximum number of donor hypotheses to be explored
     *
     * @return Max donor hypotheses
     */
    public Integer maxDonorHypotheses() {
        return maxDonorHypotheses.value();
    }

    /**
     * Set Max donor hypotheses. Maximum number of donor hypotheses
     * to be explored
     *
     * @param value Max donor hypotheses
     *
     * @return this plugin
     */
    public FILLINImputationPlugin maxDonorHypotheses(Integer value) {
        maxDonorHypotheses = new PluginParameter<>(maxDonorHypotheses, value);
        return this;
    }

    /**
     * Write all imputed heterozygous calls as such, even
     * if the original file has a homozygous call. (Not recommended
     * for inbred lines.)
     *
     * @return Impute all het calls
     */
    public Boolean imputeAllHetCalls() {
        return imputeAllHets.value();
    }

    /**
     * Set Impute all het calls. Write all imputed heterozygous
     * calls as such, even if the original file has a homozygous
     * call. (Not recommended for inbred lines.)
     *
     * @param value Impute all het calls
     *
     * @return this plugin
     */
    public FILLINImputationPlugin imputeAllHetCalls(Boolean value) {
        imputeAllHets = new PluginParameter<>(imputeAllHets, value);
        return this;
    }

    /**
     * If true, uses combination mode in focus block, else
     * does not impute
     *
     * @return Combine two haplotypes as heterozygote
     */
    public Boolean combineTwoHaplotypesAsHeterozygote() {
        return hybridNN.value();
    }

    /**
     * Set Combine two haplotypes as heterozygote. If true,
     * uses combination mode in focus block, else does not
     * impute
     *
     * @param value Combine two haplotypes as heterozygote
     *
     * @return this plugin
     */
    public FILLINImputationPlugin combineTwoHaplotypesAsHeterozygote(Boolean value) {
        hybridNN = new PluginParameter<>(hybridNN, value);
        return this;
    }

    /**
     * Create a projection alignment for high density markers
     *
     * @return Output projection alignment
     */
    public Boolean outputProjectionAlignment() {
        return isOutputProjection.value();
    }

    /**
     * Set Output projection alignment. Create a projection
     * alignment for high density markers
     *
     * @param value Output projection alignment
     *
     * @return this plugin
     */
    public FILLINImputationPlugin outputProjectionAlignment(Boolean value) {
        isOutputProjection = new PluginParameter<>(isOutputProjection, value);
        return this;
    }

    /**
     * Impute the donor file itself
     *
     * @return Impute donor file
     */
    public Boolean imputeDonorFile() {
        return imputeDonorFile.value();
    }

    /**
     * Set Impute donor file. Impute the donor file itself
     *
     * @param value Impute donor file
     *
     * @return this plugin
     */
    public FILLINImputationPlugin imputeDonorFile(Boolean value) {
        imputeDonorFile = new PluginParameter<>(imputeDonorFile, value);
        return this;
    }

    /**
     * Supress system out
     *
     * @return Supress system out
     */
    public Boolean supressSystemOut() {
        return nonverboseOutput.value();
    }

    /**
     * Set Supress system out. Supress system out
     *
     * @param value Supress system out
     *
     * @return this plugin
     */
    public FILLINImputationPlugin supressSystemOut(Boolean value) {
        nonverboseOutput = new PluginParameter<>(nonverboseOutput, value);
        return this;
    }

    /**
     * Masks input file before imputation and calculates accuracy
     * based on masked genotypes
     *
     * @return Calculate accuracy
     */
    public Boolean calculateAccuracy() {
        return accuracy.value();
    }

    /**
     * Set Calculate accuracy. Masks input file before imputation
     * and calculates accuracy based on masked genotypes
     *
     * @param value Calculate accuracy
     *
     * @return this plugin
     */
    public FILLINImputationPlugin calculateAccuracy(Boolean value) {
        accuracy = new PluginParameter<>(accuracy, value);
        return this;
    }

    /**
     * Proportion of genotypes to mask for accuracy calculation
     * if depth not available
     *
     * @return Proportion of genotypes to mask if no depth
     */
    public Double proportionOfGenotypesToMaskIfNoDepth() {
        return propSitesMask.value();
    }

    /**
     * Set Proportion of genotypes to mask if no depth. Proportion
     * of genotypes to mask for accuracy calculation if depth
     * not available
     *
     * @param value Proportion of genotypes to mask if no depth
     *
     * @return this plugin
     */
    public FILLINImputationPlugin proportionOfGenotypesToMaskIfNoDepth(Double value) {
        propSitesMask = new PluginParameter<>(propSitesMask, value);
        return this;
    }

    /**
     * Depth of genotypes to mask for accuracy calculation
     * if depth information available
     *
     * @return Depth of genotypes to mask
     */
    public Integer depthOfGenotypesToMask() {
        return depthToMask.value();
    }

    /**
     * Set Depth of genotypes to mask. Depth of genotypes
     * to mask for accuracy calculation if depth information
     * available
     *
     * @param value Depth of genotypes to mask
     *
     * @return this plugin
     */
    public FILLINImputationPlugin depthOfGenotypesToMask(Integer value) {
        depthToMask = new PluginParameter<>(depthToMask, value);
        return this;
    }

    /**
     * Proportion of genotypes of given depth to mask for
     * accuracy calculation if depth available
     *
     * @return Proportion of depth genotypes to mask
     */
    public Double proportionOfDepthGenotypesToMask() {
        return propDepthSitesMask.value();
    }

    /**
     * Set Proportion of depth genotypes to mask. Proportion
     * of genotypes of given depth to mask for accuracy calculation
     * if depth available
     *
     * @param value Proportion of depth genotypes to mask
     *
     * @return this plugin
     */
    public FILLINImputationPlugin proportionOfDepthGenotypesToMask(Double value) {
        propDepthSitesMask = new PluginParameter<>(propDepthSitesMask, value);
        return this;
    }

    /**
     * Key to calculate accuracy. Genotypes missing (masked)
     * in target file should be present in key, with all other
     * sites set to missing. Overrides otheraccuracy options
     * if present and all sites and taxa present in target
     * file present
     *
     * @return Optional key to calculate accuracy
     */
    public String optionalKeyToCalculateAccuracy() {
        return maskKey.value();
    }

    /**
     * Set Optional key to calculate accuracy. Key to calculate
     * accuracy. Genotypes missing (masked) in target file
     * should be present in key, with all other sites set
     * to missing. Overrides otheraccuracy options if present
     * and all sites and taxa present in target file present
     *
     * @param value Optional key to calculate accuracy
     *
     * @return this plugin
     */
    public FILLINImputationPlugin optionalKeyToCalculateAccuracy(String value) {
        maskKey = new PluginParameter<>(maskKey, value);
        return this;
    }

    /**
     * Calculate R2 accuracy within MAF categories based on
     * donor file
     *
     * @return Calculate accuracy within MAF categories
     */
    public Boolean calculateAccuracyWithinMAFCategories() {
        return byMAF.value();
    }

    /**
     * Set Calculate accuracy within MAF categories. Calculate
     * R2 accuracy within MAF categories based on donor file
     *
     * @param value Calculate accuracy within MAF categories
     *
     * @return this plugin
     */
    public FILLINImputationPlugin calculateAccuracyWithinMAFCategories(Boolean value) {
        byMAF = new PluginParameter<>(byMAF, value);
        return this;
    }
}