net.maizegenetics.analysis.imputation.FILLINImputationUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.
The newest version!
package net.maizegenetics.analysis.imputation;

import com.google.common.collect.*;
import com.google.common.primitives.Ints;
import net.maizegenetics.analysis.popgen.DonorHypoth;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.BitUtil;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import static net.maizegenetics.dna.snp.GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
import static net.maizegenetics.dna.WHICH_ALLELE.Major;
import static net.maizegenetics.dna.WHICH_ALLELE.Minor;
import static net.maizegenetics.dna.snp.GenotypeTableUtils.isHeterozygous;

/**
 * Basic utility functions to support imputation by blocks.
 *
 * @author Ed Buckler
 */
public class FILLINImputationUtils {

    /**
     * Counts union and intersection of major and minor alleles between the target genotype and potential
     * donor genotypes.  The counts are done by 64 sites block.  These counts can be quickly used to estimate
     * distance for the set of blocks.
     * @param modBitsOfTarget major and minor presence bits for target genotype (must be aligned same as donor)
     * @param donorAlign genotypeTable with potential donor genotypes
     * @return array with [donor index][sites, same count, diff count, het count index][block index]
     */
    public static byte[][][] calcAllelePresenceCountsBtwTargetAndDonors(BitSet[] modBitsOfTarget, GenotypeTable donorAlign) {
        int blocks=modBitsOfTarget[0].getNumWords();
        byte[][][] allDist=new byte[donorAlign.numberOfTaxa()][4][blocks];
        long[] iMj=modBitsOfTarget[0].getBits();
        long[] iMn=modBitsOfTarget[1].getBits();
        for (int donor1 = 0; donor1 < allDist.length; donor1++) {
            long[] jMj=donorAlign.allelePresenceForAllSites(donor1, Major).getBits();
            long[] jMn=donorAlign.allelePresenceForAllSites(donor1, Minor).getBits();
            for (int i = 0; i  bestDonors=MinMaxPriorityQueue.orderedBy(DonorHypoth.byErrorRateOrdering)
                .maximumSize(maxDonorHypotheses).create();
        for (int d1 : donor1indices) {
            int sameCnt = 0, diffCnt = 0, hetCnt = 0;
            for (int i = firstBlock; i <=lastBlock; i++) {
                sameCnt+=targetToDonorDistances[d1][1][i];
                diffCnt+=targetToDonorDistances[d1][2][i];
                hetCnt+=targetToDonorDistances[d1][3][i];
            }
            int testSites= sameCnt + diffCnt - hetCnt;
            if(testSites bd=new HashMap<>();
        for (int i = 0; i < allDH.length; i++) {
            int rank=allDH[i].length;
            for (DonorHypoth dh: allDH[i]) {
                if (dh==null) continue;
                if (bd.containsKey(dh.donor1Taxon)) {  //counts the frequency of best hit
                    bd.put(dh.donor1Taxon, bd.get(dh.donor1Taxon)+ rank);
                } else {
                    bd.put(dh.donor1Taxon, rank);
                }
                rank--;
                //break;//this line allows only the best hypothesis to go into set (original method)
            }
        }
        TreeMultimap rankings= TreeMultimap.create(Ordering.natural().reverse(),Ordering.natural());
        for (Map.Entry dhs : bd.entrySet()) {
            rankings.put(dhs.getValue(),dhs.getKey());
        }
        int resultSize=(rankings.size() iDH=rankings.values().iterator();
        for (int i=0; i bestDonors=MinMaxPriorityQueue.orderedBy(DonorHypoth.byErrorRateOrdering)
                .maximumSize(maxDonorHypotheses).create();
        long[] mj1=donorAlign.allelePresenceForSitesBlock(d1, Major, firstBlock, lastBlock + 1);
        long[] mn1=donorAlign.allelePresenceForSitesBlock(d1, Minor, firstBlock, lastBlock + 1);
        for (int d2 : donor2Indices) {
            long[] mj2=donorAlign.allelePresenceForSitesBlock(d2, Major, firstBlock, lastBlock + 1);
            long[] mn2=donorAlign.allelePresenceForSitesBlock(d2, Minor, firstBlock, lastBlock + 1);
            int[] mendErr=mendelErrorComparison(mjT, mnT, mj1, mn1, mj2, mn2);
            if(mendErr[1] tests=HashMultimap.create();
        for (int d1 : donor1Indices) {
            for (int d2 : donor2Indices) {
                if(d1 bestDonors=MinMaxPriorityQueue.orderedBy(DonorHypoth.byErrorRateOrdering)
                .maximumSize(maxDonorHypotheses).create();
        for (int d1 : tests.keySet()) {
            int[] d2donors=Ints.toArray(tests.get(d1));
            DonorHypoth[] oneDimenHypoth=findHeterozygousDonorHypoth(targetTaxon, mjT, mnT, firstBlock, lastBlock,
                    focusBlock, donorAlign, d1, d2donors, maxDonorHypotheses, minTestSites);
            for (DonorHypoth donorHypoth : oneDimenHypoth) {bestDonors.add(donorHypoth);}
        }
        DonorHypoth[] result=bestDonors.toArray(new DonorHypoth[0]);
        Arrays.sort(result,DonorHypoth.byErrorRateOrdering);  //Ques keep the top values, but not ordered
        return result;
    }

    /**
     * Combines arrays of donorHypoth, sorts them, and returns the best limited by maxDonorHypotheses
     * @param maxDonorHypotheses maximum number of donor hypotheses to retain
     * @param dhs arrays of DonorHypoth[]
     * @return combined arrays of DonorHypoth (length<=maxDonorHypotheses) ordered by error rate
     */
    public static DonorHypoth[] combineDonorHypothArrays(int maxDonorHypotheses, DonorHypoth[] ... dhs) {
        MinMaxPriorityQueue bestDonors=MinMaxPriorityQueue.orderedBy(DonorHypoth.byErrorRateOrdering)
                .maximumSize(maxDonorHypotheses).create();
        for (DonorHypoth[] aDHArray : dhs) {
            for (DonorHypoth donorHypoth : aDHArray) {
                bestDonors.add(donorHypoth);
            }
        }
        DonorHypoth[] result=bestDonors.toArray(new DonorHypoth[0]);
        Arrays.sort(result,DonorHypoth.byErrorRateOrdering);  //Ques keep the top values, but not ordered
        return result;
    }

    /**
     * Given a start 64 site block, it expands to the left and right until it hits
     * the minimum Minor Site count or MajorSiteCount in the target taxon
     * @param mnT - minor allele bit presence in a series of longs
     * @param focusBlock  index of the focus block
     * @param minMinorCnt minimum count to stop expanding for minor allele sites
     * @param minMajorCnt minimum count to stop expanding for major allele sites
     * @return arrays of blocks {startBlock, focusBlock, endBlock}
     */
    public static int[] getBlockWithMinMinorCount(long[] mjT, long[] mnT, int focusBlock, int minMinorCnt,
                                            int minMajorCnt) {
        int blocks=mjT.length;
        int majorCnt=Long.bitCount(mjT[focusBlock]);
        int minorCnt=Long.bitCount(mnT[focusBlock]);
        int endBlock=focusBlock, startBlock=focusBlock;
        while((minorCnt