net.maizegenetics.pangenome.hapcollapse.ConsensusProcessingUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
/**
 * 
 */
package net.maizegenetics.pangenome.hapcollapse;

import java.sql.Connection;
import java.util.*;

import net.maizegenetics.pangenome.api.*;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.taxa.distance.DistanceMatrixBuilder;
import org.apache.log4j.Logger;

import com.google.common.collect.BiMap;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;

import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Tuple;

/**
 * This class holds methods for processing consensus using the variants and alleles tables
 * or data derived from them.
 * 
 * @author lcj34
 *
 */
public class ConsensusProcessingUtils {
    private static final Logger myLogger = Logger.getLogger(ConsensusProcessingUtils.class);
    
    public static  byte[] gtvalues = new byte[] {GenotypeTable.UNKNOWN_DIPLOID_ALLELE,
            NucleotideAlignmentConstants.getNucleotideDiploidByte("AA"),
            NucleotideAlignmentConstants.getNucleotideDiploidByte("CC"),
            NucleotideAlignmentConstants.getNucleotideDiploidByte("GG"),
            NucleotideAlignmentConstants.getNucleotideDiploidByte("TT"),
            NucleotideAlignmentConstants.getNucleotideDiploidByte("++"),
            NucleotideAlignmentConstants.getNucleotideDiploidByte("--"),
    };
    
    public static Map createGtvalueMap()
    {
        Map gtMap = new HashMap();
        gtMap.put(GenotypeTable.UNKNOWN_DIPLOID_ALLELE, 0);
        gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("AA"),1);
        gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("CC"), 2);
        gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("GG"), 3);
        gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("TT"), 4);
        gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("++"), 5);
        gtMap.put(NucleotideAlignmentConstants.getNucleotideDiploidByte("--"), 6);
        return gtMap;
    }
    /**
     * RefAltData comes from processing the Longs stored for each haplotype.
     * It is used for consensus processing.  Different data is available than is
     * with the VariantMappingData used for processing single haplotypes.
     *
     * @param rad
     * @return
     */
    public static long getLongFromRefAltData(RefAltData rad) {
        long vmLong = 0;
        if ( rad.getVariantId() == -1) {
            // create long for reference record
           // format:  1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
            vmLong = rad.getLen(); // length gets upper 3 bytes
            vmLong = (vmLong << 8) + rad.getRefDepth(); // refDepth get next byte
            vmLong = (vmLong << 32) + rad.getPosition(); // position on chrom gets last 4 bytes
            vmLong |= 1L << 63; // set upper bit to indicate this is a reference record
        } else {
            // create long for variant record
            // format: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 2 bytes=GC
            vmLong = rad.getVariantId(); // id gets 4 upper bytes
            vmLong = (vmLong << 8) + rad.getRefDepth(); // refDepth gets next byte
            vmLong = (vmLong << 8) + rad.getAltDepth(); // altDepth gets a byte
            byte isIndel = rad.getLen() > 1 ? (byte)1 : 0; // RefAltData now stores actual indel length
            vmLong = (vmLong << 8) + isIndel; // isIndel gets a byte
            vmLong = (vmLong << 8) + 0; // undefined other data as last byte
            
        }
        
        return vmLong;
    }

    // Determine if any taxon have a reference range at startPos.  If yes, find longest stretch where most have refRange
    // that occurs before the next SNP is seen (nextSnpPos)
    // parameter "maxRefLen" is the maximum length for a refBlock.  This is determined by where the next SNP falls -
    // we stop the refBLock before the next SNP.  If maxRefLen = -1, means no more snps so length isn't limited.
    // Returns:  RefAltData.  A variant ID of -1 means is refblock.  0 means store N.  BUt we don't have variantID for N block!  ANd it needs to be N to Ref
    public static RefAltData findTaxonRefCoverage(Position startPos, int maxRefLen, TaxaList tList, Map> taxonToPosDataMap, double maxError, double minTaxaCoverage) {
        int refDepth = 0; // filled in for refBLock
        int altDepth = 0; // filled in for refBlock (will be 0, altdepth not stored for refBlock)
        int indelRefDepth = 0; // filled in when we have non-ref block
        int indelAltDepth = 0; // filled in when we have non-ref block
        int taxaRefCount = 0;
        int middleOfVariantCount = 0;
                
        int refLen = Integer.MAX_VALUE; // start high - we pick smallest Ref covered
        
        //  Split taxa names back to original so their Ref data can be grabbed 
        TaxaList splitTList = splitConsensusTaxa(tList); 
        
        // This map holds a list of variantIds and the number of times
        // they were seen.  The RefAltData for each variantId is also stored. 
        Map> varidCountRADMap = new HashMap<> ();
        
        for (Taxon taxa : splitTList) {            
            RangeMap txRangeMap = taxonToPosDataMap.get(taxa);
            if (txRangeMap == null) {
                throw new IllegalArgumentException("findTaxonRefCoverage: no data for taxa " + taxa.getName());
            }
            Map.Entry, RefAltData> posRadEntry = txRangeMap.getEntry(startPos);
            RefAltData rad  = null;
            if (posRadEntry != null ) {
                rad = posRadEntry.getValue();
            }
            if (rad != null && rad.getVariantId() == -1) { // have data at this position and it is a ref block
                taxaRefCount++;
                refDepth += rad.getRefDepth();
                altDepth += rad.getAltDepth();
                // the range contains the startpos.  Determine actual ref covered based on parameter startPos
                // We want the minimum number covered. rad.getPosition() is the start of the range.
                // ex: range starts at 3, is of length 9.  The range was inclusive/inclusive so
                //     goes from 3-11.  The current ref index (startPos) is at 7.  From 7 until the
                //     end there are 5 position (7-11).  (3 + 9) - 7 = 5
                int taxaRefCovered = (rad.getPosition() + rad.getLen()) - startPos.getPosition();
                if (taxaRefCovered < refLen) {
                    refLen = taxaRefCovered; // keeping the smallest ref length
                }           
            } else if (rad != null){ // is indel or SNP
                // The range start position must equal the current ref start position or it is skipped.
                // To be compared, variants must start at the same positions
                if (rad.getPosition() != startPos.getPosition()) {
                    middleOfVariantCount ++;
                    continue; // are in the middle of a variant
                }

                indelRefDepth += rad.getRefDepth();
                indelAltDepth += rad.getAltDepth();
                if (varidCountRADMap.containsKey(rad.getVariantId())) {
                    Tuple oldData = varidCountRADMap.get(rad.getVariantId());
                    Tuple newValues = new Tuple(oldData.getX()+1,oldData.getY());
                    varidCountRADMap.put(rad.getVariantId(), newValues);
                } else {
                    varidCountRADMap.put(rad.getVariantId(), new Tuple(1,rad));
                }
            } // otherwise do nothing
        }

        int totalCounter = taxaRefCount + middleOfVariantCount;
        int currentMax = taxaRefCount;
        if (maxRefLen > 0 && refLen > maxRefLen) refLen = maxRefLen; // stop refStretch before next SNP.
        RefAltData currentMaxRefAltData = null;

        if(varidCountRADMap.size() > 0) {
            for (Tuple data : varidCountRADMap.values()) {
                if(data.getX() > currentMax) {
                    currentMax = data.getX();
                    currentMaxRefAltData = data.getY();
                }
                totalCounter += data.getX();
            }
        }

        // Check the maxError - avoid dividing by 0
        double errorRate = (totalCounter == 0 ) ? Double.MAX_VALUE : 1.0 - ((double)currentMax / (double)totalCounter);
        
        // If coverage level is too low or the error rate is too high, return missing (variant ID = 0)
        if((double)totalCounter/splitTList.size() < minTaxaCoverage || errorRate > maxError) {
            return new RefAltData(0, startPos.getChromosome().getName(), startPos.getPosition(), 1, indelRefDepth, indelAltDepth);
        }
        else {
            if(currentMaxRefAltData == null) { // return Ref call (variant ID = -1)
                if (refLen <=0 ) { // this should not happen
                    myLogger.debug("findTaxonRefCoverage: currentMaxRefAltData is NULL, using bad refLen of " + refLen);
                }
                return new RefAltData(-1, startPos.getChromosome().getName(), startPos.getPosition(), refLen, refDepth, altDepth);
            }
            if (currentMaxRefAltData.getLen() <= 0 ) {  // should not happen
                myLogger.debug("findTaxonRefCoverage: using currentMaxRefAltData wih bad refLen of " + currentMaxRefAltData.getLen());
            }
            return currentMaxRefAltData; // return the variant info
        }
    }
    
    // FindHaplotypeClusters merged the taxa into 1, changing the name
    // to be the concatenation of the taxa names, with a "_0" added to each name.
    // Names are separated by ":". Split these back up and make a taxa list to traverse
    public static TaxaList splitConsensusTaxa(TaxaList tList){
        TaxaListBuilder taxaBuilder = new TaxaListBuilder();               
        tList.stream().forEach(taxa ->{
            String taxaName = taxa.getName();
            
            StringTokenizer tokenizer = new StringTokenizer(taxaName, ":");
            while (tokenizer.hasMoreElements()) {
                String name = (String)tokenizer.nextElement();
                int nameEnd = name.lastIndexOf("_");
                if (nameEnd < 0) {
                    throw new IllegalArgumentException("ConsensusProcessingUtils:splitConsensusTaxa: no underscore found in name " + name);
                }
                name = name.substring(0,nameEnd);
                taxaBuilder.add(name);
            }
            
        });              
        TaxaList taxaList = taxaBuilder.build();
        return taxaList;
    }
    
    
    // need ordered listof taxon INCLUDING the _ for sending to
    // putConsensusData for loading to DB
    private static List getOrderedConsensusTaxon(TaxaList tList){
        List orderedTaxonList = new ArrayList();               
        tList.stream().forEach(taxa ->{
            String taxaName = taxa.getName();
            
            StringTokenizer tokenizer = new StringTokenizer(taxaName, ":");
            while (tokenizer.hasMoreElements()) {
                orderedTaxonList.add((String)tokenizer.nextElement());                
            }
            
        });              
        Collections.sort(orderedTaxonList);
        return orderedTaxonList;
    }
    
    // THis method chooses the most frequent value found at each position.
    // Currently, it does not check counts, N's, etc.  And breaking of ties
    // is arbitrary.
    public static Map chooseVarIdForSNPPositionFromGenotypeTable(GenotypeTable genotypeTable, Map> posCallVarIdMap ) {
 
        Map positionToVariantIDMap = new HashMap();
        
        // genotypes defined in ConsensusProcessinUtils.gtvalues.  alleleRedirect map has index into this array
        // Get the reverse map
        Map genotypeToIndex = createGtvalueMap();
              
        for (int idx = 0; idx < genotypeTable.numberOfSites(); idx ++) {

            // There is only 1 taxon in the genotypeTable as this is consensus and
            // the genotype at each position was already called.
            byte majorGenotype = genotypeTable.genotype(0, idx);;                      

            // store call for this position to the map.
            Position pos = genotypeTable.positions().get(idx);
            
            BiMap varidtoCallBiMap = posCallVarIdMap.get(idx);  // positions index, not physical position, was stored
            Integer varId;

            Integer gtMapIndex = genotypeToIndex.get(majorGenotype);
            if (gtMapIndex == null) {
                throw new IllegalStateException("chooseVarIdForSNPPositionFromGenotypeTable: genotype " + majorGenotype + " not found in map!!");
            }
            if (gtMapIndex == 0) {                
                varId = -2; // is missing
            } else if (gtMapIndex == 1) {
                varId = -1;  // is ref
            } else {
                varId = varidtoCallBiMap.inverse().get(gtMapIndex); // translate the call to the variantID
                if (varId == null) {
                    throw new IllegalArgumentException("VariantProcessingUtils:getPositionForSNPsMap: no entry for chrom " 
                       + pos.getChromosome().getName() + " position " + pos.getPosition());
                }
            }
           positionToVariantIDMap.put(pos, varId); 
        }
        
        return positionToVariantIDMap;
    }
    
    /**
     * Returns a map of variantID to RefAltData for each variantId/data lookup.
     * variantID of -1 (ref) is handled in calling method.    
     * @param taxonToPosDataMap
     * @return
     */
    public static Map  createVarIDtoRefAltData(Map> taxonToPosDataMap) {
        Map varIdToDataMap = new HashMap();
        for (RangeMap mapData : taxonToPosDataMap.values()) {
            for (RefAltData rad : mapData.asMapOfRanges().values()) {
                varIdToDataMap.put(rad.getVariantId(),rad);
            }
        }
        return varIdToDataMap;
    }    

    public static String convertVariantsToSequence(List variants, ReferenceRange refRange, GenomeSequence refSequence) {
        //make sure variants are in order by start and have unique start positions, resolve any overlapping variants
        //rules for resolving overlaps
        //1. Overwrite any multisite ref calls
        //2. Do not overwrite alt calls (this will almost always be a deletion because SNPS only cover one ref position)

        //variants will already be in order by start and the starts will be non-overlapping.
        //So, just need to compare each variant to the previous variant.
        //Once the previous variant range has been determined its sequence can be added to the string
        Chromosome chr = refRange.chromosome();
        if (!variantsInOrder(variants)) throw new IllegalArgumentException("Variants not sorted by start position or variant start not unique");

        VariantInfo previousVariant = null;
        StringBuilder mySequence = new StringBuilder();
        for (VariantInfo info : variants) {
            if (previousVariant != null) {
                //does this variant overlap the previous variant?
                if (info.start() > previousVariant.end()) {
                    //no overlap
                    addVariantToSequence(mySequence, previousVariant, chr, refSequence);
                    //if there is a gap, pad with N's after previous sequence
                    for (int pos = previousVariant.end() + 1; pos < info.start(); pos++) mySequence.append("N");
                    previousVariant = info;

                } else {
                    //apply rules one and two to determine range for these variants
                    //then add sequence from previousVariant to mySequence
                    boolean prevAltCall = previousVariant.isVariant() && previousVariant.genotypeString().equals(previousVariant.altAlleleString());
                    if (prevAltCall) {
                        //do not overwrite the previous call (leave its range as is)
                        addVariantToSequence(mySequence, previousVariant, chr, refSequence);

                        int numberOfCharactersToTrim = previousVariant.end() - info.start() + 1;

                        //if this info is a variant trim necessary characters of the beginning of the genotype string
                        if (info.isVariant()) {
                            String trimmedGenotype;
                            if (info.genotypeString().length() <= numberOfCharactersToTrim) trimmedGenotype = "";
                            else trimmedGenotype = info.genotypeString().substring(numberOfCharactersToTrim);
                            previousVariant = new VariantInfo(info.chromosome(), info.start(), info.end(), trimmedGenotype,
                                    info.refAlleleString(), info.altAlleleString(), info.isVariant(), info.depth());
                        } else {
                            //if info is a ref block adjust its range to start after the previous one ends
                            previousVariant = new VariantInfo(info.chromosome(), previousVariant.end() + 1, info.end(), info.genotypeString(),
                                    info.refAlleleString(), info.altAlleleString(), info.isVariant(), info.depth());
                        }

                    } else if (previousVariant.isVariant()) {
                        //trim the end off the previous genotype so that it ends before this one
                        int numberOfCharactersInGenotype = info.start() - previousVariant.start();
                        String trimmedGenotype = previousVariant.genotypeString().substring(0,numberOfCharactersInGenotype);
                        previousVariant = new VariantInfo(previousVariant.chromosome(), previousVariant.start(), previousVariant.end(), trimmedGenotype,
                                previousVariant.refAlleleString(), previousVariant.altAlleleString(), previousVariant.isVariant(), previousVariant.depth());
                        addVariantToSequence(mySequence, previousVariant, chr, refSequence);
                        previousVariant = info;
                    } else {
                        //reduce the previous range so that it ends before this one
                        previousVariant = new VariantInfo(previousVariant.chromosome(), previousVariant.start(), info.start() - 1, previousVariant.genotypeString(),
                                previousVariant.refAlleleString(), previousVariant.altAlleleString(), previousVariant.isVariant(), previousVariant.depth());
                        addVariantToSequence(mySequence, previousVariant, chr, refSequence);
                        previousVariant = info;
                    }
                }

            } else {
                previousVariant = info;
            }

        }

        //add the final variant
        addVariantToSequence(mySequence, previousVariant, chr, refSequence);

        //Convert range map to String
        //Add N's for any reference positions not covered by the range map
        return mySequence.toString();
    }

    private static void addVariantToSequence(StringBuilder sequence, VariantInfo variant, Chromosome chr, GenomeSequence refseq) {
        if (variant.end() < variant.start()) {
            StringBuilder sb = new StringBuilder("Variant end less than variant start for chr ");
            sb.append(variant.chromosome()).append(" ").append(variant.start());
            sb.append(" to ").append(variant.end()).append(", geno=");
            sb.append(variant.genotypeString()).append(" ref=").append(variant.refAlleleString()).append(" alt=").append(variant.altAlleleString());
            myLogger.debug(sb.toString());
            throw new IllegalArgumentException("variant end less than variant start");
        }
        if (variant.isVariant()) {
            //add the genotypeString
            sequence.append(variant.genotypeString());
        } else {
            //add ref for the variant range
            sequence.append(refseq.genotypeAsString(chr, variant.start(), variant.end()));
        }
    }

    public static boolean variantsInOrder(List variants) {
        Iterator variantIterator = variants.iterator();
        VariantInfo previousVariant;
        if (variantIterator.hasNext()) previousVariant = variantIterator.next();
        else return true;

        while (variantIterator.hasNext()) {
            VariantInfo currentVariant = variantIterator.next();
            if (currentVariant.chromosome().equals(previousVariant.chromosome())) {
                if (currentVariant.start() < previousVariant.start()) return false;
            }
        }
        return true;
    }

    /**
     * Encode a reference block to a Long
     * @param refLength length of the reference block
     * @param refDepth  readDepth in the reference block. Max depth is 127 because it is encoded by a single byte.
     * @param pos       the start position of the block on a chromosome
     * @return          the long encoding this information
     *
     * */
    public static Long encodeRefBlockToLong(int refLength, int refDepth, int pos ) {
        //ref: 1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
        int MAX_REF_LENGTH = 8388607;
        int MAX_REF_DEPTH = 127;

        int reflen = Math.min(MAX_REF_LENGTH, refLength);
        int refdep = Math.min(MAX_REF_DEPTH, refDepth);

        long encodedLong = reflen;
        encodedLong <<= 8;
        encodedLong += refdep;
        encodedLong <<= 32;
        encodedLong += pos;
        encodedLong |= 1L << 63; // set upper bit to indicate this is a reference record

        return encodedLong;
    }

    /**
     * Encode variant information as a Long
     * @param variantId
     * @param refDepth  read depth of the reference allele. Max = 127
     * @param altDepth  read depth of the alt allele. Max = 127
     * @param isIndel   true if the variant is an indel, false otherwise.
     * @return          the Long encoding this information
     * */
    public static Long encodeVariantToLong(int variantId, int refDepth, int altDepth, boolean isIndel) {
        //Variant: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 isIndel | 1 byte=unused
        int MAX_DEPTH = 127;

        int refdep = Math.min(MAX_DEPTH, refDepth);
        int altdep = Math.min(MAX_DEPTH, altDepth);

        long encodedLong = variantId;
        encodedLong <<= 8;
        encodedLong += refdep;
        encodedLong <<= 8;
        encodedLong += altdep;
        encodedLong <<= 8;
        encodedLong += isIndel ? 1 : 0;
        encodedLong <<= 8;

        return encodedLong;
    }

    /**
     * Method to verify that the taxa in the graph are in the ranking file.
     * We can have more taxon in the ranking file than in the graph, but not the other way around.
     * @param graph
     * @param rankingMap
     * @return
     */
    public static boolean areGraphTaxaInRankingMap(HaplotypeGraph graph, Map rankingMap) {
        //Get out the taxa list for the graph
        TaxaList taxaList = graph.taxaInGraph();

        long numGraphTaxonNotInRank = taxaList.stream().map(taxon -> taxon.getName())
                .filter(taxonName -> !rankingMap.keySet().contains(taxonName))
                .count();

        //If the number of graph taxon not in the ranking file is above 0, we are not able to pick the correct ranking.
        if(numGraphTaxonNotInRank>0) {
            for (Taxon taxon : taxaList) {
                if (rankingMap.get(taxon.getName()) == null) {
                    myLogger.warn("areGraphTaxaInRankingMap: missing from rankingFile:" + taxon.getName());
                }
            }
            return false;
        }
        else {
            return true;
        }
    }

    /**
     * Method to verify that rankings are unique.  This is used to throw a Warning message in the Assembly consensus.
     * @param rankingMap
     * @return
     */
    public static boolean areRankingsUnique(Map rankingMap) {
        Set rankingSet = new HashSet<>();
        for(String taxonName : rankingMap.keySet()) {
            if(rankingSet.contains(rankingMap.get(taxonName))) {
                return false;
            }
            else {
                rankingSet.add(rankingMap.get(taxonName));
            }
        }

        return true;
    }

    /**
     * Function to create a disntance matrix given a set of variants for a single reference range.
     *
     * This will ignore Ns and indels
     *
     * Distance is #SNPs/(#SNPs + #RefPos)
     *
     * @param ntaxa
     * @param chr
     * @param taxaWithInfo
     * @param taxonToVariantInfoMap
     * @return
     */
    public static DistanceMatrix createDistanceMatrix(int ntaxa, Chromosome chr, ReferenceRange currentRefRange, TaxaList taxaWithInfo, Map> taxonToVariantInfoMap ) {
        DistanceMatrixBuilder dmBuilder = DistanceMatrixBuilder.getInstance(taxaWithInfo);

        myLogger.debug(taxaWithInfo.size() + " taxa used to build distance matrix in createDistanceMatrix (line 549)");

        //set diagonal to 0
        for (int i = 0; i < ntaxa; i++) dmBuilder.set(i, i, 0.0);

        for (int i = 0; i < ntaxa - 1; i++) {
            RangeMap headMap = taxonToVariantInfoMap.get(taxaWithInfo.get(i));
            for (int j = i + 1; j < ntaxa; j++) {
                RangeMap compMap = taxonToVariantInfoMap.get(taxaWithInfo.get(j));
                int totalSiteCount = 0;
                int diffSiteCount = 0;
                int pos = currentRefRange.start();
                int prevPos = 0;
                while (pos <= currentRefRange.end()) {

                    if(pos > prevPos) {
                        prevPos = pos;
                    }
                    VariantInfo headInfo = headMap.get(pos);
                    VariantInfo compInfo = compMap.get(pos);

                    //if either is null increment pos, no information so cannot make a comparison
                    if (headInfo == null || compInfo == null) pos++;

                        //if !isVariant for both, then these are ref blocks
                        //increment total sites by min(head.end, comp.end) - current pos + 1
                        //set pos = min(head.end, comp.end) + 1
                    else if (!headInfo.isVariant() && !compInfo.isVariant()) {

                        int blockend = Math.min(headInfo.end(), compInfo.end());

                        if(pos>blockend) {
                            System.out.println("Pos:"+pos+" "+blockend);
                        }
                        blockend = Math.min(blockend, currentRefRange.end()); //ref blocks may extend beyond the end of the range
                        int coveredLength = blockend - pos + 1;
                        if (coveredLength < 1) {
                            String msg = String.format("CoveredLength < 1 at chr %s pos %d, taxa = %s, %s",
                                    chr.getName(), pos, taxaWithInfo.taxaName(i), taxaWithInfo.taxaName((j)));
                            throw new IllegalArgumentException(msg);
                        }
                        totalSiteCount += coveredLength;
                        pos = blockend + 1;
                    }

                    //if head is not variant and comp is variant:
                    //if comp.start != pos, we landed in the middle of an indel, skip to comp.end + 1
                    //if comp.start == pos, increment total site count by 1
                    //set pos = comp.end + 1
                    else if (!headInfo.isVariant() && compInfo.isVariant()) {
                        if (compInfo.start() == pos) {
                            totalSiteCount++;
                            if (!compInfo.isIndel() && compInfo.genotypeString().equals(compInfo.altAlleleString())) {
                                diffSiteCount++;
                            }
                        }
                        pos = compInfo.end() + 1;
                    }

                    //same for head is variant and comp is not variant
                    else if (headInfo.isVariant() && !compInfo.isVariant()) {
                        if (headInfo.start() == pos) {
                            totalSiteCount++;
                            if (!headInfo.isIndel() && headInfo.genotypeString().equals(headInfo.altAlleleString())) {
                                diffSiteCount++;
                            }
                        }
                        pos = headInfo.end() + 1;
                    }

                    //if both are variant
                    //if head.start = comp.start = pos process otherwise skip to max(head.end, comp.end)
                    //compare genotypes add same site count total, add diff site count to total and diff
                    //set pos = max(head.end, comp.end) + 1
                    else {
                        if (headInfo.start() == compInfo.start() && compInfo.start() == pos) {
                            totalSiteCount++;
                            if(!headInfo.isIndel() && !compInfo.isIndel() && !headInfo.genotypeString().equals(compInfo.genotypeString()))
                                diffSiteCount++;
                        }
                        pos = Math.max(headInfo.end(), compInfo.end()) + 1;
                    }
                }
                double dist = (double) diffSiteCount / totalSiteCount;
                dmBuilder.set(i, j, dist);
            }
        }

        return dmBuilder.build();
    }

    /**
     * Function to create a new DistanceMatrix setting NaNs to the maximum value in both its row and column.
     * @param originalDM
     * @return
     */
    public static DistanceMatrix setNsToMax(DistanceMatrix originalDM) {
        DistanceMatrixBuilder dmBuilder = DistanceMatrixBuilder.getInstance(originalDM.getTaxaList());

        List nanCoords = new ArrayList<>();

        for(int row = 0; row < originalDM.numberOfTaxa(); row++) {
            for( int col = row; col < originalDM.numberOfTaxa();col++) {
                if(Float.isNaN(originalDM.getDistance(row,col))) {
                    //Store coords for fixing later
                    nanCoords.add(new Integer[]{row,col});
                }
                else {
                    dmBuilder.set(row,col,originalDM.getDistance(row,col));
                }
            }
        }

        //Loop through the NanCoordinates
        for(Integer[] nanCoordinate : nanCoords) {
            dmBuilder.set(nanCoordinate[0],nanCoordinate[1],maxDistance(originalDM,nanCoordinate[0],nanCoordinate[1]));
        }

        return dmBuilder.build();
    }

    /**
     * Function to figure out what the maximum row and col distance are for a given position in the oringinal distance Matrix.
     * @param matrix
     * @param row
     * @param col
     * @return
     */
    public static float maxDistance(DistanceMatrix matrix,int row, int col) {
        float currentMax = 0.0f;
        for(int colCounter = 0; colCounter < matrix.numberOfTaxa(); colCounter++) {
            if( !Float.isNaN(matrix.getDistance(row,colCounter)) && matrix.getDistance(row,colCounter) > currentMax) {
                currentMax = matrix.getDistance(row,colCounter);
            }
        }

        for( int rowCounter = 0; rowCounter < matrix.numberOfTaxa(); rowCounter++) {
            if(!Float.isNaN(matrix.getDistance(rowCounter,col)) && matrix.getDistance(rowCounter,col) > currentMax) {
                currentMax = matrix.getDistance(rowCounter,col);
            }
        }

        return currentMax;
    }

}