All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.hapcollapse.RunHapConsensusPipelinePlugin Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.hapcollapse;

import java.awt.Frame;
import java.io.*;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.swing.ImageIcon;

import com.google.common.collect.*;
import net.maizegenetics.dna.map.*;
import net.maizegenetics.plugindef.*;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.taxa.distance.DistanceMatrixBuilder;
import net.maizegenetics.taxa.tree.Tree;
import net.maizegenetics.taxa.tree.TreeClusters;
import net.maizegenetics.taxa.tree.UPGMATree;
import org.apache.log4j.Logger;

import net.maizegenetics.pangenome.api.HaplotypeGraph;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.pangenome.api.Variant;
import net.maizegenetics.pangenome.api.VariantUtils;
import net.maizegenetics.pangenome.db_loading.AnchorDataPHG;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.pangenome.db_loading.PHGDataWriter;
import net.maizegenetics.pangenome.db_loading.PHGdbAccess;
import net.maizegenetics.pangenome.db_loading.VariantsProcessingUtils;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * 
 * This plugin creates consensus haplotypes.
 * 
 * It processes each reference range in a separate thread using Java Futures.
 * Data is written to the database via a single thread.
 *
 * Simple plugin to run the full Haplotype Collapse plugin.  Will do the following steps:
 * Steps:
 * 1. Loop through each reference range in the graph:
 * 1.a Extract the HaplotypeNodes with the VariantIndo data.
 * 1.b Cluster haplotypes based on the FIndHaplotypeClustersPlugin and a genotype table
 *     containing only variantId data for sites with SNPs.
 * 1.c Export  clustered GenotypeTables
 * 1.d Determine allel calls, create List of ref/variant data for each cluster.
 * 
 * 2. For consensus haplotype created, upload to the database.
 * 
 * based on RunHapCollapsePipelinePlugin 
 *
 * @author lcj34
 *
 */
public class RunHapConsensusPipelinePlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(RunHapConsensusPipelinePlugin.class);

    public enum CLUSTERING_MODE {upgma, upgma_assembly, kmer_assembly};


    private PluginParameter myReference = new PluginParameter.Builder<>("referenceFasta", null, String.class)
            .description("Input Reference Fasta")
            .inFile()
            .build();

    private PluginParameter dbConfigFile = new PluginParameter.Builder<>("dbConfigFile", null, String.class)
            .description("File holding the DB config information")
            .required(true)
            .inFile()
            .build();

    private PluginParameter myCollapseMethod = new PluginParameter.Builder<>("collapseMethod", null, String.class)
            .description("Name of the collapse method to be stored in the database")
            .required(true)
            .build();

    private PluginParameter myCollapseMethodDetails = new PluginParameter.Builder<>("collapseMethodDetails", null, String.class)
            .description("Details for the collapse method to be stored in the database")
            .required(false)
            .build();

    private PluginParameter minAlleleFrequency = new PluginParameter.Builder<>("minFreq", 0.5, Double.class)
            .guiName("Minimum Allele Frequency")
            .description("At each position, if no allele has the minimum frequency, the consensus haplotype allele will be set to missing.")
            .required(false)
            .build();

    private PluginParameter rankingFile = new PluginParameter.Builder<>("rankingFile", null, String.class)
            .description("File The Ranking for the Taxon in the DB.  This is used to break ties.")
            .required(false)
            .inFile()
            .build();

    private PluginParameter maxNumberOfClusters = new PluginParameter.Builder<>("maxClusters", 30, Integer.class)
            .required(false)
            .description("The maximum number of clusters that will be created for a reference range. " +
                    "If mxDiv produces too many clusters then the cut height that produces maxClusters number of clusters will be substituted.")
            .guiName("Maximum Cluster Number")
            .build();

    private PluginParameter minSiteForComp = new PluginParameter.Builder<>("minSites", 30, Integer.class)
            .required(false)
            .description("The minimum number of shared sites that can be used to calculate the distance between two taxa")
            .guiName("Minimum Sites with Data ")
            .build();

    private PluginParameter minTaxaCoverage = new PluginParameter.Builder<>("minCoverage", 0.1, Double.class)
            .required(false)
            .description("For each range, any taxon with coverage of less than this amount will not be used" +
                    " to generate consensus haplotypes and will not be included in any haplotype group for that range.")
            .guiName("Minimum Coverage")
            .range(Range.closed(0.0,1.0))
            .build();

    private PluginParameter maxThreads = new PluginParameter.Builder<>("maxThreads", 1000, Integer.class)
            .description("The maximum number of threads to be used to create consensi. " +
                    "The actual number of threads used will not be greater than number of available CPU's - 2.")
            .build();

    private PluginParameter minTaxa = new PluginParameter.Builder<>("minTaxa", 1, Integer.class)
            .description("Minimum number of taxa")
            .build();

    private PluginParameter mxDiv = new PluginParameter.Builder<>("mxDiv", 0.01, Double.class)
            .description("Maximum divergence.")
            .build();

    private PluginParameter clusteringMode = new PluginParameter.Builder<>("clusteringMode", CLUSTERING_MODE.upgma, CLUSTERING_MODE.class)
            .description("Clustering mode")
            .range(CLUSTERING_MODE.values())
            .build();

    private PluginParameter kmerSize = new PluginParameter.Builder<>("kmerSize", 7, Integer.class)
            .description("Kmer size")
            .dependentOnParameter(clusteringMode, CLUSTERING_MODE.kmer_assembly)
            .build();

    private PluginParameter distanceCalculation = new PluginParameter.Builder<>("distanceCalculation", DistanceCalculation.Euclidean, DistanceCalculation.class)
            .description("Distance calculation type")
            .dependentOnParameter(clusteringMode, CLUSTERING_MODE.kmer_assembly)
            .range(DistanceCalculation.values())
            .build();

    public RunHapConsensusPipelinePlugin() {
        super(null, false);
    }

    public RunHapConsensusPipelinePlugin(Frame parentFrame) {
        super(parentFrame, false);
    }

    public RunHapConsensusPipelinePlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }

    @Override
    protected void postProcessParameters() {

        if (clusteringMode() == CLUSTERING_MODE.upgma && (reference() == null || reference().isEmpty())) {
            throw new IllegalArgumentException("RunHapConsensusPipelinePlugin: postProcessParameters: reference fasta must be specified for clustering modes upgma_assembly or upgma");
        }

        if ((clusteringMode() == CLUSTERING_MODE.upgma_assembly || clusteringMode() == CLUSTERING_MODE.kmer_assembly) && (rankingFile() == null || rankingFile().isEmpty())) {
            throw new IllegalArgumentException(("RunHapConsensusPipelinePlugin: postProcessParameters: ranking file must be specified for clustering modes upgma_assembly and kmer"));
        }

    }

    @Override
    public DataSet processData(DataSet input) {

        //Load up the PHG
        List temp = input.getDataOfType(HaplotypeGraph.class);
        if (temp.size() != 1) {
            throw new IllegalArgumentException("HapCountBestPathPlugin: processData: must input one HaplotypeGraph: " + temp.size());
        }
        HaplotypeGraph hapgraph = (HaplotypeGraph) temp.get(0).getData();

        runConsensusPipeline(hapgraph, reference());

        return null;
    }
    
    /**
     * This method will loop through each reference range in the graph and will:
     * 1. Merge the List of variant/ref data for a given haplotype method:
     * 2. Cluster the haplotypes together into groups
     * Then when done with all reference ranges, load data for each cluster to the db.
     *
     * This is the main function that processes each reference range in the graph
     *
     * @param graph haplotype graph
     * @param referenceFasta reference fasta
     */
    public void runConsensusPipeline(HaplotypeGraph graph, String referenceFasta) {

        final GenomeSequence referenceSequence;
        if (reference() != null && !reference().isEmpty()) {
            referenceSequence = GenomeSequenceBuilder.instance(referenceFasta);
        } else {
            referenceSequence = null;
        }

        myLogger.info("RunHapConsensusPipelinePlugin: checking masterVariantMap for empty");
        if (Variant.masterVariantMap.isEmpty()) {
            Long mTime = System.nanoTime();
            Connection dbConn = DBLoadingUtils.connection(dbConfigFile(), false);
            myLogger.info("RunHapConsensusPipelinePlugin: masterVariantMap is empty - call addGraphVariantsToVariantMap");
            VariantUtils.addGraphVariantsToVariantMap(graph, dbConn);
            try {
                dbConn.close();
            } catch (SQLException exc) {

                myLogger.info("runConsensusPipeline: error closing db connection after creating masterVariantMap");
            }
            double totalTime = (System.nanoTime() - mTime)/1e9;
            myLogger.info("time to create masterVariantMap in seconds: " + totalTime);
            
        }
        myLogger.info("\nrunHapConsensusPipelinePlugin: after masterVariantMap check, size of masterVariantMap: " + Variant.masterVariantMap.size());
        // Get the properties object
        Properties dbProperties = loadProperties();
        myLogger.info("Starting up the threadpool");
        ExecutorService threadpool = ForkJoinPool.commonPool();
        int availableCPUs = Runtime.getRuntime().availableProcessors();
        if (maxThreads() < availableCPUs - 2 ) {
            threadpool = new ForkJoinPool(maxThreads());
        }
        myLogger.info("Thread Pool started");
        BlockingQueue> futures = new LinkedBlockingQueue<>(5100);

        // Start thread that processes futures
        Future processingFuture = threadpool.submit(new ProcessFutures(futures));

        myLogger.info("Loading up the ranking file");
        Map rankingMap = new HashMap<>();
        if(rankingFile() != null) {
            try {
                BufferedReader reader = Utils.getBufferedReader(rankingFile());
                String currentLine = "";
                while ((currentLine = reader.readLine()) != null) {
                    String[] currentLineSplit = currentLine.split("\t");
                    rankingMap.put(currentLineSplit[0], Double.parseDouble(currentLineSplit[1]));
                }
            } catch (Exception e) {
                myLogger.info("RunHapCollapsePipelinePlugin: runCollapsePipeline: Problem with reading ranking file: " + e.getMessage());

            }

            //Verify the ranking map
            if(!ConsensusProcessingUtils.areGraphTaxaInRankingMap(graph,rankingMap)) {
                throw new IllegalStateException("Ranking file does not have all the taxon in the graph.  " +
                        "Please check your ranking file and make sure you have rankings for each taxon.");
            }

            if(!ConsensusProcessingUtils.areRankingsUnique(rankingMap)) {
                myLogger.warn("WARNING found multiple taxon with the same ranking.  This has the potential to select incorrect representative haplotypes.");
            }


        }
        else {
            myLogger.info("No Ranking file was supplied. Running Consensus in upgma mode ");
        }

        myLogger.info("Walking through reference ranges and starting up future threads");

        boolean normalTermination = true;

        //processingFuture is running in a thread. When an exception occurs within processingFuture,
        // it prints an error message in the log and dies but does not kill the calling thread.
        // That blocks the futures queue and things just hang waiting for completion.
        // So, the approach is to check to make sure the processingFuture thread is still alive before adding another reference range
        // and, just to be safe, also check to make sure that processingFuture does not die after that check.
        for (ReferenceRange referenceRange : graph.referenceRanges()) {
            List datumList = new ArrayList<>();
            datumList.add(new Datum("graph", graph, "Graph Passed In"));
            if (referenceSequence != null) {
                datumList.add(new Datum("referenceGenomeSequence", referenceSequence, "GenomeSequence holding the reference"));
            }
            datumList.add(new Datum("refRange", referenceRange, " Current Reference Range"));
            datumList.add(new Datum("dbConfigFile",dbConfigFile(), " Database configuration info")); // lcj added

            DataSet input = new DataSet(datumList, null);

            //processingFuture.isDone() returns true if processingFuture has terminated for any reason.
            // If the futures queue is empty it blocks but does not terminate. It will not terminate
            // until it encounters an empty RunMergeAndCluster or if and exception occurs.
            if (processingFuture.isDone()) {
                myLogger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
                myLogger.error("processingFuture was terminated prematurely. Stopped trying to add ref ranges at " + referenceRange.toString());
                myLogger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
                normalTermination = false;
                break;
            }
            try {
                // Add processing of the merging and then clustering to the blocking queue
                Future myFuture = threadpool.submit(new RunMergeAndCluster(input,
                        dbProperties, referenceSequence, rankingMap));
                Boolean added = false;
                while (!added && !processingFuture.isDone()) {
                    added = futures.offer(myFuture, 1L, TimeUnit.MINUTES);
                }

            } catch (Exception e) {
                // Catch the error, but finish processing the reference ranges
                myLogger.debug(e.getMessage(), e);
                myLogger.error("Error Processing refRange: " + referenceRange.intervalString() + "\nError: " + e.getMessage());
            }
        }

        try {
            //add an empty RunMergeAndCluster to the processing queue to tell processingFutures to terminate
            if (processingFuture.isDone()) {
                myLogger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
                myLogger.error("processingFuture was terminated prematurely. All reference ranges were added to the processing queue.");
                myLogger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
                normalTermination = false;
            } else {

                Future lastFuture = threadpool.submit(new RunMergeAndCluster());
                Boolean added = false;
                while (!added && !processingFuture.isDone()) {
                    added = futures.offer(lastFuture, 1L, TimeUnit.MINUTES);
                }
            }

        } catch (InterruptedException e) {
            myLogger.error("Error putting empty RunMergeAndCluster in runConsensusPipeline. May interfere with thread shutdown.");
            myLogger.debug(e.getMessage(), e);
        }

        // Wait until processing of all futures is done
        try {
            processingFuture.get();
        } catch (Exception ex) {
            myLogger.debug(ex.getMessage(), ex);
        }
        
        myLogger.info("Shutting down the threadpool");
        threadpool.shutdown();
        if (!normalTermination) throw new IllegalArgumentException("The consensus pipeline terminated prematurely");
    }

    private Properties loadProperties() {
        Properties configProperties = new Properties();
        try {
            configProperties.load(Utils.getBufferedReader(dbConfigFile()));
        } catch (Exception e) {
            myLogger.error("RunHapCollapsePipelinePlugin: loadProperties Failed to Load Properties.", e);
            throw new IllegalStateException("RunHapCollapsePipelinePlugin: loadProperties Failed to Load Properties.", e);
        }
        return configProperties;
    }
    
    
    /**
     * The ProcessFutures class is the thread that needs to run all of the RunMergeAndCluster threads and will upload things to the db.
     * Basically this thread will run until it finds the final RunMergeAndCluster object, blocking execution until each thread in the queue is done.
     * Once we have over 1000 haplotypes to write to the db, we write those haplotypes to the db and start a new queue.
     *
     */
    private class ProcessFutures implements Runnable {

        private final BlockingQueue> myQueue;

        public ProcessFutures(BlockingQueue> queue) {
            myQueue = queue;
        }

        @Override
        public void run() {
            try (Connection dbConnect = DBLoadingUtils.connection(dbConfigFile(), false)) {

                PHGDataWriter phg = new PHGdbAccess(dbConnect);

                Multimap>> consensusDataMap = HashMultimap.create();

                int methodId = phg.putMethod(collapseMethod(), DBLoadingUtils.MethodType.ANCHOR_HAPLOTYPES, pluginParameters());
                Future future = myQueue.take();
                RunMergeAndCluster mergeAndClusterObject = future.get();
                while (!mergeAndClusterObject.myIsFinal) {

                    if (mergeAndClusterObject == null) {
                        //Skip over this one.
                        continue;
                    }
                    consensusDataMap.putAll(mergeAndClusterObject.getConsensusDataMap());
                    ReferenceRange currentRefRange = mergeAndClusterObject.getReferenceRange();

                    myLogger.debug("RefRange: "+currentRefRange.intervalString());

                    //TODO make this a parameter
                    if (consensusDataMap.size() > 5000) {
                        myLogger.debug("Writing to the database:");
                        //Load the consensus to the db
                        Long time = System.nanoTime();
                        phg.putConsensusSequences(consensusDataMap, methodId);
                        double totalTime = (System.nanoTime() - time)/1e9;
                        myLogger.debug("time to write 10000 entries in seconds: " + totalTime);
                        //Clear out the multimap
                        consensusDataMap.clear();
                    }

                    future = myQueue.take();
                    mergeAndClusterObject = future.get();
                }

                //Load the consensus to the db
                phg.putConsensusSequences(consensusDataMap, methodId);

            } catch (Exception e) {
                myLogger.debug(e.getMessage(), e);
                throw new IllegalStateException("RunHapCollapsePipelinePlugin: ProcessFutures: problem: " + e.getMessage());
            }

        }

    }
    
    /**
     *
     * This class takes a reference range, run through all the 
     * taxon with their variants List, create genotype table based on variant_id from PHG variants table.
     * Send this genotype table through FindHaplotypeClustersPlugin
     * Decode results from clustersPlugin into actual alleles
     * 
     */
    public class RunMergeAndCluster implements Callable {

        private DataSet graphToMerge;
        private ReferenceRange currentRefRange;
        private final boolean myIsFinal;
        private Multimap>> consensusDataMap = HashMultimap.create();
        private GenomeSequence referenceGenotype = null;
        private final Map rankingMap;
        private final Pattern NUC = Pattern.compile("[ACGTR]"); //R denotes a ref nucleotide here
        private final Pattern GAM = Pattern.compile(".+_[0-9]+");

        public RunMergeAndCluster(DataSet graphToMerge, Properties dbProperties, GenomeSequence referenceSequence) {
            this.graphToMerge = graphToMerge;
            this.referenceGenotype = referenceSequence;
            myIsFinal = false;
            this.rankingMap = null;
        }

        public RunMergeAndCluster(DataSet graphToMerge, Properties dbProperties, GenomeSequence referenceSequence, Map rankingMap) {
            this.graphToMerge = graphToMerge;
            this.referenceGenotype = referenceSequence;
            myIsFinal = false;
            this.rankingMap = rankingMap;
        }

        public RunMergeAndCluster() {
            this.graphToMerge = null;
            myIsFinal = true;
            this.rankingMap = null;
        }

        @Override
        public RunMergeAndCluster call() {
            if (myIsFinal) {
                return this;
            }
            try {

                if (clusteringMode() == CLUSTERING_MODE.upgma) {
                    clusterThenMerge();
                }
                else if (clusteringMode() == CLUSTERING_MODE.upgma_assembly || clusteringMode() == CLUSTERING_MODE.kmer_assembly) {
                    clusterAssemblies();
                }

                return this;
            } catch (Exception exc) {
                //Get out the current ReferenceRange:
                ReferenceRange referenceRange = (ReferenceRange) graphToMerge.getDataOfType(ReferenceRange.class).get(0).getData();
                myLogger.error("Error processing ReferenceRange:" + referenceRange.intervalString() + " ErrorMessage:" + exc.getMessage());
                myLogger.debug(exc.getMessage(), exc);
                return this;
            } finally {
                graphToMerge = null;
            }
        }

        public void clusterThenMerge() {
            currentRefRange = (ReferenceRange) graphToMerge.getDataOfType(ReferenceRange.class).get(0).getData();
            Chromosome chr = currentRefRange.chromosome();

            List temp = graphToMerge.getDataOfType(HaplotypeGraph.class);
            if (temp.size() != 1) {
                throw new IllegalArgumentException("PathsToVCFPlugin: processData: must input one Haplotype Graph");
            }

            HaplotypeGraph graph = (HaplotypeGraph)temp.get(0).getData();

            myLogger.info("Loading variants into RangeMap");

            //load variants into a RangeMap
            Map> taxonToVariantInfoMap = loadTaxaRangeMaps(graph);

            DistanceMatrix dm = calculateDistanceMatrix(taxonToVariantInfoMap);
            if (dm == null) {
                myLogger.info("Not enough data to calculate distances. No consensus haplotypes created for " + currentRefRange.toString());
                return;
            }

            myLogger.debug(String.format("Finished builder distance matrix for ref range %d", currentRefRange.id()));

//            debug - show dm
            StringBuilder sb = new StringBuilder();
            for (double[] row : dm.getDistances()) {
                sb.append(Arrays.stream(row).mapToObj(d -> String.format("%1.6f",d)).collect(Collectors.joining(" "))).append("\n");
            }
            sb.append("-----------");
            myLogger.debug(sb.toString());

            //cluster and extract groups
            UPGMATree myTree = new UPGMATree(dm);
            TreeClusters myClusters = new TreeClusters(myTree);
            int[] taxaGroups = myClusters.getGroups(mxDiv());
            int nclusters = Arrays.stream(taxaGroups).max().orElse(0) + 1;

            if (nclusters > maxNumberOfClusters()) {
                int oldClusterNumber = nclusters;
                taxaGroups = myClusters.getGroups(maxNumberOfClusters());
                nclusters = Arrays.stream(taxaGroups).max().orElse(0) + 1;
                myLogger.debug(String.format("%d clusters at %s using mxDiv = %1.8f, number of clusters reduced to %d",
                        oldClusterNumber, currentRefRange.toString(), mxDiv(), nclusters));
            }

            System.out.println(groupsToTaxaLists(taxaGroups,myTree));
            System.out.println(Arrays.toString(taxaGroups));

            myLogger.debug(String.format("ref range chr %s pos %d has %d clusters", currentRefRange.chromosome().getName(), currentRefRange.start(), nclusters));


            //add consensus haplotypes to the consensusDataMap
            //consensusDataMap is a Multimap>>
            //Position is the start position of currentRefRange
            //Each Tuple represents a single haplotype
            //AnchorDataPHG is an AnchorDataPHG object for that haplotype
            //List is the of taxa names ending in _hap number

            for (List listOfTaxa : groupsToTaxaLists(taxaGroups, myTree)) {
                List consensusVariants = consensusVariants(taxonToVariantInfoMap, listOfTaxa, currentRefRange);
                List consensusLongs = consensusVariants.stream().map(VariantInfo::toLong).collect(Collectors.toList());
                String consensusSequence = ConsensusProcessingUtils.convertVariantsToSequence(consensusVariants, currentRefRange, referenceGenotype);
                AnchorDataPHG myAnchorData = createAnchorPHG(consensusLongs, consensusSequence, currentRefRange);
                List gameteNames = taxaToGameteNames(listOfTaxa);
                consensusDataMap.put(Position.of(currentRefRange.chromosome(), currentRefRange.start()), new Tuple<>(myAnchorData, gameteNames));
            }

        }

        private List taxaToGameteNames(List taxa) {
            //the code that adds the consensus haplotypes to the db expects the taxa names to end in _haplotype number
            //if the taxa names do not end in that add _0
            List gameteNames = taxa.stream().map(Taxon::getName).map(tn -> {
//                if (GAM.matcher(tn).matches()) return tn;
//                else return tn + "_0";
                return tn + "_0";
            }).collect(Collectors.toList());
            return gameteNames;
        }

        private Map> loadTaxaRangeMaps(HaplotypeGraph graph) {
            List nodes = graph.nodes(currentRefRange);
            Map> rangeMaps = new HashMap<>();
            int sitesInRef = currentRefRange.end() - currentRefRange.start() + 1;

            for (HaplotypeNode node : nodes) {
                Taxon myTaxon = node.taxaList().get(0);
                Optional> optInfo = node.variantInfos();

                if (optInfo.isPresent()) {
                    //calculate taxon coverage
                    int sitesCalled = optInfo.get().stream().mapToInt(var -> var.length()).sum();
                    double coverage = (double) sitesCalled / sitesInRef;

                    //add taxon to map if coverage exceeds minTaxaCoverage
                    if (coverage >= minTaxaCoverage()) {
                        rangeMaps.put(myTaxon, rangeMapForTaxon(optInfo.get(), myTaxon.getName()));
                    } else {
                        myLogger.debug(String.format("%s dropped due to insufficient coverage = %1.3f at %s",
                                myTaxon.getName(), coverage, currentRefRange.toString()));
                    }
                }
            }

            return rangeMaps;
        }

        private RangeMap rangeMapForTaxon(List infoList, String taxonName) {
            TreeRangeMap infoMap = TreeRangeMap.create();

            for (VariantInfo info : infoList) {
                //if the genotype call is N do not process it
                if (info.genotypeString().equals("N")) continue;

                //add the new range after removing any potential overlap
                Range infoRange = Range.closed(info.start(), info.end());
                infoMap.remove(infoRange);
                infoMap.put(infoRange, info);

            }
            return infoMap;
        }

        public DistanceMatrix calculateDistanceMatrix(Map> taxonToVariantInfoMap) {
            List taxa = new ArrayList<>(taxonToVariantInfoMap.keySet());
            int nTaxaInMap = taxa.size();
            double[][] distanceMatrix = new double[nTaxaInMap][nTaxaInMap];

            //set the distance values in the matrix
            for (int i = 0; i < nTaxaInMap - 1; i++) {
                distanceMatrix[i][i] = 0.0; //set the diagonal to 0
                for (int j = i + 1; j < nTaxaInMap; j++) {
                    RangeMap varmapi = taxonToVariantInfoMap.get(taxa.get(i));
                    RangeMap varmapj = taxonToVariantInfoMap.get(taxa.get(j));
                    distanceMatrix[i][j] = distanceMatrix[j][i] = distanceFromSNPs(varmapi, varmapj);
                }
            }

            //calculate row maxima (column maxima are the same)
            double[] rowMax = new double[nTaxaInMap];
            Arrays.fill(rowMax, Double.NaN);

            for (int i = 0; i < nTaxaInMap; i++) {
                for (int j = 0; j < nTaxaInMap; j++) {
                    if (!Double.isNaN(distanceMatrix[i][j])) {
                        if (Double.isNaN(rowMax[i])) rowMax[i] = distanceMatrix[i][j];
                        else rowMax[i] = Math.max(rowMax[i], distanceMatrix[i][j]);
                    }
                }
            }


            //delete any rows/columns that are completely missing other than the diagonal
            List rowsNotMissing = new ArrayList<>();
            for (int r = 0; r < nTaxaInMap; r++) {
                long missingCount = Arrays.stream(distanceMatrix[r]).filter(d -> Double.isNaN(d)).count();
                if (missingCount < nTaxaInMap - 1) rowsNotMissing.add(r);
            }


            if (rowsNotMissing.size() <= 1) return null;
            TaxaListBuilder taxaBuilder = new TaxaListBuilder();

            //delete rows/columns, taxa, replace NaN's with row/column max
            for (Integer ndx : rowsNotMissing) taxaBuilder.add(taxa.get(ndx));
            DistanceMatrixBuilder distanceBuilder = DistanceMatrixBuilder.getInstance(taxaBuilder.build());

            int reducedTaxaNumber = rowsNotMissing.size();
            for (int i = 0; i < reducedTaxaNumber; i++) {
                int originalRow = rowsNotMissing.get(i);
                for (int j = i + 1; j < reducedTaxaNumber; j++) {
                    int originalColumn = rowsNotMissing.get(j);
                    if (Double.isNaN(distanceMatrix[originalRow][originalColumn]))
                        distanceBuilder.set(i, j, Math.max(rowMax[originalRow], rowMax[originalColumn]));
                    else distanceBuilder.set(i, j, distanceMatrix[originalRow][originalColumn]);
                }
            }
            return distanceBuilder.build();
        }

        public double distanceFromSNPs(RangeMap rangeMap1, RangeMap rangeMap2) {
            //this method is designed to work with variant calls from GATK/Senteion, which separates insertions and deletions
            //calls generated from mummer alignments should use the method in createDistanceMatrix()

            //compare only positions represented by single nucleotides
            //test only the pos for variants because pos+1 could be covered by a different variant due to GVCF overlaps
            //except that it is okay to process entire ref blocks

            int pos = currentRefRange.start();
            int totalSites = 0;
            int variantSites = 0;
            while (pos <= currentRefRange.end()) {
                VariantInfo var1 = rangeMap1.get(pos);
                VariantInfo var2 = rangeMap2.get(pos);
                if (var1 == null || var2 == null) {
                    pos++; //do not add this site to counts
                } else if(var1.isIndel() || var2.isIndel()) {
                    pos++; //do not add this site to counts
                } else if(var1.genotypeString().equals("N") || var2.genotypeString().equals("N")) {
                    pos++; //do not add this site to counts
                } else if(var1.isVariant() && var2.isVariant()) {
                    //query position in each variant. If a single nucleotide increment total
                    //if nucleotides are unequal increment variantSites
                    String geno1 = var1.genotypeString();
                    String geno2 = var2.genotypeString();
                    if ((NUC.matcher(geno1).matches()) && NUC.matcher(geno2).matches()) {
                        if (!geno1.equals(geno2)) variantSites++;
                        totalSites++;
                    }
                    pos++;
                } else if(var1.isVariant()) { // && !var2.isVariant
                    //var1 is a snp, so just need to check if it is a ref or alt call
                    totalSites++;
                    if (var1.genotypeString().equals(var1.altAlleleString())) variantSites++;
                    pos++;
                } else if(var2.isVariant()) { // && !var1.isVariant
                    //same as for var1 is variant
                    //var2 is a snp, so just need to check if it is a ref or alt call
                    totalSites++;
                    if (var2.genotypeString().equals(var2.altAlleleString())) variantSites++;
                    pos++;
                } else { //both are in ref block
                    //move to end of shortest ref block and increment totalSites by number of positions moved
                    //move pos 1 past the end of the block
                    //do not go beyond the end of currentRefRange
                    int endpos = Math.min(var1.end(), var2.end());
                    endpos = Math.min(endpos, currentRefRange.end());
                    totalSites += endpos - pos + 1;
                    pos = endpos + 1;
                }

            }

            if (totalSites < minSiteForComp()) return Double.NaN;

            return (double) variantSites / totalSites;
        }


        /**
         * Method to cluster Assemblies
         *
         * Steps Taken Here:
         * For the current Reference Range, extract out all haplotypes in the graph
         * Compute a distance matrix using #SNPS/(#SNPs+#Ref) as the distance.  Ignore indels and Ns.
         * From this distance matrix, compute a UPGMA tree and cut at parameter mxDiv
         * Within each sub group, chose a single haplotype to be the representative haplotype.  This choice is done by the external rankingFile required to run this
         * Once the representative haplotypes are chosen, write them to the db with the full Taxa list for the sub-group
         *
         */
        public void clusterAssemblies() {
            myLogger.info("Running Cluster Assemblies.");
            if(rankingMap == null) {
                throw new IllegalStateException("Unable to run clusterAssemblies as no ranking map:");
            }
            currentRefRange = (ReferenceRange) graphToMerge.getDataOfType(ReferenceRange.class).get(0).getData();
            Chromosome chr = currentRefRange.chromosome();

            List temp = graphToMerge.getDataOfType(HaplotypeGraph.class);
            if (temp.size() != 1) {
                throw new IllegalArgumentException("PathsToVCFPlugin: processData: must input one Haplotype Graph");
            }

            HaplotypeGraph graph = (HaplotypeGraph)temp.get(0).getData();

            myLogger.info("Loading variants into RangeMap");
            //load variants into a RangeMap
            Map> taxonToVariantInfoMap = loadVariantsIntoRangeMap(graph,chr);

            //calculate a distance matrix
            int ntaxa = taxonToVariantInfoMap.keySet().size();
            TaxaList taxaWithInfo = new TaxaListBuilder().addAll(taxonToVariantInfoMap.keySet()).build();

            //Now that we have the clusters we need to run SW on the nodes
            //Walk through the taxon groups and convert get the nodes for all the taxon in the group
            Map taxonToHapNodeMap = createTaxonToHaplotypeNode(graph,currentRefRange);

            //If we just have one haplotype, just write it to the DB immediately
            if(taxonToHapNodeMap.size()==1) {
                List taxonNames = taxonToHapNodeMap.keySet().stream().map(taxon -> taxon.getName()+"_0").collect(Collectors.toList());

                HaplotypeNode currentNode = taxonToHapNodeMap.get(taxonToHapNodeMap.keySet().stream().collect(Collectors.toList()).get(0));

                String bestSeq = currentNode.haplotypeSequence().sequence();
                List consensusLongs = currentNode.variantContextsLong().get();
                consensusDataMap.put(Position.of(currentRefRange.chromosome(), currentRefRange.start()),
                        new Tuple<>(createAnchorPHG(consensusLongs, currentRefRange, bestSeq),
                                taxonNames));
            }
            else {
                DistanceMatrix dm;
                switch (clusteringMode()) {

                    case upgma_assembly:
                        dm = ConsensusProcessingUtils.createDistanceMatrix(ntaxa, chr, currentRefRange, taxaWithInfo, taxonToVariantInfoMap);
                        break;

                    case kmer_assembly:
                        dm = KmerBasedConsensusUtils.kmerDistanceMatrix(currentRefRange, graph.nodes(currentRefRange), kmerSize(), distanceCalculation());
                        break;

                    default:
                        throw new IllegalArgumentException("RunHapConsensusPipelinePlugin: clusterAssemblies: clustering mode: " + clusteringMode().name() + " not supported.");

                }
                myLogger.debug(String.format("Finished builder distance matrix for ref range %d", currentRefRange.id()));

//                System.out.println("Before SettingNs:");
//                System.out.println(dm.getTaxaList().stream().map(taxon -> taxon.getName()).collect(Collectors.joining(",")));
//                for (double[] row : dm.getDistances()) {
//                    System.out.println(Arrays.stream(row).mapToObj(d -> String.format("%1.4f",d)).collect(Collectors.joining(" ")));
//                }
//                System.out.println("-------");

                //Need to set the NaNs to something otherwise
                dm = ConsensusProcessingUtils.setNsToMax(dm);
//                System.out.println(dm.getTaxaList().stream().map(taxon -> taxon.getName()).collect(Collectors.joining(",")));

                //Uncomment to show distance matrix
//                System.out.println("After SettingNs:");
//                for (double[] row : dm.getDistances()) {
//                    System.out.println(Arrays.stream(row).mapToObj(d -> String.format("%1.4f",d)).collect(Collectors.joining(" ")));
//                }
//                System.out.println("-------");

                //cluster and extract groups
                UPGMATree myTree = new UPGMATree(dm);
                TreeClusters myClusters = new TreeClusters(myTree);
                int[] taxaGroups = myClusters.getGroups(mxDiv());
                int nclusters = Arrays.stream(taxaGroups).max().orElse(0) + 1;

                if (nclusters > maxNumberOfClusters()) {
                    int oldClusterNumber = nclusters;
                    taxaGroups = myClusters.getGroups(maxNumberOfClusters());
                    nclusters = Arrays.stream(taxaGroups).max().orElse(0) + 1;
                    myLogger.debug(String.format("%d clusters at %s using mxDiv = %1.8f, number of clusters reduced to %d",
                            oldClusterNumber, currentRefRange.toString(), mxDiv(), nclusters));
                }

                List> clusters = groupsToTaxaLists(taxaGroups, myTree);

                //Display information about the cluster groups
                StringBuilder msgBuilder = new StringBuilder();
                msgBuilder.append("Cluster groups : ").append(Arrays.toString(taxaGroups));
                String groupSizes = clusters.stream().map(group -> Integer.toString(group.size())).collect(Collectors.joining(","));
                msgBuilder.append("\nNumber of taxa per group : ").append(groupSizes);
                myLogger.info(msgBuilder.toString());

                for (List group : clusters) {
                    //Use the ranking file to choose the best taxon.

                    double bestRanking = Double.MIN_VALUE;
                    Taxon finalBestTaxon = null;
                    //go through the bestTaxon Set and figure out which one is best by external mapping:
                    for (Taxon currentSubTaxon : group) {
                        double currentRank = rankingMap.get(currentSubTaxon.getName());
                        if (currentRank > bestRanking) {
                            bestRanking = currentRank;
                            finalBestTaxon = currentSubTaxon;
                        }
                    }


                    //Extract out the long for the representative taxon and create a list of taxa names to be clustered
                    List taxonNames = group.stream().map(taxon -> taxon.getName() + "_0").collect(Collectors.toList());

                    String bestSeq = taxonToHapNodeMap.get(finalBestTaxon).haplotypeSequence().sequence();
                    List consensusLongs = taxonToHapNodeMap.get(finalBestTaxon).variantContextsLong().get();

                    //Put the output to the map to be written to the DB
                    consensusDataMap.put(Position.of(currentRefRange.chromosome(), currentRefRange.start()),
                            new Tuple<>(createAnchorPHG(consensusLongs, currentRefRange, bestSeq),
                                    taxonNames));
                }
            }
        }

        /**
         * Method to load the variants for a given chromosome into a Range map.
         *
         * @param graph
         * @param chr
         * @return
         */
        private Map> loadVariantsIntoRangeMap(HaplotypeGraph graph, Chromosome chr) {
            Map> taxonToVariantInfoMap = new HashMap<>();

            List myNodes = graph.nodes(currentRefRange);
            for (HaplotypeNode aNode : myNodes) {
                Optional> optVarInfo = aNode.variantInfos();

                if (optVarInfo.isPresent()) {
                    List myInfoList = optVarInfo.get();
                    if (myInfoList.size() > 0) {
                        ImmutableRangeMap.Builder mapBuilder = new ImmutableRangeMap.Builder<>();
                        Iterator infoIterator = myInfoList.iterator();
                        //This section of codes tests for overlapping variants with if (info.start() > prevInfo.end())
                        //If variants overlap the second one is skipped, because including it would cause an error when calling
                        //mapBuilder.build(). As the distance method only ends up counting SNPs and overlapping variants involve indels
                        //there is no impact on the distance calculation.
                        VariantInfo prevInfo = null;
                        if (infoIterator.hasNext()) {
                            prevInfo = infoIterator.next();
                            mapBuilder.put(Range.closed(prevInfo.start(), prevInfo.end()), prevInfo);
                        }
                        while (infoIterator.hasNext()) {
                            VariantInfo info = infoIterator.next();
                            if (info.start() > prevInfo.end()) {
                                mapBuilder.put(Range.closed(info.start(), info.end()), info);
                                prevInfo = info;
                            }
                        }
                        for (Taxon taxon : aNode.taxaList()) taxonToVariantInfoMap.put(taxon, mapBuilder.build());
                    }
                }
            }

            return taxonToVariantInfoMap;
        }

        /**
         * Function to create a TaxonToHaplotypeNode mapping for use when creating consensus.
         * @param graph
         * @param referenceRange
         * @return
         */
        private Map createTaxonToHaplotypeNode(HaplotypeGraph graph, ReferenceRange referenceRange) {
            Map taxonHaplotypeNodeMap =
            graph.nodes(referenceRange).stream()
                    .flatMap(node -> {
                        List> intermediateHapNodes = new ArrayList<>();
                        TaxaList taxaList = node.taxaList();
                        for(Taxon tx : taxaList) {
                            intermediateHapNodes.add(new Tuple(tx,node));
                        }


                        return intermediateHapNodes.stream();
                    })
            .collect(Collectors.toMap(nodeTuple -> nodeTuple.getX(),nodeTuple -> nodeTuple.getY()));

            return taxonHaplotypeNodeMap;
        }

        private List> groupsToTaxaLists(int[] groups, Tree tree) {
            //the groups array is in the order of the external nodes in tree
            //to make groups into taxa lists, the Taxon must retrieved from each external node using getIdentifier

            int maxGroup = Arrays.stream(groups).max().orElse(0);
            List> listOfLists = new ArrayList<>();
            for (int i = 0; i <= maxGroup; i++) listOfLists.add(new ArrayList());
            for (int ndx = 0; ndx < groups.length; ndx++) {
                Taxon taxon = tree.getExternalNode(ndx).getIdentifier();
                listOfLists.get(groups[ndx]).add(taxon);
            }

            //if minTaxaPerHaplotype > 1, filter listOfLists on group size >= minTaxaPerHaplotype
            if (minTaxa() > 1) {
                listOfLists = listOfLists.stream().filter(grp -> grp.size() >= minTaxa()).collect(Collectors.toList());
            }

            return listOfLists;
        }

        public List consensusVariants(Map> taxonToVariantInfoMap, List taxa, ReferenceRange refRange) {
            List consensusVariants = new ArrayList<>();
            Chromosome chr = refRange.chromosome();

            int pos = refRange.start();
            boolean inRefBlock = false;
            int refBlockStart = 0;
            int refBlockDepth = 0;

            while (pos <= refRange.end()) {
                final int currentPosition = pos;

                List currentVariants = taxa.stream()
                        .map(taxon -> taxonToVariantInfoMap.get(taxon))
                        .filter(rmap -> rmap != null)
                        .map(rmap -> rmap.get(currentPosition))
                        .filter(var -> var != null)
                        .map(var -> {
                            //convert any variants with a ref call to a ref block covering the same positions
                            if (var.isVariant() && var.genotypeString().equals(var.refAlleleString())) {
                                long varLong = ConsensusProcessingUtils.encodeRefBlockToLong(var.length(), var.depth()[0], currentPosition);
                                return new VariantInfo(chr.getName(), var.start(), var.end(), "REF", "REF", "NON-REF", false, varLong);
                            } else return var;
                        })
                        .collect(Collectors.toList());

                if (currentVariants.size() == 0) {
                    //no variants at this position
                    //if in a ref block, end the ref block at the previous position and add it to the variant list
                    if (inRefBlock) {
                        addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth);
                        inRefBlock = false;
                    }
                    pos++;

                } else {
                    //there are variants at this position. Are they all ref blocks
                    long nonRefCount = currentVariants.stream().filter(var -> var.isVariant()).count();
                    if (nonRefCount == 0) {
                        //all the variants are ref blocks
                        //if any taxa are missing, can only increment pos by one as the missingness could end at any time
                        //all the variants are ref blocks and all taxa have variants find the shortest block
                        //if in a ref block, reset pos to the end of the shortest block + 1
                        //if not in a ref block, start one and reset pos

                        //the block cannot end before it starts, so resetting pos to a value less than currentPos would be problematic
                        //shouldn't happen but test just in case
                        int minEnd = currentPosition;
                        if (currentVariants.size() == taxa.size()) {
                            minEnd = currentVariants.stream().mapToInt(var -> var.end()).min().orElse(pos);
                            minEnd = Math.min(minEnd, refRange.end());  //do not go beyond the end of the currentRefRange
                            if (minEnd < currentPosition) {
                                String msg = "A ref block ends at " + minEnd + ", which is before the current position " + currentPosition;
                                throw new IllegalArgumentException(msg);
                            }
                        }

                        //calculate site ref block depth as sum of variant depths
                        int siteDepth = 0;
                        for (VariantInfo info : currentVariants) {
                            siteDepth += info.depth()[0];
                        }
                        if (inRefBlock) {
                            refBlockDepth = Math.min(refBlockDepth, siteDepth);
                        } else {
                            inRefBlock = true;
                            refBlockStart = pos;
                            refBlockDepth = siteDepth;
                        }

                        pos = minEnd + 1;
                    } else {
                        //Some variants are not ref blocks, so find the majority variant
                        //if the majority variant is not at first position add nothing
                        //if there are two variants with max freq, do not add a variant
                        //if the max freq < min freq, add nothing
                        //when adding a variant and in ref block, end the ref block at pos - 1 and add it to the consensus list
                        Multiset idCounter = HashMultiset.create();
                        for (VariantInfo info : currentVariants) idCounter.add(info.variantId());

                        //sort the varids in order of descending count
                        List varidCounts = idCounter.entrySet().stream()
                                .map(ent -> new int[]{ent.getElement().intValue(), ent.getCount()}).collect(Collectors.toList());
                        Collections.sort(varidCounts, (a,b) -> b[1] - a[1]);

                        int totalNumberOfVariants = currentVariants.size();
                        double majorFreq = (double) varidCounts.get(0)[1] / totalNumberOfVariants;

                        if (majorFreq >= minAlleleFrequency()
                                && ( varidCounts.size() == 1 || varidCounts.get(0)[1] > varidCounts.get(1)[1]) ) {

                            List majorityVariants = currentVariants.stream()
                                    .filter(var -> var.variantId() == varidCounts.get(0)[0]).collect(Collectors.toList());

                            //if the majority variant at is in a ref block extend or start a ref block
                            if (varidCounts.get(0)[0] == -1) {
                                //majority variant is ref, just start/extend a ref block
                                int siteDepth = 0;
                                for (VariantInfo info : majorityVariants) {
                                    siteDepth += info.depth()[0];
                                }
                                if (inRefBlock) {
                                    refBlockDepth = Math.min(refBlockDepth, siteDepth);
                                } else {
                                    inRefBlock = true;
                                    refBlockStart = pos;
                                    refBlockDepth = siteDepth;
                                }
                            } else {
                                int majorityStart = majorityVariants.get(0).start();
                                //if the majority variant is in a ref block then only update if the start position equals the current position
                                if (currentPosition == majorityStart) {

                                    //finally we have a real variant
                                    int[] siteDepth = new int[2];
                                    for (VariantInfo info : majorityVariants) {
                                        siteDepth[0] += info.depth()[0];
                                        siteDepth[1] += info.depth()[1];
                                    }
                                    if (inRefBlock) {
                                        //end the ref block at pos - 1
                                        addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth);
                                        inRefBlock = false;
                                    }
                                    VariantInfo aVariant = majorityVariants.get(0);
                                    long varLong = replaceDepthInLong(aVariant.toLong(), siteDepth, aVariant.isIndel());
                                    VariantInfo consensusInfo = new VariantInfo(chr.getName(), aVariant.start(), aVariant.end(), aVariant.genotypeString(), aVariant.refAlleleString(),
                                            aVariant.altAlleleString(), true, varLong);
                                    consensusVariants.add(consensusInfo);


                                } else {
                                    //did not call a variant at this position. It is probably covered by a previously called indel
                                    //just in case check inRefBlock and end it if true
                                    if (inRefBlock) {
                                        //end the ref block at pos - 1
                                        addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth);
                                        inRefBlock = false;
                                    }
                                }
                            }


                        } else {
                            //could not call a variant
                            //if ref blocks + first position of indels (==ref) is >= minfreq call the position ref
                            //but first must check insertions to make sure previous variant did not call this a deletion (this happens)
                            //the following code is kind of complex, but probably does not get used very often, maybe, hopefully
                            int refcount = 0;
                            int refDepth = 0;
                            //have to iterate through the taxa here because may need to get the previous position variant
                            for (Taxon taxon : taxa) {
                                VariantInfo info = taxonToVariantInfoMap.get(taxon).get(currentPosition);
                                if (info != null) {
                                    if (!info.isVariant()) {
                                        refcount++;
                                        refDepth += info.depth()[0];
                                    } else if (info.isIndel() && info.start() == currentPosition) {
                                        if (info.genotypeString().length() > info.refAlleleString().length()) {
                                            //insertion
                                            //if the previous variant for this taxon is a deletion covering this position,
                                            //   do not increment refcount
                                            VariantInfo previousInfo = taxonToVariantInfoMap.get(taxon).get(currentPosition - 1);
                                            if (previousInfo != null) {
                                                boolean wasIndel = previousInfo.isIndel();
                                                boolean wasNotDeletion = !wasIndel || previousInfo.genotypeString().length() >= previousInfo.refAlleleString().length();
                                                boolean coversThisPosition = previousInfo.start() < currentPosition && previousInfo.end() >= currentPosition;
                                                if (wasNotDeletion || !coversThisPosition) {
                                                    refcount++;
                                                    //because this is the alt variant, which has a ref call at its first position
                                                    refDepth += info.depth()[1];
                                                }

                                            }
                                        } else if (info.genotypeString().length() < info.refAlleleString().length()) {
                                            //deletion
                                            refcount++;
                                            refDepth += info.depth()[1];
                                        }
                                    }
                                }
                            }

                            if (refcount / totalNumberOfVariants >= minAlleleFrequency())  {
                                if (inRefBlock) {
                                    refBlockDepth = Math.min(refBlockDepth, refDepth);
                                } else {
                                    inRefBlock = true;
                                    refBlockStart = pos;
                                    refBlockDepth = refDepth;
                                }
                            } else if (inRefBlock) {
                                //otherwise end ref block if in one
                                //end the ref block at pos - 1
                                addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth);
                                inRefBlock = false;
                            }
                        }
                        pos++;
                    }
                }
            }

            //finished iterating through the ref range positions
            //if still in a ref block add it the consensus variants
            if (inRefBlock) {
                //end the ref block at pos - 1
                addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth);            //sort the variants by start position
                inRefBlock = false;
            }

            Collections.sort(consensusVariants, (a,b) -> a.start() - b.start());
            
            //check for and reconcile overlapping variants
            //for a pair of overlapping variants, it the first is a ref call overlap is okay
            //if the first is a deletion and the second is an insertion starting at the final base pair, thats okay
            //anything else both variants should be deleted, but just report for now
            List checkedVariants = new ArrayList<>();
            Iterator iter = consensusVariants.iterator();
            VariantInfo previousVariant = null;
            if (iter.hasNext()) previousVariant = iter.next();
            while (iter.hasNext()) {
                VariantInfo currentVariant = iter.next();
                if (previousVariant == null) {
                    previousVariant = currentVariant;
                } else {
                    if (previousVariant.end() >= currentVariant.start()) {
                        //overlap
                        if (!previousVariant.isVariant()) {
                            checkedVariants.add(previousVariant);
                            previousVariant = currentVariant;
                        } else if (previousVariant.isIndel() && currentVariant.isIndel()) {
                            boolean isPreviousDeletion = previousVariant.genotypeString().length() < previousVariant.refAlleleString().length();
                            boolean isCurrentDeletion = currentVariant.genotypeString().length() < currentVariant.refAlleleString().length();

                            //deletion followed by indel
                            //assumes this is alt call. Safe because ref calls were converted to ref blocks earlier in the code.
                            if (isPreviousDeletion && !isCurrentDeletion) {
                                //okay to leave as is
                                checkedVariants.add(previousVariant);
                                previousVariant = currentVariant;
                            } else {
                                //do not know why this happened, print message, skip previous variant, and continue
                                String taxanames = taxa.stream().map(t -> t.getName()).collect(Collectors.joining(","));
                                String message = new StringBuilder("OVERLAP for ")
                                        .append(taxanames)
                                        .append("\nvariant at chr ").append(previousVariant.chromosome())
                                        .append(": ").append(previousVariant.start()).append(" to ")
                                        .append(previousVariant.end()).append(" geno ")
                                        .append(previousVariant.genotypeString()).append(" ref ")
                                        .append(previousVariant.refAlleleString()).append(" alt ").append(previousVariant.altAlleleString())
                                        .append("\n").append("overlapped by variant at chr ").append(currentVariant.chromosome())
                                        .append(": ").append(currentVariant.start()).append(" to ")
                                        .append(currentVariant.end()).append(" geno ")
                                        .append(currentVariant.genotypeString()).append(" ref ")
                                        .append(currentVariant.refAlleleString()).append(" alt ").append(currentVariant.altAlleleString()).toString();
                                myLogger.debug(message);
                                previousVariant = currentVariant;
                            }
                        }
                    } else {
                        checkedVariants.add(previousVariant);
                        previousVariant = currentVariant;
                    }
                }
            }

            //add the final variant
            if (previousVariant != null) checkedVariants.add(previousVariant);

            if (checkedVariants.size() == 0) {
                String tn = taxa.stream().map(t -> t.getName()).collect(Collectors.joining(","));
                myLogger.debug("size of checked consensus variant list = 0 for " + tn + " at " + refRange + ". Size of consensusVariants = " +  consensusVariants.size());
            }

            return checkedVariants;
        }

        private void addRefBlockToConsensusVariants(List consensus, String chrname, int start, int end, int depth) {
            int blockLength = end - start + 1;
            long varLong = ConsensusProcessingUtils.encodeRefBlockToLong(blockLength, depth, start);
            consensus.add(new VariantInfo(chrname, start, end, "REF","REF","NON_REF", false, varLong));
        }

        private Long replaceDepthInLong(Long original, int[] depth, boolean isIndel) {
            //depth is expected to be ref,alt
            int[] variantInfo = VariantUtils.decodeLongVariant(original);

            if (original < 0) {
                // create long for reference record
                // format:  1bit=ref | 2 bytes 7 bits = refLength | 1 bytes=refDepth | 4 bytes=position on chrom
                //variantInfo[2] is read depth
                variantInfo[2] = depth[0];
                return ConsensusProcessingUtils.encodeRefBlockToLong(variantInfo[1],variantInfo[2], variantInfo[3]);
            } else {
                // create long for variant record
                // format: 4 bytes= variant_mapping table id | 1 byte=refDepth | 1 byte=altDepth | 1 bytes=isIndel | 1 byte unused
                variantInfo[2] = depth[0];
                variantInfo[3] = depth[1];
                //decode long variant does not grab indel info

                return ConsensusProcessingUtils.encodeVariantToLong(variantInfo[1],variantInfo[2], variantInfo[3], isIndel);

            }
        }

        public ReferenceRange getReferenceRange() {
            return currentRefRange;
        }

        public Multimap>> getConsensusDataMap() {
            return consensusDataMap;
        }



        private AnchorDataPHG createAnchorPHG(List variants, ReferenceRange referenceRange, String hapSequence) {
            try {

                Range intervalRange = Range.closed(Position.of(referenceRange.chromosome(), referenceRange.start()),
                        Position.of(referenceRange.chromosome(), referenceRange.end()));
                byte[] values = VariantsProcessingUtils.encodeVariantLongListToByteArray(variants);
                AnchorDataPHG adata = new AnchorDataPHG(intervalRange,
                        intervalRange,
                        "NA",
                        values,
                        hapSequence);
                return adata;
            } catch (Exception exc) {
                myLogger.error("RunHapCollapsePipelinePlugin: Error encoding List to byte[];");
                throw new IllegalStateException("RunHapCollapsePipelinePlugin: Error encoding List to byte[];", exc);
            }
        }
        /**
         *
         * @param variants  a list of longs representing the variants in this haplotype
         * @param sequence  the sequence string for this haplotype
         * @param referenceRange    the ReferenceRange for this haplotype
         * @return an AnchorDataPHG for a haplotype
         */
        private AnchorDataPHG createAnchorPHG(List variants, String sequence, ReferenceRange referenceRange) {
            try {

                Range intervalRange = Range.closed(Position.of(referenceRange.chromosome(), referenceRange.start()),
                        Position.of(referenceRange.chromosome(), referenceRange.end()));
                byte[] values = VariantsProcessingUtils.encodeVariantLongListToByteArray(variants);
                AnchorDataPHG adata = new AnchorDataPHG(intervalRange,
                        intervalRange,
                        "NA",
                        values,
                        sequence);
                return adata;
            } catch (Exception exc) {
                throw new IllegalStateException("RunHapCollapsePipelinePlugin: Error encoding List to byte[];", exc);
            }
        }

    }

    private String getAnnotatedMethodDetails() {
        //configProperties comes from code in RunMergeAndCluster
        Properties dbProperties = loadProperties();
        int minTaxaPerHaplotype = Integer.parseInt(dbProperties.getProperty("minTaxa", "1"));
        double mxDiv = Double.parseDouble(dbProperties.getProperty("mxDiv", "0.01"));
        CLUSTERING_MODE clusteringMode = CLUSTERING_MODE.valueOf(dbProperties.getProperty("clusteringMode", "upgma"));
        StringBuilder detailBuilder = new StringBuilder();
        detailBuilder.append(collapseMethodDetails()).append(":");
        detailBuilder.append("minTaxa=").append(minTaxaPerHaplotype);
        detailBuilder.append(",mxDiv=").append(mxDiv);
        detailBuilder.append(",mode=").append(clusteringMode.toString());
        detailBuilder.append(",minFreq=").append(minAlleleFrequency());
        detailBuilder.append(",maxClusters=").append(maxNumberOfClusters());
        detailBuilder.append(",minSites=").append(minSiteForComp());
        detailBuilder.append(",minCoverage=").append(minTaxaCoverage());
        return detailBuilder.toString();
    }

    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        
        return ("Create Consensus Haplotypes");
    }

    @Override
    public String getToolTipText() {
        return ("Takes haplotypes for a given method, clusters them based on a divergence parameter, creates a consensus haplotype for each reference range interval");
    }
    
    /**
     * Input Reference Fasta
     *
     * @return Ref
     */
    public String reference() {
        return myReference.value();
    }

    /**
     * Set Ref. Input Reference Fasta
     *
     * @param value Ref
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin reference(String value) {
        myReference = new PluginParameter<>(myReference, value);
        return this;
    }

    /**
     * File holding the DB config information
     *
     * @return Db Config File
     */
    public String dbConfigFile() {
        return dbConfigFile.value();
    }

    /**
     * Set Db Config File. File holding the DB config information
     *
     * @param value Db Config File
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin dbConfigFile(String value) {
        dbConfigFile = new PluginParameter<>(dbConfigFile, value);
        return this;
    }

    /**
     * Name of the collapse method to be stored in the database
     *
     * @return Collapse Method
     */
    public String collapseMethod() {
        return myCollapseMethod.value();
    }

    /**
     * Set Collapse Method. Name of the collapse method to
     * be stored in the database
     *
     * @param value Collapse Method
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin collapseMethod(String value) {
        myCollapseMethod = new PluginParameter<>(myCollapseMethod, value);
        return this;
    }

    /**
     * Details for the collapse method to be stored in the
     * database
     *
     * @return Collapse Method Details
     */
    public String collapseMethodDetails() {
        return myCollapseMethodDetails.value();
    }

    /**
     * Set Collapse Method Details. Details for the collapse
     * method to be stored in the database
     *
     * @param value Collapse Method Details
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin collapseMethodDetails(String value) {
        myCollapseMethodDetails = new PluginParameter<>(myCollapseMethodDetails, value);
        return this;
    }

    /**
     * At each position, if no allele has the minimum frequency,
     * the consensus haplotype allele will be set to misssing.
     *
     * @return Minimum Allele Frequency
     */
    public Double minAlleleFrequency() {
        return minAlleleFrequency.value();
    }

    /**
     * Set Minimum Allele Frequency. At each position, if
     * no allele has the minimum frequency, the consensus
     * haplotype allele will be set to misssing.
     *
     * @param value Minimum Allele Frequency
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin minAlleleFrequency(Double value) {
        minAlleleFrequency = new PluginParameter<>(minAlleleFrequency, value);
        return this;
    }

    /**
     * File The Ranking for the Taxon in the DB.  This is
     * used to break ties.
     *
     * @return Ranking File
     */
    public String rankingFile() {
        return rankingFile.value();
    }

    /**
     * Set Ranking File. File The Ranking for the Taxon in
     * the DB.  This is used to break ties.
     *
     * @param value Ranking File
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin rankingFile(String value) {
        rankingFile = new PluginParameter<>(rankingFile, value);
        return this;
    }

    /**
     * The maximum number of clusters that will be created
     * for a reference range. If mxDiv produces too many clusters
     * then the cut height that produces maxClusters number
     * of clusters will be substituted.
     *
     * @return Maximum Cluster Number
     */
    public Integer maxNumberOfClusters() {
        return maxNumberOfClusters.value();
    }

    /**
     * Set Maximum Cluster Number. The maximum number of clusters
     * that will be created for a reference range. If mxDiv
     * produces too many clusters then the cut height that
     * produces maxClusters number of clusters will be substituted.
     *
     * @param value Maximum Cluster Number
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin maxNumberOfClusters(Integer value) {
        maxNumberOfClusters = new PluginParameter<>(maxNumberOfClusters, value);
        return this;
    }

    /**
     * The minimum number of shared sites that can be used
     * to calculate the distance between two taxa
     *
     * @return Minimum Sites with Data
     */
    public Integer minSiteForComp() {
        return minSiteForComp.value();
    }

    /**
     * Set Minimum Sites with Data . The minimum number of
     * shared sites that can be used to calculate the distance
     * between two taxa
     *
     * @param value Minimum Sites with Data
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin minSiteForComp(Integer value) {
        minSiteForComp = new PluginParameter<>(minSiteForComp, value);
        return this;
    }

    /**
     * Minimum Coverage
     *
     * @return Minimum Coverage
     */
    public Double minTaxaCoverage() {
        return minTaxaCoverage.value();
    }

    /**
     * Set Minimum Coverage. Minimum Coverage
     *
     * @param value Minimum Coverage
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin minTaxaCoverage(Double value) {
        minTaxaCoverage = new PluginParameter<>(minTaxaCoverage, value);
        return this;
    }

    /**
     * The maximum number of threads to be used to create
     * consensi. The actual number of threads used will not
     * be greater than number of available CPU's - 2.
     *
     * @return Max Threads
     */
    public Integer maxThreads() {
        return maxThreads.value();
    }

    /**
     * Set Max Threads. The maximum number of threads to be
     * used to create consensi. The actual number of threads
     * used will not be greater than number of available CPU's
     * - 2.
     *
     * @param value Max Threads
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin maxThreads(Integer value) {
        maxThreads = new PluginParameter<>(maxThreads, value);
        return this;
    }

    /**
     * Minimum number of taxa
     *
     * @return Min Taxa
     */
    public Integer minTaxa() {
        return minTaxa.value();
    }

    /**
     * Set Min Taxa. Minimum number of taxa
     *
     * @param value Min Taxa
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin minTaxa(Integer value) {
        minTaxa = new PluginParameter<>(minTaxa, value);
        return this;
    }

    /**
     * Maximum divergence.
     *
     * @return Mx Div
     */
    public Double mxDiv() {
        return mxDiv.value();
    }

    /**
     * Set Mx Div. Maximum divergence.
     *
     * @param value Mx Div
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin mxDiv(Double value) {
        mxDiv = new PluginParameter<>(mxDiv, value);
        return this;
    }

    /**
     * Clustering mode
     *
     * @return Clustering Mode
     */
    public CLUSTERING_MODE clusteringMode() {
        return clusteringMode.value();
    }

    /**
     * Set Clustering Mode. Clustering mode
     *
     * @param value Clustering Mode
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin clusteringMode(CLUSTERING_MODE value) {
        clusteringMode = new PluginParameter<>(clusteringMode, value);
        return this;
    }

    /**
     * Kmer size
     *
     * @return Kmer Size
     */
    public Integer kmerSize() {
        return kmerSize.value();
    }

    /**
     * Set Kmer Size. Kmer size
     *
     * @param value Kmer Size
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin kmerSize(Integer value) {
        kmerSize = new PluginParameter<>(kmerSize, value);
        return this;
    }

    /**
     * Distance calculation type
     *
     * @return Distance Calculation
     */
    public DistanceCalculation distanceCalculation() {
        return distanceCalculation.value();
    }

    /**
     * Set Distance Calculation. Distance calculation type
     *
     * @param value Distance Calculation
     *
     * @return this plugin
     */
    public RunHapConsensusPipelinePlugin distanceCalculation(DistanceCalculation value) {
        distanceCalculation = new PluginParameter<>(distanceCalculation, value);
        return this;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy