net.maizegenetics.pangenome.hapcollapse.RunHapConsensusPipelinePlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapcollapse;
import com.google.common.collect.*;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.HaplotypeGraph;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.HaplotypeNode.VariantInfo;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.pangenome.api.VariantUtils;
import net.maizegenetics.pangenome.db_loading.*;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.taxa.distance.DistanceMatrixBuilder;
import net.maizegenetics.taxa.tree.Tree;
import net.maizegenetics.taxa.tree.TreeClusters;
import net.maizegenetics.taxa.tree.UPGMATree;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import org.jetbrains.annotations.NotNull;
import javax.swing.*;
import java.awt.*;
import java.io.BufferedReader;
import java.sql.Connection;
import java.util.List;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
*
* This plugin creates consensus haplotypes.
*
* It processes each reference range in a separate thread using Java Futures.
* Data is written to the database via a single thread.
*
* Simple plugin to run the full Haplotype Collapse plugin. Will do the following steps:
* Steps:
* 1. Loop through each reference range in the graph:
* 1.a Extract the HaplotypeNodes with the VariantInfo data.
* 1.b Cluster haplotypes based on the FIndHaplotypeClustersPlugin and a genotype table
* containing only variantId data for sites with SNPs.
* 1.c Export clustered GenotypeTables
* 1.d Determine allele calls
*
* 2. For consensus haplotype created, upload to the database.
*
* based on RunHapCollapsePipelinePlugin
*
* NOTE: upgma is no longer supported, but may be added again later. 07/25/22
*
* @author lcj34
* @author pjb39
*
*/
public class RunHapConsensusPipelinePlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(RunHapConsensusPipelinePlugin.class);
public enum CLUSTERING_MODE {upgma, upgma_assembly, kmer_assembly};
private PluginParameter myReference = new PluginParameter.Builder<>("referenceFasta", null, String.class)
.description("Input Reference Fasta")
.inFile()
.build();
private PluginParameter dbConfigFile = new PluginParameter.Builder<>("dbConfigFile", null, String.class)
.description("File holding the DB config information")
.required(true)
.inFile()
.build();
private PluginParameter myCollapseMethod = new PluginParameter.Builder<>("collapseMethod", null, String.class)
.description("Name of the collapse method to be stored in the database")
.required(true)
.build();
private PluginParameter myCollapseMethodDetails = new PluginParameter.Builder<>("collapseMethodDetails", null, String.class)
.description("Details for the collapse method to be stored in the database")
.required(false)
.build();
private PluginParameter minAlleleFrequency = new PluginParameter.Builder<>("minFreq", 0.5, Double.class)
.guiName("Minimum Allele Frequency")
.description("At each position, if no allele has the minimum frequency, the consensus haplotype allele will be set to missing.")
.required(false)
.build();
private PluginParameter rankingFile = new PluginParameter.Builder<>("rankingFile", null, String.class)
.description("File The Ranking for the Taxon in the DB. This is used to break ties. A tab-delimited file with 2 columns, no header. The first column is a taxa name, the second is a number. The largest numbers indicate highest priority when breaking ties.")
.required(false)
.inFile()
.build();
private PluginParameter maxNumberOfClusters = new PluginParameter.Builder<>("maxClusters", 30, Integer.class)
.required(false)
.description("The maximum number of clusters that will be created for a reference range. " +
"If mxDiv produces too many clusters then the cut height that produces maxClusters number of clusters will be substituted.")
.guiName("Maximum Cluster Number")
.build();
private PluginParameter minSiteForComp = new PluginParameter.Builder<>("minSites", 30, Integer.class)
.required(false)
.description("The minimum number of shared sites that can be used to calculate the distance between two taxa")
.guiName("Minimum Sites with Data ")
.build();
private PluginParameter minTaxaCoverage = new PluginParameter.Builder<>("minCoverage", 0.1, Double.class)
.required(false)
.description("For each range, any taxon with coverage of less than this amount will not be used" +
" to generate consensus haplotypes and will not be included in any haplotype group for that range.")
.guiName("Minimum Coverage")
.range(Range.closed(0.0,1.0))
.build();
private PluginParameter maxThreads = new PluginParameter.Builder<>("maxThreads", 1000, Integer.class)
.description("The maximum number of threads to be used to create consensi. " +
"The actual number of threads used will not be greater than number of available CPU's - 2.")
.build();
private PluginParameter minTaxa = new PluginParameter.Builder<>("minTaxa", 1, Integer.class)
.description("Minimum number of taxa")
.build();
private PluginParameter mxDiv = new PluginParameter.Builder<>("mxDiv", 0.01, Double.class)
.description("Maximum divergence.")
.build();
private PluginParameter clusteringMode = new PluginParameter.Builder<>("clusteringMode", CLUSTERING_MODE.upgma_assembly, CLUSTERING_MODE.class)
.description("Clustering mode")
.range(CLUSTERING_MODE.values())
.build();
private PluginParameter kmerSize = new PluginParameter.Builder<>("kmerSize", 7, Integer.class)
.description("Kmer size")
.dependentOnParameter(clusteringMode, CLUSTERING_MODE.kmer_assembly)
.build();
private PluginParameter distanceCalculation = new PluginParameter.Builder<>("distanceCalculation", DistanceCalculation.Euclidean, DistanceCalculation.class)
.description("Distance calculation type")
.dependentOnParameter(clusteringMode, CLUSTERING_MODE.kmer_assembly)
.range(DistanceCalculation.values())
.build();
private PluginParameter isTestMethod = new PluginParameter.Builder("isTestMethod", false, Boolean.class)
.description("Indication if the data to be loaded against a test method. Data loaded with test methods are not cached with the PHG ktor server")
.required(false).build();
public RunHapConsensusPipelinePlugin() {
super(null, false);
}
public RunHapConsensusPipelinePlugin(Frame parentFrame) {
super(parentFrame, false);
}
public RunHapConsensusPipelinePlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
protected void postProcessParameters() {
if (clusteringMode() == CLUSTERING_MODE.upgma) {
throw new IllegalArgumentException("RunHapConsensusPipelinePlugin: postProcessParameters: clustering mode upgma is not currently supported. Please use upgma_assembly or kmer_assembly");
}
if (clusteringMode() == CLUSTERING_MODE.upgma_assembly && (reference() == null || reference().isEmpty())) {
throw new IllegalArgumentException("RunHapConsensusPipelinePlugin: postProcessParameters: reference fasta must be specified for clustering modes upgma_assembly or upgma");
}
if ((clusteringMode() == CLUSTERING_MODE.upgma_assembly || clusteringMode() == CLUSTERING_MODE.kmer_assembly) && (rankingFile() == null || rankingFile().isEmpty())) {
throw new IllegalArgumentException(("RunHapConsensusPipelinePlugin: postProcessParameters: ranking file must be specified for clustering modes upgma_assembly and kmer"));
}
}
@Override
public DataSet processData(DataSet input) {
//Load up the PHG
List temp = input.getDataOfType(HaplotypeGraph.class);
if (temp.size() != 1) {
throw new IllegalArgumentException("HapCountBestPathPlugin: processData: must input one HaplotypeGraph: " + temp.size());
}
HaplotypeGraph hapgraph = (HaplotypeGraph) temp.get(0).getData();
runConsensusPipeline(hapgraph, reference());
return null;
}
/**
* This method will loop through each reference range in the graph and will:
* 1. Merge the List of variant/ref data for a given haplotype method:
* 2. Cluster the haplotypes together into groups
* Then when done with all reference ranges, load data for each cluster to the db.
*
* This is the main function that processes each reference range in the graph
*
* @param graph haplotype graph
* @param referenceFasta reference fasta
*/
public void runConsensusPipeline(HaplotypeGraph graph, String referenceFasta) {
final GenomeSequence referenceSequence;
if (reference() != null && !reference().isEmpty()) {
referenceSequence = GenomeSequenceBuilder.instance(referenceFasta);
} else {
referenceSequence = null;
}
// Get the properties object
Properties dbProperties = loadProperties();
myLogger.info("Starting up the threadpool");
ExecutorService threadpool = ForkJoinPool.commonPool();
int availableCPUs = Runtime.getRuntime().availableProcessors();
if (maxThreads() < availableCPUs - 2 ) {
threadpool = new ForkJoinPool(maxThreads());
}
myLogger.info("Thread Pool started");
BlockingQueue> futures = new LinkedBlockingQueue<>(5100);
// Start thread that processes futures
Future> processingFuture = threadpool.submit(new ProcessFutures(futures));
myLogger.info("Loading up the ranking file");
Map rankingMap = new HashMap<>();
if(rankingFile() != null) {
try {
BufferedReader reader = Utils.getBufferedReader(rankingFile());
String currentLine = "";
while ((currentLine = reader.readLine()) != null) {
String[] currentLineSplit = currentLine.split("\t");
rankingMap.put(currentLineSplit[0], Double.parseDouble(currentLineSplit[1]));
}
myLogger.info("runConsensusPipeline: there are " + rankingMap.entrySet().size() + " entries processed from rankingFile:" + rankingFile());
} catch (Exception e) {
myLogger.info("RunHapCollapsePipelinePlugin: runCollapsePipeline: Problem with reading ranking file: " + e.getMessage());
}
//Verify the ranking map
if(!ConsensusProcessingUtils.areGraphTaxaInRankingMap(graph,rankingMap)) {
throw new IllegalStateException("Ranking file does not have all the taxon in the graph. " +
"Please check your ranking file and make sure you have rankings for each taxon.");
}
if(!ConsensusProcessingUtils.areRankingsUnique(rankingMap)) {
myLogger.warn("WARNING found multiple taxon with the same ranking. This has the potential to select incorrect representative haplotypes.");
}
}
else {
myLogger.info("No Ranking file was supplied. Running Consensus in upgma mode ");
}
myLogger.info("Walking through reference ranges and starting up future threads");
boolean normalTermination = true;
//processingFuture is running in a thread. When an exception occurs within processingFuture,
// it prints an error message in the log and dies but does not kill the calling thread.
// That blocks the futures queue and things just hang waiting for completion.
// So, the approach is to check to make sure the processingFuture thread is still alive before adding another reference range
// and, just to be safe, also check to make sure that processingFuture does not die after that check.
for (ReferenceRange referenceRange : graph.referenceRanges()) {
List datumList = new ArrayList<>();
datumList.add(new Datum("graph", graph, "Graph Passed In"));
if (referenceSequence != null) {
datumList.add(new Datum("referenceGenomeSequence", referenceSequence, "GenomeSequence holding the reference"));
}
datumList.add(new Datum("refRange", referenceRange, " Current Reference Range"));
datumList.add(new Datum("dbConfigFile",dbConfigFile(), " Database configuration info")); // lcj added
DataSet input = new DataSet(datumList, null);
//processingFuture.isDone() returns true if processingFuture has terminated for any reason.
// If the futures queue is empty it blocks but does not terminate. It will not terminate
// until it encounters an empty RunMergeAndCluster or if and exception occurs.
if (processingFuture.isDone()) {
myLogger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
myLogger.error("processingFuture was terminated prematurely. Stopped trying to add ref ranges at " + referenceRange.toString());
myLogger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
normalTermination = false;
break;
}
try {
// Add processing of the merging and then clustering to the blocking queue
Future myFuture = threadpool.submit(new RunMergeAndCluster(input,
dbProperties, referenceSequence, rankingMap));
Boolean added = false;
while (!added && !processingFuture.isDone()) {
added = futures.offer(myFuture, 1L, TimeUnit.MINUTES);
}
} catch (Exception e) {
// Catch the error, but finish processing the reference ranges
myLogger.debug(e.getMessage(), e);
myLogger.error("Error Processing refRange: " + referenceRange.intervalString() + "\nError: " + e.getMessage());
}
}
try {
//add an empty RunMergeAndCluster to the processing queue to tell processingFutures to terminate
if (processingFuture.isDone()) {
myLogger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
myLogger.error("processingFuture was terminated prematurely. All reference ranges were added to the processing queue.");
myLogger.error("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
normalTermination = false;
} else {
Future lastFuture = threadpool.submit(new RunMergeAndCluster());
Boolean added = false;
while (!added && !processingFuture.isDone()) {
added = futures.offer(lastFuture, 1L, TimeUnit.MINUTES);
}
}
} catch (InterruptedException e) {
myLogger.error("Error putting empty RunMergeAndCluster in runConsensusPipeline. May interfere with thread shutdown.");
myLogger.debug(e.getMessage(), e);
}
// Wait until processing of all futures is done
try {
processingFuture.get();
} catch (Exception ex) {
myLogger.debug(ex.getMessage(), ex);
}
myLogger.info("Shutting down the threadpool");
threadpool.shutdown();
if (!normalTermination) throw new IllegalArgumentException("The consensus pipeline terminated prematurely");
}
private Properties loadProperties() {
Properties configProperties = new Properties();
try {
configProperties.load(Utils.getBufferedReader(dbConfigFile()));
} catch (Exception e) {
myLogger.error("RunHapCollapsePipelinePlugin: loadProperties Failed to Load Properties.", e);
throw new IllegalStateException("RunHapCollapsePipelinePlugin: loadProperties Failed to Load Properties.", e);
}
return configProperties;
}
/**
* The ProcessFutures class is the thread that needs to run all of the RunMergeAndCluster threads and will upload things to the db.
* Basically this thread will run until it finds the final RunMergeAndCluster object, blocking execution until each thread in the queue is done.
* Once we have over 1000 haplotypes to write to the db, we write those haplotypes to the db and start a new queue.
*
*/
private class ProcessFutures implements Runnable {
private final BlockingQueue> myQueue;
public ProcessFutures(BlockingQueue> queue) {
myQueue = queue;
}
@Override
public void run() {
try (Connection dbConnect = DBLoadingUtils.connection(dbConfigFile(), false)) {
PHGDataWriter phg = new PHGdbAccess(dbConnect);
Multimap>> consensusDataMap = HashMultimap.create();
DBLoadingUtils.MethodType methodType = DBLoadingUtils.MethodType.CONSENSUS_ANCHOR_SEQUENCE;
if (isTestMethod()) {
methodType = DBLoadingUtils.MethodType.TEST_CONSENSUS_ANCHOR_SEQUENCE;
}
int methodId = phg.putMethod(collapseMethod(), methodType, pluginParameters());
Future future = myQueue.take();
RunMergeAndCluster mergeAndClusterObject = future.get();
while (!mergeAndClusterObject.myIsFinal) {
if (mergeAndClusterObject == null) {
//Skip over this one.
continue;
}
consensusDataMap.putAll(mergeAndClusterObject.getConsensusDataMap());
ReferenceRange currentRefRange = mergeAndClusterObject.getReferenceRange();
myLogger.debug("RefRange: "+currentRefRange.intervalString());
//TODO make this a parameter
if (consensusDataMap.size() > 5000) {
myLogger.debug("Writing to the database:");
//Load the consensus to the db
Long time = System.nanoTime();
phg.putConsensusSequences(consensusDataMap, methodId);
double totalTime = (System.nanoTime() - time)/1e9;
myLogger.debug("time to write 10000 entries in seconds: " + totalTime);
//Clear out the multimap
consensusDataMap.clear();
}
future = myQueue.take();
mergeAndClusterObject = future.get();
}
//Load the consensus to the db
phg.putConsensusSequences(consensusDataMap, methodId);
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("RunHapCollapsePipelinePlugin: ProcessFutures: problem: " + e.getMessage());
}
}
}
/**
*
* This class takes a reference range, run through all the
* taxon with their variants List, create genotype table based on variant_id from PHG variants table.
* Send this genotype table through FindHaplotypeClustersPlugin
* Decode results from clustersPlugin into actual alleles
*
*/
public class RunMergeAndCluster implements Callable {
private DataSet graphToMerge;
private ReferenceRange currentRefRange;
private final boolean myIsFinal;
private Multimap>> consensusDataMap = HashMultimap.create();
private GenomeSequence referenceGenotype = null;
private final Map rankingMap;
private final Pattern NUC = Pattern.compile("[ACGTR]"); //R denotes a ref nucleotide here
private final Pattern GAM = Pattern.compile(".+_[0-9]+");
public RunMergeAndCluster(DataSet graphToMerge, Properties dbProperties, GenomeSequence referenceSequence) {
this.graphToMerge = graphToMerge;
this.referenceGenotype = referenceSequence;
myIsFinal = false;
this.rankingMap = null;
}
public RunMergeAndCluster(DataSet graphToMerge, Properties dbProperties, GenomeSequence referenceSequence, Map rankingMap) {
this.graphToMerge = graphToMerge;
this.referenceGenotype = referenceSequence;
myIsFinal = false;
this.rankingMap = rankingMap;
}
public RunMergeAndCluster() {
this.graphToMerge = null;
myIsFinal = true;
this.rankingMap = null;
}
@Override
public RunMergeAndCluster call() {
if (myIsFinal) {
return this;
}
try {
if (clusteringMode() == CLUSTERING_MODE.upgma) {
clusterThenMerge();
}
else if (clusteringMode() == CLUSTERING_MODE.upgma_assembly || clusteringMode() == CLUSTERING_MODE.kmer_assembly) {
clusterAssemblies();
}
return this;
} catch (Exception exc) {
//Get out the current ReferenceRange:
ReferenceRange referenceRange = (ReferenceRange) graphToMerge.getDataOfType(ReferenceRange.class).get(0).getData();
myLogger.error("Error processing ReferenceRange:" + referenceRange.intervalString() + " ErrorMessage:" + exc.getMessage());
myLogger.debug(exc.getMessage(), exc);
return this;
} finally {
graphToMerge = null;
}
}
public void clusterThenMerge() {
currentRefRange = (ReferenceRange) graphToMerge.getDataOfType(ReferenceRange.class).get(0).getData();
Chromosome chr = currentRefRange.chromosome();
List temp = graphToMerge.getDataOfType(HaplotypeGraph.class);
if (temp.size() != 1) {
throw new IllegalArgumentException("PathsToVCFPlugin: processData: must input one Haplotype Graph");
}
HaplotypeGraph graph = (HaplotypeGraph)temp.get(0).getData();
myLogger.info("clusterThenMerge:Loading variants into RangeMap");
//load variants into a RangeMap
Map> taxonToVariantInfoMap = loadTaxaRangeMaps(graph);
DistanceMatrix dm = calculateDistanceMatrix(taxonToVariantInfoMap);
if (dm == null) {
myLogger.info("Not enough data to calculate distances. No consensus haplotypes created for " + currentRefRange.toString());
return;
}
myLogger.debug(String.format("Finished builder distance matrix for ref range %d", currentRefRange.id()));
// debug - show dm
StringBuilder sb = new StringBuilder();
for (double[] row : dm.getDistances()) {
sb.append(Arrays.stream(row).mapToObj(d -> String.format("%1.6f",d)).collect(Collectors.joining(" "))).append("\n");
}
sb.append("-----------");
myLogger.debug(sb.toString());
//cluster and extract groups
UPGMATree myTree = new UPGMATree(dm);
TreeClusters myClusters = new TreeClusters(myTree);
int[] taxaGroups = myClusters.getGroups(mxDiv());
int nclusters = Arrays.stream(taxaGroups).max().orElse(0) + 1;
if (nclusters > maxNumberOfClusters()) {
int oldClusterNumber = nclusters;
taxaGroups = myClusters.getGroups(maxNumberOfClusters());
nclusters = Arrays.stream(taxaGroups).max().orElse(0) + 1;
myLogger.debug(String.format("%d clusters at %s using mxDiv = %1.8f, number of clusters reduced to %d",
oldClusterNumber, currentRefRange.toString(), mxDiv(), nclusters));
}
System.out.println(groupsToTaxaLists(taxaGroups,myTree));
System.out.println(Arrays.toString(taxaGroups));
myLogger.debug(String.format("ref range chr %s pos %d has %d clusters", currentRefRange.chromosome().getName(), currentRefRange.start(), nclusters));
//add consensus haplotypes to the consensusDataMap
//consensusDataMap is a Multimap>>
//Position is the start position of currentRefRange
//Each Tuple represents a single haplotype
//AnchorDataPHG is an AnchorDataPHG object for that haplotype
//List is the of taxa names ending in _hap number
for (List listOfTaxa : groupsToTaxaLists(taxaGroups, myTree)) {
List consensusVariants = consensusVariants(taxonToVariantInfoMap, listOfTaxa, currentRefRange);
String consensusSequence = ConsensusProcessingUtils.convertVariantsToSequence(consensusVariants, currentRefRange, referenceGenotype);
// Warning: this will not result in storing a valid gvcf file id in the haplotypes table for these entries
AnchorDataPHG myAnchorData = createAnchorPHG( consensusSequence, currentRefRange);
List gameteNames = taxaToGameteNames(listOfTaxa);
consensusDataMap.put(Position.of(currentRefRange.chromosome(), currentRefRange.start()), new Tuple<>(myAnchorData, gameteNames));
}
}
private List taxaToGameteNames(List taxa) {
//the code that adds the consensus haplotypes to the db expects the taxa names to end in _haplotype number
//if the taxa names do not end in that add _0
List gameteNames = taxa.stream().map(Taxon::getName).map(tn -> {
// if (GAM.matcher(tn).matches()) return tn;
// else return tn + "_0";
return tn + "_0";
}).collect(Collectors.toList());
return gameteNames;
}
private Map> loadTaxaRangeMaps(HaplotypeGraph graph) {
List nodes = graph.nodes(currentRefRange);
Map> rangeMaps = new HashMap<>();
int sitesInRef = currentRefRange.end() - currentRefRange.start() + 1;
for (HaplotypeNode node : nodes) {
Taxon myTaxon = node.taxaList().get(0);
Optional> optInfo = node.variantInfos();
if (optInfo.isPresent()) {
//calculate taxon coverage
int sitesCalled = optInfo.get().stream().mapToInt(var -> var.length()).sum();
double coverage = (double) sitesCalled / sitesInRef;
//add taxon to map if coverage exceeds minTaxaCoverage
if (coverage >= minTaxaCoverage()) {
rangeMaps.put(myTaxon, rangeMapForTaxon(optInfo.get(), myTaxon.getName()));
} else {
myLogger.debug(String.format("%s dropped due to insufficient coverage = %1.3f at %s",
myTaxon.getName(), coverage, currentRefRange.toString()));
}
}
}
return rangeMaps;
}
private RangeMap rangeMapForTaxon(List infoList, String taxonName) {
TreeRangeMap infoMap = TreeRangeMap.create();
for (VariantInfo info : infoList) {
//if the genotype call is N do not process it
if (info.genotypeString().equals("N")) continue;
//add the new range after removing any potential overlap
Range infoRange = Range.closed(info.start(), info.end());
infoMap.remove(infoRange);
infoMap.put(infoRange, info);
}
return infoMap;
}
public DistanceMatrix calculateDistanceMatrix(Map> taxonToVariantInfoMap) {
List taxa = new ArrayList<>(taxonToVariantInfoMap.keySet());
int nTaxaInMap = taxa.size();
double[][] distanceMatrix = new double[nTaxaInMap][nTaxaInMap];
//set the distance values in the matrix
for (int i = 0; i < nTaxaInMap - 1; i++) {
distanceMatrix[i][i] = 0.0; //set the diagonal to 0
for (int j = i + 1; j < nTaxaInMap; j++) {
RangeMap varmapi = taxonToVariantInfoMap.get(taxa.get(i));
RangeMap varmapj = taxonToVariantInfoMap.get(taxa.get(j));
distanceMatrix[i][j] = distanceMatrix[j][i] = distanceFromSNPs(varmapi, varmapj);
}
}
//calculate row maxima (column maxima are the same)
double[] rowMax = new double[nTaxaInMap];
Arrays.fill(rowMax, Double.NaN);
for (int i = 0; i < nTaxaInMap; i++) {
for (int j = 0; j < nTaxaInMap; j++) {
if (!Double.isNaN(distanceMatrix[i][j])) {
if (Double.isNaN(rowMax[i])) rowMax[i] = distanceMatrix[i][j];
else rowMax[i] = Math.max(rowMax[i], distanceMatrix[i][j]);
}
}
}
//delete any rows/columns that are completely missing other than the diagonal
List rowsNotMissing = new ArrayList<>();
for (int r = 0; r < nTaxaInMap; r++) {
long missingCount = Arrays.stream(distanceMatrix[r]).filter(d -> Double.isNaN(d)).count();
if (missingCount < nTaxaInMap - 1) rowsNotMissing.add(r);
}
if (rowsNotMissing.size() <= 1) return null;
TaxaListBuilder taxaBuilder = new TaxaListBuilder();
//delete rows/columns, taxa, replace NaN's with row/column max
for (Integer ndx : rowsNotMissing) taxaBuilder.add(taxa.get(ndx));
DistanceMatrixBuilder distanceBuilder = DistanceMatrixBuilder.getInstance(taxaBuilder.build());
int reducedTaxaNumber = rowsNotMissing.size();
for (int i = 0; i < reducedTaxaNumber; i++) {
int originalRow = rowsNotMissing.get(i);
for (int j = i + 1; j < reducedTaxaNumber; j++) {
int originalColumn = rowsNotMissing.get(j);
if (Double.isNaN(distanceMatrix[originalRow][originalColumn]))
distanceBuilder.set(i, j, Math.max(rowMax[originalRow], rowMax[originalColumn]));
else distanceBuilder.set(i, j, distanceMatrix[originalRow][originalColumn]);
}
}
return distanceBuilder.build();
}
public double distanceFromSNPs(RangeMap rangeMap1, RangeMap rangeMap2) {
//this method is designed to work with variant calls from GATK/Senteion, which separates insertions and deletions
//calls generated from mummer alignments should use the method in createDistanceMatrix()
//compare only positions represented by single nucleotides
//test only the pos for variants because pos+1 could be covered by a different variant due to GVCF overlaps
//except that it is okay to process entire ref blocks
int pos = currentRefRange.start();
int totalSites = 0;
int variantSites = 0;
while (pos <= currentRefRange.end()) {
VariantInfo var1 = rangeMap1.get(pos);
VariantInfo var2 = rangeMap2.get(pos);
if (var1 == null || var2 == null) {
pos++; //do not add this site to counts
} else if(var1.isIndel() || var2.isIndel()) {
pos++; //do not add this site to counts
} else if(var1.genotypeString().equals("N") || var2.genotypeString().equals("N")) {
pos++; //do not add this site to counts
} else if(var1.isVariant() && var2.isVariant()) {
//query position in each variant. If a single nucleotide increment total
//if nucleotides are unequal increment variantSites
String geno1 = var1.genotypeString();
String geno2 = var2.genotypeString();
if ((NUC.matcher(geno1).matches()) && NUC.matcher(geno2).matches()) {
if (!geno1.equals(geno2)) variantSites++;
totalSites++;
}
pos++;
} else if(var1.isVariant()) { // && !var2.isVariant
//var1 is a snp, so just need to check if it is a ref or alt call
totalSites++;
if (var1.genotypeString().equals(var1.altAlleleString())) variantSites++;
pos++;
} else if(var2.isVariant()) { // && !var1.isVariant
//same as for var1 is variant
//var2 is a snp, so just need to check if it is a ref or alt call
totalSites++;
if (var2.genotypeString().equals(var2.altAlleleString())) variantSites++;
pos++;
} else { //both are in ref block
//move to end of shortest ref block and increment totalSites by number of positions moved
//move pos 1 past the end of the block
//do not go beyond the end of currentRefRange
int endpos = Math.min(var1.end(), var2.end());
endpos = Math.min(endpos, currentRefRange.end());
totalSites += endpos - pos + 1;
pos = endpos + 1;
}
}
if (totalSites < minSiteForComp()) return Double.NaN;
return (double) variantSites / totalSites;
}
/**
* Method to cluster Assemblies
*
* Steps Taken Here:
* For the current Reference Range, extract out all haplotypes in the graph
* Compute a distance matrix using #SNPS/(#SNPs+#Ref) as the distance. Ignore indels and Ns.
* From this distance matrix, compute a UPGMA tree and cut at parameter mxDiv
* Within each sub group, chose a single haplotype to be the representative haplotype. This choice is done by the external rankingFile required to run this
* Once the representative haplotypes are chosen, write them to the db with the full Taxa list for the sub-group
*
*/
public void clusterAssemblies() {
myLogger.info("Running Cluster Assemblies.");
if(rankingMap == null) {
throw new IllegalStateException("Unable to run clusterAssemblies as no ranking map:");
}
currentRefRange = (ReferenceRange) graphToMerge.getDataOfType(ReferenceRange.class).get(0).getData();
Chromosome chr = currentRefRange.chromosome();
List temp = graphToMerge.getDataOfType(HaplotypeGraph.class);
if (temp.size() != 1) {
throw new IllegalArgumentException("PathsToVCFPlugin: processData: must input one Haplotype Graph");
}
HaplotypeGraph graph = (HaplotypeGraph)temp.get(0).getData();
myLogger.info("Loading variants into RangeMap");
//load variants into a RangeMap
Map> taxonToVariantInfoMap = loadVariantsIntoRangeMap(graph,chr);
//calculate a distance matrix
int ntaxa = taxonToVariantInfoMap.keySet().size();
TaxaList taxaWithInfo = new TaxaListBuilder().addAll(taxonToVariantInfoMap.keySet()).build();
//Now that we have the clusters we need to run SW on the nodes
//Walk through the taxon groups and convert get the nodes for all the taxon in the group
Map taxonToHapNodeMap = createTaxonToHaplotypeNode(graph,currentRefRange);
//If we just have one haplotype, just write it to the DB immediately
if(taxonToHapNodeMap.size()==1) {
List taxonNames = taxonToHapNodeMap.keySet().stream().map(taxon -> taxon.getName()+"_0").collect(Collectors.toList());
HaplotypeNode currentNode = taxonToHapNodeMap.get(taxonToHapNodeMap.keySet().stream().collect(Collectors.toList()).get(0));
String bestSeq = currentNode.haplotypeSequence().sequence();
consensusDataMap.put(Position.of(currentRefRange.chromosome(), currentRefRange.start()),
new Tuple<>(createAnchorPHG(currentRefRange, bestSeq, currentNode.asmContig(),
currentNode.asmStart(), currentNode.asmEnd(), currentNode.asmStrand(),currentNode.genomeFileID(),currentNode.gvcfFileID()),
taxonNames));
}
else {
DistanceMatrix dm;
switch (clusteringMode()) {
case upgma_assembly:
dm = ConsensusProcessingUtils.createDistanceMatrix(ntaxa, chr, currentRefRange, taxaWithInfo, taxonToVariantInfoMap);
break;
case kmer_assembly:
dm = KmerBasedConsensusUtils.kmerDistanceMatrix(currentRefRange, graph.nodes(currentRefRange), kmerSize(), distanceCalculation());
break;
default:
throw new IllegalArgumentException("RunHapConsensusPipelinePlugin: clusterAssemblies: clustering mode: " + clusteringMode().name() + " not supported.");
}
myLogger.debug(String.format("Finished builder distance matrix for ref range %d", currentRefRange.id()));
// System.out.println("Before SettingNs:");
// System.out.println(dm.getTaxaList().stream().map(taxon -> taxon.getName()).collect(Collectors.joining(",")));
// for (double[] row : dm.getDistances()) {
// System.out.println(Arrays.stream(row).mapToObj(d -> String.format("%1.4f",d)).collect(Collectors.joining(" ")));
// }
// System.out.println("-------");
//Need to set the NaNs to something otherwise
dm = ConsensusProcessingUtils.setNsToMax(dm);
// System.out.println(dm.getTaxaList().stream().map(taxon -> taxon.getName()).collect(Collectors.joining(",")));
//Uncomment to show distance matrix
// System.out.println("After SettingNs:");
// for (double[] row : dm.getDistances()) {
// System.out.println(Arrays.stream(row).mapToObj(d -> String.format("%1.4f",d)).collect(Collectors.joining(" ")));
// }
// System.out.println("-------");
//cluster and extract groups
UPGMATree myTree = new UPGMATree(dm);
TreeClusters myClusters = new TreeClusters(myTree);
int[] taxaGroups = myClusters.getGroups(mxDiv());
int nclusters = (int) Arrays.stream(taxaGroups).distinct().count();
if (nclusters > maxNumberOfClusters()) {
int oldClusterNumber = nclusters;
taxaGroups = myClusters.getGroups(maxNumberOfClusters());
nclusters = (int) Arrays.stream(taxaGroups).distinct().count();
myLogger.debug(String.format("%d clusters at %s using mxDiv = %1.8f, number of clusters reduced to %d",
oldClusterNumber, currentRefRange.toString(), mxDiv(), nclusters));
}
List> clusters = groupsToTaxaLists(taxaGroups, myTree);
//Display information about the cluster groups
StringBuilder msgBuilder = new StringBuilder();
msgBuilder.append("Cluster groups : ").append(Arrays.toString(taxaGroups));
String groupSizes = clusters.stream().map(group -> Integer.toString(group.size())).collect(Collectors.joining(","));
msgBuilder.append("\nNumber of taxa per group : ").append(groupSizes);
myLogger.info(msgBuilder.toString());
for (List group : clusters) {
//Use the ranking file to choose the best taxon.
double bestRanking = Double.MIN_VALUE;
Taxon finalBestTaxon = null;
//go through the bestTaxon Set and figure out which one is best by external mapping:
for (Taxon currentSubTaxon : group) {
double currentRank = rankingMap.get(currentSubTaxon.getName());
if (currentRank > bestRanking) {
bestRanking = currentRank;
finalBestTaxon = currentSubTaxon;
}
}
//Extract out the long for the representative taxon and create a list of taxa names to be clustered
List taxonNames = group.stream().map(taxon -> taxon.getName() + "_0").collect(Collectors.toList());
HaplotypeNode bestHap = taxonToHapNodeMap.get(finalBestTaxon);
String bestSeq = bestHap.haplotypeSequence().sequence();
//Put the output to the map to be written to the DB
consensusDataMap.put(Position.of(currentRefRange.chromosome(), currentRefRange.start()),
new Tuple<>(createAnchorPHG(currentRefRange, bestSeq,
bestHap.asmContig(),bestHap.asmStart(), bestHap.asmEnd(), bestHap.asmStrand(),bestHap.genomeFileID(),bestHap.gvcfFileID()),
taxonNames));
}
}
}
/**
* Method to load the variants for a given chromosome into a Range map.
*
* @param graph
* @param chr
* @return
*/
private Map> loadVariantsIntoRangeMap(HaplotypeGraph graph, Chromosome chr) {
Map> taxonToVariantInfoMap = new HashMap<>();
List myNodes = graph.nodes(currentRefRange);
for (HaplotypeNode aNode : myNodes) {
Optional> optVarInfo = aNode.variantInfos();
if (optVarInfo.isPresent()) {
List myInfoList = optVarInfo.get();
if (myInfoList.size() > 0) {
ImmutableRangeMap.Builder mapBuilder = new ImmutableRangeMap.Builder<>();
Iterator infoIterator = myInfoList.iterator();
//This section of codes tests for overlapping variants with if (info.start() > prevInfo.end())
//If variants overlap the second one is skipped, because including it would cause an error when calling
//mapBuilder.build(). As the distance method only ends up counting SNPs and overlapping variants involve indels
//there is no impact on the distance calculation.
VariantInfo prevInfo = null;
if (infoIterator.hasNext()) {
prevInfo = infoIterator.next();
mapBuilder.put(Range.closed(prevInfo.start(), prevInfo.end()), prevInfo);
}
while (infoIterator.hasNext()) {
VariantInfo info = infoIterator.next();
if (info.start() > prevInfo.end()) {
mapBuilder.put(Range.closed(info.start(), info.end()), info);
prevInfo = info;
}
}
for (Taxon taxon : aNode.taxaList()) taxonToVariantInfoMap.put(taxon, mapBuilder.build());
}
}
}
return taxonToVariantInfoMap;
}
/**
* Function to create a TaxonToHaplotypeNode mapping for use when creating consensus.
* @param graph
* @param referenceRange
* @return
*/
private Map createTaxonToHaplotypeNode(HaplotypeGraph graph, ReferenceRange referenceRange) {
Map taxonHaplotypeNodeMap =
graph.nodes(referenceRange).stream()
.flatMap(node -> {
List> intermediateHapNodes = new ArrayList<>();
TaxaList taxaList = node.taxaList();
for(Taxon tx : taxaList) {
intermediateHapNodes.add(new Tuple(tx,node));
}
return intermediateHapNodes.stream();
})
.collect(Collectors.toMap(nodeTuple -> nodeTuple.getX(),nodeTuple -> nodeTuple.getY()));
return taxonHaplotypeNodeMap;
}
private List> groupsToTaxaLists(int[] groups, Tree tree) {
//the groups array is in the order of the external nodes in tree
//to make groups into taxa lists, the Taxon must retrieved from each external node using getIdentifier
int maxGroup = Arrays.stream(groups).max().orElse(0);
List> listOfLists = new ArrayList<>();
for (int i = 0; i <= maxGroup; i++) listOfLists.add(new ArrayList());
for (int ndx = 0; ndx < groups.length; ndx++) {
Taxon taxon = tree.getExternalNode(ndx).getIdentifier();
listOfLists.get(groups[ndx]).add(taxon);
}
//filter listOfLists on group size >= minTaxaPerHaplotype
listOfLists = listOfLists.stream().filter(grp -> grp.size() >= minTaxa()).collect(Collectors.toList());
return listOfLists;
}
public List consensusVariants(Map> taxonToVariantInfoMap, List taxa, ReferenceRange refRange) {
List consensusVariants = new ArrayList<>();
Chromosome chr = refRange.chromosome();
int pos = refRange.start();
boolean inRefBlock = false;
int refBlockStart = 0;
int refBlockDepth = 0;
while (pos <= refRange.end()) {
final int currentPosition = pos;
List currentVariants = taxa.stream()
.map(taxon -> taxonToVariantInfoMap.get(taxon))
.filter(rmap -> rmap != null)
.map(rmap -> rmap.get(currentPosition))
.filter(var -> var != null)
.map(var -> {
//convert any variants with a ref call to a ref block covering the same positions
if (var.isVariant() && var.genotypeString().equals(var.refAlleleString())) {
long varLong = ConsensusProcessingUtils.encodeRefBlockToLong(var.length(), var.depth()[0], currentPosition);
return new VariantInfo(chr.getName(), var.start(), var.end(), "REF", "REF", "NON-REF", false, varLong);
} else return var;
})
.collect(Collectors.toList());
if (currentVariants.size() == 0) {
//no variants at this position
//if in a ref block, end the ref block at the previous position and add it to the variant list
if (inRefBlock) {
addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth);
inRefBlock = false;
}
pos++;
} else {
//there are variants at this position. Are they all ref blocks
long nonRefCount = currentVariants.stream().filter(var -> var.isVariant()).count();
if (nonRefCount == 0) {
//all the variants are ref blocks
//if any taxa are missing, can only increment pos by one as the missingness could end at any time
//all the variants are ref blocks and all taxa have variants find the shortest block
//if in a ref block, reset pos to the end of the shortest block + 1
//if not in a ref block, start one and reset pos
//the block cannot end before it starts, so resetting pos to a value less than currentPos would be problematic
//shouldn't happen but test just in case
int minEnd = currentPosition;
if (currentVariants.size() == taxa.size()) {
minEnd = currentVariants.stream().mapToInt(var -> var.end()).min().orElse(pos);
minEnd = Math.min(minEnd, refRange.end()); //do not go beyond the end of the currentRefRange
if (minEnd < currentPosition) {
String msg = "A ref block ends at " + minEnd + ", which is before the current position " + currentPosition;
throw new IllegalArgumentException(msg);
}
}
//calculate site ref block depth as sum of variant depths
int siteDepth = 0;
for (VariantInfo info : currentVariants) {
siteDepth += info.depth()[0];
}
if (inRefBlock) {
refBlockDepth = Math.min(refBlockDepth, siteDepth);
} else {
inRefBlock = true;
refBlockStart = pos;
refBlockDepth = siteDepth;
}
pos = minEnd + 1;
} else {
//Some variants are not ref blocks, so find the majority variant
//if the majority variant is not at first position add nothing
//if there are two variants with max freq, do not add a variant
//if the max freq < min freq, add nothing
//when adding a variant and in ref block, end the ref block at pos - 1 and add it to the consensus list
Multiset viCounter = HashMultiset.create();
for (VariantInfo info : currentVariants) viCounter.add(new VariantInfoConsensus(info.start(),info.end(),info.genotypeString(),info.isVariant()));
//sort the varids in order of descending count
// the list is an int array with just 2 elements - the varid and the count.
// the sort, then is on the second element (ie the count)
List> varInfoCounts = viCounter.entrySet().stream()
.map(ent -> new Tuple(ent.getElement(), ent.getCount())).collect(Collectors.toList());
Collections.sort(varInfoCounts, (first,second) -> second.getY() - first.getY());
int totalNumberOfVariants = currentVariants.size();
double majorFreq = (double) varInfoCounts.get(0).getY() / totalNumberOfVariants;
if (majorFreq >= minAlleleFrequency()
&& ( varInfoCounts.size() == 1 || varInfoCounts.get(0).getY() > varInfoCounts.get(1).getY()) ) {
List majorityVariants = currentVariants.stream()
.filter(var -> (new VariantInfoConsensus(var.start(),var.end(),var.genotypeString(),var.isVariant())).equals(varInfoCounts.get(0).getX())).collect(Collectors.toList());
//if the majority variant at is in a ref block extend or start a ref block
if (varInfoCounts.get(0).getX().isVariant == false) {
//majority variant is ref, just start/extend a ref block
int siteDepth = 0;
for (VariantInfo info : majorityVariants) {
siteDepth += info.depth()[0];
}
if (inRefBlock) {
refBlockDepth = Math.min(refBlockDepth, siteDepth);
} else {
inRefBlock = true;
refBlockStart = pos;
refBlockDepth = siteDepth;
}
} else {
int majorityStart = majorityVariants.get(0).start();
//if the majority variant is in a ref block then only update if the start position equals the current position
if (currentPosition == majorityStart) {
//finally we have a real variant
int[] siteDepth = new int[2];
for (VariantInfo info : majorityVariants) {
siteDepth[0] += info.depth()[0];
siteDepth[1] += info.depth()[1];
}
if (inRefBlock) {
//end the ref block at pos - 1
addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth);
inRefBlock = false;
}
VariantInfo aVariant = majorityVariants.get(0);
// This isn't stored in the db. When we create the VariantInfos from
// HaplotypeNode.VariantInfos() - this calls ConvertVariantContextToVariantInfo.convertContextToInfo
// And that sets the depths. So the depths is what we need to add here, not a varLong.
VariantInfo consensusInfo = new VariantInfo(chr.getName(), aVariant.start(), aVariant.end(), aVariant.genotypeString(), aVariant.refAlleleString(),
aVariant.altAlleleString(), true, siteDepth);
consensusVariants.add(consensusInfo);
} else {
//did not call a variant at this position. It is probably covered by a previously called indel
//just in case check inRefBlock and end it if true
if (inRefBlock) {
//end the ref block at pos - 1
addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth);
inRefBlock = false;
}
}
}
} else {
//could not call a variant
//if ref blocks + first position of indels (==ref) is >= minfreq call the position ref
//but first must check insertions to make sure previous variant did not call this a deletion (this happens)
//the following code is kind of complex, but probably does not get used very often, maybe, hopefully
int refcount = 0;
int refDepth = 0;
//have to iterate through the taxa here because may need to get the previous position variant
for (Taxon taxon : taxa) {
VariantInfo info = taxonToVariantInfoMap.get(taxon).get(currentPosition);
if (info != null) {
if (!info.isVariant()) {
refcount++;
refDepth += info.depth()[0];
} else if (info.isIndel() && info.start() == currentPosition) {
if (info.genotypeString().length() > info.refAlleleString().length()) {
//insertion
//if the previous variant for this taxon is a deletion covering this position,
// do not increment refcount
VariantInfo previousInfo = taxonToVariantInfoMap.get(taxon).get(currentPosition - 1);
if (previousInfo != null) {
boolean wasIndel = previousInfo.isIndel();
boolean wasNotDeletion = !wasIndel || previousInfo.genotypeString().length() >= previousInfo.refAlleleString().length();
boolean coversThisPosition = previousInfo.start() < currentPosition && previousInfo.end() >= currentPosition;
if (wasNotDeletion || !coversThisPosition) {
refcount++;
//because this is the alt variant, which has a ref call at its first position
refDepth += info.depth()[1];
}
}
} else if (info.genotypeString().length() < info.refAlleleString().length()) {
//deletion
refcount++;
refDepth += info.depth()[1];
}
}
}
}
if (refcount / totalNumberOfVariants >= minAlleleFrequency()) {
if (inRefBlock) {
refBlockDepth = Math.min(refBlockDepth, refDepth);
} else {
inRefBlock = true;
refBlockStart = pos;
refBlockDepth = refDepth;
}
} else if (inRefBlock) {
//otherwise end ref block if in one
//end the ref block at pos - 1
addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth);
inRefBlock = false;
}
}
pos++;
}
}
}
//finished iterating through the ref range positions
//if still in a ref block add it the consensus variants
if (inRefBlock) {
//end the ref block at pos - 1
addRefBlockToConsensusVariants(consensusVariants, chr.getName(), refBlockStart, pos - 1, refBlockDepth); //sort the variants by start position
inRefBlock = false;
}
Collections.sort(consensusVariants, (a,b) -> a.start() - b.start());
//check for and reconcile overlapping variants
//for a pair of overlapping variants, it the first is a ref call overlap is okay
//if the first is a deletion and the second is an insertion starting at the final base pair, thats okay
//anything else both variants should be deleted, but just report for now
List checkedVariants = new ArrayList<>();
Iterator iter = consensusVariants.iterator();
VariantInfo previousVariant = null;
if (iter.hasNext()) previousVariant = iter.next();
while (iter.hasNext()) {
VariantInfo currentVariant = iter.next();
if (previousVariant == null) {
previousVariant = currentVariant;
} else {
if (previousVariant.end() >= currentVariant.start()) {
//overlap
if (!previousVariant.isVariant()) {
checkedVariants.add(previousVariant);
previousVariant = currentVariant;
} else if (previousVariant.isIndel() && currentVariant.isIndel()) {
boolean isPreviousDeletion = previousVariant.genotypeString().length() < previousVariant.refAlleleString().length();
boolean isCurrentDeletion = currentVariant.genotypeString().length() < currentVariant.refAlleleString().length();
//deletion followed by indel
//assumes this is alt call. Safe because ref calls were converted to ref blocks earlier in the code.
if (isPreviousDeletion && !isCurrentDeletion) {
//okay to leave as is
checkedVariants.add(previousVariant);
previousVariant = currentVariant;
} else {
//do not know why this happened, print message, skip previous variant, and continue
String taxanames = taxa.stream().map(t -> t.getName()).collect(Collectors.joining(","));
String message = new StringBuilder("OVERLAP for ")
.append(taxanames)
.append("\nvariant at chr ").append(previousVariant.chromosome())
.append(": ").append(previousVariant.start()).append(" to ")
.append(previousVariant.end()).append(" geno ")
.append(previousVariant.genotypeString()).append(" ref ")
.append(previousVariant.refAlleleString()).append(" alt ").append(previousVariant.altAlleleString())
.append("\n").append("overlapped by variant at chr ").append(currentVariant.chromosome())
.append(": ").append(currentVariant.start()).append(" to ")
.append(currentVariant.end()).append(" geno ")
.append(currentVariant.genotypeString()).append(" ref ")
.append(currentVariant.refAlleleString()).append(" alt ").append(currentVariant.altAlleleString()).toString();
myLogger.debug(message);
previousVariant = currentVariant;
}
}
} else {
checkedVariants.add(previousVariant);
previousVariant = currentVariant;
}
}
}
//add the final variant
if (previousVariant != null) checkedVariants.add(previousVariant);
if (checkedVariants.size() == 0) {
String tn = taxa.stream().map(t -> t.getName()).collect(Collectors.joining(","));
myLogger.debug("size of checked consensus variant list = 0 for " + tn + " at " + refRange + ". Size of consensusVariants = " + consensusVariants.size());
}
return checkedVariants;
}
private void addRefBlockToConsensusVariants(List consensus, String chrname, int start, int end, int depth) {
int blockLength = end - start + 1;
long varLong = ConsensusProcessingUtils.encodeRefBlockToLong(blockLength, depth, start);
consensus.add(new VariantInfo(chrname, start, end, "REF","REF","NON_REF", false, varLong));
}
public ReferenceRange getReferenceRange() {
return currentRefRange;
}
public Multimap>> getConsensusDataMap() {
return consensusDataMap;
}
// No longer need variants passed in. Only the gvcf_file_id, which is determined when
// we load the data.
private AnchorDataPHG createAnchorPHG( ReferenceRange referenceRange, String hapSequence,
String asmContig, int asmStart, int asmEnd, String asmStrand,
int genomeFileID,int gvcfFileId) {
try {
Range intervalRange = Range.closed(Position.of(referenceRange.chromosome(), referenceRange.start()),
Position.of(referenceRange.chromosome(), referenceRange.end()));
AnchorDataPHG adata = new AnchorDataPHG(intervalRange,
asmContig,asmStart,asmEnd,asmStrand,
"NA",
hapSequence,
genomeFileID,
gvcfFileId
);
return adata;
} catch (Exception exc) {
myLogger.error("RunHapCollapsePipelinePlugin: Error encoding List to byte[];");
throw new IllegalStateException("RunHapCollapsePipelinePlugin: Error encoding List to byte[];", exc);
}
}
/**
* Warning: this function call will not result in a valid gvcfFileId
* parameter if loading to the haplotypes table
* @param sequence the sequence string for this haplotype
* @param referenceRange the ReferenceRange for this haplotype
* @return an AnchorDataPHG for a haplotype
*/
private AnchorDataPHG createAnchorPHG( String sequence, ReferenceRange referenceRange) {
try {
Range intervalRange = Range.closed(Position.of(referenceRange.chromosome(), referenceRange.start()),
Position.of(referenceRange.chromosome(), referenceRange.end()));
AnchorDataPHG adata = new AnchorDataPHG(intervalRange,
"0",0,0,".", // fix if this is used for assembly consensus
"NA",
sequence,
-1,-1);
return adata;
} catch (Exception exc) {
throw new IllegalStateException("RunHapCollapsePipelinePlugin: Error encoding List to byte[];", exc);
}
}
}
private String getAnnotatedMethodDetails() {
//configProperties comes from code in RunMergeAndCluster
Properties dbProperties = loadProperties();
int minTaxaPerHaplotype = Integer.parseInt(dbProperties.getProperty("minTaxa", "1"));
double mxDiv = Double.parseDouble(dbProperties.getProperty("mxDiv", "0.01"));
CLUSTERING_MODE clusteringMode = CLUSTERING_MODE.valueOf(dbProperties.getProperty("clusteringMode", "upgma"));
StringBuilder detailBuilder = new StringBuilder();
detailBuilder.append(collapseMethodDetails()).append(":");
detailBuilder.append("minTaxa=").append(minTaxaPerHaplotype);
detailBuilder.append(",mxDiv=").append(mxDiv);
detailBuilder.append(",mode=").append(clusteringMode.toString());
detailBuilder.append(",minFreq=").append(minAlleleFrequency());
detailBuilder.append(",maxClusters=").append(maxNumberOfClusters());
detailBuilder.append(",minSites=").append(minSiteForComp());
detailBuilder.append(",minCoverage=").append(minTaxaCoverage());
return detailBuilder.toString();
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return ("Create Consensus Haplotypes");
}
@Override
public String getToolTipText() {
return ("Takes haplotypes for a given method, clusters them based on a divergence parameter, creates a consensus haplotype for each reference range interval");
}
/**
* Input Reference Fasta
*
* @return Ref
*/
public String reference() {
return myReference.value();
}
/**
* Set Ref. Input Reference Fasta
*
* @param value Ref
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin reference(String value) {
myReference = new PluginParameter<>(myReference, value);
return this;
}
/**
* File holding the DB config information
*
* @return Db Config File
*/
public String dbConfigFile() {
return dbConfigFile.value();
}
/**
* Set Db Config File. File holding the DB config information
*
* @param value Db Config File
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin dbConfigFile(String value) {
dbConfigFile = new PluginParameter<>(dbConfigFile, value);
return this;
}
/**
* Name of the collapse method to be stored in the database
*
* @return Collapse Method
*/
public String collapseMethod() {
return myCollapseMethod.value();
}
/**
* Set Collapse Method. Name of the collapse method to
* be stored in the database
*
* @param value Collapse Method
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin collapseMethod(String value) {
myCollapseMethod = new PluginParameter<>(myCollapseMethod, value);
return this;
}
/**
* Details for the collapse method to be stored in the
* database
*
* @return Collapse Method Details
*/
public String collapseMethodDetails() {
return myCollapseMethodDetails.value();
}
/**
* Set Collapse Method Details. Details for the collapse
* method to be stored in the database
*
* @param value Collapse Method Details
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin collapseMethodDetails(String value) {
myCollapseMethodDetails = new PluginParameter<>(myCollapseMethodDetails, value);
return this;
}
/**
* At each position, if no allele has the minimum frequency,
* the consensus haplotype allele will be set to misssing.
*
* @return Minimum Allele Frequency
*/
public Double minAlleleFrequency() {
return minAlleleFrequency.value();
}
/**
* Set Minimum Allele Frequency. At each position, if
* no allele has the minimum frequency, the consensus
* haplotype allele will be set to misssing.
*
* @param value Minimum Allele Frequency
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin minAlleleFrequency(Double value) {
minAlleleFrequency = new PluginParameter<>(minAlleleFrequency, value);
return this;
}
/**
* File The Ranking for the Taxon in the DB. This is
* used to break ties.
*
* @return Ranking File
*/
public String rankingFile() {
return rankingFile.value();
}
/**
* Set Ranking File. File The Ranking for the Taxon in
* the DB. This is used to break ties.
*
* @param value Ranking File
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin rankingFile(String value) {
rankingFile = new PluginParameter<>(rankingFile, value);
return this;
}
/**
* The maximum number of clusters that will be created
* for a reference range. If mxDiv produces too many clusters
* then the cut height that produces maxClusters number
* of clusters will be substituted.
*
* @return Maximum Cluster Number
*/
public Integer maxNumberOfClusters() {
return maxNumberOfClusters.value();
}
/**
* Set Maximum Cluster Number. The maximum number of clusters
* that will be created for a reference range. If mxDiv
* produces too many clusters then the cut height that
* produces maxClusters number of clusters will be substituted.
*
* @param value Maximum Cluster Number
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin maxNumberOfClusters(Integer value) {
maxNumberOfClusters = new PluginParameter<>(maxNumberOfClusters, value);
return this;
}
/**
* The minimum number of shared sites that can be used
* to calculate the distance between two taxa
*
* @return Minimum Sites with Data
*/
public Integer minSiteForComp() {
return minSiteForComp.value();
}
/**
* Set Minimum Sites with Data . The minimum number of
* shared sites that can be used to calculate the distance
* between two taxa
*
* @param value Minimum Sites with Data
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin minSiteForComp(Integer value) {
minSiteForComp = new PluginParameter<>(minSiteForComp, value);
return this;
}
/**
* Minimum Coverage
*
* @return Minimum Coverage
*/
public Double minTaxaCoverage() {
return minTaxaCoverage.value();
}
/**
* Set Minimum Coverage. Minimum Coverage
*
* @param value Minimum Coverage
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin minTaxaCoverage(Double value) {
minTaxaCoverage = new PluginParameter<>(minTaxaCoverage, value);
return this;
}
/**
* The maximum number of threads to be used to create
* consensi. The actual number of threads used will not
* be greater than number of available CPU's - 2.
*
* @return Max Threads
*/
public Integer maxThreads() {
return maxThreads.value();
}
/**
* Set Max Threads. The maximum number of threads to be
* used to create consensi. The actual number of threads
* used will not be greater than number of available CPU's
* - 2.
*
* @param value Max Threads
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin maxThreads(Integer value) {
maxThreads = new PluginParameter<>(maxThreads, value);
return this;
}
/**
* Minimum number of taxa
*
* @return Min Taxa
*/
public Integer minTaxa() {
return minTaxa.value();
}
/**
* Set Min Taxa. Minimum number of taxa
*
* @param value Min Taxa
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin minTaxa(Integer value) {
minTaxa = new PluginParameter<>(minTaxa, value);
return this;
}
/**
* Maximum divergence.
*
* @return Mx Div
*/
public Double mxDiv() {
return mxDiv.value();
}
/**
* Set Mx Div. Maximum divergence.
*
* @param value Mx Div
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin mxDiv(Double value) {
mxDiv = new PluginParameter<>(mxDiv, value);
return this;
}
/**
* Clustering mode
*
* @return Clustering Mode
*/
public CLUSTERING_MODE clusteringMode() {
return clusteringMode.value();
}
/**
* Set Clustering Mode. Clustering mode
*
* @param value Clustering Mode
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin clusteringMode(CLUSTERING_MODE value) {
clusteringMode = new PluginParameter<>(clusteringMode, value);
return this;
}
/**
* Kmer size
*
* @return Kmer Size
*/
public Integer kmerSize() {
return kmerSize.value();
}
/**
* Set Kmer Size. Kmer size
*
* @param value Kmer Size
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin kmerSize(Integer value) {
kmerSize = new PluginParameter<>(kmerSize, value);
return this;
}
/**
* Distance calculation type
*
* @return Distance Calculation
*/
public DistanceCalculation distanceCalculation() {
return distanceCalculation.value();
}
/**
* Set Distance Calculation. Distance calculation type
*
* @param value Distance Calculation
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin distanceCalculation(DistanceCalculation value) {
distanceCalculation = new PluginParameter<>(distanceCalculation, value);
return this;
}
/**
* Indication if the data is to be loaded against a test
* method. Data loaded with test methods are not cached
* with the PHG ktor server
*
* @return Is Test Method
*/
public Boolean isTestMethod() {
return isTestMethod.value();
}
/**
* Set Is Test Method. Indication if the data is to be
* loaded against a test method. Data loaded with test
* methods are not cached with the PHG ktor server
*
* @param value Is Test Method
*
* @return this plugin
*/
public RunHapConsensusPipelinePlugin isTestMethod(Boolean value) {
isTestMethod = new PluginParameter(isTestMethod, value);
return this;
}
// Public class used to create/compare variant counts
// DO we need "end" or is start + genotypeString enough?
public class VariantInfoConsensus implements Comparable {
private final int start;
private final int end;
private final String genotypeString;
private final boolean isVariant;
public VariantInfoConsensus( int startPos, int endPos, String genotype, boolean variant) {
start = startPos;
end = endPos;
genotypeString = genotype;
isVariant = variant;
}
@Override
public int compareTo( VariantInfoConsensus other) {
if (this == other) {
return 0;
}
int result = Integer.compare(start,other.start);
if (result != 0) {
return result;
}
result = Integer.compare(end,other.end);
if (result != 0) {
return result;
}
return genotypeString.compareTo(other.genotypeString);
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (this.isVariant != ((VariantInfoConsensus)obj).isVariant) {
return false;
}
return (compareTo((VariantInfoConsensus) obj) == 0);
}
}
}