net.maizegenetics.pangenome.hapcollapse.RunHapCollapsePipelinePlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapcollapse;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Range;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.analysis.distance.KinshipPlugin;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.ExportUtils;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.io.VCFUtil;
import net.maizegenetics.pangenome.api.HaplotypeGraph;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.pangenome.db_loading.AnchorDataPHG;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.pangenome.db_loading.PHGDataWriter;
import net.maizegenetics.pangenome.db_loading.PHGdbAccess;
import net.maizegenetics.pangenome.fastaExtraction.ExtractFastaUtils;
import net.maizegenetics.pangenome.hapcollapse.FindHaplotypeClustersPlugin.CLUSTER_METHOD;
import net.maizegenetics.pangenome.hapcollapse.FillIndelsIntoConsensus.INDEL_MERGE_RULE;
import net.maizegenetics.plugindef.*;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.net.URL;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.*;
import java.util.stream.Collectors;
/**
* Simple plugin to run the full Haplotype Collapse plugin. Will do the following steps:
* Steps:
* 1. Loop through each reference range in the graph:
* 1.a Extract the HaplotypeNodes with the VariantContexts(We assume that the user has not pulled these yet for memory reasons)
* 1.b Merge all the VariantContext records for each BP of the reference range
* 1.c Export a GenotypeTable containing each bp
* 1.d Run HapCollapse Finding algorithm on this genotype table
* 2. For each VCF file exported, upload them to the DB as a consensus
* Created by zrm22 on 11/8/17.
*/
@Deprecated
public class RunHapCollapsePipelinePlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(RunHapCollapsePipelinePlugin.class);
private PluginParameter myReference = new PluginParameter.Builder<>("ref", null, String.class)
.description("Input Reference Fasta")
.required(true)
.inFile()
.build();
private PluginParameter dbConfigFile = new PluginParameter.Builder<>("dbConfigFile", null, String.class)
.description("File holding the DB config information")
.required(true)
.inFile()
.build();
private PluginParameter myConsensusVCFOutputDir = new PluginParameter.Builder<>("consensusVCFOutputDir", null, String.class)
.description("Directory where you want to store the output VCFs from the consensus process")
.required(true)
.outDir()
.guiName("Consensus VCF Output Dir")
.build();
private PluginParameter myConsensusFastaOutputDir = new PluginParameter.Builder<>("consensusFastaOutputDir", null, String.class)
.description("Directory where you want to store the output fastas from the consensus process")
.required(true)
.outDir()
.build();;
private PluginParameter myCollapseMethod = new PluginParameter.Builder<>("collapseMethod", null, String.class)
.description("Name of the collapse method to be stored in the database")
.required(true)
.build();
private PluginParameter myCollapseMethodDetails = new PluginParameter.Builder<>("collapseMethodDetails", null, String.class)
.description("Details for the collapse method to be stored in the database")
.required(true)
.build();
private PluginParameter clusterMethod = new PluginParameter.Builder<>("method", CLUSTER_METHOD.coverage, CLUSTER_METHOD.class)
.guiName("Cluster Method")
.description("The method used to cluster taxa. Coverage seeds the first cluster with the highest coverage taxon. UPGMA builds a UPGMA tree then cuts it at maxDistance.")
.required(false)
.build();
private PluginParameter mergeRule = new PluginParameter.Builder<>("indelMergeRule", INDEL_MERGE_RULE.setToN, INDEL_MERGE_RULE.class)
.guiName("Indel Merge Rule")
.description("The rule in which to resolve the conflicting Indels after consensus has been found.")
.required(false)
.build();
public RunHapCollapsePipelinePlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
//Steps
// Build a PHG without the variant contexts(for space)
// We will need to pull each variant context object later
// Pass the PHG to MergeGVCFPlugin with a temp output Directory
// Loop through the VCFs in the temp directory and Run FindHaplotypeClustersPlugin on it exporting to a new VCF directory
// Load in those VCFs and fastas into the DB using LoadConsensusAnchorSequencesPlugin
//Load up the PHG
List temp = input.getDataOfType(HaplotypeGraph.class);
if (temp.size() != 1) {
throw new IllegalArgumentException("HapCountBestPathPlugin: processData: must input one HaplotypeGraph: " + temp.size());
}
HaplotypeGraph hapgraph = (HaplotypeGraph) temp.get(0).getData();
runCollapsePipeline(hapgraph, reference());
return null;
}
/**
* This method will loop through each reference range in the graph and will:
* 1. Merge all the gvcf records for a given haplotype method:
* 2. Cluster the haplotypes together into groups
* Then when done with all reference ranges, load the exported gvcfs and fastas to the db.
*
* This is the main function that processes each reference range in the graph
*
* @param graph haplotype graph
* @param referenceFasta reference fasta
*/
public void runCollapsePipeline(HaplotypeGraph graph, String referenceFasta) {
final GenomeSequence referenceSequence = GenomeSequenceBuilder.instance(referenceFasta);
// Get the properties object
Properties dbProperties = loadProperties();
ExecutorService threadpool = ForkJoinPool.commonPool();
// BlockingQueue> futures = new LinkedBlockingQueue<>(Integer.parseInt(dbProperties.getProperty("queueSize","10")));
BlockingQueue> futures = new LinkedBlockingQueue<>();
// Start thread that processes futures
Future> processingFuture = threadpool.submit(new ProcessFutures(futures));
// int batchSize = Integer.parseInt(dbProperties.getProperty("batchSize","1"));
// List rangeBatch = new ArrayList<>();
// for(ReferenceRange referenceRange : graph.referenceRanges()) {
// rangeBatch.add(referenceRange);
// if(rangeBatch.size() >= batchSize) {
// HaplotypeGraph filteredGraph = new FilterGraphPlugin(null, false)
// .refRanges(rangeBatch)
// .filter(graph);
//
// //Extract out the variantContexts for the nodes in the graph
// HaplotypeGraph filteredGraphWithVariants = filteredGraph;
// for(ReferenceRange refRangeWithVariants : filteredGraphWithVariants.referenceRanges()) {
// List datumList = new ArrayList<>();
// datumList.add(new Datum("graph", filteredGraphWithVariants, "Graph Passed In"));
// datumList.add(new Datum("referenceGenomeSequence", referenceSequence, "GenomeSequence holding the reference"));
// datumList.add(new Datum("refRange", refRangeWithVariants, " Current Reference Range"));
//
// DataSet input = new DataSet(datumList, null);
//
// try {
// // Add processing of the merging and then clustering to the blocking queue
// futures.add(threadpool.submit(new RunMergeAndCluster(input, referenceFasta,
// dbProperties, referenceSequence,
// dbProperties.getProperty("exportMergedVCF", ""))));
// } catch (Exception e) {
// // Catch the error, but finish processing the reference ranges
// myLogger.debug(e.getMessage(), e);
// myLogger.error("Error Processing refRange: " + refRangeWithVariants.intervalString() + "\nError: " + e.getMessage());
// }
// }
// rangeBatch = new ArrayList<>();
// }
// }
for (ReferenceRange referenceRange : graph.referenceRanges()) {
List datumList = new ArrayList<>();
datumList.add(new Datum("graph", graph, "Graph Passed In"));
datumList.add(new Datum("referenceGenomeSequence", referenceSequence, "GenomeSequence holding the reference"));
datumList.add(new Datum("refRange", referenceRange, " Current Reference Range"));
DataSet input = new DataSet(datumList, null);
try {
// Add processing of the merging and then clustering to the blocking queue
futures.add(threadpool.submit(new RunMergeAndCluster(input, referenceFasta,
dbProperties, referenceSequence,
dbProperties.getProperty("exportMergedVCF", ""))));
} catch (Exception e) {
// Catch the error, but finish processing the reference ranges
myLogger.debug(e.getMessage(), e);
myLogger.error("Error Processing refRange: " + referenceRange.intervalString() + "\nError: " + e.getMessage());
}
}
futures.add(threadpool.submit(new RunMergeAndCluster()));
// Wait until processing of all futures is done
try {
processingFuture.get();
} catch (Exception ex) {
myLogger.debug(ex.getMessage(), ex);
throw new IllegalStateException("RunHapCollapsePipelinePlugin: runCollapsePipeline: Problem with thread processing futures: " + ex.getMessage());
}
threadpool.shutdown();
}
private FindHaplotypeClustersPlugin setupFindHaplotypeClustersPlugin(Properties dbProperties, GenomeSequence referenceSequence) {
return new FindHaplotypeClustersPlugin(null, false)
.outFile(consensusVCFOutputDir())
.sequenceOutDir(consensusFastaOutputDir())
.referenceSequence(referenceSequence)
.clusterMethod(clusterMethod())
.maxDistFromFounder(Double.parseDouble(dbProperties.getProperty("mxDiv", "0.01")))
.seqErrorRate(Double.parseDouble(dbProperties.getProperty("seqErr", "0.01")))
.minSiteForComp(Integer.parseInt(dbProperties.getProperty("minSites", "20")))
.minTaxaInGroup(Integer.parseInt(dbProperties.getProperty("minTaxa", "2")))
.replaceNsWithMajor(Boolean.parseBoolean(dbProperties.getProperty("replaceNsWithMajor","false")))
.maxError(Double.parseDouble(dbProperties.getProperty("maxError",".2")))
.useDepthForCalls(Boolean.parseBoolean(dbProperties.getProperty("useDepth","true")))
.clusterMethod(CLUSTER_METHOD.valueOf(dbProperties.getProperty("method", clusterMethod().toString())));
}
private Properties loadProperties() {
Properties configProperties = new Properties();
try {
configProperties.load(Utils.getBufferedReader(dbConfigFile()));
} catch (Exception e) {
myLogger.error("RunHapCollapsePipelinePlugin: loadProperties Failed to Load Properties.", e);
throw new IllegalStateException("RunHapCollapsePipelinePlugin: loadProperties Failed to Load Properties.", e);
}
return configProperties;
}
/**
* The ProcessFutures class is the thread that needs to run all of the RunMergeAndCluster threads and will upload things to the db.
* Basically this thread will run until it finds the final RunMergeAndCluster object, blocking execution until each thread in the queue is done.
* Once we have over 1000 haplotypes to write to the db, we write those haplotypes to the db and start a new queue.
*
*/
private class ProcessFutures implements Runnable {
private final BlockingQueue> myQueue;
public ProcessFutures(BlockingQueue> queue) {
myQueue = queue;
}
@Override
public void run() {
try (Connection dbConnect = DBLoadingUtils.connection(dbConfigFile(), false)) {
PHGDataWriter phg = new PHGdbAccess(dbConnect);
Multimap>> consensusDataMap = HashMultimap.create();
// Deprecated, so not adding changes to allow for test method type
int methodId = phg.putMethod(collapseMethod(), DBLoadingUtils.MethodType.CONSENSUS_ANCHOR_SEQUENCE, pluginParameters());
Future future = myQueue.take();
RunMergeAndCluster mergeAndClusterObject = future.get();
while (!mergeAndClusterObject.myIsFinal) {
if (mergeAndClusterObject == null) {
//Skip over this one.
continue;
}
consensusDataMap.putAll(mergeAndClusterObject.getConsensusDataMap());
ReferenceRange currentRefRange = mergeAndClusterObject.getReferenceRange();
myLogger.debug("RefRange: "+currentRefRange.intervalString());
//TODO make this a parameter
if (consensusDataMap.size() > 1000) {
myLogger.debug("Writing to the database:");
//Load the consensus to the db
phg.putConsensusSequences(consensusDataMap, methodId);
//Clear out the multimap
consensusDataMap.clear();
}
future = myQueue.take();
mergeAndClusterObject = future.get();
}
//Load the consensus to the db
phg.putConsensusSequences(consensusDataMap, methodId);
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("RunHapCollapsePipelinePlugin: ProcessFutures: problem: " + e.getMessage());
}
}
}
/**
* Class to run the Merging and Clustering part of the pipeline.
* This will send a list of haplotypeNodes to the merge plugin to get a merged GenotypeTable
* It will then Cluster the taxon in the GenotypeTable and export a set of genotypeTables each containing one combined taxon
* Then the thread will create the objects needed to load into the DB.
*
*/
private class RunMergeAndCluster implements Callable {
private DataSet graphToMerge;
private MergeGVCFPlugin mergePlugin;
private FindHaplotypeClustersPlugin clustersPlugin;
private final String exportMergedVCFPath;
private ReferenceRange currentRefRange;
private final boolean myIsFinal;
private Multimap>> consensusDataMap = HashMultimap.create();
private GenomeSequence referenceGenotype = null;
public RunMergeAndCluster(DataSet graphToMerge, String referenceFasta, Properties dbProperties, GenomeSequence referenceSequence, String exportMergedVCFPath) {
this.graphToMerge = graphToMerge;
this.mergePlugin = new MergeGVCFPlugin(null, false).dBConfig(dbConfigFile()).referenceFile(referenceFasta);
this.clustersPlugin = setupFindHaplotypeClustersPlugin(dbProperties, referenceSequence);
this.exportMergedVCFPath = exportMergedVCFPath;
this.referenceGenotype = referenceSequence;
myIsFinal = false;
}
public RunMergeAndCluster() {
this.graphToMerge = null;
this.mergePlugin = null;
this.clustersPlugin = null;
this.exportMergedVCFPath = null;
myIsFinal = true;
}
@Override
public RunMergeAndCluster call() {
if (myIsFinal) {
return this;
}
try {
DataSet ds = mergePlugin.performFunction(graphToMerge);
currentRefRange = (ReferenceRange) graphToMerge.getDataOfType(ReferenceRange.class).get(0).getData();
if (ds.getDataSet().size() == 0) {
return this;
}
if (!exportMergedVCFPath.equals("")) {
GenotypeTable genotypeTable = (GenotypeTable) ds.getDataOfType(GenotypeTable.class).get(0).getData();
String exportGTFileName = new StringBuilder().append(exportMergedVCFPath).append("chr")
.append(currentRefRange.chromosome().getName())
.append("_stPos")
.append(currentRefRange.start())
.append("_merged.vcf")
.toString();
ExportUtils.writeToVCF(genotypeTable, exportGTFileName, true);
}
DataSet clusteredGenotypes = clustersPlugin.performFunction(ds);
//Convert the consensus GenotypeTables to List so we can readd in the indels easily
List> consensusSequences = clusteredGenotypes.getDataOfType(GenotypeTable.class)
.stream()
.map(datum -> (GenotypeTable)datum.getData())
.map(genotypeTable -> VCFUtil.convertGenotypeTableToVariantContextList(genotypeTable))
.collect(Collectors.toList());
//Get the original raw haplotypes with variants
List rawHapsWithVariants = (List)ds.getDataOfType(List.class).get(0).getData();
List> rawVariants = rawHapsWithVariants.stream().map(haplotypeNode -> haplotypeNode.variantContexts())
.filter(variantContexts -> variantContexts.isPresent())
.map(variantContexts -> variantContexts.get())
.filter(variantContexts -> variantContexts.size()>0)
.collect(Collectors.toList());
List> consensusSequenceWithIndels = FillIndelsIntoConsensus.addInIndels(currentRefRange,referenceGenotype,consensusSequences,rawVariants,2,mergeRule());
//Loop through the indel readded consensus sequences and make new AnchorPHG objects.
for(List currentConsensus : consensusSequenceWithIndels) {
//Loop through the clusteredGenotypes and pull out the phg record and add to a map
consensusDataMap.put(Position.of(currentRefRange.chromosome(), currentRefRange.start()),
new Tuple<>(createAnchorPHG(currentConsensus, currentRefRange),
extractTaxaNames(currentConsensus)));
}
return this;
} catch (Exception e) {
//Get out the current ReferenceRange:
ReferenceRange referenceRange = (ReferenceRange) graphToMerge.getDataOfType(ReferenceRange.class).get(0).getData();
myLogger.error("Error processing ReferenceRange:" + referenceRange.intervalString() + " ErrorMessage:" + e.getMessage());
return this;
} finally {
graphToMerge = null;
mergePlugin = null;
clustersPlugin = null;
}
}
public ReferenceRange getReferenceRange() {
return currentRefRange;
}
public Multimap>> getConsensusDataMap() {
return consensusDataMap;
}
/**
* Method to create the AnchorPHG object which is written to the database
* @param variants
* @param referenceRange
* @return
*/
private AnchorDataPHG createAnchorPHG(List variants, ReferenceRange referenceRange) {
try {
Range intervalRange = Range.closed(Position.of(referenceRange.chromosome(), referenceRange.start()),
Position.of(referenceRange.chromosome(), referenceRange.end()));
AnchorDataPHG adata = new AnchorDataPHG(intervalRange,
"0",0,0,".", // asm values - all null here
null,
ExtractFastaUtils.extractFastaSequence(variants, referenceRange), -1,-1);
return adata;
} catch (Exception exc) {
myLogger.error("RunHapCollapsePipelinePlugin: Error encoding List to byte[];");
throw new IllegalStateException("RunHapCollapsePipelinePlugin: Error encoding List to byte[];", exc);
}
}
private List extractTaxaNames(List vcs) {
return Arrays.stream(vcs.get(0).getSampleNamesOrderedByName().get(0).split(":")).sorted().collect(Collectors.toList());
}
}
@Override
public String getButtonName() {
return "RunHapCollapsePipeline";
}
@Override
public String getToolTipText() {
return "Run the full Haplotype Collapse Pipeline.";
}
@Override
public String pluginDescription() {
return "This Plugin is used to generate and upload a set of consensus haplotypes to a PHG DB.";
}
@Override
public ImageIcon getIcon() {
URL imageURL = KinshipPlugin.class.getResource("/net/maizegenetics/analysis/images/missing.gif");
if (imageURL == null) {
return null;
} else {
return new ImageIcon(imageURL);
}
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(RunHapCollapsePipelinePlugin.class);
// }
// /**
// * Convenience method to run plugin with one return object.
// */
// // TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
/**
* Input Reference Fasta
*
* @return Ref
*/
public String reference() {
return myReference.value();
}
/**
* Set Ref. Input Reference Fasta
*
* @param value Ref
*
* @return this plugin
*/
public RunHapCollapsePipelinePlugin reference(String value) {
myReference = new PluginParameter<>(myReference, value);
return this;
}
/**
* File holding the DB config information
*
* @return Db Config File
*/
public String dbConfigFile() {
return dbConfigFile.value();
}
/**
* Set Db Config File. File holding the DB config information
*
* @param value Db Config File
*
* @return this plugin
*/
public RunHapCollapsePipelinePlugin dbConfigFile(String value) {
dbConfigFile = new PluginParameter<>(dbConfigFile, value);
return this;
}
/**
* Directory where you want to store the output VCFs from
* the consensus process
*
* @return Consensus VCF Output Dir
*/
public String consensusVCFOutputDir() {
return myConsensusVCFOutputDir.value();
}
/**
* Set Consensus VCF Output Dir. Directory where you want
* to store the output VCFs from the consensus process
*
* @param value Consensus VCF Output Dir
*
* @return this plugin
*/
public RunHapCollapsePipelinePlugin consensusVCFOutputDir(String value) {
myConsensusVCFOutputDir = new PluginParameter<>(myConsensusVCFOutputDir, value);
return this;
}
/**
* Directory where you want to store the output fastas
* from the consensus process
*
* @return Consensus Fasta Output Dir
*/
public String consensusFastaOutputDir() {
return myConsensusFastaOutputDir.value();
}
/**
* Set Consensus Fasta Output Dir. Directory where you
* want to store the output fastas from the consensus
* process
*
* @param value Consensus Fasta Output Dir
*
* @return this plugin
*/
public RunHapCollapsePipelinePlugin consensusFastaOutputDir(String value) {
myConsensusFastaOutputDir = new PluginParameter<>(myConsensusFastaOutputDir, value);
return this;
}
/**
* Name of the collapse method to be stored in the database
*
* @return Collapse Method
*/
public String collapseMethod() {
return myCollapseMethod.value();
}
/**
* Set Collapse Method. Name of the collapse method to
* be stored in the database
*
* @param value Collapse Method
*
* @return this plugin
*/
public RunHapCollapsePipelinePlugin collapseMethod(String value) {
myCollapseMethod = new PluginParameter<>(myCollapseMethod, value);
return this;
}
/**
* Details for the collapse method to be stored in the
* database
*
* @return Collapse Method Details
*/
public String collapseMethodDetails() {
return myCollapseMethodDetails.value();
}
/**
* Set Collapse Method Details. Details for the collapse
* method to be stored in the database
*
* @param value Collapse Method Details
*
* @return this plugin
*/
public RunHapCollapsePipelinePlugin collapseMethodDetails(String value) {
myCollapseMethodDetails = new PluginParameter<>(myCollapseMethodDetails, value);
return this;
}
/**
* The method used to cluster taxa. Coverage seeds the
* first cluster with the highest coverage taxon. UPGMA
* builds a UPGMA tree then cuts it at maxDistance.
*
* @return Cluster Method
*/
public CLUSTER_METHOD clusterMethod() {
return clusterMethod.value();
}
/**
* Set Cluster Method. The method used to cluster taxa.
* Coverage seeds the first cluster with the highest coverage
* taxon. UPGMA builds a UPGMA tree then cuts it at maxDistance.
*
* @param value Cluster Method
*
* @return this plugin
*/
public RunHapCollapsePipelinePlugin clusterMethod(CLUSTER_METHOD value) {
clusterMethod = new PluginParameter<>(clusterMethod, value);
return this;
}
/**
* The rule in which to resolve the conflicting Indels
* after consensus has been found.
*
* @return Indel Merge Rule
*/
public INDEL_MERGE_RULE mergeRule() {
return mergeRule.value();
}
/**
* Set Indel Merge Rule. The rule in which to resolve
* the conflicting Indels after consensus has been found.
*
* @param value Indel Merge Rule
*
* @return this plugin
*/
public RunHapCollapsePipelinePlugin mergeRule(INDEL_MERGE_RULE value) {
mergeRule = new PluginParameter<>(mergeRule, value);
return this;
}
}