net.maizegenetics.pangenome.hapcollapse.MergeGVCFPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapcollapse;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.snp.ExportUtils;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.pangenome.api.CreateGraphUtils;
import net.maizegenetics.pangenome.api.HaplotypeGraph;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.sql.Connection;
import java.util.*;
import java.util.List;
/**
* Created by zrm22 on 11/6/17.
*
* Plugin which adapts GATK's Merging GVCF files.
* This will simply take all the variant context records for all of the taxon at that reference range and will merge the calls and depth information into a single VariantContext record
* For now it will export a VCF for each reference range interval. These can then be read into the FindHaplotypeClustersPlugin
*
* The Algorithm is as follows:
* Take all the HaplotypeNodes:
* Create a GenotypeTable which has all indels expanded out to individual positions
* Filter out the positions where there is an indel from the GenotypeTable
* If outputDir is specified, export the genotypeTable using TASSEL export
* otherwise return the genotypeTable in a DataSet
*/
@Deprecated
public class MergeGVCFPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(MergeGVCFPlugin.class);
private PluginParameter myOutputDir = new PluginParameter.Builder<>("outputDir", null, String.class)
.description("Directory where you want to store the output VCFs")
.outDir()
.build();
private PluginParameter myReferenceFile = new PluginParameter.Builder<>("referenceFile", null, String.class)
.description("Reference Input file")
.inFile()
.required(true)
.build();
private PluginParameter myDBConfig = new PluginParameter.Builder<>("dbConfig", null, String.class)
.description("Config file used to set up the db connection")
.required(true)
.inFile()
.build();
public MergeGVCFPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
List temp = input.getDataOfType(HaplotypeGraph.class);
if (temp.size() != 1) {
throw new IllegalArgumentException("PathsToVCFPlugin: processData: must input one Haplotype Graph");
}
HaplotypeGraph graph = (HaplotypeGraph)temp.get(0).getData();
temp = input.getDataOfType(GenomeSequence.class);
if(temp.size()!= 1) {
throw new IllegalArgumentException("MergeGVCFPlugin: processData: must input one reference Genome Sequence");
}
GenomeSequence genomeSequence = (GenomeSequence)temp.get(0).getData();
temp = input.getDataOfType(ReferenceRange.class);
if(temp.size() != 1) {
throw new IllegalArgumentException("MergeGVCFPlugin: processData: must input one ReferenceRange object");
}
ReferenceRange range = (ReferenceRange)temp.get(0).getData();
myLogger.info("Starting to merge variants:");
//Converting the GVCFs to a Genotype table marking indels as + or -
try {
List nodesWithVariants = extractNodesWithVariants(graph.nodes(range),range);
GenotypeTable mergedGTWithIndels = MergeGVCFUtils.createGenotypeTableFromHaplotypeNodes(range, nodesWithVariants, genomeSequence);
GenotypeTable mergedGTNoIndels = MergeGVCFUtils.removeIndels(mergedGTWithIndels);
if (outputDir() != null) {
ExportUtils.writeToVCF(mergedGTNoIndels, outputDir() + "/outputMergedVCF_chr" + range.chromosome().getName() + "_stPos" + range.start() + ".vcf", true);
}
//Return the merged GenotypeTable, the reference range and the original Nodes with the variants.
//Each of these pieces of information are required by later steps in the pipeline.
return new DataSet(Arrays.asList(new Datum("MergedGenotypeTable", mergedGTNoIndels, "Genotype Table after merging process"),
new Datum("ReferenceRange",range,"Reference Range Currently Processed"),
new Datum("RawNodesWithVariants",nodesWithVariants,"Raw haplotypeNodes with the VariantContexts")), null);
}
catch(Exception e) {
myLogger.warn("MergeGVCFPlugin WARNING: error converting VariantList to GenotypeTable. MergeGVCFPlugin will return an empty Dataset. Reference Range: "+range.intervalString()+"\n"+e.getMessage()+"\n");
return new DataSet(new ArrayList<>(),null);
}
}
private List extractNodesWithVariants(List nodesAtRefRange, ReferenceRange referenceRange) {
List nodesWithVariantContexts = null;
//Check to see if the nodesAtRefRange already have variantContexts
boolean hasVariants = false;
for(HaplotypeNode node : nodesAtRefRange) {
if(node.variantContexts().isPresent()) {
hasVariants = true;
break;
}
}
//If the nodes do not have variantContexts, we need to get them from the Database
if(hasVariants) {
nodesWithVariantContexts = nodesAtRefRange;
}
else {
//Query the database to get the Variant contexts. The full graph is likely too large to fit into memory if we do not pull the VariantContexts out one reference range at a time.
//This should be wrapped into a Connection object and passed to the graph builder
try (Connection connection = CreateGraphUtils.connection(dBConfig())) {
nodesWithVariantContexts = CreateGraphUtils.createHaplotypeNodesWithVariants(connection, new HashSet<>(nodesAtRefRange)).get(referenceRange);
} catch (Exception e) {
myLogger.debug("MergeGVCFPlugin. Error setting up the connection/retrieving VCF records from DB:"+e.getMessage());
throw new IllegalStateException("Merge GVCF Plugin. Error setting up the connection/retrieving VCF records from DB.",e);
}
}
return nodesWithVariantContexts;
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "Merge GVCFs";
}
@Override
public String getToolTipText() {
return "Merge GVCFs";
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(MergeGVCFPlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
/**
* Directory where you want to store the output VCFs
*
* @return Output Dir
*/
public String outputDir() {
return myOutputDir.value();
}
/**
* Set Output Dir. Directory where you want to store the
* output VCFs
*
* @param value Output Dir
*
* @return this plugin
*/
public MergeGVCFPlugin outputDir(String value) {
myOutputDir = new PluginParameter<>(myOutputDir, value);
return this;
}
/**
* Reference Input file
*
* @return Reference File
*/
public String referenceFile() {
return myReferenceFile.value();
}
/**
* Set Reference File. Reference Input file
*
* @param value Reference File
*
* @return this plugin
*/
public MergeGVCFPlugin referenceFile(String value) {
myReferenceFile = new PluginParameter<>(myReferenceFile, value);
return this;
}
/**
* Config file used to set up the db connection
*
* @return Db Config
*/
public String dBConfig() {
return myDBConfig.value();
}
/**
* Set Db Config. Config file used to set up the db connection
*
* @param value Db Config
*
* @return this plugin
*/
public MergeGVCFPlugin dBConfig(String value) {
myDBConfig = new PluginParameter<>(myDBConfig, value);
return this;
}
}