net.maizegenetics.pangenome.db_loading.LoadHapSequencesFromGVCFPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.db_loading;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeSet;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.ConvertVariantContextToVariantInfo;
import net.maizegenetics.pangenome.fastaExtraction.GVCFSequence;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.File;
import java.sql.Connection;
import java.util.*;
import java.util.stream.Collectors;
/**
* Plugin which will upload a GVCF file to the DB. It can be filtered or unfiltered, but will upload as haplotypes to the database.
* This plugin should work with GATK/Sentieon GVCFS as well as assembly GVCFs. It currently will not work with consensus.
* Created by Zack Miller [email protected] June 25th 2018
*/
@Deprecated
public class LoadHapSequencesFromGVCFPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(LoadHapSequencesFromGVCFPlugin.class);
private PluginParameter myInputFile = new PluginParameter.Builder<>("inputGVCFFile", null, String.class)
.description("GVCF File to be filtered.")
.inFile()
.required(true)
.build();
private PluginParameter myReference = new PluginParameter.Builder<>("ref", null, String.class)
.description("Input Reference Fasta")
.required(true)
.inFile()
.build();
private PluginParameter dbConfigFile = new PluginParameter.Builder<>("dbConfigFile", null, String.class)
.description("File holding the DB config information")
.required(true)
.inFile()
.build();
private PluginParameter loadDataFile = new PluginParameter.Builder<>("loadDataFile", null, String.class)
.description("File holding the DB config information")
.required(true)
.inFile()
.build();
private PluginParameter bedFile = new PluginParameter.Builder<>("bedFile", null, String.class)
.description("File holding the Reference Range Information")
.required(true)
.inFile()
.build();
public LoadHapSequencesFromGVCFPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
//Steps:
//walk through the bed file to get the list of regions we want to extract
RangeSet focusRanges = parseBedFileIntoRangeSet(bedFile());
//Open up the loadDataFile and parse out the needed fields
Map loadDataMap = parseLoadData(loadDataFile());
Properties configProperties = new Properties();
try {
configProperties.load(Utils.getBufferedReader(dbConfigFile()));
} catch (Exception e) {
e.printStackTrace();
}
GenomeSequence gs = GVCFSequence.instance(reference(),inputFile());
myLogger.info("Begin LoadHapSequencesFromGVCFPlugin ...");
try (Connection dbConnect = DBLoadingUtils.connection(dbConfigFile(), false)) {
//Set up the objects needed when we start iterating through the BED file
PHGDataWriter phg = new PHGdbAccess(dbConnect);
VCFFileReader gvcfFile = new VCFFileReader(new File(inputFile()),false);
Map anchorSequences = new HashMap();
RangeMap chromAnchorStartEnd = null;
// First load method, genotype, gametes, gamete_groups tables
GenoHaploData ghd = new GenoHaploData(Integer.parseInt(loadDataMap.get("ploidy")),
Boolean.parseBoolean(loadDataMap.get("is_ref")),
loadDataMap.get("line"),
loadDataMap.get("line_data"),
Boolean.parseBoolean(loadDataMap.get("genesPhased")),
Boolean.parseBoolean(loadDataMap.get("chromsPhased")),
Integer.parseInt(loadDataMap.get("hapNumber")),
Float.parseFloat(loadDataMap.get("conf")));
phg.putGenoAndHaploTypeData(ghd);
// Load the gamete_groups and gamete_haplotypes table
String nameWithHap = loadDataMap.get("line") + "_" + loadDataMap.get("hapNumber"); // ref line is always hap1
List gameteGroupList = new ArrayList();
gameteGroupList.add(nameWithHap);
phg.putGameteGroupAndHaplotypes(gameteGroupList);
// Put the method data - identifies for each haplotype how the anchors were created
// anchor and inter-anchor are both method_type "anchor_haplotypes"
Map pluginParams = pluginParameters();
pluginParams.put("notes",loadDataMap.get("method_details"));
int method_id = phg.putMethod(loadDataMap.get("method"), DBLoadingUtils.MethodType.ANCHOR_HAPLOTYPES,pluginParams);
// Add info to group tables
int gamete_grp_id = phg.getGameteGroupIDFromTaxaList(gameteGroupList);
//For each range in the bed file, process
String currentChromosome = "";
String prevChromosome = "";
for(Range currentRange: focusRanges.asRanges()) {
if(currentRange.lowerEndpoint().getChromosome()==null || !currentRange.lowerEndpoint().getChromosome().getName().equals(currentChromosome)) {
chromAnchorStartEnd = phg.getIntervalRangesWithIDForChrom(currentRange.lowerEndpoint().getChromosome().getName());
prevChromosome = currentChromosome;
currentChromosome = chromAnchorStartEnd.asMapOfRanges()
.keySet()
.stream()
.collect(Collectors.toList())
.get(0)
.lowerEndpoint()
.getChromosome()
.getName();
if (anchorSequences.size() > 0) { // load previous chromosome's data
myLogger.info("\ncalling putHaplotypes for chrom " + prevChromosome);
phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, prevChromosome);
anchorSequences.clear(); // these have been processed, clear the map, start fresh with next chrom
}
}
//Extract out the List
List gvcfContexts = gvcfFile.query(currentRange.lowerEndpoint().getChromosome().getName(),
currentRange.lowerEndpoint().getPosition(),
currentRange.upperEndpoint().getPosition())
.stream()
.collect(Collectors.toList());
//Extract out the fasta sequence:
// THis looks wrong - shouldn't it be coordinates of the hap-line, using hap fasta?
String sequence = gs.genotypeAsString(currentRange.lowerEndpoint().getChromosome(),
currentRange.lowerEndpoint().getPosition(),
currentRange.upperEndpoint().getPosition());
//Add the sequences and the encoded GVCF to the datastructure
Map.Entry, Integer> mapEntry
= chromAnchorStartEnd.getEntry(currentRange.lowerEndpoint());
int anchorId = mapEntry.getValue();
AnchorDataPHG aData = new AnchorDataPHG(currentRange,null,inputFile(),null, ConvertVariantContextToVariantInfo.convertVCListToVariantInfoList(gvcfContexts,loadDataMap.get("line"),Integer.parseInt(loadDataMap.get("hapNumber"))), sequence);
anchorSequences.put(anchorId, aData);
}
if ( anchorSequences.size() > 0) {
// load the data on per-chrom basis
myLogger.info("\ncalling putHaplotypes for chrom " + currentChromosome);
phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, currentChromosome);
}
((PHGdbAccess)phg).close();
}catch(Exception exc) {
myLogger.debug(exc.getMessage(),exc);
throw new IllegalStateException("Error writing haplotypes to the DB:",exc);
}
return null;
}
/**
* Simple method to parse a BED file into a Range set for use in the DB uploading.
* @param bedFile
* @return
*/
private RangeSet parseBedFileIntoRangeSet(String bedFile) {
RangeSet focusRanges = TreeRangeSet.create();
try(BufferedReader reader = Utils.getBufferedReader(bedFile)) {
String currentLine = "";
while((currentLine = reader.readLine()) != null) {
//Split the bed file by tabs
String[] bedRecordSplit = currentLine.split("\t");
Position startPos = Position.of(bedRecordSplit[0],Integer.parseInt(bedRecordSplit[1])+1);
Position endPos = Position.of(bedRecordSplit[0],Integer.parseInt(bedRecordSplit[2]));
focusRanges.add(Range.closed(startPos,endPos));
}
}
catch(Exception exc) {
myLogger.debug("Error loading in the Bed file",exc);
throw new IllegalStateException("Error loading in the Bed File: ",exc);
}
return focusRanges;
}
/**
* Method to parse and verify the load data file input to this plugin.
* @param loadDataFile
* @return
*/
private Map parseLoadData (String loadDataFile) {
myLogger.info("postProcessParameters: reading genomeDataFile: " + loadDataFile);
Map loadDataMap = new HashMap<>();
try (BufferedReader br = Utils.getBufferedReader(loadDataFile)){
// parse input file to find arguments
myLogger.info("reading genomeDataFile: " + loadDataFile);
String headers = br.readLine(); // read the header line
int lineIndex = -1;
int lineDataIndex = -1;
int ploidyIndex = -1;
int hapNumberIndex = -1;
int isRefIndex = -1;
int genesPhasedIndex = -1;
int chromPhasedIndex = -1;
int confIndex = -1;
int methodIndex = -1;
int methodDetailsIndex = -1;
int versionIndex = -1;
int idx = 0;
myLogger.info("GenomeFile header line: " + headers);
for (String header : headers.split("\\t")) {
if (header.equalsIgnoreCase("Genotype")) {
lineIndex = idx;
} else if (header.equalsIgnoreCase("Hapnumber")) {
hapNumberIndex = idx;
} else if (header.equalsIgnoreCase("Dataline")) {
lineDataIndex = idx;
} else if (header.equalsIgnoreCase("ploidy")) {
ploidyIndex = idx;
} else if (header.equalsIgnoreCase("reference")) {
isRefIndex = idx;
} else if (header.equalsIgnoreCase("genesPhased")) {
genesPhasedIndex = idx;
} else if (header.equalsIgnoreCase("chromsPhased")) {
chromPhasedIndex = idx;
} else if (header.equalsIgnoreCase("confidence")) {
confIndex = idx;
} else if (header.equalsIgnoreCase("Method")) {
methodIndex = idx;
} else if (header.equalsIgnoreCase("MethodDetails")) {
methodDetailsIndex = idx;
}
idx++;
}
if (lineIndex == -1 || lineDataIndex == -1 || ploidyIndex == -1 ||
hapNumberIndex == -1 || isRefIndex == -1 || genesPhasedIndex == -1 ||
chromPhasedIndex == -1 || confIndex == -1 || methodIndex == -1 ||
methodDetailsIndex == -1 ) {
myLogger.error("ERROR - Genotype datafile does not contain the required 10 fields");
myLogger.error("Please check your file for the tab delimited, case-insensistive headers: ");
myLogger.error(" Genotype Hapnumber Dataline Ploidy Reference GenePhased ChromPhased Confidence Method MethodDetails RefVersion");
throw new IllegalArgumentException("Wrong number of header columns in genome data file");
}
// All headers are present - check there is data for the required columns
String dataLine = br.readLine();
String[] dataTokens = dataLine.split("\\t");
if (dataTokens.length != 10) {
myLogger.error("ERROR - wrong number of data items in genotype datafile, expecting 10, found " + dataTokens.length);
throw new IllegalArgumentException("ERROR - wrong number of data items in genotype datafile, expecting 10, found " + dataTokens.length);
}
loadDataMap.put("line",dataTokens[lineIndex]);
loadDataMap.put("line_data", dataTokens[lineDataIndex]);
loadDataMap.put("ploidy",dataTokens[ploidyIndex]);
loadDataMap.put("hapNumber",dataTokens[hapNumberIndex]);
loadDataMap.put("is_ref", (dataTokens[isRefIndex].equalsIgnoreCase("true")) ? "true" : "false");
loadDataMap.put("genesPhased",(dataTokens[genesPhasedIndex].equalsIgnoreCase("true")) ? "true" : "false");
loadDataMap.put("chromsPhased", (dataTokens[chromPhasedIndex].equalsIgnoreCase("true")) ? "true" : "false");
loadDataMap.put("conf", dataTokens[confIndex]);
loadDataMap.put("method", dataTokens[methodIndex]);
loadDataMap.put("method_details", dataTokens[methodDetailsIndex]);
} catch (Exception exc) {
myLogger.error("ERROR parsing fields in genomeData file");
myLogger.error(" Expecting a tab-delimited file with these columns: " );
myLogger.error(" Genotype/Hapnumber/Dataline/ploidy/reference/genePhased/chromPhased/confidence/Method/MethodDetails/RefVersion");
myLogger.error("Please fix the file and try again - aborting the run");
throw new IllegalArgumentException("ERROR parsing genomeData file");
}
return loadDataMap;
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "Load GVCF file to DB";
}
@Override
public String getToolTipText() {
return "Load GVCF file to DB";
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(LoadHapSequencesFromGVCFPlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
/**
* GVCF File to be filtered.
*
* @return Input G V C F File
*/
public String inputFile() {
return myInputFile.value();
}
/**
* Set Input G V C F File. GVCF File to be filtered.
*
* @param value Input G V C F File
*
* @return this plugin
*/
public LoadHapSequencesFromGVCFPlugin inputFile(String value) {
myInputFile = new PluginParameter<>(myInputFile, value);
return this;
}
/**
* Input Reference Fasta
*
* @return Ref
*/
public String reference() {
return myReference.value();
}
/**
* Set Ref. Input Reference Fasta
*
* @param value Ref
*
* @return this plugin
*/
public LoadHapSequencesFromGVCFPlugin reference(String value) {
myReference = new PluginParameter<>(myReference, value);
return this;
}
/**
* File holding the DB config information
*
* @return Db Config File
*/
public String dbConfigFile() {
return dbConfigFile.value();
}
/**
* Set Db Config File. File holding the DB config information
*
* @param value Db Config File
*
* @return this plugin
*/
public LoadHapSequencesFromGVCFPlugin dbConfigFile(String value) {
dbConfigFile = new PluginParameter<>(dbConfigFile, value);
return this;
}
/**
* File holding the DB config information
*
* @return Load Data File
*/
public String loadDataFile() {
return loadDataFile.value();
}
/**
* Set Load Data File. File holding the DB config information
*
* @param value Load Data File
*
* @return this plugin
*/
public LoadHapSequencesFromGVCFPlugin loadDataFile(String value) {
loadDataFile = new PluginParameter<>(loadDataFile, value);
return this;
}
/**
* File holding the Reference Range Information
*
* @return Bed File
*/
public String bedFile() {
return bedFile.value();
}
/**
* Set Bed File. File holding the Reference Range Information
*
* @param value Bed File
*
* @return this plugin
*/
public LoadHapSequencesFromGVCFPlugin bedFile(String value) {
bedFile = new PluginParameter<>(bedFile, value);
return this;
}
}