net.maizegenetics.pangenome.db_loading.LoadHapSequencesToDBPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.db_loading;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.File;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.swing.ImageIcon;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import net.maizegenetics.pangenome.api.ConvertVariantContextToVariantInfo;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import org.apache.log4j.Logger;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;
/**
* This method takes data processed through GATK haplotype caller, or through
* processAssebmlyGenomes.Minimap2PipelinePlugin. In either case, the data is
* subsequently run through ComputeMedianGVCFAndFilter which creates a fasta for
* loading into the DB via this method.
*
* The ID line in the fasta is used to associate each sequence with a DB
* genome interval id. Entries are added to the following tables for each haplotype:
* - genotypes
* - gametes
* - gamete_groups (only the single taxa/chromosome in each group)
* - gamete_haplotypes (associates gamete_grp_id to the gamete_id)
* - haplotypes (sequence related data for each genome_interval for the specified gamete is added)
*
* INPUT:
* 1. configuration file indicating db connection data
* 2. a fasta file with haplotype anchor sequences. Fasta idlines look as below.
* >refChrom:refstart:refEnd gvcfFileName
* 3. a tab-delimited file containing parameters specific for this haplotype, columns are:
* Genotype/Hapnumber/Dataline/ploidy/reference/genesPhased/chromsPhased/confidence/Method/MethodDetails/RefVersion
* 4. Path to directory containing the gvcf file
*
*
* OUTPUT:
* Data is written to the db tables listed above.
*
* @author lcj34
*
*/
@Deprecated
public class LoadHapSequencesToDBPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(LoadHapSequencesToDBPlugin.class);
private PluginParameter fasta = new PluginParameter.Builder("fasta", null, String.class).guiName("Fasta File").required(true).inFile()
.description("Fasta file containing haplotype sequences ").build();
private PluginParameter genomeData = new PluginParameter.Builder("genomeData", null, String.class).guiName("Genome Data File").required(true).inFile()
.description("Path to tab-delimited file containing genome speciic data with header line:\nGenotype Hapnumber Dataline Ploidy Reference GenePhased ChromPhased Confidence Method MethodDetails RefVersion")
.build();
private PluginParameter gvcf = new PluginParameter.Builder("gvcf", null, String.class).guiName("GVCF Directory").required(true)
.description("Directry containing GVCF file used to create the haplotype fasta file. Directory path only including trailing /").build();
// This data is populated from the genomeData file
private String line; // refName to be stored as line_name in genotypes table, e.g. B73Ref
private String line_data ;
private int ploidy ;
private int hapNumber;
private boolean genesPhased;
private boolean chromsPhased;
private float conf;
private String method;
private String method_details;
boolean is_ref = false;
public LoadHapSequencesToDBPlugin() {
super(null, false);
}
public LoadHapSequencesToDBPlugin(Frame parentFrame) {
super(parentFrame, false);
}
public LoadHapSequencesToDBPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
private static int numMapped = 0;
private static int notMapped = 0;
@Override
public void postProcessParameters() {
myLogger.info("postProcessParameters: reading genomeDataFile: " + genomeData());
try (BufferedReader br = Utils.getBufferedReader(genomeData())){
// parse input file to find arguments
myLogger.info("reading genomeDataFile: " + genomeData());
String headers = br.readLine(); // read the header line
int lineIndex = -1;
int lineDataIndex = -1;
int ploidyIndex = -1;
int hapNumberIndex = -1;
int isRefIndex = -1;
int genesPhasedIndex = -1;
int chromPhasedIndex = -1;
int confIndex = -1;
int methodIndex = -1;
int methodDetailsIndex = -1;
int idx = 0;
myLogger.info("GenomeFile header line: " + headers);
for (String header : headers.split("\\t")) {
if (header.equalsIgnoreCase("Genotype")) {
lineIndex = idx;
} else if (header.equalsIgnoreCase("Hapnumber")) {
hapNumberIndex = idx;
} else if (header.equalsIgnoreCase("Dataline")) {
lineDataIndex = idx;
} else if (header.equalsIgnoreCase("ploidy")) {
ploidyIndex = idx;
} else if (header.equalsIgnoreCase("reference")) {
isRefIndex = idx;
} else if (header.equalsIgnoreCase("genesPhased")) {
genesPhasedIndex = idx;
} else if (header.equalsIgnoreCase("chromsPhased")) {
chromPhasedIndex = idx;
} else if (header.equalsIgnoreCase("confidence")) {
confIndex = idx;
} else if (header.equalsIgnoreCase("Method")) {
methodIndex = idx;
} else if (header.equalsIgnoreCase("MethodDetails")) {
methodDetailsIndex = idx;
}
idx++;
}
if (lineIndex == -1 || lineDataIndex == -1 || ploidyIndex == -1 ||
hapNumberIndex == -1 || isRefIndex == -1 || genesPhasedIndex == -1 ||
chromPhasedIndex == -1 || confIndex == -1 || methodIndex == -1 ||
methodDetailsIndex == -1 ) {
myLogger.error("ERROR - Genotype datafile does not contain the required 10 fields");
myLogger.error("Please check your file for the tab delimited, case-insensistive headers: ");
myLogger.error(" Genotype Hapnumber Dataline Ploidy Reference GenePhased ChromPhased Confidence Method MethodDetails");
throw new IllegalArgumentException("Wrong number of header columns in genome data file");
}
// All headers are present - check there is data for the required columns
String dataLine = br.readLine();
String[] dataTokens = dataLine.split("\\t");
if (dataTokens.length != 10) {
myLogger.error("ERROR - wrong number of data items in genotype datafile, expecting 10, found " + dataTokens.length);
throw new IllegalArgumentException("ERROR - wrong number of data items in genotype datafile, expecting 10, found " + dataTokens.length);
}
line = dataTokens[lineIndex];
line_data = dataTokens[lineDataIndex];
ploidy = Integer.parseInt(dataTokens[ploidyIndex]);
hapNumber = Integer.parseInt(dataTokens[hapNumberIndex]);
is_ref = (dataTokens[isRefIndex].equalsIgnoreCase("true")) ? true : false;
genesPhased = (dataTokens[genesPhasedIndex].equalsIgnoreCase("true")) ? true : false;
chromsPhased = (dataTokens[chromPhasedIndex].equalsIgnoreCase("true")) ? true : false;
conf = Float.parseFloat(dataTokens[confIndex]);
method = dataTokens[methodIndex];
method_details = dataTokens[methodDetailsIndex];
} catch (Exception exc) {
myLogger.error("ERROR parsing fields in genomeData file");
myLogger.error(" Expecting a tab-delimited file with these columns: " );
myLogger.error(" Genotype/Hapnumber/Dataline/ploidy/reference/genePhased/chromPhased/confidence/Method/MethodDetails/RefVersion");
myLogger.error("Please fix the file and try again - aborting the run");
throw new IllegalArgumentException("ERROR parsing genomeData file");
}
}
@Override
public DataSet processData(DataSet input) {
long totalTime = System.nanoTime();
long time=System.nanoTime();
RangeMap chromPosIDMap = null;
// Map
Map anchorSequences = new HashMap();
Connection dbConnect = (Connection)input.getData(0).getData();
if (dbConnect == null) {
myLogger.error("LoadHapSequencesToDBPlugin: No DB CONNECTION supplied !");
return null;
}
myLogger.info("LoadHapSequencesToDBPlugin: have connection, create PHGdbAccess object");
PHGDataWriter phg = new PHGdbAccess(dbConnect);
// Read the fasta file. Pull anchors on per-chrom data
try (BufferedReader fastaBR = Utils.getBufferedReader(fasta())){
String fLine;
String oldChrom = "-1";
String refData = "none";
String refChr = "none";
StringBuilder seqSB = new StringBuilder();
int idlineCount = 0;
int fastaSeqCount = 0;
int numUnknownSeq = 0;
// Load initial data to genotypes, gametes, gamete_groups, method
// add the genotype/haplotype info to DB
GenoHaploData ghd = new GenoHaploData(ploidy,is_ref,line,line_data, genesPhased, chromsPhased, hapNumber, conf);
phg.putGenoAndHaploTypeData(ghd);
// Load the gamete_groups and gamete_haplotypes table
String nameWithHap = line + "_" + hapNumber; // ref line is always hap1
List gameteGroupList = new ArrayList();
gameteGroupList.add(nameWithHap);
phg.putGameteGroupAndHaplotypes(gameteGroupList);
// Put the method data - identifies for each haplotype how the anchors were created
// anchor and inter-anchor are both method_type "anchor_haplotypes"
int method_id = phg.putMethod(method, DBLoadingUtils.MethodType.ANCHOR_HAPLOTYPES,pluginParameters());
// Add info to group tables
int gamete_grp_id = phg.getGameteGroupIDFromTaxaList(gameteGroupList);
while ((fLine = fastaBR.readLine()) != null) {
if (fLine.startsWith(">")) {
// id line: >refChrom:refstart:refEnd gvcfFileName
idlineCount++;
if (seqSB.length() > 0) { // process previous fasta sequence
fastaSeqCount++;
// look for anchorID.
addSequenceToMap(chromPosIDMap, anchorSequences,seqSB.toString(), refData);
seqSB.setLength(0); // clear the previous sequence
}
refData = fLine.replace(">","");
refChr = refData.split(":")[0]; // 8:09890:98900 = chr/start/end
if (!refChr.equals(oldChrom)) {
myLogger.info("Time to process chrom " + oldChrom + ": " + (System.nanoTime() - time) / 1e9 + " seconds");
time = System.nanoTime();
// In the DB, the chromosome name is stored from the chromosome object.
// The Chromosome class will parse off CHR or CHROMOSOME before creating the name
// Pass that here.
String refChrName = Chromosome.instance(refChr).getName();
myLogger.info("LoadHapSequencesToDBPlugin: getting anchors for chrom " + refChr + " as " + refChrName);
// Position object is key, anchorID is value
chromPosIDMap = phg.getIntervalRangesWithIDForChrom(refChrName);
myLogger.info("Number of anchors for chrom " + refChr + " is " + chromPosIDMap.asMapOfRanges().keySet().size());
time = System.nanoTime();
oldChrom = refChr;
if (anchorSequences.size() > 0) {
// WIth variants table schema, we must load on a per-chrom basis
processGVCFVariants(anchorSequences);
myLogger.info("\ncalling putHaplotypes for chrom " + oldChrom + " with list of size: " + anchorSequences.size() );
phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, oldChrom);
anchorSequences.clear(); // start fresh
}
}
} else {
seqSB.append(fLine);
}
}
// Process last fasta entry, load last chromosome
if (seqSB.length() > 0) {
fastaSeqCount++;
// look for anchorID, add to map
addSequenceToMap( chromPosIDMap, anchorSequences, seqSB.toString(), refData);
processGVCFVariants(anchorSequences);
myLogger.info("\ncalling putHaplotypes for chrom " + refChr + " with list of size: " + anchorSequences.size());
phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, refChr);
seqSB.setLength(0); // clear the previous sequence
}
((PHGdbAccess)phg).close();
myLogger.info("Time to load sequences into DB: " + (System.nanoTime() - time) / 1e9 + " seconds");
myLogger.info("\nTotal time for LoadHapSequencesToPHGdb: " + (System.nanoTime() - totalTime) / 1e9 + " seconds");
} catch (Exception exc) {
throw new IllegalStateException("LoadHapSequences: error procesing data: " + exc.getMessage());
}
return null;
}
// Create all the variant BLOBs for loading into the DB.
private void processGVCFVariants(Map anchorSequences) {
anchorSequences.entrySet().parallelStream().forEach(entry -> {
AnchorDataPHG aData = entry.getValue();
// Encode gvcfFile into variant list bytes. The boolean fields indicates whether
// only variants should be pulled from the gvcf file and if refRanges should be merged.
// For individual haplotypes (vs consensus) the full gvcf file is passed.
// Only store in the variant list the entries related to this genome_interval region.
String gvcf = aData.gvcf();
try {
VCFFileReader vcfReader = new VCFFileReader(new File(gvcf), false);
Range interval = aData.intervalCoordinates();
CloseableIterator vc = vcfReader.query(interval.lowerEndpoint().getChromosome().getName(),
interval.lowerEndpoint().getPosition(), interval.upperEndpoint().getPosition());
List variants = vc.stream()
.map(variantContext -> ConvertVariantContextToVariantInfo.convertContextToInfo(variantContext,line,hapNumber))
.collect(Collectors.toList());
int lastSlash = gvcf.lastIndexOf("/");
gvcf = gvcf.substring(lastSlash + 1); // no longer need the directory path, just store file name
AnchorDataPHG newAdata = new AnchorDataPHG(aData.intervalCoordinates(), aData.geneCoordinates(),
gvcf, null, variants, aData.sequence());
entry.setValue(newAdata); // setValue() does not incur a concurrent modification exception
} catch (Exception exc) {
throw new IllegalStateException("Error processing GVCFVariants." , exc);
}
});
}
private String getGVCFFileName(String[] idline) {
// gvcf file is last item on idline, first item after the space
// >refChrom:refStartPos:refEndPos gvcfFileName
String gvcfFileNameOnly;
if (idline.length < 2) {
// Old version of fasta idlines don't have the gvcf name. Infer it from
// the fasta file name
String fastaName = fasta();
int lastSlash = fasta().lastIndexOf("/");
if (lastSlash != -1) {
fastaName = fasta().substring(lastSlash+1);
}
int lastPeriod = fastaName.lastIndexOf(".");
gvcfFileNameOnly = fastaName.substring(0, lastPeriod) + ".g.vcf";
} else {
gvcfFileNameOnly = idline[1];
}
return gvcfFileNameOnly;
}
// look for match from genome intervals table. There should always be a match
private void addSequenceToMap(RangeMap chromAnchorStartEnd,
Map anchorSequences, String seqString, String refData) {
int anchorId = -1;
String[] idlineData = refData.split(" ");
String refInterval = idlineData[0];;
String refChr = refInterval.split(":")[0]; // 8:09890:98900 = chr/start/end;
String gvcfFileNameOnly = getGVCFFileName( idlineData) ;
int refStart = Integer.parseInt(refInterval.split(":")[1]);
Position startPos = Position.of( refChr,refStart);
Map.Entry, Integer> mapEntry
= chromAnchorStartEnd.getEntry(startPos);
if (mapEntry != null) {
// refStart pos must not only fall within a Position range, it must equal the start position
if (mapEntry.getKey().lowerEndpoint().getPosition() == refStart) {
anchorId = mapEntry.getValue();
int refEnd = Integer.parseInt(refInterval.split(":")[2]);
Position endPos = Position.of(refChr, refEnd);
Range intervalRange = Range.closed(startPos, endPos);
Position geneStart = Position.of( refChr,0);
Position geneEnd = Position.of( refChr,0);
String gvcfFileWithPath = gvcf() + gvcfFileNameOnly;
Range geneRange = Range.closed(geneStart, geneEnd); // not relevant for non-ref haplotypes
AnchorDataPHG adata = new AnchorDataPHG(intervalRange, geneRange,gvcfFileWithPath, null, seqString);
anchorSequences.put(anchorId, adata);
numMapped++;
} else {
notMapped++;
myLogger.warn("Anchor not found for chr " + refChr + " with start pos " + refStart);
}
} else {
notMapped++;
myLogger.warn("Anchor not found for chr " + refChr + " with start pos " + refStart);
}
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return ("Load haplotype sequences to database");
}
@Override
public String getToolTipText() {
return ("Load haplotype sequences to database");
}
// public static void main(String[] args) {
// GeneratePluginCode.generate(LoadHapSequencesToPHGdb.class);
// }
/**
* Fasta file containing haplotype sequences
*
* @return Fasta File
*/
public String fasta() {
return fasta.value();
}
/**
* Set Fasta File. Fasta file containing haplotype sequences
*
*
* @param value Fasta File
*
* @return this plugin
*/
public LoadHapSequencesToDBPlugin fasta(String value) {
fasta = new PluginParameter<>(fasta, value);
return this;
}
/**
* Path to tab-delimited file containing genome speciic
* data with header line:
* Genotype Hapnumber Dataline Ploidy Reference GenePhased
* ChromPhased Confidence Method MethodDetails RefVersion
*
* @return Genome Data File
*/
public String genomeData() {
return genomeData.value();
}
/**
* Set Genome Data File. Path to tab-delimited file containing
* genome speciic data with header line:
* Genotype Hapnumber Dataline Ploidy Reference GenePhased
* ChromPhased Confidence Method MethodDetails RefVersion
*
* @param value Genome Data File
*
* @return this plugin
*/
public LoadHapSequencesToDBPlugin genomeData(String value) {
genomeData = new PluginParameter<>(genomeData, value);
return this;
}
/**
* GVCF file used to create the haplotype fasta file
*
* @return GVCF File
*/
public String gvcf() {
return gvcf.value();
}
/**
* Set GVCF File. GVCF file used to create the haplotype
* fasta file
*
* @param value GVCF File
*
* @return this plugin
*/
public LoadHapSequencesToDBPlugin gvcf(String value) {
gvcf = new PluginParameter<>(gvcf, value);
return this;
}
}