net.maizegenetics.pangenome.hapCalling.HapCallingUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapCalling;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.*;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.*;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.util.DirectoryCrawler;
import java.io.File;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
/**
* Created by zrm22 on 8/29/17.
*/
public class HapCallingUtils {
/**
* Method to extract the VCF VariantContexts from the HaplotypePath.
* TODO remove if we get rid of HaplotypePath
* @param bestPath HaplotypePath containing one single path through the PHG.
* @return list of VariantContext objects which represent the concatenated vcf records for each node on the path
*/
public static List getVariantContextFromHaplotypePath(HaplotypePath bestPath) {
//convert the path to a list of nodes and get the List
return getVariantContextFromHaplotypeNodeList(bestPath.nodes());
}
/**
* Method to extract the VCF VariantContexts from a List of HaplotypeNodes
* @param nodeList List of HaplotypeNodes which we want the variantContexts from
* @return List of VariantContext objects which we can then export to a vcf
*/
public static List getVariantContextFromHaplotypeNodeList(List nodeList) {
return nodeList.stream().flatMap(haplotypeNode -> haplotypeNode.variantContexts().get().stream()).collect(Collectors.toList());
}
public static void writeVariantContextsToVCF(List variantContextList, String exportVCFFileName, String referenceFileName, List taxa) {
try {
VariantContextWriterBuilder vcfOutputWriterBuilder = new VariantContextWriterBuilder();
if (referenceFileName == null) {
//remove indexing on the fly as we do not have a reference sequence provided
vcfOutputWriterBuilder = vcfOutputWriterBuilder.unsetOption(Options.INDEX_ON_THE_FLY);
} else {
vcfOutputWriterBuilder = vcfOutputWriterBuilder.setReferenceDictionary(new IndexedFastaSequenceFile(new File(referenceFileName)).getSequenceDictionary());
}
VariantContextWriter vcfOutputWriter = vcfOutputWriterBuilder.setOutputFile(exportVCFFileName)
.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
.build();
vcfOutputWriter.writeHeader(createGenericHeader(taxa));
for (VariantContext currentContext : variantContextList) {
vcfOutputWriter.add(currentContext);
}
vcfOutputWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Method to write the list of VariantContexts out to a vcf file. If the referenceFileName is not null, it will also index the file on the fly.
* TODO when we convert TASSEL's VCF reader to use htsjdk, rework this method into ExportUtils
* TODO use an optional for the referenceFileName
*
* @param variantContextList list of VariantContext objects that need to be written to the vcf file
* @param exportVCFFileName String file name of the exported VCF file
* @param referenceFileName String name of the reference fasta file name. If null, will not index the vcf. Otherwise you need a fasta index as well to make it work
* @param taxonName Taxon name
*/
public static void writeVariantContextsToVCF(List variantContextList, String exportVCFFileName, String referenceFileName, String taxonName) {
try {
VariantContextWriterBuilder vcfOutputWriterBuilder = new VariantContextWriterBuilder();
if(referenceFileName == null) {
//remove indexing on the fly as we do not have a reference sequence provided
vcfOutputWriterBuilder = vcfOutputWriterBuilder.unsetOption(Options.INDEX_ON_THE_FLY);
}
else {
vcfOutputWriterBuilder = vcfOutputWriterBuilder.setReferenceDictionary(new IndexedFastaSequenceFile(new File(referenceFileName)).getSequenceDictionary());
}
VariantContextWriter vcfOutputWriter = vcfOutputWriterBuilder.setOutputFile(exportVCFFileName)
.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
.build();
vcfOutputWriter.writeHeader(createGenericHeader(taxonName));
for(VariantContext currentContext : variantContextList) {
vcfOutputWriter.add(correctENDAndGTVariantContexts(currentContext, taxonName));
}
vcfOutputWriter.close();
}
catch(Exception e) {
e.printStackTrace();
}
}
/**
* Method to pass the END annotation and the GT call correctly to the output VariantContext
* @param currentContext Current VariantContext which needs to be corrected
* @param taxonName New name for the Taxon being called. Need to do this to prevent ./. calls
* @return new VariantContext Object with the changes made.
*/
private static VariantContext correctENDAndGTVariantContexts(VariantContext currentContext, String taxonName) {
VariantContextBuilder vcb = new VariantContextBuilder(currentContext);
//We only want to export the END annotation if it is not a variant.
//Otherwise it can be inferred from the alleles in the reference
if(!currentContext.isVariant()) {
vcb.attribute("END", currentContext.getEnd());
}
//rename the Genotype to the taxonName
GenotypeBuilder gtBuilder = new GenotypeBuilder(currentContext.getGenotype(0)).name(taxonName);
vcb.genotypes(gtBuilder.make());
return vcb.make();
}
public static void callSNPsFromHaplotypePath(HaplotypeGraph graph,HaplotypePath bestPath, String vcfFileDir, String exportVCFFileName,String referenceFileName, String taxonName) {
callSNPsFromHaplotypeNodes(bestPath.nodes(),createHapIdToFileNameMapping(graph,vcfFileDir),vcfFileDir,exportVCFFileName, referenceFileName, taxonName);
}
public static void callSNPsFromSomeVCFs(String inputVCFFileDir, String exportVCFFileName, String referenceFileName, String taxonName) {
try{
VariantContextWriter vcfOutputWriter =new VariantContextWriterBuilder()
// .setReferenceDictionary(new IndexedFastaSequenceFile(new File(referenceFileName)).getSequenceDictionary())
.clearOptions()
.setOutputFile(exportVCFFileName)
.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
.build();
vcfOutputWriter.writeHeader(createGenericHeader(taxonName));
int counter = 0;
for (Path vcfPath : DirectoryCrawler.listPaths("glob:*.vcf", Paths.get(inputVCFFileDir))) {
VCFFileReader reader = new VCFFileReader(vcfPath.toFile(),false);
writeVCFRecordsToFile(vcfOutputWriter,reader);
}
vcfOutputWriter.close();
}
catch(Exception e) {
e.printStackTrace();
}
}
public static void callSNPsFromHaplotypeNodes(List listOfNodes, HashMap hapIdToFileNameMapping, String vcfFileDir, String exportVCFFileName, String referenceFileName, String taxonName) {
//load up the vcfs for each node. Final solution is to query the db using the hapid and get the file path.
//For now though we can hack it together
//consensus_chr10_stPos100055301_endPos100060684_ZEAxppRCQDIAAPEI-11.vcf.gz
try {
VariantContextWriter vcfOutputWriter = new VariantContextWriterBuilder()
.setReferenceDictionary(new IndexedFastaSequenceFile(new File(referenceFileName)).getSequenceDictionary())
.setOutputFile(exportVCFFileName)
.setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
.build();
vcfOutputWriter.writeHeader(createGenericHeader(taxonName));
int counter = 0;
for(HaplotypeNode currentNode : listOfNodes) {
String fileName = hapIdToFileNameMapping.get(currentNode.id());
VCFFileReader vcfReader = new VCFFileReader(new File(vcfFileDir+"/"+fileName));
writeVCFRecordsToFile(vcfOutputWriter,vcfReader);
}
vcfOutputWriter.close();
}
catch(Exception e) {
e.printStackTrace();
}
}
private static void writeVCFRecordsToFile(VariantContextWriter writer, VCFFileReader vcfReader) {
try {
CloseableIterator vcfIterator = vcfReader.iterator();
while(vcfIterator.hasNext()) {
VariantContext vcfRecord = vcfIterator.next();
if(vcfRecord.isVariant())
{
writer.add(vcfRecord);
}
}
vcfIterator.close();
}
catch(Exception e) {
e.printStackTrace();
}
}
public static HashMap createHapIdToFileNameMapping(HaplotypeGraph graph, String pangenomeDir) {
HashMap hapIdToFileNameMap = new HashMap<>();
Multimap referenceRangeToFileNameMap = HashMultimap.create();
Set referenceRanges = graph.referenceRanges();
for(ReferenceRange range: referenceRanges) {
List nodes = graph.nodes(range);
for(HaplotypeNode currentNode : nodes) {
StringBuilder globStringBuilder = new StringBuilder();
globStringBuilder.append("glob:consensus_chr")
.append(currentNode.referenceRange().chromosome().getName())
.append("_stPos")
.append(currentNode.referenceRange().start())
.append("_endPos")
.append(currentNode.referenceRange().end())
.append("_*");
TaxaList taxaList = currentNode.taxaList();
if(referenceRangeToFileNameMap.containsKey(currentNode.referenceRange())) {
//loop through the entry and find the name that is correct
for(String currentFile : referenceRangeToFileNameMap.get(currentNode.referenceRange())) {
String taxaName = getTaxaFromFileName(currentFile);
if(taxaList.indexOf(taxaName) != -1) {
hapIdToFileNameMap.put(currentNode.id(),currentFile);
break;
}
}
}
else {
//loop through each file
for (Path fastaPath : DirectoryCrawler.listPaths(globStringBuilder.toString(), Paths.get(pangenomeDir))) {
String taxaName = getTaxaFromFileName(fastaPath.toString());
if(taxaList.indexOf(taxaName)!=-1) {
hapIdToFileNameMap.put(currentNode.id(),fastaPath.toString());
}
//Add it to the cache for easy lookup later
referenceRangeToFileNameMap.put(currentNode.referenceRange(),fastaPath.toString());
}
}
}
}
return hapIdToFileNameMap;
}
// public static HashMap createHapIdToFileNameMappingV2(HaplotypeGraph graph, String pangenomeDir) {
// HashMap hapIdToFileNameMap = new HashMap<>();
//
// //preload the referenceRangeToFileMap so it is easier to compute when we compare the nodes
// Multimap referenceRangeToFileNameMap = createRefRangeToFileMap(pangenomeDir,);
//
// Set referenceRanges = graph.referenceRanges();
// for(ReferenceRange range: referenceRanges) {
// List nodes = graph.nodes(range);
//
// for(HaplotypeNode currentNode : nodes) {
// StringBuilder globStringBuilder = new StringBuilder();
// globStringBuilder.append("glob:consensus_chr")
// .append(currentNode.referenceRange().chromosome().getName())
// .append("_stPos")
// .append(currentNode.referenceRange().start())
// .append("_endPos")
// .append(currentNode.referenceRange().end())
// .append("_*");
// TaxaList taxaList = currentNode.taxaList();
//
//
// if(referenceRangeToFileNameMap.containsKey(currentNode.referenceRange())) {
// //loop through the entry and find the name that is correct
// for(String currentFile : referenceRangeToFileNameMap.get(currentNode.referenceRange())) {
// String taxaName = getTaxaFromFileName(currentFile);
//
// if(taxaList.indexOf(taxaName) != -1) {
// hapIdToFileNameMap.put(currentNode.id(),currentFile);
// break;
// }
// }
// }
// else {
// //loop through each file
// for (Path fastaPath : DirectoryCrawler.listPaths(globStringBuilder.toString(), Paths.get(pangenomeDir))) {
// String taxaName = getTaxaFromFileName(fastaPath.toString());
//
// if(taxaList.indexOf(taxaName)!=-1) {
// hapIdToFileNameMap.put(currentNode.id(),fastaPath.toString());
// }
// //Add it to the cache for easy lookup later
// referenceRangeToFileNameMap.put(currentNode.referenceRange(),fastaPath.toString());
// }
// }
// }
// }
//
// return hapIdToFileNameMap;
// }
private static Multimap createRefRangeToFileMap(String inputVCFDirectory, HashMap positionToRefRangeMap) {
Multimap referenceRangeToFileNameMap = HashMultimap.create();
for (Path fastaPath : DirectoryCrawler.listPaths("glob:*.vcf", Paths.get(inputVCFDirectory))) {
//Extract the chr and position from the file name
//consensus_chr10_stPos100055301_endPos100060684_ZEAxppRCQDIAAPEI-11.vcf.gz
String[] fileNameSplit = fastaPath.toString().split("_");
String chr = fileNameSplit[1].substring(3);
String stPos = fileNameSplit[2].substring(7);
//Create a Position object
//add this Entry to the multimap
}
return referenceRangeToFileNameMap;
}
private static String getTaxaFromFileName(String fileName) {
//consensus_chr10_stPos100055301_endPos100060684_ZEAxppRCQDIAAPEI-11.vcf.gz
String[] fileNameSplit = fileName.split("_");
String justTaxaName = fileNameSplit[fileNameSplit.length-1].split("\\.")[0];
return justTaxaName;
}
private static VCFHeader createGenericHeader(String taxon) {
List taxaNames = new ArrayList<>();
taxaNames.add(taxon);
return createGenericHeader(taxaNames);
}
public static VCFHeader createGenericHeader(List taxaNames) {
Set headerLines = createGenericHeaderLineSet();
return new VCFHeader(headerLines, taxaNames);
}
public static Set createGenericHeaderLineSet() {
Set headerLines = new HashSet<>();
headerLines.add(new VCFFormatHeaderLine("AD",3,VCFHeaderLineType.Integer,"Allelic depths for the ref and alt alleles in the order listed"));
headerLines.add(new VCFFormatHeaderLine("DP",1,VCFHeaderLineType.Integer,"Read Depth (only filtered reads used for calling)"));
headerLines.add(new VCFFormatHeaderLine("GQ",1,VCFHeaderLineType.Integer,"Genotype Quality"));
headerLines.add(new VCFFormatHeaderLine("GT",1,VCFHeaderLineType.String,"Genotype"));
headerLines.add(new VCFFormatHeaderLine("PL",VCFHeaderLineCount.G,VCFHeaderLineType.Integer,"Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
headerLines.add(new VCFInfoHeaderLine("DP",1,VCFHeaderLineType.Integer,"Total Depth"));
headerLines.add(new VCFInfoHeaderLine("NS",1,VCFHeaderLineType.Integer,"Number of Samples With Data"));
headerLines.add(new VCFInfoHeaderLine("AF",3,VCFHeaderLineType.Integer,"Allele Frequency"));
headerLines.add(new VCFInfoHeaderLine("END",1,VCFHeaderLineType.Integer,"Stop position of the interval"));
headerLines.add(new VCFInfoHeaderLine("ASM_Chr",1,VCFHeaderLineType.String,"Assembly chromosome"));
headerLines.add(new VCFInfoHeaderLine("ASM_Start",1,VCFHeaderLineType.Integer,"Assembly start position"));
headerLines.add(new VCFInfoHeaderLine("ASM_End",1,VCFHeaderLineType.Integer,"Assembly end position"));
headerLines.add(new VCFInfoHeaderLine("ASM_Strand",1,VCFHeaderLineType.String,"Assembly strand"));
return headerLines;
}
public static VCFHeader createGATKGVCFHeader(List taxaNames) {
//Get the initial batch of header lines
Set headerLines = createGenericHeaderLineSet();
//Add in the GATK GVCF specific headers
/*
##ALT=
*/
// headerLines.add(new VCFAltHeaderLine("ID=NON_REF,Description=\"Represents any possible alternative allele not already represented at this location by REF and ALT\"",VCFHeaderVersion.VCF4_2));
headerLines.add(new VCFSimpleHeaderLine("ALT","NON_REF","Represents any possible alternative allele not already represented at this location by REF and ALT"));
headerLines.add(new VCFFilterHeaderLine("LowQual","Low quality"));
headerLines.add(new VCFFormatHeaderLine("MIN_DP",1,VCFHeaderLineType.Integer,"Minimum DP observed within the GVCF block"));
headerLines.add(new VCFFormatHeaderLine("PGT",1,VCFHeaderLineType.String,"Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles"));
headerLines.add(new VCFFormatHeaderLine("PID",1,VCFHeaderLineType.String,"Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group"));
headerLines.add(new VCFFormatHeaderLine("PS",1,VCFHeaderLineType.Integer,"Phasing set (typically the position of the first variant in the set)"));
headerLines.add(new VCFFormatHeaderLine("SB",4,VCFHeaderLineType.Integer,"Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."));
//Adding in the GVCF Block bands
addInGQBlockHeader(headerLines);
headerLines.add(new VCFInfoHeaderLine("BaseQRankSum",1,VCFHeaderLineType.Float,"Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"));
headerLines.add(new VCFInfoHeaderLine("ExcessHet",1,VCFHeaderLineType.Float,"Phred-scaled p-value for exact test of excess heterozygosity"));
headerLines.add(new VCFInfoHeaderLine("InbreedingCoeff",1,VCFHeaderLineType.Float,"Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"));
headerLines.add(new VCFInfoHeaderLine("MLEAC",VCFHeaderLineCount.A,VCFHeaderLineType.Integer,"Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"));
headerLines.add(new VCFInfoHeaderLine("MLEAF",VCFHeaderLineCount.A,VCFHeaderLineType.Float,"Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"));
headerLines.add(new VCFInfoHeaderLine("MQRankSum",1,VCFHeaderLineType.Float,"Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"));
headerLines.add(new VCFInfoHeaderLine("RAW_MQandDP",2,VCFHeaderLineType.Float,"Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation."));
headerLines.add(new VCFInfoHeaderLine("ReadPosRankSum",1,VCFHeaderLineType.Float,"Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"));
return new VCFHeader(headerLines,taxaNames);
}
public static void addInGQBlockHeader(Set headerLines) {
for(int i = 1; i <60; i++) {
headerLines.add(createGVCFBlockHeaderLine(i,i+1));
}
headerLines.add(createGVCFBlockHeaderLine(60,70));
headerLines.add(createGVCFBlockHeaderLine(70,80));
headerLines.add(createGVCFBlockHeaderLine(80,90));
headerLines.add(createGVCFBlockHeaderLine(90,100));
}
public static VCFHeaderLine createGVCFBlockHeaderLine(int start, int exclusiveEnd) {
StringBuilder nameSb = new StringBuilder();
nameSb.append("GVCFBlock");
nameSb.append(start);
nameSb.append("-");
nameSb.append(exclusiveEnd);
StringBuilder descriptionSb = new StringBuilder();
descriptionSb.append("minGQ=");
descriptionSb.append(start);
descriptionSb.append("(inclusive),maxGQ=");
descriptionSb.append(exclusiveEnd);
descriptionSb.append("(exclusive)");
return new VCFHeaderLine(nameSb.toString(), descriptionSb.toString());
// return new VCFSimpleHeaderLine(nameSb.toString(),nameSb.toString(),descriptionSb.toString());
}
}