net.maizegenetics.pangenome.hapcollapse.MergeGVCFUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapcollapse;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.analysis.filter.FilterSiteBuilderPlugin;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.dna.snp.score.AlleleDepthBuilder;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Created by zrm22 on 11/7/17.
* This class borrows code from GATK. Eventually we should just bring in the GATK jar and use the method directly.
* Some of this will need to be cleaned up as it uses a lot of extra methods. I just copied them till the class will compile.
* TODO Clean up this class as it is thrown together to make it work.
*/
@Deprecated
public class MergeGVCFUtils {
private static final Logger myLogger = Logger.getLogger(MergeGVCFUtils.class);
private static final Allele nonRefAllele = Allele.create("", false);
/**
* Method to make a genotypeTable from a list of Haplotype Nodes.
* This method will simply make the GenotypeTable encoding indels as either + or - depending on what is actually stored in the HaplotypeNode
* In other steps, the indels will be filtered out and then will be added back in when consensus is made.
* @param refRange
* @param nodesWithVariantContexts
* @param referenceSequence
* @return
*/
public static GenotypeTable createGenotypeTableFromHaplotypeNodes(ReferenceRange refRange, List nodesWithVariantContexts, GenomeSequence referenceSequence) {
try {
ArrayListMultimap taxonToVariantListMap = ArrayListMultimap.create();
//Using a loop to fix when we have empty lists of variantContexts stored in the db
for(HaplotypeNode currentNodeWithVariants : nodesWithVariantContexts) {
Optional> variantContextsOptional = currentNodeWithVariants.variantContexts();
if(variantContextsOptional.isPresent()) {
//Map the variantContexts to the taxon
taxonToVariantListMap.putAll(new Taxon.Builder(removeHaplotypeCaller(currentNodeWithVariants.taxaList().get(0).getName())).build(),variantContextsOptional.get());
}
else {
//add in an empty list, these should all be here.
taxonToVariantListMap.putAll(currentNodeWithVariants.taxaList().get(0), new ArrayList());
}
}
return createGenotypeTableFromGVCFs(refRange, taxonToVariantListMap, referenceSequence);
}
catch(Exception e) {
myLogger.error("Exception thrown in createGenotypeTableFromHaplotypes"+e);
e.printStackTrace();
}
return null;
}
/**
* Simple method to remove the _Haplotype_Caller from the taxon name
* @param taxonName
* @return
*/
public static String removeHaplotypeCaller(String taxonName) {
if(taxonName.endsWith("_Haplotype_Caller")) {
return taxonName.substring(0,taxonName.length()-17);
}
else {
return taxonName;
}
}
/**
* This method takes all the VariantContexts for all of the Taxon we need in our new GenotypeTable and will create a correctly encoded GenotypeTable
* @param refRange
* @param taxonToVariantListMap
* @param referenceSequence
* @return
*/
public static GenotypeTable createGenotypeTableFromGVCFs(ReferenceRange refRange, Multimap taxonToVariantListMap, GenomeSequence referenceSequence) {
//Setup the TaxaList
TaxaList taxa = new TaxaListBuilder().addAll(taxonToVariantListMap.keySet()).build();
//Setup the positionList
PositionList positions = buildPositionsFromRefRange(refRange);
//Need to make a Position->KnownVariant list
Multimap positionToKnownVariants = buildInitialPositionToKnownVariantMap(positions,referenceSequence);
//Setup the callTable
GenotypeCallTableBuilder genotypeCallTableBuilder = GenotypeCallTableBuilder.getInstance(taxa.size(),positions.size());
AlleleDepthBuilder depthBuilder = AlleleDepthBuilder.getInstance(taxa.numberOfTaxa(), positions.size(), taxa);
//Loop through each taxa
for (Taxon taxon : taxa) {
int taxonIndex = taxa.indexOf(taxon);
byte[][] taxonDepths = new byte[6][positions.size()];
//loop through each variant context
for (VariantContext vc : taxonToVariantListMap.get(taxon)) {
if (vc.getGenotype(0) == null) { // LCJ added for consensus problem july 17, 2018
throw new IllegalStateException("null genotype for taxon : " + taxon + ", vc: " + vc.toString());
}
//Get the start site and make sure its in the reference range:
int startSite = (vc.getStart() < positions.get(0).getPosition() && vc.getEnd() >= positions.get(0).getPosition()) ? 0 : positions.indexOf(Position.of(refRange.chromosome(), vc.getStart()));
if (startSite < 0) {
throw new IllegalStateException("ERROR VariantContext does not overlap a position in the reference range.");
}
//Check to see if variantContext is reference range
if (isRefBlock(vc) || vc.getAlternateAlleles().size() == 0) {
//If so grab a reference allele for each position
for (int i = vc.getStart(); i <= vc.getEnd() && startSite < positions.size(); i++) {
byte refAllele = referenceSequence.genotype(refRange.chromosome(), i);
genotypeCallTableBuilder.setBase(taxonIndex, startSite, (byte) (refAllele << 4 | refAllele));
//These should be in the map already, but we need to add in just in case:
positionToKnownVariants.put(positions.get(startSite), NucleotideAlignmentConstants.getHaplotypeNucleotide(refAllele));
//Add depths DP is safe as it returns an int which is not a null.
try {
if (refAllele < 6 && refAllele >=0) {
taxonDepths[refAllele][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getDP());
}
} catch (Exception exc) {
myLogger.error("Exception MergeGVCFUtils for taxon " + taxon.getName() + ", refAllele=" + refAllele + ", startSite=" + startSite);
myLogger.error(" refRange chrom:" + refRange.chromosome().getName() + " Coordinates: " + refRange.start() + ":" + refRange.end());
myLogger.error(" vcStart: " + vc.getStart() + ", vcEnd: " + vc.getEnd() + " loop index: " + i);
throw exc;
}
startSite++;
}
} else if (isDeletion(vc)) {
//Deletions conserve the first few bps
String allele = vc.getGenotype(taxon.getName()).getGenotypeString();
//We do a loop here in case there are multiple alleles covered:
//Usually these SNPs would be split into separate records, but it is permitted
//Ex: Ref is AAA and ALT is TTT
for (int i = 0; i < allele.length() && startSite < positions.size(); i++) {
String diploid = allele.charAt(i) + "" + allele.charAt(i);
genotypeCallTableBuilder.setBase(taxonIndex, startSite, NucleotideAlignmentConstants.getNucleotideDiploidByte(diploid));
positionToKnownVariants.put(positions.get(startSite), "" + allele.charAt(i));
if(vc.getGenotype(0).getAD() != null ) {
byte altAllele = NucleotideAlignmentConstants.getNucleotideAlleleByte(allele.charAt(i));
if (altAllele >= 0 && altAllele < 6) {
taxonDepths[altAllele][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getAD()[1]);
}
}
startSite++;
}
//If we have additional positions which we do not have enough base pairs for, we insert the gap allele
for (int i = vc.getStart() + allele.length(); i <= vc.getEnd() && startSite < positions.size(); i++) {
genotypeCallTableBuilder.setBase(taxonIndex, startSite, NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE);
if(vc.getGenotype(0).getAD() != null ) {
taxonDepths[NucleotideAlignmentConstants.GAP_ALLELE][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getAD()[1]);
}
startSite++;
}
} else if (isInsertion(vc)) {
//process insertion
for (int i = vc.getStart(); i <= vc.getEnd() && startSite < positions.size(); i++) {
genotypeCallTableBuilder.setBase(taxonIndex, startSite, NucleotideAlignmentConstants.INSERT_DIPLOID_ALLELE);
if(vc.getGenotype(0).getAD() != null ) {
taxonDepths[NucleotideAlignmentConstants.INSERT_ALLELE][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getAD()[1]);
}
startSite++;
}
} else {
//Process the variantContext as a SNP or non INDEL multi-basepair variant
//get the genotype allele string
String allele = vc.getGenotype(taxon.getName()).getGenotypeString();
//We do a loop here in case there are multiple alleles covered:
//Usually these SNPs would be split into separate records, but it is permitted
//Ex: Ref is AAA and ALT is TTT
for (int i = 0; i < allele.length() && startSite < positions.size(); i++) {
String diploid = allele.charAt(i) + "" + allele.charAt(i);
genotypeCallTableBuilder.setBase(taxonIndex, startSite, NucleotideAlignmentConstants.getNucleotideDiploidByte(diploid));
positionToKnownVariants.put(positions.get(startSite), "" + allele.charAt(i));
if(vc.getGenotype(0).getAD() != null ) {
byte altAllele = NucleotideAlignmentConstants.getNucleotideAlleleByte(allele.charAt(i));
if (altAllele >= 0 && altAllele < 6) {
taxonDepths[altAllele][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getAD()[1]);
}
}
startSite++;
}
}
}
depthBuilder.addTaxon(taxonIndex, taxonDepths);
}
//Now rebuild the positionList with the knownVariants
PositionListBuilder posListBuilder = new PositionListBuilder();
for(Position position:positions) {
List storedKnownVar = positionToKnownVariants.get(position).stream().collect(Collectors.toList());
String[] knownVariants = new String[storedKnownVar.size()];
for(int i = 0; i < knownVariants.length; i++) {
knownVariants[i] = storedKnownVar.get(i);
}
Position newPosition = Position.builder(position.getChromosome().getName(),position.getPosition())
.allele(WHICH_ALLELE.Reference,referenceSequence.genotype(position.getChromosome(),position.getPosition())) //Set Reference as the actual reference passed in from the GenomeSequence
.knownVariants(knownVariants)
.build();
posListBuilder.add(newPosition);
}
//Update the TaxaList to contain the _HaplotypeCaller so that the names will be consistent with the DB
TaxaList taxaListWithUpdatedName = taxa.stream().map(taxon -> new Taxon(taxon.getName())).collect(TaxaList.collect());
//Build the GenotypeTable and return
return GenotypeTableBuilder.getInstance(genotypeCallTableBuilder.build(), posListBuilder.build(), taxaListWithUpdatedName, depthBuilder.build());
}
/**
* Simple method containing a stream to build a PositionList for each Position in the Reference Range. We need to do this to create the correctly sized GenotypeTable
* @param referenceRange
* @return
*/
private static PositionList buildPositionsFromRefRange(ReferenceRange referenceRange) {
return IntStream.rangeClosed(referenceRange.start(),referenceRange.end())
.mapToObj(positionNumber -> Position.of(referenceRange.chromosome(),positionNumber))
.collect(PositionList.collectValidateOrder());
}
/**
* Simple method to associate the reference call as a known variant
* @param positions
* @param referenceSequence
* @return
*/
private static Multimap buildInitialPositionToKnownVariantMap(PositionList positions, GenomeSequence referenceSequence) {
Multimap knownVariants = HashMultimap.create();
for(Position position : positions) {
knownVariants.put(position,referenceSequence.genotypeAsString(position.getChromosome(),position.getPosition()));
}
return knownVariants;
}
/**
* Simple method to check to see if the Variant is a reference block
* @param vc
* @return
*/
public static boolean isRefBlock(VariantContext vc) {
//if only 1 allele in reference but stop position is higher than start position
if(vc.getReference().getBaseString().length()==1 && vc.getEnd() - vc.getStart() >0) {
return true;
}
else if(vc.getReference().getBaseString().length()==1 && vc.getAlternateAlleles().size()>0 && vc.getAlternateAllele(0).getBaseString().length() ==0) {
return true;
}
else {
return false;
}
}
/**
* Simple method to check to see if the variant context is a deletion
* @param vc
* @return
*/
private static boolean isDeletion(VariantContext vc) {
//if # of reference alleles is greater than 1 and number of reference alleles is less than #of alts
if(vc.getReference().getBaseString().length()>vc.getAlternateAllele(0).getBaseString().length()) {
return true;
}
else {
return false;
}
}
/**
* Simple method to see if the variant context is an insertion.
* @param vc
* @return
*/
private static boolean isInsertion(VariantContext vc) {
//if # of reference alleles is less than number of alts
if(vc.getReference().getBaseString().length()