All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.hapcollapse.MergeGVCFUtils Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.hapcollapse;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.analysis.filter.FilterSiteBuilderPlugin;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.dna.snp.score.AlleleDepthBuilder;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import org.apache.log4j.Logger;

import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * Created by zrm22 on 11/7/17.
 * This class borrows code from GATK.  Eventually we should just bring in the GATK jar and use the method directly.
 * Some of this will need to be cleaned up as it uses a lot of extra methods.  I just copied them till the class will compile.
 * TODO Clean up this class as it is thrown together to make it work.
 */
@Deprecated
public class MergeGVCFUtils {


    private static final Logger myLogger = Logger.getLogger(MergeGVCFUtils.class);
    private static final Allele nonRefAllele = Allele.create("", false);


    /**
     * Method to make a genotypeTable from a list of Haplotype Nodes.
     * This method will simply make the GenotypeTable encoding indels as either + or - depending on what is actually stored in the HaplotypeNode
     * In other steps, the indels will be filtered out and then will be added back in when consensus is made.
     * @param refRange
     * @param nodesWithVariantContexts
     * @param referenceSequence
     * @return
     */
    public static GenotypeTable createGenotypeTableFromHaplotypeNodes(ReferenceRange refRange, List nodesWithVariantContexts, GenomeSequence referenceSequence) {
        try {
            ArrayListMultimap taxonToVariantListMap = ArrayListMultimap.create();
            //Using a loop to fix when we have empty lists of variantContexts stored in the db
            for(HaplotypeNode currentNodeWithVariants : nodesWithVariantContexts) {
                Optional> variantContextsOptional = currentNodeWithVariants.variantContexts();
                if(variantContextsOptional.isPresent()) {
                    //Map the variantContexts to the taxon
                    taxonToVariantListMap.putAll(new Taxon.Builder(removeHaplotypeCaller(currentNodeWithVariants.taxaList().get(0).getName())).build(),variantContextsOptional.get());
                }
                else {
                    //add in an empty list, these should all be here.
                    taxonToVariantListMap.putAll(currentNodeWithVariants.taxaList().get(0), new ArrayList());
                }
            }

            return createGenotypeTableFromGVCFs(refRange, taxonToVariantListMap, referenceSequence);
        }
        catch(Exception e) {
            myLogger.error("Exception thrown in createGenotypeTableFromHaplotypes"+e);
            e.printStackTrace();
        }
        return null;
    }

    /**
     * Simple method to remove the _Haplotype_Caller from the taxon name
     * @param taxonName
     * @return
     */
    public static String removeHaplotypeCaller(String taxonName) {
        if(taxonName.endsWith("_Haplotype_Caller")) {
            return taxonName.substring(0,taxonName.length()-17);
        }
        else {
            return taxonName;
        }
    }

    /**
     * This method takes all the VariantContexts for all of the Taxon we need in our new GenotypeTable and will create a correctly encoded GenotypeTable
     * @param refRange
     * @param taxonToVariantListMap
     * @param referenceSequence
     * @return
     */
    public static GenotypeTable createGenotypeTableFromGVCFs(ReferenceRange refRange, Multimap taxonToVariantListMap, GenomeSequence referenceSequence) {
        //Setup the TaxaList
        TaxaList taxa = new TaxaListBuilder().addAll(taxonToVariantListMap.keySet()).build();
        //Setup the positionList
        PositionList positions = buildPositionsFromRefRange(refRange);

        //Need to make a Position->KnownVariant list
        Multimap positionToKnownVariants = buildInitialPositionToKnownVariantMap(positions,referenceSequence);
        //Setup the callTable
        GenotypeCallTableBuilder genotypeCallTableBuilder = GenotypeCallTableBuilder.getInstance(taxa.size(),positions.size());

        AlleleDepthBuilder depthBuilder = AlleleDepthBuilder.getInstance(taxa.numberOfTaxa(), positions.size(), taxa);
        //Loop through each taxa
        for (Taxon taxon : taxa) {
            int taxonIndex = taxa.indexOf(taxon);
            byte[][] taxonDepths = new byte[6][positions.size()];
            //loop through each variant context
            for (VariantContext vc : taxonToVariantListMap.get(taxon)) {

                if (vc.getGenotype(0) == null) { // LCJ added for consensus problem july 17, 2018
                    throw new IllegalStateException("null genotype for taxon : " + taxon + ", vc: " + vc.toString());
                }
                //Get the start site and make sure its in the reference range:
                int startSite = (vc.getStart() < positions.get(0).getPosition() && vc.getEnd() >= positions.get(0).getPosition()) ? 0 : positions.indexOf(Position.of(refRange.chromosome(), vc.getStart()));
                if (startSite < 0) {
                    throw new IllegalStateException("ERROR VariantContext does not overlap a position in the reference range.");
                }

                //Check to see if variantContext is reference range
                if (isRefBlock(vc) || vc.getAlternateAlleles().size() == 0) {
                    //If so grab a reference allele for each position
                    for (int i = vc.getStart(); i <= vc.getEnd() && startSite < positions.size(); i++) {
                        byte refAllele = referenceSequence.genotype(refRange.chromosome(), i);
                        genotypeCallTableBuilder.setBase(taxonIndex, startSite, (byte) (refAllele << 4 | refAllele));
                        //These should be in the map already, but we need to add in just in case:
                        positionToKnownVariants.put(positions.get(startSite), NucleotideAlignmentConstants.getHaplotypeNucleotide(refAllele));
                        //Add depths DP is safe as it returns an int which is not a null.
                        try {
                            if (refAllele < 6 && refAllele >=0) {
                                taxonDepths[refAllele][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getDP());
                            }
                        
                        } catch (Exception exc) {
                            myLogger.error("Exception MergeGVCFUtils for taxon " + taxon.getName() + ", refAllele=" + refAllele + ", startSite=" + startSite);
                            myLogger.error("  refRange chrom:" + refRange.chromosome().getName() + " Coordinates: " + refRange.start() + ":" + refRange.end());
                            myLogger.error("  vcStart: " + vc.getStart() + ", vcEnd: " + vc.getEnd() + " loop index: " + i);
                            throw exc;
                        }
                        startSite++;
                    }
                } else if (isDeletion(vc)) {
                    //Deletions conserve the first few bps
                    String allele = vc.getGenotype(taxon.getName()).getGenotypeString();

                    //We do a loop here in case there are multiple alleles covered:
                    //Usually these SNPs would be split into separate records, but it is permitted
                    //Ex: Ref is AAA and ALT is TTT
                    for (int i = 0; i < allele.length() && startSite < positions.size(); i++) {
                        String diploid = allele.charAt(i) + "" + allele.charAt(i);
                        genotypeCallTableBuilder.setBase(taxonIndex, startSite, NucleotideAlignmentConstants.getNucleotideDiploidByte(diploid));
                        positionToKnownVariants.put(positions.get(startSite), "" + allele.charAt(i));
                        if(vc.getGenotype(0).getAD() != null ) {
                            byte altAllele = NucleotideAlignmentConstants.getNucleotideAlleleByte(allele.charAt(i));
                            if (altAllele >= 0 && altAllele < 6) {
                                taxonDepths[altAllele][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getAD()[1]);
                            }
                            
                        }
                        startSite++;
                    }
                    //If we have additional positions which we do not have enough base pairs for, we insert the gap allele
                    for (int i = vc.getStart() + allele.length(); i <= vc.getEnd() && startSite < positions.size(); i++) {
                        genotypeCallTableBuilder.setBase(taxonIndex, startSite, NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE);
                        if(vc.getGenotype(0).getAD() != null ) {
                            taxonDepths[NucleotideAlignmentConstants.GAP_ALLELE][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getAD()[1]);
                        }
                        startSite++;
                    }
                } else if (isInsertion(vc)) {
                    //process insertion
                    for (int i = vc.getStart(); i <= vc.getEnd() && startSite < positions.size(); i++) {
                        genotypeCallTableBuilder.setBase(taxonIndex, startSite, NucleotideAlignmentConstants.INSERT_DIPLOID_ALLELE);
                        if(vc.getGenotype(0).getAD() != null ) {
                            taxonDepths[NucleotideAlignmentConstants.INSERT_ALLELE][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getAD()[1]);
                        }
                        startSite++;
                    }
                } else {
                    //Process the variantContext as a SNP or non INDEL multi-basepair variant
                    //get the genotype allele string
                    String allele = vc.getGenotype(taxon.getName()).getGenotypeString();
                    //We do a loop here in case there are multiple alleles covered:
                    //Usually these SNPs would be split into separate records, but it is permitted
                    //Ex: Ref is AAA and ALT is TTT
                    for (int i = 0; i < allele.length() && startSite < positions.size(); i++) {
                        String diploid = allele.charAt(i) + "" + allele.charAt(i);
                        genotypeCallTableBuilder.setBase(taxonIndex, startSite, NucleotideAlignmentConstants.getNucleotideDiploidByte(diploid));
                        positionToKnownVariants.put(positions.get(startSite), "" + allele.charAt(i));
                        if(vc.getGenotype(0).getAD() != null ) {
                            byte altAllele = NucleotideAlignmentConstants.getNucleotideAlleleByte(allele.charAt(i));
                            if (altAllele >= 0 && altAllele < 6) {
                                taxonDepths[altAllele][startSite] = AlleleDepthUtil.depthIntToByte(vc.getGenotype(0).getAD()[1]);
                            }                              
                        }
                        startSite++;
                    }
                }
            }
            depthBuilder.addTaxon(taxonIndex, taxonDepths);
        }



        //Now rebuild the positionList with the knownVariants
        PositionListBuilder posListBuilder = new PositionListBuilder();
        for(Position position:positions) {
            List storedKnownVar = positionToKnownVariants.get(position).stream().collect(Collectors.toList());
            String[] knownVariants = new String[storedKnownVar.size()];

            for(int i = 0; i < knownVariants.length; i++) {
                knownVariants[i] = storedKnownVar.get(i);
            }

            Position newPosition = Position.builder(position.getChromosome().getName(),position.getPosition())
                    .allele(WHICH_ALLELE.Reference,referenceSequence.genotype(position.getChromosome(),position.getPosition())) //Set Reference as the actual reference passed in from the GenomeSequence
                    .knownVariants(knownVariants)
                    .build();
            posListBuilder.add(newPosition);
        }

        //Update the TaxaList to contain the _HaplotypeCaller so that the names will be consistent with the DB
        TaxaList taxaListWithUpdatedName = taxa.stream().map(taxon -> new Taxon(taxon.getName())).collect(TaxaList.collect());

        //Build the GenotypeTable and return
        return GenotypeTableBuilder.getInstance(genotypeCallTableBuilder.build(), posListBuilder.build(), taxaListWithUpdatedName, depthBuilder.build());
    }


    /**
     * Simple method containing a stream to build a PositionList for each Position in the Reference Range.  We need to do this to create the correctly sized GenotypeTable
     * @param referenceRange
     * @return
     */
    private static PositionList buildPositionsFromRefRange(ReferenceRange referenceRange) {
        return IntStream.rangeClosed(referenceRange.start(),referenceRange.end())
                .mapToObj(positionNumber -> Position.of(referenceRange.chromosome(),positionNumber))
                .collect(PositionList.collectValidateOrder());
    }

    /**
     * Simple method to associate the reference call as a known variant
     * @param positions
     * @param referenceSequence
     * @return
     */
    private static Multimap buildInitialPositionToKnownVariantMap(PositionList positions, GenomeSequence referenceSequence) {
        Multimap knownVariants = HashMultimap.create();

        for(Position position : positions) {
            knownVariants.put(position,referenceSequence.genotypeAsString(position.getChromosome(),position.getPosition()));
        }
        return knownVariants;
    }

    /**
     * Simple method to check to see if the Variant is a reference block
     * @param vc
     * @return
     */
    public static boolean isRefBlock(VariantContext vc) {
        //if only 1 allele in reference but stop position is higher than start position
        if(vc.getReference().getBaseString().length()==1 && vc.getEnd() - vc.getStart() >0) {
            return true;
        }
        else if(vc.getReference().getBaseString().length()==1 && vc.getAlternateAlleles().size()>0 && vc.getAlternateAllele(0).getBaseString().length() ==0) {
            return true;
        }
        else {
            return false;
        }
    }

    /**
     * Simple method to check to see if the variant context is a deletion
     * @param vc
     * @return
     */
    private static boolean isDeletion(VariantContext vc) {
        //if # of reference alleles is greater than 1 and number of reference alleles is less than #of alts
        if(vc.getReference().getBaseString().length()>vc.getAlternateAllele(0).getBaseString().length()) {
            return true;
        }
        else {
            return false;
        }
    }

    /**
     * Simple method to see if the variant context is an insertion.
     * @param vc
     * @return
     */
    private static boolean isInsertion(VariantContext vc) {
        //if # of reference alleles is less than number of alts
        if(vc.getReference().getBaseString().length()




© 2015 - 2024 Weber Informatics LLC | Privacy Policy