net.maizegenetics.pangenome.hapCalling.HapCallingUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.hapCalling;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.*;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.*;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.util.DirectoryCrawler;

import java.io.File;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Created by zrm22 on 8/29/17.
 */
public class HapCallingUtils {

    /**
     * Method to extract the VCF VariantContexts from the HaplotypePath.
     * TODO remove if we get rid of HaplotypePath
     * @param bestPath HaplotypePath containing one single path through the PHG.
     * @return list of VariantContext objects which represent the concatenated vcf records for each node on the path
     */
    public static List getVariantContextFromHaplotypePath(HaplotypePath bestPath) {
        //convert the path to a list of nodes and get the List
        return getVariantContextFromHaplotypeNodeList(bestPath.nodes());
    }

    /**
     * Method to extract the VCF VariantContexts from a List of HaplotypeNodes
     * @param nodeList List of HaplotypeNodes which we want the variantContexts from
     * @return List of VariantContext objects which we can then export to a vcf
     */
    public static List getVariantContextFromHaplotypeNodeList(List nodeList) {
        return nodeList.stream().flatMap(haplotypeNode -> haplotypeNode.variantContexts().get().stream()).collect(Collectors.toList());
    }

    public static void writeVariantContextsToVCF(List variantContextList, String exportVCFFileName, String referenceFileName, List taxa) {

        try {
            VariantContextWriterBuilder vcfOutputWriterBuilder = new VariantContextWriterBuilder();

            if (referenceFileName == null) {
                //remove indexing on the fly as we do not have a reference sequence provided
                vcfOutputWriterBuilder = vcfOutputWriterBuilder.unsetOption(Options.INDEX_ON_THE_FLY);
            } else {
                vcfOutputWriterBuilder = vcfOutputWriterBuilder.setReferenceDictionary(new IndexedFastaSequenceFile(new File(referenceFileName)).getSequenceDictionary());
            }

            VariantContextWriter vcfOutputWriter = vcfOutputWriterBuilder.setOutputFile(exportVCFFileName)
                    .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
                    .build();

            vcfOutputWriter.writeHeader(createGenericHeader(taxa));

            for (VariantContext currentContext : variantContextList) {
                vcfOutputWriter.add(currentContext);
            }

            vcfOutputWriter.close();

        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    /**
     * Method to write the list of VariantContexts out to a vcf file.  If the referenceFileName is not null, it will also index the file on the fly.
     * TODO when we convert TASSEL's VCF reader to use htsjdk, rework this method into ExportUtils
     * TODO use an optional for the referenceFileName
     *
     * @param variantContextList list of VariantContext objects that need to be written to the vcf file
     * @param exportVCFFileName String file name of the exported VCF file
     * @param referenceFileName String name of the reference fasta file name.  If null, will not index the vcf.  Otherwise you need a fasta index as well to make it work
     * @param taxonName Taxon name
     */
    public static void writeVariantContextsToVCF(List variantContextList, String exportVCFFileName, String referenceFileName, String taxonName) {
        try {
            VariantContextWriterBuilder vcfOutputWriterBuilder = new VariantContextWriterBuilder();

            if(referenceFileName == null) {
                //remove indexing on the fly as we do not have a reference sequence provided
                vcfOutputWriterBuilder = vcfOutputWriterBuilder.unsetOption(Options.INDEX_ON_THE_FLY);
            }
            else {
                vcfOutputWriterBuilder = vcfOutputWriterBuilder.setReferenceDictionary(new IndexedFastaSequenceFile(new File(referenceFileName)).getSequenceDictionary());
            }

            VariantContextWriter vcfOutputWriter = vcfOutputWriterBuilder.setOutputFile(exportVCFFileName)
                                                                        .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
                                                                        .build();

            vcfOutputWriter.writeHeader(createGenericHeader(taxonName));

            for(VariantContext currentContext : variantContextList) {
                vcfOutputWriter.add(correctENDAndGTVariantContexts(currentContext, taxonName));
            }

            vcfOutputWriter.close();

        }
        catch(Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Method to pass the END annotation and the GT call correctly to the output VariantContext
     * @param currentContext Current VariantContext which needs to be corrected
     * @param taxonName New name for the Taxon being called.  Need to do this to prevent ./. calls
     * @return new VariantContext Object with the changes made.
     */
    private static VariantContext correctENDAndGTVariantContexts(VariantContext currentContext, String taxonName) {
        VariantContextBuilder vcb = new VariantContextBuilder(currentContext);

        //We only want to export the END annotation if it is not a variant.
        //Otherwise it can be inferred from the alleles in the reference
        if(!currentContext.isVariant()) {
            vcb.attribute("END", currentContext.getEnd());
        }

        //rename the Genotype to the taxonName
        GenotypeBuilder gtBuilder = new GenotypeBuilder(currentContext.getGenotype(0)).name(taxonName);
        vcb.genotypes(gtBuilder.make());

        return vcb.make();
    }



    public static void callSNPsFromHaplotypePath(HaplotypeGraph graph,HaplotypePath bestPath, String vcfFileDir, String exportVCFFileName,String referenceFileName, String taxonName) {
        callSNPsFromHaplotypeNodes(bestPath.nodes(),createHapIdToFileNameMapping(graph,vcfFileDir),vcfFileDir,exportVCFFileName, referenceFileName, taxonName);
    }

    public static void callSNPsFromSomeVCFs(String inputVCFFileDir, String exportVCFFileName, String referenceFileName, String taxonName) {
        try{
            VariantContextWriter vcfOutputWriter =new VariantContextWriterBuilder()
//                    .setReferenceDictionary(new IndexedFastaSequenceFile(new File(referenceFileName)).getSequenceDictionary())
                    .clearOptions()
                    .setOutputFile(exportVCFFileName)
                    .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
                    .build();


            vcfOutputWriter.writeHeader(createGenericHeader(taxonName));

            int counter = 0;
            for (Path vcfPath : DirectoryCrawler.listPaths("glob:*.vcf", Paths.get(inputVCFFileDir))) {
                VCFFileReader reader = new VCFFileReader(vcfPath.toFile(),false);
                writeVCFRecordsToFile(vcfOutputWriter,reader);
            }
            vcfOutputWriter.close();
        }
        catch(Exception e) {
            e.printStackTrace();
        }


    }

    public static void callSNPsFromHaplotypeNodes(List listOfNodes, HashMap hapIdToFileNameMapping, String vcfFileDir, String exportVCFFileName, String referenceFileName, String taxonName) {
        //load up the vcfs for each node.  Final solution is to query the db using the hapid and get the file path.
        //For now though we can hack it together
        //consensus_chr10_stPos100055301_endPos100060684_ZEAxppRCQDIAAPEI-11.vcf.gz
        try {
            VariantContextWriter vcfOutputWriter = new VariantContextWriterBuilder()
                    .setReferenceDictionary(new IndexedFastaSequenceFile(new File(referenceFileName)).getSequenceDictionary())
                    .setOutputFile(exportVCFFileName)
                    .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
                    .build();

            vcfOutputWriter.writeHeader(createGenericHeader(taxonName));

            int counter = 0;
            for(HaplotypeNode currentNode : listOfNodes) {
                String fileName = hapIdToFileNameMapping.get(currentNode.id());
                VCFFileReader vcfReader = new VCFFileReader(new File(vcfFileDir+"/"+fileName));

                writeVCFRecordsToFile(vcfOutputWriter,vcfReader);
            }

            vcfOutputWriter.close();

        }
        catch(Exception e) {
            e.printStackTrace();
        }
    }

    private static void writeVCFRecordsToFile(VariantContextWriter writer, VCFFileReader vcfReader) {
        try {
        CloseableIterator vcfIterator = vcfReader.iterator();

        while(vcfIterator.hasNext()) {
            VariantContext vcfRecord = vcfIterator.next();
            if(vcfRecord.isVariant())
            {
                writer.add(vcfRecord);
            }
        }
        vcfIterator.close();
        }
        catch(Exception e) {
            e.printStackTrace();
        }
    }

    public static HashMap createHapIdToFileNameMapping(HaplotypeGraph graph, String pangenomeDir) {
        HashMap hapIdToFileNameMap = new HashMap<>();

        Multimap referenceRangeToFileNameMap = HashMultimap.create();

        Set referenceRanges = graph.referenceRanges();
        for(ReferenceRange range: referenceRanges) {
            List nodes = graph.nodes(range);

            for(HaplotypeNode currentNode : nodes) {
                StringBuilder globStringBuilder = new StringBuilder();
                globStringBuilder.append("glob:consensus_chr")
                        .append(currentNode.referenceRange().chromosome().getName())
                        .append("_stPos")
                        .append(currentNode.referenceRange().start())
                        .append("_endPos")
                        .append(currentNode.referenceRange().end())
                        .append("_*");
                TaxaList taxaList = currentNode.taxaList();


                if(referenceRangeToFileNameMap.containsKey(currentNode.referenceRange())) {
                    //loop through the entry and find the name that is correct
                    for(String currentFile : referenceRangeToFileNameMap.get(currentNode.referenceRange())) {
                        String taxaName = getTaxaFromFileName(currentFile);

                        if(taxaList.indexOf(taxaName) != -1) {
                            hapIdToFileNameMap.put(currentNode.id(),currentFile);
                            break;
                        }
                    }
                }
                else {
                    //loop through each file
                    for (Path fastaPath : DirectoryCrawler.listPaths(globStringBuilder.toString(), Paths.get(pangenomeDir))) {
                        String taxaName = getTaxaFromFileName(fastaPath.toString());

                        if(taxaList.indexOf(taxaName)!=-1) {
                            hapIdToFileNameMap.put(currentNode.id(),fastaPath.toString());
                        }
                        //Add it to the cache for easy lookup later
                        referenceRangeToFileNameMap.put(currentNode.referenceRange(),fastaPath.toString());
                    }
                }
            }
        }

        return hapIdToFileNameMap;
    }

//    public static HashMap createHapIdToFileNameMappingV2(HaplotypeGraph graph, String pangenomeDir) {
//        HashMap hapIdToFileNameMap = new HashMap<>();
//
//        //preload the referenceRangeToFileMap so it is easier to compute when we compare the nodes
//        Multimap referenceRangeToFileNameMap = createRefRangeToFileMap(pangenomeDir,);
//
//        Set referenceRanges = graph.referenceRanges();
//        for(ReferenceRange range: referenceRanges) {
//            List nodes = graph.nodes(range);
//
//            for(HaplotypeNode currentNode : nodes) {
//                StringBuilder globStringBuilder = new StringBuilder();
//                globStringBuilder.append("glob:consensus_chr")
//                        .append(currentNode.referenceRange().chromosome().getName())
//                        .append("_stPos")
//                        .append(currentNode.referenceRange().start())
//                        .append("_endPos")
//                        .append(currentNode.referenceRange().end())
//                        .append("_*");
//                TaxaList taxaList = currentNode.taxaList();
//
//
//                if(referenceRangeToFileNameMap.containsKey(currentNode.referenceRange())) {
//                    //loop through the entry and find the name that is correct
//                    for(String currentFile : referenceRangeToFileNameMap.get(currentNode.referenceRange())) {
//                        String taxaName = getTaxaFromFileName(currentFile);
//
//                        if(taxaList.indexOf(taxaName) != -1) {
//                            hapIdToFileNameMap.put(currentNode.id(),currentFile);
//                            break;
//                        }
//                    }
//                }
//                else {
//                    //loop through each file
//                    for (Path fastaPath : DirectoryCrawler.listPaths(globStringBuilder.toString(), Paths.get(pangenomeDir))) {
//                        String taxaName = getTaxaFromFileName(fastaPath.toString());
//
//                        if(taxaList.indexOf(taxaName)!=-1) {
//                            hapIdToFileNameMap.put(currentNode.id(),fastaPath.toString());
//                        }
//                        //Add it to the cache for easy lookup later
//                        referenceRangeToFileNameMap.put(currentNode.referenceRange(),fastaPath.toString());
//                    }
//                }
//            }
//        }
//
//        return hapIdToFileNameMap;
//    }

    private static Multimap createRefRangeToFileMap(String inputVCFDirectory, HashMap positionToRefRangeMap) {
        Multimap referenceRangeToFileNameMap = HashMultimap.create();

        for (Path fastaPath : DirectoryCrawler.listPaths("glob:*.vcf", Paths.get(inputVCFDirectory))) {
            //Extract the chr and position from the file name
            //consensus_chr10_stPos100055301_endPos100060684_ZEAxppRCQDIAAPEI-11.vcf.gz
            String[] fileNameSplit = fastaPath.toString().split("_");
            String chr = fileNameSplit[1].substring(3);
            String stPos = fileNameSplit[2].substring(7);
            //Create a Position object

            //add this Entry to the multimap
        }
        return referenceRangeToFileNameMap;
    }

    private static String getTaxaFromFileName(String fileName) {
        //consensus_chr10_stPos100055301_endPos100060684_ZEAxppRCQDIAAPEI-11.vcf.gz
        String[] fileNameSplit = fileName.split("_");
        String justTaxaName = fileNameSplit[fileNameSplit.length-1].split("\\.")[0];

        return justTaxaName;
    }

    private static VCFHeader createGenericHeader(String taxon) {
        List taxaNames = new ArrayList<>();
        taxaNames.add(taxon);
        return  createGenericHeader(taxaNames);
    }

    public static VCFHeader createGenericHeader(List taxaNames) {
        Set headerLines = createGenericHeaderLineSet();
        return new VCFHeader(headerLines, taxaNames);
    }


    public static Set createGenericHeaderLineSet() {
        Set headerLines = new HashSet<>();

        headerLines.add(new VCFFormatHeaderLine("AD",3,VCFHeaderLineType.Integer,"Allelic depths for the ref and alt alleles in the order listed"));
        headerLines.add(new VCFFormatHeaderLine("DP",1,VCFHeaderLineType.Integer,"Read Depth (only filtered reads used for calling)"));
        headerLines.add(new VCFFormatHeaderLine("GQ",1,VCFHeaderLineType.Integer,"Genotype Quality"));
        headerLines.add(new VCFFormatHeaderLine("GT",1,VCFHeaderLineType.String,"Genotype"));
        headerLines.add(new VCFFormatHeaderLine("PL",VCFHeaderLineCount.G,VCFHeaderLineType.Integer,"Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));

        headerLines.add(new VCFInfoHeaderLine("DP",1,VCFHeaderLineType.Integer,"Total Depth"));
        headerLines.add(new VCFInfoHeaderLine("NS",1,VCFHeaderLineType.Integer,"Number of Samples With Data"));
        headerLines.add(new VCFInfoHeaderLine("AF",3,VCFHeaderLineType.Integer,"Allele Frequency"));
        headerLines.add(new VCFInfoHeaderLine("END",1,VCFHeaderLineType.Integer,"Stop position of the interval"));
        headerLines.add(new VCFInfoHeaderLine("ASM_Chr",1,VCFHeaderLineType.String,"Assembly chromosome"));
        headerLines.add(new VCFInfoHeaderLine("ASM_Start",1,VCFHeaderLineType.Integer,"Assembly start position"));
        headerLines.add(new VCFInfoHeaderLine("ASM_End",1,VCFHeaderLineType.Integer,"Assembly end position"));
        headerLines.add(new VCFInfoHeaderLine("ASM_Strand",1,VCFHeaderLineType.String,"Assembly strand"));

        return headerLines;
    }
    public static VCFHeader createGATKGVCFHeader(List taxaNames) {
        //Get the initial batch of header lines
        Set headerLines = createGenericHeaderLineSet();


        //Add in the GATK GVCF specific headers
        /*
        ##ALT=
         */
//        headerLines.add(new VCFAltHeaderLine("ID=NON_REF,Description=\"Represents any possible alternative allele not already represented at this location by REF and ALT\"",VCFHeaderVersion.VCF4_2));
        headerLines.add(new VCFSimpleHeaderLine("ALT","NON_REF","Represents any possible alternative allele not already represented at this location by REF and ALT"));
        headerLines.add(new VCFFilterHeaderLine("LowQual","Low quality"));
        headerLines.add(new VCFFormatHeaderLine("MIN_DP",1,VCFHeaderLineType.Integer,"Minimum DP observed within the GVCF block"));
        headerLines.add(new VCFFormatHeaderLine("PGT",1,VCFHeaderLineType.String,"Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles"));
        headerLines.add(new VCFFormatHeaderLine("PID",1,VCFHeaderLineType.String,"Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group"));
        headerLines.add(new VCFFormatHeaderLine("PS",1,VCFHeaderLineType.Integer,"Phasing set (typically the position of the first variant in the set)"));
        headerLines.add(new VCFFormatHeaderLine("SB",4,VCFHeaderLineType.Integer,"Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."));

        //Adding in the GVCF Block bands
        addInGQBlockHeader(headerLines);

        headerLines.add(new VCFInfoHeaderLine("BaseQRankSum",1,VCFHeaderLineType.Float,"Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"));
        headerLines.add(new VCFInfoHeaderLine("ExcessHet",1,VCFHeaderLineType.Float,"Phred-scaled p-value for exact test of excess heterozygosity"));
        headerLines.add(new VCFInfoHeaderLine("InbreedingCoeff",1,VCFHeaderLineType.Float,"Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"));
        headerLines.add(new VCFInfoHeaderLine("MLEAC",VCFHeaderLineCount.A,VCFHeaderLineType.Integer,"Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"));
        headerLines.add(new VCFInfoHeaderLine("MLEAF",VCFHeaderLineCount.A,VCFHeaderLineType.Float,"Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"));
        headerLines.add(new VCFInfoHeaderLine("MQRankSum",1,VCFHeaderLineType.Float,"Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"));
        headerLines.add(new VCFInfoHeaderLine("RAW_MQandDP",2,VCFHeaderLineType.Float,"Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation."));
        headerLines.add(new VCFInfoHeaderLine("ReadPosRankSum",1,VCFHeaderLineType.Float,"Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"));


        return new VCFHeader(headerLines,taxaNames);
    }

    public static void addInGQBlockHeader(Set headerLines) {
        for(int i = 1; i <60; i++) {
            headerLines.add(createGVCFBlockHeaderLine(i,i+1));
        }
        headerLines.add(createGVCFBlockHeaderLine(60,70));
        headerLines.add(createGVCFBlockHeaderLine(70,80));
        headerLines.add(createGVCFBlockHeaderLine(80,90));
        headerLines.add(createGVCFBlockHeaderLine(90,100));
    }
    public static VCFHeaderLine createGVCFBlockHeaderLine(int start, int exclusiveEnd) {
        StringBuilder nameSb = new StringBuilder();
        nameSb.append("GVCFBlock");
        nameSb.append(start);
        nameSb.append("-");
        nameSb.append(exclusiveEnd);

        StringBuilder descriptionSb = new StringBuilder();
        descriptionSb.append("minGQ=");
        descriptionSb.append(start);
        descriptionSb.append("(inclusive),maxGQ=");
        descriptionSb.append(exclusiveEnd);
        descriptionSb.append("(exclusive)");

        return new VCFHeaderLine(nameSb.toString(), descriptionSb.toString());
//        return new VCFSimpleHeaderLine(nameSb.toString(),nameSb.toString(),descriptionSb.toString());
    }

}