All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.dna.map.GVCFGenomeSequenceBuilder Maven / Gradle / Ivy

package net.maizegenetics.dna.map;

import com.google.common.base.Splitter;
import com.google.common.collect.*;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.util.*;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.OpenBitSet;


/**
 * Created by zrm22 on 3/27/17.
 */
public class GVCFGenomeSequenceBuilder extends GenomeSequenceBuilder {
    private static final Pattern TAB_PATTERN = Pattern.compile("[\\t]+");

    /**
     * Builds GenomeSequence from a fasta file and a GVCF file.
     *
     * @param fastaFileName full path to fasta file
     * @return GenomeSequence object
     */
    public static GenomeSequence instance(String fastaFileName, String gvcfFileName) throws Exception{
        Function charConversion = (c) -> c;
        return instance(fastaFileName, charConversion, gvcfFileName);
    }

    /**
     * Builds GenomeSequence from a fasta file.  The char conversion provide a mechanism to convert upper and lower case
     * or convert one case to N.  This is useful if a case if used to define a certain class of bases
     *
     * @param fastaFileName  full path to fasta file
     * @param charConversion lambda Function to convert characters
     * @return GenomeSequence object
     */
    public static GenomeSequence instance(String fastaFileName, Function charConversion, String gvcfFileName) throws Exception{
        long chrPosMapStartTime = System.currentTimeMillis();
        Map chromPositionMap = readReferenceGenomeChr(fastaFileName, charConversion);
        System.out.println("Done Reading in Reference: Total Time Taken: "+(System.currentTimeMillis()-chrPosMapStartTime)+"ms");
        //need to create a Map>>
//        Map> gvcfAnnotationsAndCalls = readGVCFFile(gvcfFileName);
        System.out.println("Generating Position List");
        long startTime = System.currentTimeMillis();
        PositionList gvcfPositionsAndAnnotations = readGVCFFilePositionList(gvcfFileName);
        System.out.println("Done Creating Pos list. Total Time Taken: "+(System.currentTimeMillis() - startTime)+"ms");
        return new HalfByteGenomeSequenceGVCF(chromPositionMap,gvcfPositionsAndAnnotations);
    }

    public static GenomeSequence instance(GVCFGenomeSequence base, BitSet maskedBitSet, BitSet filteredBitSet) throws Exception{
        return new HalfByteGenomeSequenceGVCF(base.getChrPosMap(), base.getGVCFPositions(), maskedBitSet, filteredBitSet);

//        Map chromPositionMap = readReferenceGenomeChr(fastaFileName, charConversion);
//        //need to create a Map>>
//        Map> gvcfAnnotationsAndCalls = readGVCFFile(gvcfFileName);
//        PositionList gvcfPositionsAndAnnotations = readGVCFFilePositionList(gvcfFileName);
//        return new HalfByteGenomeSequenceGVCF(chromPositionMap,gvcfPositionsAndAnnotations);
    }


    private static Map> readGVCFFile(String gvcfFileName) {
        HashMap> chromosomeRangeMapHashMap = new HashMap<>();
        //TODO multithread using similar code to BuilderFromVCF

        return chromosomeRangeMapHashMap;
    }


    private static PositionList readGVCFFilePositionList(String gvcfFileName) throws Exception{
        ArrayList positionArrayList = new ArrayList<>();

//        BufferedReader gvcfFileReader = new BufferedReader(new FileReader(gvcfFileName));
        BufferedReader gvcfFileReader = Utils.getBufferedReader(gvcfFileName,-1);
        //Loop through the headers
        String currentLine = "";
        while(!(currentLine = gvcfFileReader.readLine()).startsWith("#CHROM")) {

        }
        //parse the header
        String[] header = TAB_PATTERN.split(currentLine);
        HeaderPositions hp= new HeaderPositions(header);
        GVCFPositionRecord gvcfPositionRecord = new GVCFPositionRecord(hp);

        while((currentLine = gvcfFileReader.readLine())!=null) {
            positionArrayList.add(gvcfPositionRecord.parseGVCFRecords(currentLine));
        }
        PositionList instance = new PositionArrayList(positionArrayList, "AGPv4");
        return instance;
    }
}
class HeaderPositions {
    final int NUM_HAPMAP_NON_TAXA_HEADERS;
    final int GENOIDX;
    final int SNPID_INDEX;
    //  final int VARIANT_INDEX;
    final int FILTER_INDEX;
    final int QUAL_INDEX;
    final int CHROMOSOME_INDEX;
    final int POSITION_INDEX;
    final int REF_INDEX;
    final int ALT_INDEX;
    final int INFO_INDEX;
    final int FORMAT_INDEX;

    public HeaderPositions(String[] header){
        int chrIdx=firstEqualIndex(header,"#CHROM");
        if(chrIdx<0) chrIdx=firstEqualIndex(header,"#CHR");
        CHROMOSOME_INDEX=chrIdx;
        POSITION_INDEX=firstEqualIndex(header,"POS");
        SNPID_INDEX=firstEqualIndex(header,"ID");
        REF_INDEX=firstEqualIndex(header,"REF");
        ALT_INDEX=firstEqualIndex(header,"ALT");
        QUAL_INDEX=firstEqualIndex(header,"QUAL");
        FILTER_INDEX=firstEqualIndex(header,"FILTER");
        INFO_INDEX=firstEqualIndex(header,"INFO");
        FORMAT_INDEX=firstEqualIndex(header,"FORMAT");

        NUM_HAPMAP_NON_TAXA_HEADERS=Math.max(INFO_INDEX,FORMAT_INDEX)+1;
        GENOIDX=NUM_HAPMAP_NON_TAXA_HEADERS;
    }

    private static int firstEqualIndex(String[] sa, String match) {
        for (int i=0; i knownVariants = new ArrayList<>();
    //the depth
    int depth = 0;
    //GQ score
    int gqScore = 0;
    //GeneralAnnotations for any other in INFO tag
    GeneralAnnotationStorage generalAnnotationStorage;


}

class GVCFPositionRecord {
    GeneralPosition ap= new GeneralPosition.Builder(new Chromosome("1"),1232)
            .maf(0.05f)
            .build();
    HeaderPositions hp;

    public GVCFPositionRecord(HeaderPositions hp) {
        this.hp = hp;

    }
    public Position parseGVCFRecords(String input) {
        //Figure out the tab positioning for the header columns
        int[] tabPos=new int[hp.NUM_HAPMAP_NON_TAXA_HEADERS+1];
        int tabIndex=0;
        int len=input.length();
        for (int i=0; (tabIndex0) snpID=input.substring(tabPos[hp.SNPID_INDEX-1]+1, tabPos[hp.SNPID_INDEX]);

        //create the General position
        GeneralPosition.Builder apb=new GeneralPosition.Builder(currChr, Integer.parseInt(input.substring(tabPos[hp.POSITION_INDEX-1]+1, tabPos[hp.POSITION_INDEX])));
        if(snpID!=null && !snpID.equals(".")) {
            apb.snpName(snpID);
        }



        String refS=input.substring(tabPos[hp.REF_INDEX-1]+1, tabPos[hp.REF_INDEX]);
        String alt=input.substring(tabPos[hp.ALT_INDEX-1]+1, tabPos[hp.ALT_INDEX]);
        //create an String to hold the list of variants, we will have to parse this on export
        String allAlleles = refS+"/"+alt.replace(",","/");

        apb = apb.knownVariants(allAlleles);

        //loop through the INFO tag and save off any of those values into positions
        for(String annoS: Splitter.on(";").split(input.substring(tabPos[hp.INFO_INDEX-1]+1, tabPos[hp.INFO_INDEX]))) {
            apb.addAnno(annoS);
        }


        final int iGT=0; //genotype index
        int iAD=-1,iDP=-1,iGQ=-1, iPL=-1;  //alleleDepth, overall depth, genotypeQuality, phredGenotypeLikelihoods
        if(hp.FORMAT_INDEX>=0) {
            //Check to see if FORMAT tag is missing. Only applicable for single taxa files
            if(tabPos[hp.FORMAT_INDEX]==0) {
                throw new IllegalStateException("Error Processing VCF: Missing FORMAT tag.");
            }
            String unsplitInput = input.substring(tabPos[hp.FORMAT_INDEX-1]+1, tabPos[hp.FORMAT_INDEX]);
            if(unsplitInput.length()==0|| !unsplitInput.startsWith("GT")) {
                //Check to see it has the GT field
                if(unsplitInput.contains("GT")) {
                    throw new IllegalStateException("Error Processing VCF Block: GT field is not in first position of FORMAT.");
                }
                //If GT isnt in, we assume that it is missing FORMAT
                else {
                    throw new IllegalStateException("Error Processing VCF Block: Missing FORMAT tag.");
                }
            }
            String[] formatS = unsplitInput.split(":");

            iAD=firstEqualIndex(formatS,"AD");
            iDP=firstEqualIndex(formatS,"DP");
            iGQ=firstEqualIndex(formatS,"GQ");
        }



        //after info is recorded we need to record the call section
        String taxaAllG = input.substring(tabPos[hp.NUM_HAPMAP_NON_TAXA_HEADERS-1]+1);
        int f = 0;
        for(String fieldS: Splitter.on(":").split(taxaAllG)) {
            if (f == iGT) {
                apb.addAnno("GT",fieldS);
            }
            if(f == iAD) {
                apb.addAnno("AD",fieldS);
            }
            if(f == iDP) {
                apb.addAnno("DP",fieldS);
            }
            if(f == iGQ) {
                apb.addAnno("GQ",fieldS);
            }

            f++;
        }



        return apb.build();
    }

    private static int firstEqualIndex(String[] sa, String match) {
        for (int i=0; i chromPositionMap;
    private Map chromLengthLookup=new HashMap<>();
    private RangeMap wholeGenomeIndexMap= TreeRangeMap.create();
    private  PositionList gvcfAnnotationsAndCalls;
    private final long genomeSize;
    private BitSet maskBitSet;
    private BitSet filterBitSet;

    //Timing stuff
    long totalQueryTime = 0;


    HashMap positionIndex = new HashMap<>();
    int indexScaleFactor = 10000;
    protected HalfByteGenomeSequenceGVCF(MapchromPositionMap, PositionList gvcfAnnotationsAndCalls) {
        this.chromPositionMap = chromPositionMap;
        this.gvcfAnnotationsAndCalls = gvcfAnnotationsAndCalls;
        long initialSequenceSetupStart = System.currentTimeMillis();
        chromPositionMap.entrySet().stream()
                .forEach(e -> chromLengthLookup.put(e.getKey(),e.getKey().getLength()));
        LongAdder genomeIndex=new LongAdder();
        chromosomes().stream().sorted()
                .forEach(chrom -> {
                            int length=chromLengthLookup.get(chrom);
                            wholeGenomeIndexMap.put(Range.closed(genomeIndex.longValue(),
                                    genomeIndex.longValue()+length-1),chrom);
                            genomeIndex.add(length);
                            int[] currentIndexArray = new int[length/indexScaleFactor+2];
                            Arrays.fill(currentIndexArray,-1);
                            positionIndex.put(chrom,currentIndexArray);
                        }
                );
        genomeSize=genomeIndex.longValue();
        maskBitSet = new OpenBitSet(gvcfAnnotationsAndCalls.size());
        filterBitSet = new OpenBitSet(gvcfAnnotationsAndCalls.size());
        System.out.println("Initial Startup time for GVCF Sequence: "+(System.currentTimeMillis() - initialSequenceSetupStart)+"ms");
        long indexInitStart = System.currentTimeMillis();
        for(int i = 0; i < gvcfAnnotationsAndCalls.size(); i++) {
            int currentBin = gvcfAnnotationsAndCalls.get(i).getPosition()/indexScaleFactor;
            //new record.  add it to the index
            if(positionIndex.get(gvcfAnnotationsAndCalls.get(i).getChromosome())[currentBin] == -1) {
                positionIndex.get(gvcfAnnotationsAndCalls.get(i).getChromosome())[currentBin] = i;
            }
        }
        System.out.println("Done Creating index: "+(System.currentTimeMillis() - indexInitStart)+"ms");
    }

    protected HalfByteGenomeSequenceGVCF(MapchromPositionMap, PositionList gvcfAnnotationsAndCalls,BitSet maskBitSet, BitSet filterBitSet) {
        this.chromPositionMap = chromPositionMap;
        this.gvcfAnnotationsAndCalls = gvcfAnnotationsAndCalls;
        chromPositionMap.entrySet().stream()
                .forEach(e -> chromLengthLookup.put(e.getKey(),e.getKey().getLength()));
        LongAdder genomeIndex=new LongAdder();
        chromosomes().stream().sorted()
                .forEach(chrom -> {
                            int length=chromLengthLookup.get(chrom);
                            wholeGenomeIndexMap.put(Range.closed(genomeIndex.longValue(),
                                    genomeIndex.longValue()+length-1),chrom);
                            genomeIndex.add(length);
                            int[] currentIndexArray = new int[length/indexScaleFactor+2];
                            Arrays.fill(currentIndexArray,-1);
                            positionIndex.put(chrom,currentIndexArray);
                        }
                );
        genomeSize=genomeIndex.longValue();
        this.maskBitSet = maskBitSet;
        this.filterBitSet = filterBitSet;

        for(int i = 0; i < gvcfAnnotationsAndCalls.size(); i++) {
            int currentBin = gvcfAnnotationsAndCalls.get(i).getPosition()/indexScaleFactor;
            //new record.  add it to the index
            if(positionIndex.get(gvcfAnnotationsAndCalls.get(i).getChromosome())[currentBin] == -1) {
                positionIndex.get(gvcfAnnotationsAndCalls.get(i).getChromosome())[currentBin] = i;
            }
        }
    }
    @Override
    public Set chromosomes() {
        return chromPositionMap.keySet();
    }

    @Override
    public byte[] chromosomeSequence(Chromosome chrom) {
        return chromosomeSequence(chrom,1,chromLengthLookup.get(chrom));
    }

    @Override
    // Code assumes 1-based coordinates have been passed.  It will catch and return
    // null if the startSite is 0.  Otherwise, the user is on their own to ensure
    // input is 1-based.
    //TODO add in functionality using GVCF annotations
    public byte[] chromosomeSequence(Chromosome chrom, int startSite, int lastSite) {
        final int startSiteFinal = startSite;
        final int lastSiteFinal = lastSite;

        //add one as we grab the last site as well.
//        regionTotalReferenceBPCount = lastSite - startSite+1;

        //zrm22 new for getting gvcf records.
        //Basically the loop now changes to loop through the positionList to check if the allele should be the ref or not
        gvcfAnnotationsAndCalls.chromosomeSiteCount(chrom);

        //use the index to get the currentList of positions
        int[] currentIndex = positionIndex.get(chrom);
        int startPositionListIndex = currentIndex[startSite/indexScaleFactor];
        int endPositionListIndex = currentIndex[lastSite/indexScaleFactor+1];

        int startPositionListCounter = 1;
        while(startPositionListIndex==-1) {
            if (startSite / indexScaleFactor - startPositionListCounter >= 0) {
                startPositionListIndex = currentIndex[startSite / indexScaleFactor - startPositionListCounter];
                startPositionListCounter++;
            }
            else {
                startPositionListIndex = 0;
                break;
            }
        }
        ArrayList listOfChrPositions = new ArrayList<>();
        if(endPositionListIndex==-1) {
            for(int i = startPositionListIndex; i < startPositionListIndex+indexScaleFactor && i < gvcfAnnotationsAndCalls.size(); i++) {

                if(gvcfAnnotationsAndCalls.get(i).getPosition()>=startSiteFinal && gvcfAnnotationsAndCalls.get(i).getPosition()<=lastSiteFinal) {
                    listOfChrPositions.add(gvcfAnnotationsAndCalls.get(i));
                }
            }
        }
        else {
            for (int i = startPositionListIndex; i <= endPositionListIndex; i++) {
                if (gvcfAnnotationsAndCalls.get(i).getPosition() >= startSiteFinal && gvcfAnnotationsAndCalls.get(i).getPosition() <= lastSiteFinal) {
                    listOfChrPositions.add(gvcfAnnotationsAndCalls.get(i));
                }
            }
        }

        //sort things to make sure we can iterate properly
        Collections.sort(listOfChrPositions);


        int actualStartPos = 0;
        int startIndex = 0;
        int actualEndPos = 0;
        int endIndex = 0;

        boolean addingAtStart = false;
        if(listOfChrPositions.size()>0) {
            for (int i = 1; i < gvcfAnnotationsAndCalls.size(); i++) {
                if (gvcfAnnotationsAndCalls.get(i).getPosition() == listOfChrPositions.get(0).getPosition()) {
                    if (checkPositionCoverage(gvcfAnnotationsAndCalls.get(i - 1), startSiteFinal)) {
                        startIndex = i-1;
                        listOfChrPositions.add(0,gvcfAnnotationsAndCalls.get(startIndex));
                        addingAtStart = true;
                    }
                    break;
                }
            }
        }

        //loop through from startSite till the first position in the GVCF export these alleles directly from the ref
        int listOfChrPositionsCounter = 0;
        ArrayList byteList = new ArrayList<>();

        if(listOfChrPositions.size()==0) {
            //Export a single N for a missing sequence
            byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
        }
        else {
            for (int siteCounter = startSite; siteCounter <= lastSite && listOfChrPositionsCounter < listOfChrPositions.size(); siteCounter++) {
                GeneralAnnotation ga = listOfChrPositions.get(listOfChrPositionsCounter).getAnnotation();
                Set annotationKeys = ga.getAnnotationKeys();
                if(addingAtStart) {
                    if (listOfChrPositionsCounter != 0) {
                        if (filterBitSet.fastGet(listOfChrPositionsCounter-1)) {
                            //get the size of the position
                            if (annotationKeys.contains("END")) {
                                int lengthOfPosBlock = Integer.parseInt((String) ga.getTextAnnotation("END")[0]) - listOfChrPositions.get(listOfChrPositionsCounter).getPosition();
                                siteCounter+=lengthOfPosBlock;
                            }
                            else{
                                //check insertion or deletion
                                String[] variants = listOfChrPositions.get(listOfChrPositionsCounter).getKnownVariants();
                                siteCounter+=variants[0].length()-1;
                            }
                            listOfChrPositionsCounter++;
                            continue;
                        }
                    }
                }
                else {
                    if (filterBitSet.fastGet(listOfChrPositionsCounter)) {
                        if (annotationKeys.contains("END")) {
                            int lengthOfPosBlock = Integer.parseInt((String) ga.getTextAnnotation("END")[0]) - listOfChrPositions.get(listOfChrPositionsCounter).getPosition();
                            siteCounter+=lengthOfPosBlock;
                        }
                        else{
                            //check insertion or deletion
                            String[] variants = listOfChrPositions.get(listOfChrPositionsCounter).getKnownVariants();
                            siteCounter+=variants[0].length()-1;
                        }
                        listOfChrPositionsCounter++;
                        continue;
                    }
                }
                if (maskBitSet.fastGet(listOfChrPositionsCounter)) {
                    //if true it means we have to mask all of these positions
                    //Check to see if we have an END anno
                }
                if (listOfChrPositions.get(listOfChrPositionsCounter).getPosition() <= siteCounter) {
                    if (annotationKeys.contains("END")) {
                        //this means we have a block
                        //Check the call and grab the corresponding allele values
                        String call = ga.getTextAnnotation("GT")[0];
                        boolean phased = true;
                        boolean haploid = false;
                        if (call.contains("/")) {
                            phased = false;
                        } else if (call.contains("|")) {
                            phased = true;
                        } else {
                            haploid = true;
                        }
                        String[] callSplit = phased ? call.split("|") : call.split("/");
                        int leftAllele = Integer.parseInt(callSplit[0]);
                        int rightAllele = leftAllele;
                        if (!haploid) {
                            rightAllele = Integer.parseInt(callSplit[1]);
                        }
                        int endPoint = Integer.parseInt(ga.getTextAnnotation("END")[0]) - 1;
                        //check to see if our requested end point is smaller than the region
                        if(lastSite-1 packedBytes.length * 2 || endPoint > packedBytes.length * 2) {
                            throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: requested sequence is out of range"); // requested sequence is out of range
                        }
                        String[] variants = listOfChrPositions.get(listOfChrPositionsCounter).getKnownVariants();
                        String leftAlleleString = variants[leftAllele];

                        //check depth
//                        String dpFullString = (String) annos.get("DP").toArray()[0];
                        String dpFullString = ga.getTextAnnotation("DP")[0];

                        // String minDPString = (String) annos.get("MIN_DP").toArray()[0];
                        //TODO check to see if we should assume only Ref or call for blocks
                        if (maskBitSet.fastGet(listOfChrPositionsCounter) || dpFullString.equals("0")) {

                            //if true we have a masking
                            for (int i = startSiteShifted; i <= endPoint; i++) {
//                                regionDepthCount+=0;
//                                regionZeroCoverageCount++;

                                byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                            }
                        } else if (leftAllele == 0) {
                            //fill in with reference sequence
                            //pull the sequence from siteCounter till you get to endPoint
                            //Zrm Apr17 fix for default rules
                            //check if homozygous or het based on GT calls
                            int depth = 0;
                            int minDepth = 0;
                            if(!dpFullString.equals("")) {
                                depth = Integer.parseInt(dpFullString);
                            }
                            for (int i = startSiteShifted; i <= endPoint; i++) {
//                                regionDepthCount+=depth;
//                                regionMinDepthCount+=minDepth;
                                byteList.add((byte) ((i % 2 == 0) ? ((packedBytes[i / 2] & 0xF0) >> 4) : (packedBytes[i / 2] & 0x0F)));
                            }

                        } else {
                            int depth = 0;
                            if(!dpFullString.equals("")) {
                                depth = Integer.parseInt(dpFullString);
                            }
                            for (int i = startSiteShifted; i <= endPoint; i++) {
//                                regionDepthCount+=depth;
//                                regionZeroCoverageCount++;
                                byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                            }
                        }
                        //shift up the siteCounter to match the end
                        siteCounter = endPoint + 1;
                    } else {
                        //it is likely a snp
                        //check the call and grab the correct allele
                        //siteCounter will increment correctly
                        String call = ga.getTextAnnotation("GT")[0];
                        boolean phased = true;
                        boolean haploid = false;
                        if (call.contains("/")) {
                            phased = false;
                        } else if (call.contains("|")) {
                            phased = true;
                        } else {
                            //haploid
                            haploid = true;
                        }
                        String[] callSplit = phased ? call.split("|") : call.split("/");
                        int leftAllele = Integer.parseInt(callSplit[0]);
                        int rightAllele = leftAllele;
                        if (!haploid) {
                            rightAllele = Integer.parseInt(callSplit[1]);
                        }

                        //Get the AD field and split it on commas
                        String adsFullString = ga.getTextAnnotation("AD")[0];
                        String dpFullString = ga.getTextAnnotation("DP")[0];
                        String[] adSplit = adsFullString.split(",");
                        //Get the known Variants
                        String[] variants = listOfChrPositions.get(listOfChrPositionsCounter).getKnownVariants();
                        boolean isHet = calcHet(adSplit);
                        if (isHet) {
                            //TODO handle hets correctly for indels and such
                            String leftAlleleString = variants[0];
                            for(int alleleCounter = 0; alleleCounter < leftAlleleString.length(); alleleCounter++) {
                                byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                            }
                            siteCounter+=leftAlleleString.length()-1;
//                            for(int i = 0; i < leftAlleleString.length();i++) {
//                                regionHetCount++;
//                            }
                        }
                        //ZRM April17 do calls based on depth only
                        //This block is not actually used as it has already been handled if it is a het or a homozygous ref(in an allele block)
                        else if (adSplit[0].equals("1") || adSplit[0].equals("2")) {
                            if (adSplit[1].equals("0")) {
                                //export the ref
                                String leftAlleleString = variants[0];

                                int depth = 0;
                                if(!dpFullString.equals("")) {
                                    depth = Integer.parseInt(dpFullString);
                                }

                                if (maskBitSet.fastGet(listOfChrPositionsCounter)) {
                                    //if true we have to mask it
                                    for (int i = 0; i < leftAlleleString.length() && siteCounter+i < lastSite-1; i++) {
//                                        regionDepthCount+=depth;
//                                        regionNumberHomoRef1Or2Count++;
                                        byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                                    }
                                } else {
                                    for (int i = 0; i < leftAlleleString.length() && siteCounter+i < lastSite-1; i++) {
//                                        regionDepthCount+=depth;
//                                        regionNumberHomoRef1Or2Count++;
                                        byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte(leftAlleleString.charAt(i)));
                                    }
                                }
                            }

                        } else if (adSplit[1].equals("1") || adSplit[1].equals("2")) { //ZRM22 Jun19 ChangeforChristy
//                        } else if (adSplit[1].equals("0")) {
                            //call Ns
                            //export the ref
                            String leftAlleleString = variants[1];
                            int depth = 0;
                            if(!dpFullString.equals("")) {
                                depth = Integer.parseInt(dpFullString);
                            }
                            for (int i = 0; i < leftAlleleString.length(); i++) {
//                                regionDepthCount+=depth;
//                                regionNumberHomoAlt1Or2Count++;
                                byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                            }
                        } else if (Integer.parseInt(adSplit[1]) >= 3) { //ZRM22 Jun19 change for christy
//                        } else if (Integer.parseInt(adSplit[1]) >= 1) {
                            int depth = 0;
                            if(!dpFullString.equals("")) {
                                depth = Integer.parseInt(dpFullString);
                            }
                            String leftAlleleString = variants[1];
                            if (maskBitSet.fastGet(listOfChrPositionsCounter)) {
                                //if true we have to mask it
                                for (int i = 0; i < leftAlleleString.length(); i++) {
//                                    regionAltCount++;
//                                    regionNumberHomoAlt3PlusCount++;
//                                    regionDepthCount+=depth;
                                    byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                                }
                            } else {
                                for (int i = 0; i < leftAlleleString.length(); i++) {
//                                    regionAltCount++;
//                                    regionNumberHomoAlt3PlusCount++;
//                                    regionDepthCount+=depth;
                                    byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte(leftAlleleString.charAt(i)));
                                }
                                siteCounter+=variants[0].length()-1;
                            }
                        }
                        else {
                            //in the case where we have 0 depth for each allele it will fall through all the cases
                            //lets stick in Ns for the number of reference basepairs  fix the old method header as well.
                            String leftAlleleString = variants[0];

                            for (int i = 0; i < leftAlleleString.length(); i++) {
//                                    regionAltCount++;
//                                    regionNumberHomoAlt3PlusCount++;
//                                    regionDepthCount+=depth;
                                byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte(leftAlleleString.charAt(i)));
                            }
                            siteCounter+=variants[0].length()-1;
                        }
                    }
                    listOfChrPositionsCounter++;
                } else {
                    //grab the reference as we dont have a gvcf for the requested site
                    //Should this be the breaking point of the sequence??
                    //TODO should we mark these with Ns?

                    //With 0 coverage we want to export an N
                    byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
//                    regionDepthCount+=0;
//                    regionZeroCoverageCount++;

                }
            }
        }
//        regionTotalExportedBPCount = byteList.size();
        byte[] fullBytes = new byte[byteList.size()];
        for(int i = 0; i < fullBytes.length; i++) {
            fullBytes[i] = byteList.get(i);
        }
        return fullBytes;
    }


    @Override
    //Code to get the Sequence in a string as well as the various stats we have collected.
    // This should allow for safe multithreading for the stats.
    public HashMap chromosomeSequenceAndStats(Chromosome chrom, int startSite, int lastSite) {
        long queryTimeStart = System.currentTimeMillis();

        int regionTotalReferenceBPCount = 0;
        int regionTotalExportedBPCount = 0;
        int regionHetCount = 0;
        int regionAltCount = 0;
        int regionDepthCount = 0;
        int regionGQCount = 0;
        int regionMinDepthCount = 0;
        int regionZeroCoverageCount = 0;
        int regionNumberHomoRef1Or2Count = 0;
        int regionNumberHomoRef3PlusCount = 0;
        int regionNumberHomoAlt1Or2Count = 0;
        int regionNumberHomoAlt3PlusCount = 0;

        final int startSiteFinal = startSite;
        final int lastSiteFinal = lastSite;

        //add one as we grab the last site as well.
        regionTotalReferenceBPCount = lastSite - startSite+1;

        //zrm22 new for getting gvcf records.
        //Basically the loop now changes to loop through the positionList to check if the allele should be the ref or not
//        gvcfAnnotationsAndCalls.chromosomeSiteCount(chrom);

        long getCurrentSubsetPositionListTimeStart = System.currentTimeMillis();
        //use the index to get the currentList of positions
        int[] currentIndex = positionIndex.get(chrom);
        int startPositionListIndex = currentIndex[startSite/indexScaleFactor];
        int endPositionListIndex = currentIndex[lastSite/indexScaleFactor+1];

        int startPositionListCounter = 1;
        while(startPositionListIndex==-1) {
            startPositionListIndex = currentIndex[startSite/indexScaleFactor-startPositionListCounter];
            startPositionListCounter++;
        }
        ArrayList listOfChrPositions = new ArrayList<>();
        int actualStartIndex = Integer.MAX_VALUE;
        if(endPositionListIndex==-1) {
            for(int i = startPositionListIndex; i < startPositionListIndex+indexScaleFactor && i < gvcfAnnotationsAndCalls.size(); i++) {

                if(gvcfAnnotationsAndCalls.get(i).getPosition()>=startSiteFinal && gvcfAnnotationsAndCalls.get(i).getPosition()<=lastSiteFinal) {
                    if(i < actualStartIndex) {
                        actualStartIndex = i;
                    }
                    listOfChrPositions.add(gvcfAnnotationsAndCalls.get(i));
                }
            }
        }
        else {
            for (int i = startPositionListIndex; i <= endPositionListIndex; i++) {
                if (gvcfAnnotationsAndCalls.get(i).getPosition() >= startSiteFinal && gvcfAnnotationsAndCalls.get(i).getPosition() <= lastSiteFinal) {
                    if(i < actualStartIndex) {
                        actualStartIndex = i;
                    }
                    listOfChrPositions.add(gvcfAnnotationsAndCalls.get(i));
                }
            }
        }
        long totalTimeToSubsetPosList = System.currentTimeMillis() - getCurrentSubsetPositionListTimeStart;
        //sort things to make sure we can iterate properly
        long sortPosTimeStart = System.currentTimeMillis();
        Collections.sort(listOfChrPositions);
        long totalTimeToSortPosList = System.currentTimeMillis() - sortPosTimeStart;

        int actualStartPos = 0;
        int startIndex = 0;
        int actualEndPos = 0;
        int endIndex = 0;

        long addAPositionToStartTime = System.currentTimeMillis();
        boolean addingAtStart = false;
        if(listOfChrPositions.size()>0 && startPositionListIndex!=0) {
            if(checkPositionCoverage(gvcfAnnotationsAndCalls.get(actualStartIndex-1),startSiteFinal)) {
                startIndex = actualStartIndex -1;
                listOfChrPositions.add(0,gvcfAnnotationsAndCalls.get(startIndex));
                addingAtStart = true;
            }


//            for (int i = 1; i < gvcfAnnotationsAndCalls.size(); i++) {
//            for (int i = startPositionListIndex; i < gvcfAnnotationsAndCalls.size(); i++) {
//                if (gvcfAnnotationsAndCalls.get(i).getPosition() == listOfChrPositions.get(0).getPosition()) {
//                    if (checkPositionCoverage(gvcfAnnotationsAndCalls.get(i - 1), startSiteFinal)) {
//                        startIndex = i-1;
//                        listOfChrPositions.add(0,gvcfAnnotationsAndCalls.get(startIndex));
//                        addingAtStart = true;
//                    }
//                    break;
//                }
//            }
        }
        long totalTimeToAddStartPosition = System.currentTimeMillis() - addAPositionToStartTime;

        //loop through from startSite till the first position in the GVCF export these alleles directly from the ref
        int listOfChrPositionsCounter = 0;
        ArrayList byteList = new ArrayList<>();

        if(listOfChrPositions.size()==0) {
            //Export a single N for a missing sequence
            byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
        }
        else {
            for (int siteCounter = startSite; siteCounter <= lastSite && listOfChrPositionsCounter < listOfChrPositions.size(); siteCounter++) {
                GeneralAnnotation ga = listOfChrPositions.get(listOfChrPositionsCounter).getAnnotation();
                Set annotationKeys = ga.getAnnotationKeys();
                if(addingAtStart) {
                    if (listOfChrPositionsCounter != 0) {
                        if (filterBitSet.fastGet(listOfChrPositionsCounter-1)) {
                            //get the size of the position
                            if (annotationKeys.contains("END")) {
                                int lengthOfPosBlock = Integer.parseInt((String) ga.getTextAnnotation("END")[0]) - listOfChrPositions.get(listOfChrPositionsCounter).getPosition();
                                siteCounter+=lengthOfPosBlock;
                            }
                            else{
                                //check insertion or deletion
                                String[] variants = listOfChrPositions.get(listOfChrPositionsCounter).getKnownVariants();
                                siteCounter+=variants[0].length()-1;
                            }
                            listOfChrPositionsCounter++;
                            continue;
                        }
                    }
                }
                else {
                    if (filterBitSet.fastGet(listOfChrPositionsCounter)) {
                        if (annotationKeys.contains("END")) {
                            int lengthOfPosBlock = Integer.parseInt((String) ga.getTextAnnotation("END")[0]) - listOfChrPositions.get(listOfChrPositionsCounter).getPosition();
                            siteCounter+=lengthOfPosBlock;
                        }
                        else{
                            //check insertion or deletion
                            String[] variants = listOfChrPositions.get(listOfChrPositionsCounter).getKnownVariants();
                            siteCounter+=variants[0].length()-1;
                        }
                        listOfChrPositionsCounter++;
                        continue;
                    }
                }
                if (maskBitSet.fastGet(listOfChrPositionsCounter)) {
                    //if true it means we have to mask all of these positions
                    //Check to see if we have an END anno
                }
                if (listOfChrPositions.get(listOfChrPositionsCounter).getPosition() <= siteCounter) {
                    if (annotationKeys.contains("END")) {
                        //this means we have a block
                        //Check the call and grab the corresponding allele values
                        String call = ga.getTextAnnotation("GT")[0];
                        boolean phased = true;
                        boolean haploid = false;
                        if (call.contains("/")) {
                            phased = false;
                        } else if (call.contains("|")) {
                            phased = true;
                        } else {
                            haploid = true;
                        }
                        String[] callSplit = phased ? call.split("|") : call.split("/");
                        int leftAllele = Integer.parseInt(callSplit[0]);
                        int rightAllele = leftAllele;
                        if (!haploid) {
                            rightAllele = Integer.parseInt(callSplit[1]);
                        }
                        int endPoint = Integer.parseInt(ga.getTextAnnotation("END")[0]) - 1;
                        //check to see if our requested end point is smaller than the region
                        if(lastSite-1 packedBytes.length * 2 || endPoint > packedBytes.length * 2) {
                            throw new IllegalArgumentException("GenomeSequenceBuilder.chromosomeSequence: requested sequence is out of range"); // requested sequence is out of range
                        }
                        String[] variants = listOfChrPositions.get(listOfChrPositionsCounter).getKnownVariants();
                        String leftAlleleString = variants[leftAllele];

                        //check depth
//                        String dpFullString = (String) annos.get("DP").toArray()[0];
                        String dpFullString = ga.getTextAnnotation("DP")[0];

                        // String minDPString = (String) annos.get("MIN_DP").toArray()[0];
                        //TODO check to see if we should assume only Ref or call for blocks
                        if (maskBitSet.fastGet(listOfChrPositionsCounter) || dpFullString.equals("0")) {

                            //if true we have a masking
                            for (int i = startSiteShifted; i <= endPoint; i++) {
                                regionDepthCount+=0;
                                regionZeroCoverageCount++;

                                byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                            }
                        } else if (leftAllele == 0) {
                            //fill in with reference sequence
                            //pull the sequence from siteCounter till you get to endPoint
                            //Zrm Apr17 fix for default rules
                            //check if homozygous or het based on GT calls
                            int depth = 0;
                            int minDepth = 0;
                            if(!dpFullString.equals("")) {
                                depth = Integer.parseInt(dpFullString);
                            }
                            for (int i = startSiteShifted; i <= endPoint; i++) {
                                regionDepthCount+=depth;
                                regionMinDepthCount+=minDepth;
                                byteList.add((byte) ((i % 2 == 0) ? ((packedBytes[i / 2] & 0xF0) >> 4) : (packedBytes[i / 2] & 0x0F)));
                            }

                        } else {
                            int depth = 0;
                            if(!dpFullString.equals("")) {
                                depth = Integer.parseInt(dpFullString);
                            }
                            for (int i = startSiteShifted; i <= endPoint; i++) {
                                regionDepthCount+=depth;
                                regionZeroCoverageCount++;
                                byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                            }
                        }
                        //shift up the siteCounter to match the end
                        siteCounter = endPoint + 1;
                    } else {
                        //it is likely a snp
                        //check the call and grab the correct allele
                        //siteCounter will increment correctly
                        String call = ga.getTextAnnotation("GT")[0];
                        boolean phased = true;
                        boolean haploid = false;
                        if (call.contains("/")) {
                            phased = false;
                        } else if (call.contains("|")) {
                            phased = true;
                        } else {
                            //haploid
                            haploid = true;
                        }
                        String[] callSplit = phased ? call.split("|") : call.split("/");
                        int leftAllele = Integer.parseInt(callSplit[0]);
                        int rightAllele = leftAllele;
                        if (!haploid) {
                            rightAllele = Integer.parseInt(callSplit[1]);
                        }

                        //Get the AD field and split it on commas
                        String adsFullString = ga.getTextAnnotation("AD")[0];
                        String dpFullString = ga.getTextAnnotation("DP")[0];
                        String[] adSplit = adsFullString.split(",");
                        //Get the known Variants
                        String[] variants = listOfChrPositions.get(listOfChrPositionsCounter).getKnownVariants();
                        boolean isHet = calcHet(adSplit);
                        if (isHet) {
                            //TODO handle hets correctly for indels and such
                            String leftAlleleString = variants[0];
                            siteCounter+=leftAlleleString.length()-1;
                            for(int i = 0; i < leftAlleleString.length();i++) {
                                regionHetCount++;
                            }
                        }
                        //ZRM April17 do calls based on depth only
                        else if (adSplit[0].equals("1") || adSplit[0].equals("2")) {
                            if (adSplit[1].equals("0")) {
                                //export the ref
                                String leftAlleleString = variants[0];

                                int depth = 0;
                                if(!dpFullString.equals("")) {
                                    depth = Integer.parseInt(dpFullString);
                                }

                                if (maskBitSet.fastGet(listOfChrPositionsCounter)) {
                                    //if true we have to mask it
                                    for (int i = 0; i < leftAlleleString.length() && siteCounter+i < lastSite-1; i++) {
                                        regionDepthCount+=depth;
                                        regionNumberHomoRef1Or2Count++;
                                        byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                                    }
                                } else {
                                    for (int i = 0; i < leftAlleleString.length() && siteCounter+i < lastSite-1; i++) {
                                        regionDepthCount+=depth;
                                        regionNumberHomoRef1Or2Count++;
                                        byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte(leftAlleleString.charAt(i)));
                                    }
                                }
                            }

                        } else if (adSplit[1].equals("1") || adSplit[1].equals("2")) {
                            //call Ns
                            //export the ref
                            String leftAlleleString = variants[1];
                            int depth = 0;
                            if(!dpFullString.equals("")) {
                                depth = Integer.parseInt(dpFullString);
                            }
                            for (int i = 0; i < leftAlleleString.length(); i++) {
                                regionDepthCount+=depth;
                                regionNumberHomoAlt1Or2Count++;
                                byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                            }
                        } else if (Integer.parseInt(adSplit[1]) >= 3) {
                            int depth = 0;
                            if(!dpFullString.equals("")) {
                                depth = Integer.parseInt(dpFullString);
                            }
                            String leftAlleleString = variants[1];
                            if (maskBitSet.fastGet(listOfChrPositionsCounter)) {
                                //if true we have to mask it
                                for (int i = 0; i < leftAlleleString.length(); i++) {
                                    regionAltCount++;
                                    regionNumberHomoAlt3PlusCount++;
                                    regionDepthCount+=depth;
                                    byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                                }
                            } else {
                                for (int i = 0; i < leftAlleleString.length(); i++) {
                                    regionAltCount++;
                                    regionNumberHomoAlt3PlusCount++;
                                    regionDepthCount+=depth;
                                    byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte(leftAlleleString.charAt(i)));
                                }
                                siteCounter+=variants[0].length()-1;
                            }
                        }
                        else {
                            //in the case where we have 0 depth for each allele it will fall through all the cases
                            //lets stick in Ns for the number of reference basepairs  fix the old method header as well.
                            String leftAlleleString = variants[0];

                            for (int i = 0; i < leftAlleleString.length(); i++) {
//                                    regionAltCount++;
//                                    regionNumberHomoAlt3PlusCount++;
//                                    regionDepthCount+=depth;
                                byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte(leftAlleleString.charAt(i)));
                            }
                            siteCounter+=variants[0].length()-1;
                        }
                    }
                    listOfChrPositionsCounter++;
                } else {
                    //grab the reference as we dont have a gvcf for the requested site
                    //Should this be the breaking point of the sequence??
                    //TODO should we mark these with Ns?

                    //With 0 coverage we want to export an N
                    byteList.add(NucleotideAlignmentConstants.getNucleotideAlleleByte("N"));
                    regionDepthCount+=0;
                    regionZeroCoverageCount++;

                }
            }
        }
        regionTotalExportedBPCount = byteList.size();
        byte[] fullBytes = new byte[byteList.size()];
        for(int i = 0; i < fullBytes.length; i++) {
            fullBytes[i] = byteList.get(i);
        }

        //Compile the HashMap();
        HashMap sequenceAndStats = new HashMap<>();
        sequenceAndStats.put("Sequence",NucleotideAlignmentConstants.nucleotideBytetoString(fullBytes));

        sequenceAndStats.put("RefSize",""+regionTotalReferenceBPCount);
        sequenceAndStats.put("Size", ""+regionTotalExportedBPCount);
        sequenceAndStats.put("HetCount",""+regionHetCount);
        sequenceAndStats.put("AltCount",""+regionAltCount);
        sequenceAndStats.put("Depth",""+regionDepthCount);
        sequenceAndStats.put("GQ",""+regionGQCount);
        sequenceAndStats.put("Min_Depth",""+regionMinDepthCount);
        sequenceAndStats.put("ZeroCoverageCount",""+regionZeroCoverageCount);
        sequenceAndStats.put("HomoRefCount",""+regionNumberHomoRef1Or2Count);
        sequenceAndStats.put("HomoAltLowDepthCount",""+regionNumberHomoAlt1Or2Count);
        sequenceAndStats.put("HomoAltHighDepthCount",""+regionNumberHomoAlt3PlusCount);

        long queryTimeEnd = System.currentTimeMillis();
//        System.out.println("Current Query Time:"+(queryTimeEnd - queryTimeStart)+"ms\n" +
//                "Time Taken to subset Pos List"+totalTimeToSubsetPosList+"ms\n" +
//                "Time Taken to sort Pos List"+totalTimeToSortPosList+"ms\n" +
//                "Time Taken to add a Position to start of List:"+totalTimeToAddStartPosition+"ms\n");

//        System.out.println("Current Query Time:"+(queryTimeEnd - queryTimeStart)+"ms");


        return sequenceAndStats;


    }

    @Override
    //TODO add in functionality to handle GVCF annotations
    public byte[] genomeSequence(long startSite, long lastSite) {
        if(lastSite-startSite>Integer.MAX_VALUE) throw
                new IllegalArgumentException("Less than "+Integer.MAX_VALUE+" sites must be requested at a time");
        byte[] fullBytes=new byte[(int)(lastSite-startSite+1)];
        long currentSiteToGet=startSite;
        while(currentSiteToGet,Chromosome> rangeChromEntry=wholeGenomeIndexMap.getEntry(currentSiteToGet);
            int chrStart=(int)(currentSiteToGet-rangeChromEntry.getKey().lowerEndpoint());
            int chrLast=(int)Math.min(rangeChromEntry.getKey().upperEndpoint()-rangeChromEntry.getKey().lowerEndpoint(),lastSite-rangeChromEntry.getKey().lowerEndpoint());
            byte[] chromoSeq=chromosomeSequence(rangeChromEntry.getValue(), chrStart+1,chrLast+1);  //+1 for 0 based genome, 1 based chromosomes
            System.arraycopy(chromoSeq,0,fullBytes,(int)(currentSiteToGet-startSite),chromoSeq.length);
            currentSiteToGet+=chromoSeq.length;
        }
        return fullBytes;
    }

    @Override
    public int chromosomeSize(Chromosome chromosome) {
        return chromLengthLookup.get(chromosome);
    }

    @Override
    public long genomeSize() {
        return genomeSize;
    }

    @Override
    public int numberOfChromosomes() {
        return chromPositionMap.size();
    }

    @Override
    //TODO see if we need to fix this for GVCF annotations
    public Map> fullRefCoordinateToChromCoordinate(ArrayList coordinates) {
        // Returns 0-based value from Chromosome array (values are stored as 0-based)
        Map> mappedCoordinates = new ConcurrentHashMap>();
        coordinates.stream().parallel().forEach(coordinate -> {
            Map.Entry,Chromosome> rangeChromEntry=wholeGenomeIndexMap.getEntry(coordinate);
            Chromosome curChrom = rangeChromEntry.getValue();
            long chromCoordinate = coordinate - rangeChromEntry.getKey().lowerEndpoint();
            Tuple chromWithCoordinate = new Tuple<>(curChrom, (int)chromCoordinate);
            mappedCoordinates.put(coordinate, chromWithCoordinate);
        });
        return mappedCoordinates;
    }

    private boolean checkPositionCoverage(Position pos, int site) {
//        SetMultimap annos = pos.getAnnotation().getAnnotationAsMap();
        GeneralAnnotation ga = pos.getAnnotation();
//        if(annos.containsKey("END")) {
//            int endPoint = Integer.parseInt((String)annos.get("END").toArray()[0]);
//            if(pos.getPosition()<=site && site <= endPoint) {
//                return true;
//            }
//            else {
//                return false;
//            }
//
//        }

        String[] endArray = ga.getTextAnnotation("END");

//        if(ga.getAnnotationKeys().contains("END")) {
        if(endArray.length!=0) {
//            int endPoint = Integer.parseInt(ga.getTextAnnotation("END")[0]);
            int endPoint = Integer.parseInt(endArray[0]);
            if(pos.getPosition()<=site && site <= endPoint) {
                return true;
            }
            else {
                return false;
            }

        }
        else if(pos.getPosition()==site) {
            return true;
        }
        else {
            String[] alleles = pos.getKnownVariants();
            //check to see if we have a deletion and the reference covers the whole
            if(alleles[0].length()+pos.getPosition()>=site) {
                return true;
            }
            else {
                return false;
            }
        }
    }

    public Map getChrPosMap() {
        return chromPositionMap;
    }
    public HashMap>> getConsecutiveRegions() {
        HashMap>> consecRegions = new HashMap<>();
        Set chromosomeSet = chromosomes();
        //Loop through each chromosome add to a Map
        HashMap> rangeMaps = new HashMap<>();
        for(Chromosome chr : chromosomeSet) {
            rangeMaps.put(chr,TreeRangeSet.create());
            consecRegions.put(chr,new ArrayList<>());
        }

        for(int i = 0 ; i < gvcfAnnotationsAndCalls.size(); i++) {
            if(filterBitSet.fastGet(i)) {
                //if we have a position to filter out, we should probably break the fasta sequence
                //to do this, just do not add in the range... RangeMap will handle it for you
                continue;
            }

            Position currentPosition = gvcfAnnotationsAndCalls.get(i);
            SetMultimap annos = currentPosition.getAnnotation().getAnnotationAsMap();
            if(annos.containsKey("END")) {
                int endPoint = Integer.parseInt((String)annos.get("END").toArray()[0]);
                rangeMaps.get(currentPosition.getChromosome()).add(Range.closed(currentPosition.getPosition(),endPoint+1));
            }
            else {
                //check to make sure that the reference allele is more than one allele(A deletion)
                String[] variants = currentPosition.getKnownVariants();
                //TODO handle hets correctly for indels and such
                String refAlleleString = variants[0];
                if(refAlleleString.length()>1) {
                    //we have a deletion or an insertion+deletion as such our END point will need to take into account the size of the deletion
                    rangeMaps.get(currentPosition.getChromosome()).add(Range.closed(currentPosition.getPosition(),currentPosition.getPosition()+refAlleleString.length()));
                }
                else {
                    //This will cause the index of the next line in the GVCF file to be non consecutive with the previous
                    rangeMaps.get(currentPosition.getChromosome()).add(Range.closed(currentPosition.getPosition(), currentPosition.getPosition() + 1));

                }
            }

        }

        for(Chromosome chr : chromosomeSet) {
            Set> rangeSet = rangeMaps.get(chr).asRanges();
            for(Range currentRange : rangeSet) {
                ArrayList currentList = new ArrayList();
                currentList.add(currentRange.lowerEndpoint());
                //We need to subtract 1 point so it will be [inclusive,inclusive] instead of [inclusive,exclusive)
                currentList.add(currentRange.upperEndpoint()-1);
                consecRegions.get(chr).add(currentList);
            }
        }

        return consecRegions;
    }
    //TODO move this to a different class
    public void writeFASTA(String fileName){
        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(fileName));
            HashMap>> consecutiveRegions = getConsecutiveRegions();
            Set chromosomes = chromosomes();
            for(Chromosome chr : chromosomes) {
                for(ArrayList bounds : consecutiveRegions.get(chr)) {
                    writer.write(">Chr_"+chr.getChromosomeNumber()+"_StartSite_"+bounds.get(0)+"_EndSite_"+bounds.get(1));
                    writer.newLine();
                    writer.write(""+NucleotideAlignmentConstants.nucleotideBytetoString(chromosomeSequence(chr,bounds.get(0),bounds.get(1))));
                    writer.newLine();
                }
            }

            writer.close();
        }
        catch(Exception e){
            e.printStackTrace();
        }

    }

    public BitSet getMaskBitSet() {
        return maskBitSet;
    }
    public void setMaskBitSet(BitSet newMaskBitSet) {
        maskBitSet = newMaskBitSet;
    }

    public BitSet getFilterBitSet() {
        return filterBitSet;
    }
    public void setFilterBitSet(BitSet newFilterBitSet) {
        filterBitSet = newFilterBitSet;
    }

    public void flipMaskBit(int index) {
        maskBitSet.fastFlip(index);
    }
    public void flipFilterBit(int index) {
        filterBitSet.fastFlip(index);
    }
    public PositionList getGVCFPositions() {
        return gvcfAnnotationsAndCalls;
    }

    private boolean calcHet(String[] adSplit) {
        boolean isHet = false;
        int refDepth = Integer.parseInt(adSplit[0]);
        int altDepth = Integer.parseInt(adSplit[1]);
        if(refDepth>0 && altDepth>0) {
            isHet = true;
        }
        return isHet;
    }

    @Override
    public void resetCounters() {
//        regionTotalReferenceBPCount = 0;
//        regionTotalExportedBPCount = 0;
//        regionHetCount = 0;
//        regionAltCount = 0;
//        regionDepthCount = 0;
//        regionGQCount = 0;
//        regionMinDepthCount = 0;
//        regionZeroCoverageCount = 0;
//        regionNumberHomoRef1Or2Count = 0;
//        regionNumberHomoAlt1Or2Count = 0;
//        regionNumberHomoAlt3PlusCount = 0;
    }

    @Override
    public HashMap getPreviousRegionStats() {
        HashMap stats = new HashMap<>();
//        stats.put("RefSize",regionTotalReferenceBPCount);
//        stats.put("Size", regionTotalExportedBPCount);
//        stats.put("HetCount",regionHetCount);
//        stats.put("AltCount",regionAltCount);
//        stats.put("Depth",regionDepthCount);
//        stats.put("GQ",regionGQCount);
//        stats.put("Min_Depth",regionMinDepthCount);
//        stats.put("ZeroCoverageCount",regionZeroCoverageCount);
//        stats.put("HomoRefCount",regionNumberHomoRef1Or2Count);
//        stats.put("HomoAltLowDepthCount",regionNumberHomoAlt1Or2Count);
//        stats.put("HomoAltHighDepthCount",regionNumberHomoAlt3PlusCount);

        return stats;
    }

    @Override
    public byte genotype(Chromosome chrom, int position) {
        // TODO Auto-generated method stub
        return 0;
    }

    @Override
    public byte genotype(Chromosome chrom, Position positionObject) {
        // TODO Auto-generated method stub
        return 0;
    }

    @Override
    public String genotypeAsString(Chromosome chrom, int position) {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String genotypeAsString(Chromosome chrom, Position positionObject) {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String genotypeAsString(Chromosome chrom, int startSite, int endSite) {
        // TODO Auto-generated method stub
        return null;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy