org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.walkers.sv;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.apache.commons.lang3.tuple.MutablePair;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.MultiVariantWalkerGroupedOnStart;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.copynumber.GermlineCNVCaller;
import org.broadinstitute.hellbender.tools.copynumber.PostprocessGermlineCNVCalls;
import org.broadinstitute.hellbender.tools.copynumber.gcnv.GermlineCNVSegmentVariantComposer;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
import org.broadinstitute.hellbender.tools.sv.cluster.*;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import org.broadinstitute.hellbender.utils.samples.PedigreeValidationType;
import org.broadinstitute.hellbender.utils.samples.SampleDB;
import org.broadinstitute.hellbender.utils.samples.Sex;
import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;

import java.util.*;
import java.util.stream.Collectors;

/**
 * Merge GCNV segments VCFs
 *
 * This tool takes in segmented VCFs produced by {@link PostprocessGermlineCNVCalls}.  For single-sample input VCFs,
 * the tool defragments CNV calls by merging adjacent events within the default or specified fractional padding margins.
 * For distinct samples, the tool clusters events that meet the reciprocal overlap requirement. Output is a multi-sample VCF
 * with site allele counts and frequencies (AC, AF).
 *
 * This tool can also be run on multiple multi-sample VCFs, as in a hierarchical gather for large cohorts.  In this case,
 * defragmentation is not preformed because it is assumed that single-sample defragmentation occurred during the initial
 * tool invocation that produced the multi-sample VCF.
 *
 * Required inputs
 * 
 *     Reference fasta
 *     Segmented VCFs, single-sample or multi-sample, from {@link PostprocessGermlineCNVCalls}
 *     The interval list used by {@link GermlineCNVCaller} for calling the input VCFs
 *     A pedigree file with an entry giving the sex of each sample
 * 
 *
 * Output
 * A single multi-sample VCF with genotypes and copy numbers
 *
 *
 * Usage example
 *
 *  *   gatk JointGermlineCNVSegmentation \
 *         -R reference.fasta
 *         -V sample1.vcf.gz
 *         -V sample2.vcf.gz
 *         --model-call-intervals .filtered.interval_list
 *         --pedigree XandYassignments.ped
 *         -O clustered.vcf.gz
 * 
 *
 * Caveats
 * 

 * All input samples are required to be present in the pedigree file
 * Quality scores are not output, but can be recalculated based on the updated event boundaries with {@link PostprocessGermlineCNVCalls}
 * Multi-sample input VCFs are assumed to have been generated by this tool and already defragmented
 * This tool only supports mammalian genomes with XX/XY sex determination.
 * 

 **/
@DocumentedFeature
@BetaFeature
@CommandLineProgramProperties(
        summary = "Gathers single-sample or multi-sample segmented gCNV VCFs, harmonizes breakpoints, and outputs a cohort VCF with genotypes.",
        oneLineSummary = "Combine segmented gCNV VCFs.",
        programGroup = StructuralVariantDiscoveryProgramGroup.class
)
public class JointGermlineCNVSegmentation extends MultiVariantWalkerGroupedOnStart {

    private SortedSet samples;
    private VariantContextWriter vcfWriter;
    private SAMSequenceDictionary dictionary;
    private SVClusterEngine defragmenter;
    private SVClusterEngine clusterEngine;
    private List callIntervals;
    private String currentContig;
    private SampleDB sampleDB;
    private boolean isMultiSampleInput = false;
    private ReferenceSequenceFile reference;
    private Collection defragmentBuffer;
    private Collection outputBuffer;
    private final Set allosomalContigs = new LinkedHashSet<>(Arrays.asList("X","Y","chrX","chrY"));

    class CopyNumberAndEndRecord {
        private MutablePair record;

        public CopyNumberAndEndRecord(final int copyNumber, final int end) {
            record = new MutablePair<>(copyNumber, end);
        }

        public int getCopyNumber() {
            return record.getLeft();
        }

        public int getEndPosition() {
            return record.getRight();
        }
    }

    public static final String MIN_QUALITY_LONG_NAME = "minimum-qs-score";
    public static final String MIN_SAMPLE_NUM_OVERLAP_LONG_NAME = "min-sample-set-fraction-overlap";
    public static final String DEFRAGMENTATION_PADDING_LONG_NAME = "defragmentation-padding-fraction";
    public static final String CLUSTERING_INTERVAL_OVERLAP_LONG_NAME = "clustering-interval-overlap";
    public static final String CLUSTERING_BREAKEND_WINDOW_LONG_NAME = "clustering-breakend-window";
    public static final String CLUSTERING_SIZE_SIMILARITY_LONG_NAME = "clustering-size-similarity";
    public static final String MODEL_CALL_INTERVALS_LONG_NAME = "model-call-intervals";
    public static final String BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME = "breakpoint-summary-strategy";
    public static final String ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME = "alt-allele-summary-strategy";
    public static final String FLAG_FIELD_LOGIC_LONG_NAME = "flag-field-logic";

    @Argument(fullName = MIN_QUALITY_LONG_NAME, doc = "Minimum QS score to combine a variant segment", optional = true)
    private int minQS = 20;

    @Argument(fullName = MIN_SAMPLE_NUM_OVERLAP_LONG_NAME, doc = "Minimum fraction of common samples for two variants to cluster together", optional = true)
    private double minSampleSetOverlap = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getSampleOverlap();

    @Argument(fullName = DEFRAGMENTATION_PADDING_LONG_NAME, doc = "Extend events by this fraction on each side when determining overlap to merge", optional = true)
    private double defragmentationPadding = CNVLinkage.DEFAULT_PADDING_FRACTION;

    @Argument(fullName = CLUSTERING_INTERVAL_OVERLAP_LONG_NAME,
            doc="Minimum interval reciprocal overlap for clustering", optional=true)
    public double clusterIntervalOverlap = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getReciprocalOverlap();

    @Argument(fullName = CLUSTERING_BREAKEND_WINDOW_LONG_NAME,
            doc="Cluster events whose endpoints are within this distance of each other", optional=true)
    public int clusterWindow = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getWindow();

    @Argument(fullName = CLUSTERING_SIZE_SIMILARITY_LONG_NAME,
            doc="Minimum size similarity for clustering", optional=true)
    public double clusterSizeSimilarity = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getSizeSimilarity();

    @Argument(fullName = MODEL_CALL_INTERVALS_LONG_NAME, doc = "gCNV model intervals created with the FilterIntervals tool.", optional=true)
    private GATKPath modelCallIntervalList = null;

    @Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME, doc = "Strategy to use for choosing a representative value for a breakpoint cluster.", optional = true)
    private CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy = CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END;


    @Argument(fullName = ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME, doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with different subtypes.", optional = true)
    private CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy = CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE;

    @Argument(fullName= StandardArgumentDefinitions.OUTPUT_LONG_NAME,
            shortName=StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
            doc="The combined output VCF")
    private GATKPath outputFile;

    @Argument(
            doc = "Reference copy-number on autosomal intervals.",
            fullName = PostprocessGermlineCNVCalls.AUTOSOMAL_REF_COPY_NUMBER_LONG_NAME,
            minValue = 0,
            optional = true
    )
    private int refAutosomalCopyNumber = 2;

    /**
     * See https://software.broadinstitute.org/gatk/documentation/article.php?id=7696 for more details on the PED
     * format. Note that each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to
     * tell the GATK PED parser that the corresponding fields are missing from the ped file.
     *
     */
    @Argument(fullName=StandardArgumentDefinitions.PEDIGREE_FILE_LONG_NAME, shortName=StandardArgumentDefinitions.PEDIGREE_FILE_SHORT_NAME, doc="Pedigree file for samples")
    private GATKPath pedigreeFile = null;

    @Override
    public boolean doDictionaryCrossValidation() {
        return false;
    }

    //require a reference to do dictionary validation since there may be too many samples for cross-validating
    @Override
    public boolean requiresReference() {
        return true;
    }

    // Cannot require sample overlap when clustering across samples
    private static final double CLUSTER_SAMPLE_OVERLAP_FRACTION = 0;

    @Argument(fullName = SVClusterWalker.MAX_RECORDS_IN_RAM_LONG_NAME,
            doc = "When writing VCF files that need to be sorted, this will specify the number of records stored in " +
                    "RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort a " +
                    "VCF file, and increases the amount of RAM needed.",
            optional=true)
    public int maxRecordsInRam = 10000;

    @Override
    public void onTraversalStart() {
        reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier());

        //strict validation will verify that all samples are in the pedigree
        sampleDB = SampleDB.createSampleDBFromPedigreeAndDataSources(pedigreeFile, getSamplesForVariants(), PedigreeValidationType.STRICT);

        dictionary = getBestAvailableSequenceDictionary();
        //dictionary will not be null because this tool requiresReference()

        final GenomeLocParser parser = new GenomeLocParser(this.dictionary);

        setIntervals(parser);

        final ClusteringParameters clusterArgs = ClusteringParameters.createDepthParameters(clusterIntervalOverlap, clusterSizeSimilarity, clusterWindow, CLUSTER_SAMPLE_OVERLAP_FRACTION);
        if (callIntervals == null) {
            defragmenter = SVClusterEngineFactory.createCNVDefragmenter(dictionary, altAlleleSummaryStrategy, reference, defragmentationPadding, minSampleSetOverlap);
        } else {
            defragmenter = SVClusterEngineFactory.createBinnedCNVDefragmenter(dictionary, altAlleleSummaryStrategy, reference, defragmentationPadding, minSampleSetOverlap, callIntervals);
        }
        clusterEngine = SVClusterEngineFactory.createCanonical(SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE, breakpointSummaryStrategy, altAlleleSummaryStrategy,
                dictionary, reference, true, clusterArgs, CanonicalSVLinkage.DEFAULT_MIXED_PARAMS, CanonicalSVLinkage.DEFAULT_PESR_PARAMS);

        defragmentBuffer = new ArrayList<>();
        outputBuffer = new ArrayList<>();
        vcfWriter = getVCFWriter();

        if (getSamplesForVariants().size() != 1) {
            logger.warn("Multi-sample VCFs found, which are assumed to be pre-clustered. Skipping defragmentation.");
            isMultiSampleInput = true;
        } else {
            isMultiSampleInput = false;
        }
    }

    /**
     * If model intervals are supplied, subset to the requested traversal intervals
     * @param parser    needed to merge intervals if necessary
     */
    private void setIntervals(final GenomeLocParser parser) {
        if (modelCallIntervalList != null) {
           final List inputCoverageIntervals = IntervalUtils.featureFileToIntervals(parser, modelCallIntervalList.getURIString());
            final List inputTraversalIntervals = IntervalUtils.genomeLocsFromLocatables(parser,getTraversalIntervals());
            callIntervals = IntervalUtils.mergeListsBySetOperator(inputCoverageIntervals, inputTraversalIntervals, IntervalSetRule.INTERSECTION);
        }
    }

    private VariantContextWriter getVCFWriter() {
        samples = getSamplesForVariants();

        final VCFHeader inputVCFHeader = new VCFHeader(getHeaderForVariants().getMetaDataInInputOrder(), samples);

        final Set headerLines = new LinkedHashSet<>(inputVCFHeader.getMetaDataInInputOrder());
        headerLines.addAll(getDefaultToolVCFHeaderLines());
        headerLines.add(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN));
        headerLines.add(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE));
        headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
        headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
        headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));

        VariantContextWriter writer = createVCFWriter(outputFile);

        final Set sampleNameSet = new IndexedSampleList(samples).asSetOfSamples();
        final VCFHeader vcfHeader = new VCFHeader(headerLines, new TreeSet<>(sampleNameSet));
        writer.writeHeader(vcfHeader);

        return writer;
    }

    /**
     * @param variantContexts  VariantContexts from driving variants with matching start position
     *                         NOTE: This will never be empty
     * @param referenceContext ReferenceContext object covering the reference of the longest spanning VariantContext
     * @param readsContexts
     */
    @Override
    public void apply(final List variantContexts, final ReferenceContext referenceContext, final List readsContexts) {
        //variantContexts should have identical start, so choose 0th arbitrarily
        final String variantContig = variantContexts.get(0).getContig();
        if (currentContig != null && !variantContig.equals(currentContig)) {
            processClusters();
        }
        currentContig = variantContig;
        for (final VariantContext vc : variantContexts) {
            final SVCallRecord record = createDepthOnlyFromGCNVWithOriginalGenotypes(vc, minQS, allosomalContigs, refAutosomalCopyNumber, sampleDB);
            if (record != null) {
                if (!isMultiSampleInput) {
                    bufferDefragmenterOutput(defragmenter.addAndFlush(record));
                } else {
                    bufferClusterOutput(clusterEngine.addAndFlush(record));
                }
            }
        }
    }

    private void bufferDefragmenterOutput(final List records) {
        defragmentBuffer.addAll(records);
    }

    private List flushDefragmenterBuffer() {
        final List result = defragmentBuffer.stream()
                .sorted(Comparator.comparingInt(SVCallRecord::getPositionA))
                .collect(Collectors.toUnmodifiableList());
        defragmentBuffer = new ArrayList<>();
        return result;
    }

    private void bufferClusterOutput(final List records) {
        outputBuffer.addAll(records);
    }

    private List flushClusterBuffer() {
        final List result = outputBuffer.stream()
                .sorted(Comparator.comparingInt(SVCallRecord::getPositionA))
                .collect(Collectors.toUnmodifiableList());
        outputBuffer = new ArrayList<>();
        return result;
    }

    @Override
    public Object onTraversalSuccess() {
        processClusters();
        return null;
    }

    /**
     * Force-flushes the defragmenter, adds the resulting calls to the clustering engine, and flushes the clustering
     * engine. Since we need to check for variant overlap and reset genotypes, only flush clustering when we hit a
     * new contig.
     */
    private void processClusters() {
        bufferDefragmenterOutput(defragmenter.flush());
        //Jack and Isaac cluster first and then defragment
        bufferClusterOutput(
                flushDefragmenterBuffer().stream()
                        .map(clusterEngine::addAndFlush)
                        .flatMap(List::stream)
                        .collect(Collectors.toList())
        );
        bufferClusterOutput(clusterEngine.flush());
        write(flushClusterBuffer());
    }

    private VariantContext buildAndSanitizeRecord(final SVCallRecord record) {
        final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(record)
                .rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)
                .rmAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE);
        final List genotypes = new ArrayList<>(builder.getGenotypes().size());
        for (final Genotype g : builder.getGenotypes()) {
            final Map attr = new HashMap<>(g.getExtendedAttributes());
            attr.remove(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT);
            genotypes.add(new GenotypeBuilder(g).noAttributes().attributes(attr).make());
        }
        return builder.genotypes(genotypes).make();
    }

    private void write(final List calls) {
        final List sanitizedRecords = calls.stream()
                .map(this::buildAndSanitizeRecord)
                .collect(Collectors.toList());
        final Iterator it = sanitizedRecords.iterator();
        ArrayList overlappingVCs = new ArrayList<>(calls.size());
        if (!it.hasNext()) {
            return;
        }
        int clusterEnd = -1;
        String clusterContig = null;
        //gather groups of overlapping VCs and update the genotype copy numbers appropriately
        while (it.hasNext()) {
            final VariantContext curr = it.next();
            if ((clusterEnd == -1 || curr.getStart() < clusterEnd) && (clusterContig == null || curr.getContig().equals(clusterContig))) {
                overlappingVCs.add(curr);
                if (curr.getEnd() > clusterEnd) {
                    clusterEnd = curr.getEnd();
                }
                if (clusterContig == null) {
                    clusterContig = curr.getContig();
                }
            } else {
                final List resolvedVCs = resolveVariantContexts(allosomalContigs, refAutosomalCopyNumber, sampleDB, samples, overlappingVCs);
                resolvedVCs.forEach(vcfWriter::add);
                overlappingVCs = new ArrayList<>();
                overlappingVCs.add(curr);
                clusterEnd = curr.getEnd();
                clusterContig = curr.getContig();
            }
        }
        //write out the last set of overlapping VCs
        final List resolvedVCs = resolveVariantContexts(allosomalContigs, refAutosomalCopyNumber, sampleDB, samples, overlappingVCs);
        resolvedVCs.forEach(vcfWriter::add);
    }

    /**
     * Correct genotype calls for overlapping variant contexts
     * Note that we assume that a sample will not occur twice with the same copy number because it should have been defragmented
     * @param allosomalContigs    names of allosomal contigs (e.g. X, Y, chrX, chrY)
     * @param refAutosomalCopyNumber   reference copy number for autosomes
     * @param sampleDB data structure containing sample sex assignments
     * @param samples   full set of samples to be output
     * @param overlappingVCs    overlapping variant contexts with correct genotypes, but maybe not copy numbers
     * @return a list of VariantContexts with genotypes corrected for called alleles and copy number
     */
    @VisibleForTesting
    protected List resolveVariantContexts(final Set allosomalContigs, final int refAutosomalCopyNumber,
                                                          final SampleDB sampleDB, final SortedSet samples,
                                                          final List overlappingVCs) {
        Utils.nonNull(overlappingVCs);
        final List resolvedVCs = new ArrayList<>(overlappingVCs.size());
        final Iterator it = overlappingVCs.iterator();

        //sampleName, copyNumber, endPos -- it's safe to just use position because if the VCs overlap then they must be on the same contig
        final Map sampleCopyNumbers = new LinkedHashMap<>(SVUtils.hashMapCapacity(overlappingVCs.size()));
        while (it.hasNext()) {
            final VariantContext curr = it.next();
            resolvedVCs.add(updateGenotypes(allosomalContigs, refAutosomalCopyNumber, sampleDB, samples, curr, sampleCopyNumbers));
            //update copy number table for subsequent VCs using variant genotypes from input VCs
            for (final Genotype g : curr.getGenotypes()) {
                if (g.hasAnyAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) {
                    sampleCopyNumbers.put(g.getSampleName(),
                            new CopyNumberAndEndRecord(
                                    VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, refAutosomalCopyNumber),
                                    curr.getAttributeAsInt(VCFConstants.END_KEY, curr.getStart())));
                }
            }
        }
        return resolvedVCs;
    }

    /**
     * For CNVs, i.e. DEL and DUP alts only
     * @param allosomalContigs  names of allosomal contigs (e.g. X, Y, chrX, chrY)
     * @param refAutosomalCopyNumber    reference copy number for autosomes
     * @param sampleDB  data structure containing sample sex assignments
     * @param samples   full set of samples to be output
     * @param vc VariantContext with just variant samples
     * @param sampleCopyNumbers may be modified to remove terminated variants
     * @return new VariantContext with AC and AF
     */
    @VisibleForTesting
    protected static VariantContext updateGenotypes(final Set allosomalContigs, final int refAutosomalCopyNumber,
                                                    final SampleDB sampleDB, final SortedSet samples,
                                                    final VariantContext vc, final Map sampleCopyNumbers) {
        final VariantContextBuilder builder = new VariantContextBuilder(vc);
        final List newGenotypes = new ArrayList<>();
        final Allele vcRefAllele = vc.getReference();
        final Map alleleCountMap = new LinkedHashMap<>(2);
        if (vc.getAlternateAlleles().stream().filter(a -> !a.equals(GATKSVVCFConstants.DEL_ALLELE)).filter(a -> !a.equals(GATKSVVCFConstants.DUP_ALLELE)).count() > 0) {
            throw new IllegalArgumentException("At site " + vc.getContig() + ":" + vc.getStart() + " variant context contains alternate alleles in addition to CNV  and  alleles: " + vc.getAlternateAlleles());
        }
        alleleCountMap.put(GATKSVVCFConstants.DEL_ALLELE, 0L);
        alleleCountMap.put(GATKSVVCFConstants.DUP_ALLELE, 0L);
        int alleleNumber = 0;
        for (final String sample : samples) {
            final Genotype g = vc.getGenotype(sample); //may be null
            final GenotypeBuilder genotypeBuilder = g == null? new GenotypeBuilder(sample) : new GenotypeBuilder(g);  //use g to set alleles
            final List alleles;

            //get proper ploidy for autosomes and allosomes (sex chromosomes)
            final int samplePloidy = getSamplePloidy(allosomalContigs, refAutosomalCopyNumber, sampleDB, sample, vc.getContig(), g);
            alleleNumber += samplePloidy;

            //"square off" the genotype matrix by filling in missing (non-variant) samples with homRef calls with reference copy number
            if (!sampleCopyNumbers.containsKey(sample) && !vc.hasGenotype(sample)) {
                genotypeBuilder.alleles(GATKVariantContextUtils.makePloidyLengthAlleleList(samplePloidy, vcRefAllele));
                genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, samplePloidy);
                newGenotypes.add(genotypeBuilder.make());
            //handle variant genotypes and reference genotypes with non-reference copy number due to overlapping events
            } else {
                //determine sample copy number from VC genotype or sampleCopyNumbers map or default to reference, i.e. samplePloidy
                final int copyNumber;
                if (sampleCopyNumbers.containsKey(sample) && sampleCopyNumbers.get(sample).getEndPosition() > vc.getStart()) {
                    copyNumber = sampleCopyNumbers.get(sample).getCopyNumber();
                    alleles = GATKVariantContextUtils.makePloidyLengthAlleleList(samplePloidy, vcRefAllele);
                } else if (g != null) {
                    copyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, samplePloidy);
                    if (samplePloidy == g.getPloidy()) {
                        alleles = g.getAlleles();
                    } else {
                        alleles = GATKSVVariantContextUtils.makeGenotypeAllelesFromCopyNumber(copyNumber, samplePloidy, vcRefAllele);
                    }
                } else {
                    copyNumber = samplePloidy;
                    alleles = GATKSVVariantContextUtils.makeGenotypeAllelesFromCopyNumber(copyNumber, samplePloidy, vcRefAllele);
                }
                genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber);

                genotypeBuilder.alleles(alleles);
                newGenotypes.add(genotypeBuilder.make());

                //use called alleles to update AC
                //check for genotype in VC because we don't want to count overlapping events (in sampleCopyNumbers map) towards AC
                if (vc.hasGenotype(sample)) {
                    if (alleles.contains(GATKSVVCFConstants.DEL_ALLELE)) {
                        final Long count = alleleCountMap.get(GATKSVVCFConstants.DEL_ALLELE);
                        alleleCountMap.put(GATKSVVCFConstants.DEL_ALLELE, count + alleles.stream().filter(Allele::isNonReference).count());
                    } else if (copyNumber > samplePloidy) {
                        final Long count = alleleCountMap.get(GATKSVVCFConstants.DUP_ALLELE);
                        alleleCountMap.put(GATKSVVCFConstants.DUP_ALLELE, count + 1); //best we can do for dupes is carrier frequency
                    }
                }
            }
        }
        builder.genotypes(newGenotypes);

        if (alleleNumber > 0) {
            if (vc.getAlternateAlleles().size() == 1) {
                final long AC = alleleCountMap.get(vc.getAlternateAllele(0));
                builder.attribute(VCFConstants.ALLELE_COUNT_KEY, AC)
                        .attribute(VCFConstants.ALLELE_FREQUENCY_KEY, (double)AC / alleleNumber)
                        .attribute(VCFConstants.ALLELE_NUMBER_KEY, alleleNumber);
            } else {
                final List alleleCounts = new ArrayList<>(vc.getNAlleles());
                final List alleleFreqs = new ArrayList<>(vc.getNAlleles());
                for (final Allele a : builder.getAlleles()) {
                    if (a.isReference()) {
                        continue;
                    }
                    alleleCounts.add(alleleCountMap.get(a));
                    alleleFreqs.add(Double.valueOf(alleleCountMap.get(a)));
                }
                builder.attribute(VCFConstants.ALLELE_COUNT_KEY, alleleCounts)
                    .attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs)
                    .attribute(VCFConstants.ALLELE_NUMBER_KEY, alleleNumber);
            }
        }
        return builder.make();
    }

    /**
     * Get the sample ploidy for a given contig based on the input pedigree (if available)
     * defaults to {@param refAutosomalCopyNumber} for autosomes
     * @param allosomalContigs  names of allosomal contigs (e.g. X, Y, chrX, chrY)
     * @param refAutosomalCopyNumber    reference copy number for autosomes
     * @param sampleDB  data structure containing sample sex assignments
     * @param sampleName    current sample of interest
     * @param contig    current contig of interest
     * @param g may be null
     * @return
     */
    @VisibleForTesting
    protected static int getSamplePloidy(final Set allosomalContigs, final int refAutosomalCopyNumber,
                                       final SampleDB sampleDB, final String sampleName, final String contig, final Genotype g) {
        if (g != null && g.hasExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT)) {
            return VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
        }
        if (!allosomalContigs.contains(contig)) {
            return refAutosomalCopyNumber;
        }
        if (sampleDB == null || sampleDB.getSample(sampleName) == null) {
            if (g != null) {
                return g.getPloidy();
            } else {
                throw new IllegalStateException("Sample " + sampleName + " is missing from the pedigree");
            }
        } else {
            final Sex sampleSex = sampleDB.getSample(sampleName).getSex();
            if (contig.equals("X") || contig.equals("chrX")) {
                if (sampleSex.equals(Sex.FEMALE)) {
                    return 2;
                } else if (sampleSex.equals(Sex.MALE)) {
                    return 1;
                } else { //UNKNOWN
                    return 1;
                }
            } else if (contig.equals("Y") || contig.equals("chrY")) {
                if (sampleSex.equals(Sex.FEMALE)) {
                    return 0;
                } else if (sampleSex.equals(Sex.MALE)) {
                    return 1;
                } else { //UNKNOWN
                    return 1;
                }
            } else {
                throw new IllegalArgumentException("Encountered unknown allosomal contig: " + contig + ". This tool only " +
                        "supports mammalian genomes with XX/XY sex determination.");
            }
        }
    }

    @VisibleForTesting
    protected static VariantContext buildVariantContext(final SVCallRecord call, final ReferenceSequenceFile reference) {
        Utils.nonNull(call);
        Utils.nonNull(reference);
        // Use only the alt alleles actually called in genotypes
        final List newAltAlleles = getCalledAllelesOrOriginalIfNone(call);
        final boolean isCNV = newAltAlleles.size() != 1;
        final StructuralVariantType svtype;
        if (isCNV) {
            svtype = StructuralVariantType.CNV;
        } else {
            final Allele altAllele = newAltAlleles.get(0);
            if (altAllele.equals(Allele.SV_SIMPLE_DEL)) {
                svtype = StructuralVariantType.DEL;
            } else if (altAllele.equals(Allele.SV_SIMPLE_DUP)) {
                svtype = StructuralVariantType.DUP;
            } else {
                throw new IllegalArgumentException("Unsupported alt allele: " + altAllele.getDisplayString());
            }
        }
        final Allele refAllele = call.getRefAllele();
        final List outputAlleles = new ArrayList<>(newAltAlleles);
        outputAlleles.add(refAllele);

        final VariantContextBuilder builder = new VariantContextBuilder("", call.getContigA(), call.getPositionA(), call.getPositionB(),
                outputAlleles);
        builder.attribute(VCFConstants.END_KEY, call.getPositionB());
        builder.attribute(GATKSVVCFConstants.SVLEN, call.getLength());
        if (svtype == StructuralVariantType.CNV) {
            builder.attribute(VCFConstants.SVTYPE, "MCNV");  //MCNV for compatibility with svtk annotate
        } else {
            builder.attribute(VCFConstants.SVTYPE, svtype);
        }
        final List genotypes = new ArrayList<>(call.getGenotypes().size());
        for (final Genotype g : call.getGenotypes()) {
            final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
            genotypeBuilder.noAttributes();
            final Map attributes = new HashMap<>(g.getExtendedAttributes());
            //update reference alleles
            final List newGenotypeAlleles = new ArrayList<>(g.getAlleles().size());
            for (final Allele a : g.getAlleles()) {
                if (a.isReference()) {
                    newGenotypeAlleles.add(refAllele);
                } else {
                    newGenotypeAlleles.add(a);
                }
            }
            genotypeBuilder.alleles(newGenotypeAlleles);
             if (g.hasAnyAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) {
                 attributes.put(GATKSVVCFConstants.COPY_NUMBER_FORMAT, g.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT));
            }
            // Strip this for gCNV pipeline
            attributes.remove(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT);
            genotypeBuilder.attributes(attributes);
            genotypes.add(genotypeBuilder.make());
        }
        builder.genotypes(genotypes);
        return builder.make();
    }

    /**
     * Reduces alt alleles in genotypes, if any. If none, simply return the original alt allele list for the record.
     */
    private static List getCalledAllelesOrOriginalIfNone(final SVCallRecord call) {
        // Use only the alt alleles actually called in genotypes
        List newAltAlleles = call.getGenotypes().stream()
                .flatMap(g -> g.getAlleles().stream())
                .filter(allele -> allele != null && !allele.isReference() && allele.isCalled())
                .distinct()
                .collect(Collectors.toList());
        if (newAltAlleles.isEmpty()) {
            // If calls were dropped, use original alt alleles
            newAltAlleles = call.getAltAlleles();
        }
        return newAltAlleles;
    }

    @Override
    public void closeTool(){
        if (vcfWriter != null) {
            vcfWriter.close();
        }
    }

    /**
     * Attempts to create a new record from the given variant produced by
     * {@link org.broadinstitute.hellbender.tools.copynumber.GermlineCNVCaller}. If the variant contains only one
     * genotype, then null is returned if either the genotype is hom-ref, is a no-call but does not have
     * a CN value, does not meet the min QS value, or is a null call (see {@link #isNullCall(Genotype)}.
     * Genotypes that are hom-ref or are both a no-call and missing a CN value are filtered in the resulting record.
     *
     * This currently provides legacy support for older GermlineCNVCaller records that were not spec-compliant, although
     * this may be deprecated in the future.
     *
     * @param variant single-sample variant from a gCNV segments VCF
     * @param minQuality drop events with quality lower than this
     * @return a new record or null
     */
    public SVCallRecord createDepthOnlyFromGCNVWithOriginalGenotypes(final VariantContext variant,
                                                                     final double minQuality,
                                                                     final Set allosomalContigs,
                                                                     final int refAutosomalCopyNumber,
                                                                     final SampleDB sampleDB) {
        Utils.nonNull(variant);
        if (variant.getGenotypes().size() == 1) {
            //only cluster good variants
            final Genotype g = variant.getGenotypes().get(0);
            if (g.isHomRef() || (g.isNoCall() && !g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT))
                    || VariantContextGetters.getAttributeAsInt(g, GermlineCNVSegmentVariantComposer.QS, 0) < minQuality
                    || isNullCall(g)) {
                return null;
            }
        }

        final VariantContextBuilder svBuilder = new VariantContextBuilder(variant);
        svBuilder.attribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM));

        // Add expected copy number, assume diploid
        final Allele refAllele = variant.getReference();
        final List genotypesWithECN = variant.getGenotypes().stream()
                .map(g -> prepareGenotype(g, refAllele, allosomalContigs, refAutosomalCopyNumber, sampleDB, variant.getContig()))
                .collect(Collectors.toList());
        svBuilder.genotypes(genotypesWithECN);

        final SVCallRecord baseRecord = SVCallRecordUtils.create(svBuilder.make(), true, dictionary);
        final List nonRefGenotypes = baseRecord.getGenotypes().stream()
                .filter(g -> !(g.isHomRef() || (g.isNoCall() && !g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT))))
                .collect(Collectors.toList());
        return SVCallRecordUtils.copyCallWithNewGenotypes(baseRecord, GenotypesContext.copy(nonRefGenotypes));
    }

    private static Genotype prepareGenotype(final Genotype g, final Allele refAllele,
                                            final Set allosomalContigs, final int refAutosomalCopyNumber,
                                            final SampleDB sampleDB, final String contig) {
        final GenotypeBuilder builder = new GenotypeBuilder(g);
        final int ploidy = getSamplePloidy(allosomalContigs, refAutosomalCopyNumber, sampleDB, g.getSampleName(), contig, g);
        correctGenotypePloidy(builder, g, ploidy, refAllele);
        addExpectedCopyNumber(builder, ploidy);
        return builder.make();
    }

    /**
     * "Fills" genotype alleles so that it has the correct ploidy
     * @param builder new alleles will be set for this builder
     * @param g non-ref alleles will be carried over from this genotype
     * @param ploidy desired ploidy for the new genotype
     * @param refAllele desired ref allele for new genotype
     */
    private static void correctGenotypePloidy(final GenotypeBuilder builder, final Genotype g, final int ploidy,
                                              final Allele refAllele) {
        if (g.getAlleles().size() == 1 && g.getAllele(0).isNoCall()) {
            // Special case to force interpretation of a single no-call allele as a possible null GT
            builder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL));
        } else {
            final ArrayList alleles = new ArrayList<>(g.getAlleles());
            Utils.validate(alleles.size() <= ploidy, "Encountered genotype with ploidy " + ploidy +
                    " but " + alleles.size() + " alleles.");
            while (alleles.size() < ploidy) {
                alleles.add(refAllele);
            }
            alleles.trimToSize();
            builder.alleles(alleles);
        }
    }

    private static void addExpectedCopyNumber(final GenotypeBuilder g, final int ploidy) {
        g.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy).make();
    }

    /**
     * @param g
     * @return true if this is a call on a missing contig
     */
    private static boolean isNullCall(final Genotype g) {
        return g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)
                && VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0) == 0
                && g.isNoCall();

    }
}