All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.sv;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.apache.commons.lang3.tuple.MutablePair;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.MultiVariantWalkerGroupedOnStart;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.copynumber.GermlineCNVCaller;
import org.broadinstitute.hellbender.tools.copynumber.PostprocessGermlineCNVCalls;
import org.broadinstitute.hellbender.tools.copynumber.gcnv.GermlineCNVSegmentVariantComposer;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
import org.broadinstitute.hellbender.tools.sv.cluster.*;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import org.broadinstitute.hellbender.utils.samples.PedigreeValidationType;
import org.broadinstitute.hellbender.utils.samples.SampleDB;
import org.broadinstitute.hellbender.utils.samples.Sex;
import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;

import java.util.*;
import java.util.stream.Collectors;

/**
 * Merge GCNV segments VCFs
 *
 * This tool takes in segmented VCFs produced by {@link PostprocessGermlineCNVCalls}.  For single-sample input VCFs,
 * the tool defragments CNV calls by merging adjacent events within the default or specified fractional padding margins.
 * For distinct samples, the tool clusters events that meet the reciprocal overlap requirement. Output is a multi-sample VCF
 * with site allele counts and frequencies (AC, AF).
 *
 * This tool can also be run on multiple multi-sample VCFs, as in a hierarchical gather for large cohorts.  In this case,
 * defragmentation is not preformed because it is assumed that single-sample defragmentation occurred during the initial
 * tool invocation that produced the multi-sample VCF.
 *
 * 

Required inputs

*
    *
  • Reference fasta
  • *
  • Segmented VCFs, single-sample or multi-sample, from {@link PostprocessGermlineCNVCalls}
  • *
  • The interval list used by {@link GermlineCNVCaller} for calling the input VCFs
  • *
  • A pedigree file with an entry giving the sex of each sample
  • *
* *

Output

* A single multi-sample VCF with genotypes and copy numbers * * *

Usage example

* *
 *   gatk JointGermlineCNVSegmentation \
 *         -R reference.fasta
 *         -V sample1.vcf.gz
 *         -V sample2.vcf.gz
 *         --model-call-intervals .filtered.interval_list
 *         --pedigree XandYassignments.ped
 *         -O clustered.vcf.gz
 * 
* *

Caveats

*

    *
  • All input samples are required to be present in the pedigree file
  • *
  • Quality scores are not output, but can be recalculated based on the updated event boundaries with {@link PostprocessGermlineCNVCalls}
  • *
  • Multi-sample input VCFs are assumed to have been generated by this tool and already defragmented
  • *
  • This tool only supports mammalian genomes with XX/XY sex determination.
  • *

**/ @DocumentedFeature @BetaFeature @CommandLineProgramProperties( summary = "Gathers single-sample or multi-sample segmented gCNV VCFs, harmonizes breakpoints, and outputs a cohort VCF with genotypes.", oneLineSummary = "Combine segmented gCNV VCFs.", programGroup = StructuralVariantDiscoveryProgramGroup.class ) public class JointGermlineCNVSegmentation extends MultiVariantWalkerGroupedOnStart { private SortedSet samples; private VariantContextWriter vcfWriter; private SAMSequenceDictionary dictionary; private SVClusterEngine defragmenter; private SVClusterEngine clusterEngine; private List callIntervals; private String currentContig; private SampleDB sampleDB; private boolean isMultiSampleInput = false; private ReferenceSequenceFile reference; private Collection defragmentBuffer; private Collection outputBuffer; private final Set allosomalContigs = new LinkedHashSet<>(Arrays.asList("X","Y","chrX","chrY")); class CopyNumberAndEndRecord { private MutablePair record; public CopyNumberAndEndRecord(final int copyNumber, final int end) { record = new MutablePair<>(copyNumber, end); } public int getCopyNumber() { return record.getLeft(); } public int getEndPosition() { return record.getRight(); } } public static final String MIN_QUALITY_LONG_NAME = "minimum-qs-score"; public static final String MIN_SAMPLE_NUM_OVERLAP_LONG_NAME = "min-sample-set-fraction-overlap"; public static final String DEFRAGMENTATION_PADDING_LONG_NAME = "defragmentation-padding-fraction"; public static final String CLUSTERING_INTERVAL_OVERLAP_LONG_NAME = "clustering-interval-overlap"; public static final String CLUSTERING_BREAKEND_WINDOW_LONG_NAME = "clustering-breakend-window"; public static final String CLUSTERING_SIZE_SIMILARITY_LONG_NAME = "clustering-size-similarity"; public static final String MODEL_CALL_INTERVALS_LONG_NAME = "model-call-intervals"; public static final String BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME = "breakpoint-summary-strategy"; public static final String ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME = "alt-allele-summary-strategy"; public static final String FLAG_FIELD_LOGIC_LONG_NAME = "flag-field-logic"; @Argument(fullName = MIN_QUALITY_LONG_NAME, doc = "Minimum QS score to combine a variant segment", optional = true) private int minQS = 20; @Argument(fullName = MIN_SAMPLE_NUM_OVERLAP_LONG_NAME, doc = "Minimum fraction of common samples for two variants to cluster together", optional = true) private double minSampleSetOverlap = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getSampleOverlap(); @Argument(fullName = DEFRAGMENTATION_PADDING_LONG_NAME, doc = "Extend events by this fraction on each side when determining overlap to merge", optional = true) private double defragmentationPadding = CNVLinkage.DEFAULT_PADDING_FRACTION; @Argument(fullName = CLUSTERING_INTERVAL_OVERLAP_LONG_NAME, doc="Minimum interval reciprocal overlap for clustering", optional=true) public double clusterIntervalOverlap = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getReciprocalOverlap(); @Argument(fullName = CLUSTERING_BREAKEND_WINDOW_LONG_NAME, doc="Cluster events whose endpoints are within this distance of each other", optional=true) public int clusterWindow = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getWindow(); @Argument(fullName = CLUSTERING_SIZE_SIMILARITY_LONG_NAME, doc="Minimum size similarity for clustering", optional=true) public double clusterSizeSimilarity = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getSizeSimilarity(); @Argument(fullName = MODEL_CALL_INTERVALS_LONG_NAME, doc = "gCNV model intervals created with the FilterIntervals tool.", optional=true) private GATKPath modelCallIntervalList = null; @Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME, doc = "Strategy to use for choosing a representative value for a breakpoint cluster.", optional = true) private CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy = CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END; @Argument(fullName = ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME, doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with different subtypes.", optional = true) private CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy = CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE; @Argument(fullName= StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName=StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc="The combined output VCF") private GATKPath outputFile; @Argument( doc = "Reference copy-number on autosomal intervals.", fullName = PostprocessGermlineCNVCalls.AUTOSOMAL_REF_COPY_NUMBER_LONG_NAME, minValue = 0, optional = true ) private int refAutosomalCopyNumber = 2; /** * See https://software.broadinstitute.org/gatk/documentation/article.php?id=7696 for more details on the PED * format. Note that each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to * tell the GATK PED parser that the corresponding fields are missing from the ped file. * */ @Argument(fullName=StandardArgumentDefinitions.PEDIGREE_FILE_LONG_NAME, shortName=StandardArgumentDefinitions.PEDIGREE_FILE_SHORT_NAME, doc="Pedigree file for samples") private GATKPath pedigreeFile = null; @Override public boolean doDictionaryCrossValidation() { return false; } //require a reference to do dictionary validation since there may be too many samples for cross-validating @Override public boolean requiresReference() { return true; } // Cannot require sample overlap when clustering across samples private static final double CLUSTER_SAMPLE_OVERLAP_FRACTION = 0; @Argument(fullName = SVClusterWalker.MAX_RECORDS_IN_RAM_LONG_NAME, doc = "When writing VCF files that need to be sorted, this will specify the number of records stored in " + "RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort a " + "VCF file, and increases the amount of RAM needed.", optional=true) public int maxRecordsInRam = 10000; @Override public void onTraversalStart() { reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier()); //strict validation will verify that all samples are in the pedigree sampleDB = SampleDB.createSampleDBFromPedigreeAndDataSources(pedigreeFile, getSamplesForVariants(), PedigreeValidationType.STRICT); dictionary = getBestAvailableSequenceDictionary(); //dictionary will not be null because this tool requiresReference() final GenomeLocParser parser = new GenomeLocParser(this.dictionary); setIntervals(parser); final ClusteringParameters clusterArgs = ClusteringParameters.createDepthParameters(clusterIntervalOverlap, clusterSizeSimilarity, clusterWindow, CLUSTER_SAMPLE_OVERLAP_FRACTION); if (callIntervals == null) { defragmenter = SVClusterEngineFactory.createCNVDefragmenter(dictionary, altAlleleSummaryStrategy, reference, defragmentationPadding, minSampleSetOverlap); } else { defragmenter = SVClusterEngineFactory.createBinnedCNVDefragmenter(dictionary, altAlleleSummaryStrategy, reference, defragmentationPadding, minSampleSetOverlap, callIntervals); } clusterEngine = SVClusterEngineFactory.createCanonical(SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE, breakpointSummaryStrategy, altAlleleSummaryStrategy, dictionary, reference, true, clusterArgs, CanonicalSVLinkage.DEFAULT_MIXED_PARAMS, CanonicalSVLinkage.DEFAULT_PESR_PARAMS); defragmentBuffer = new ArrayList<>(); outputBuffer = new ArrayList<>(); vcfWriter = getVCFWriter(); if (getSamplesForVariants().size() != 1) { logger.warn("Multi-sample VCFs found, which are assumed to be pre-clustered. Skipping defragmentation."); isMultiSampleInput = true; } else { isMultiSampleInput = false; } } /** * If model intervals are supplied, subset to the requested traversal intervals * @param parser needed to merge intervals if necessary */ private void setIntervals(final GenomeLocParser parser) { if (modelCallIntervalList != null) { final List inputCoverageIntervals = IntervalUtils.featureFileToIntervals(parser, modelCallIntervalList.getURIString()); final List inputTraversalIntervals = IntervalUtils.genomeLocsFromLocatables(parser,getTraversalIntervals()); callIntervals = IntervalUtils.mergeListsBySetOperator(inputCoverageIntervals, inputTraversalIntervals, IntervalSetRule.INTERSECTION); } } private VariantContextWriter getVCFWriter() { samples = getSamplesForVariants(); final VCFHeader inputVCFHeader = new VCFHeader(getHeaderForVariants().getMetaDataInInputOrder(), samples); final Set headerLines = new LinkedHashSet<>(inputVCFHeader.getMetaDataInInputOrder()); headerLines.addAll(getDefaultToolVCFHeaderLines()); headerLines.add(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN)); headerLines.add(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE)); headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY)); headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY)); headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY)); VariantContextWriter writer = createVCFWriter(outputFile); final Set sampleNameSet = new IndexedSampleList(samples).asSetOfSamples(); final VCFHeader vcfHeader = new VCFHeader(headerLines, new TreeSet<>(sampleNameSet)); writer.writeHeader(vcfHeader); return writer; } /** * @param variantContexts VariantContexts from driving variants with matching start position * NOTE: This will never be empty * @param referenceContext ReferenceContext object covering the reference of the longest spanning VariantContext * @param readsContexts */ @Override public void apply(final List variantContexts, final ReferenceContext referenceContext, final List readsContexts) { //variantContexts should have identical start, so choose 0th arbitrarily final String variantContig = variantContexts.get(0).getContig(); if (currentContig != null && !variantContig.equals(currentContig)) { processClusters(); } currentContig = variantContig; for (final VariantContext vc : variantContexts) { final SVCallRecord record = createDepthOnlyFromGCNVWithOriginalGenotypes(vc, minQS, allosomalContigs, refAutosomalCopyNumber, sampleDB); if (record != null) { if (!isMultiSampleInput) { bufferDefragmenterOutput(defragmenter.addAndFlush(record)); } else { bufferClusterOutput(clusterEngine.addAndFlush(record)); } } } } private void bufferDefragmenterOutput(final List records) { defragmentBuffer.addAll(records); } private List flushDefragmenterBuffer() { final List result = defragmentBuffer.stream() .sorted(Comparator.comparingInt(SVCallRecord::getPositionA)) .collect(Collectors.toUnmodifiableList()); defragmentBuffer = new ArrayList<>(); return result; } private void bufferClusterOutput(final List records) { outputBuffer.addAll(records); } private List flushClusterBuffer() { final List result = outputBuffer.stream() .sorted(Comparator.comparingInt(SVCallRecord::getPositionA)) .collect(Collectors.toUnmodifiableList()); outputBuffer = new ArrayList<>(); return result; } @Override public Object onTraversalSuccess() { processClusters(); return null; } /** * Force-flushes the defragmenter, adds the resulting calls to the clustering engine, and flushes the clustering * engine. Since we need to check for variant overlap and reset genotypes, only flush clustering when we hit a * new contig. */ private void processClusters() { bufferDefragmenterOutput(defragmenter.flush()); //Jack and Isaac cluster first and then defragment bufferClusterOutput( flushDefragmenterBuffer().stream() .map(clusterEngine::addAndFlush) .flatMap(List::stream) .collect(Collectors.toList()) ); bufferClusterOutput(clusterEngine.flush()); write(flushClusterBuffer()); } private VariantContext buildAndSanitizeRecord(final SVCallRecord record) { final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(record) .rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY) .rmAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE); final List genotypes = new ArrayList<>(builder.getGenotypes().size()); for (final Genotype g : builder.getGenotypes()) { final Map attr = new HashMap<>(g.getExtendedAttributes()); attr.remove(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); genotypes.add(new GenotypeBuilder(g).noAttributes().attributes(attr).make()); } return builder.genotypes(genotypes).make(); } private void write(final List calls) { final List sanitizedRecords = calls.stream() .map(this::buildAndSanitizeRecord) .collect(Collectors.toList()); final Iterator it = sanitizedRecords.iterator(); ArrayList overlappingVCs = new ArrayList<>(calls.size()); if (!it.hasNext()) { return; } int clusterEnd = -1; String clusterContig = null; //gather groups of overlapping VCs and update the genotype copy numbers appropriately while (it.hasNext()) { final VariantContext curr = it.next(); if ((clusterEnd == -1 || curr.getStart() < clusterEnd) && (clusterContig == null || curr.getContig().equals(clusterContig))) { overlappingVCs.add(curr); if (curr.getEnd() > clusterEnd) { clusterEnd = curr.getEnd(); } if (clusterContig == null) { clusterContig = curr.getContig(); } } else { final List resolvedVCs = resolveVariantContexts(allosomalContigs, refAutosomalCopyNumber, sampleDB, samples, overlappingVCs); resolvedVCs.forEach(vcfWriter::add); overlappingVCs = new ArrayList<>(); overlappingVCs.add(curr); clusterEnd = curr.getEnd(); clusterContig = curr.getContig(); } } //write out the last set of overlapping VCs final List resolvedVCs = resolveVariantContexts(allosomalContigs, refAutosomalCopyNumber, sampleDB, samples, overlappingVCs); resolvedVCs.forEach(vcfWriter::add); } /** * Correct genotype calls for overlapping variant contexts * Note that we assume that a sample will not occur twice with the same copy number because it should have been defragmented * @param allosomalContigs names of allosomal contigs (e.g. X, Y, chrX, chrY) * @param refAutosomalCopyNumber reference copy number for autosomes * @param sampleDB data structure containing sample sex assignments * @param samples full set of samples to be output * @param overlappingVCs overlapping variant contexts with correct genotypes, but maybe not copy numbers * @return a list of VariantContexts with genotypes corrected for called alleles and copy number */ @VisibleForTesting protected List resolveVariantContexts(final Set allosomalContigs, final int refAutosomalCopyNumber, final SampleDB sampleDB, final SortedSet samples, final List overlappingVCs) { Utils.nonNull(overlappingVCs); final List resolvedVCs = new ArrayList<>(overlappingVCs.size()); final Iterator it = overlappingVCs.iterator(); //sampleName, copyNumber, endPos -- it's safe to just use position because if the VCs overlap then they must be on the same contig final Map sampleCopyNumbers = new LinkedHashMap<>(SVUtils.hashMapCapacity(overlappingVCs.size())); while (it.hasNext()) { final VariantContext curr = it.next(); resolvedVCs.add(updateGenotypes(allosomalContigs, refAutosomalCopyNumber, sampleDB, samples, curr, sampleCopyNumbers)); //update copy number table for subsequent VCs using variant genotypes from input VCs for (final Genotype g : curr.getGenotypes()) { if (g.hasAnyAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) { sampleCopyNumbers.put(g.getSampleName(), new CopyNumberAndEndRecord( VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, refAutosomalCopyNumber), curr.getAttributeAsInt(VCFConstants.END_KEY, curr.getStart()))); } } } return resolvedVCs; } /** * For CNVs, i.e. DEL and DUP alts only * @param allosomalContigs names of allosomal contigs (e.g. X, Y, chrX, chrY) * @param refAutosomalCopyNumber reference copy number for autosomes * @param sampleDB data structure containing sample sex assignments * @param samples full set of samples to be output * @param vc VariantContext with just variant samples * @param sampleCopyNumbers may be modified to remove terminated variants * @return new VariantContext with AC and AF */ @VisibleForTesting protected static VariantContext updateGenotypes(final Set allosomalContigs, final int refAutosomalCopyNumber, final SampleDB sampleDB, final SortedSet samples, final VariantContext vc, final Map sampleCopyNumbers) { final VariantContextBuilder builder = new VariantContextBuilder(vc); final List newGenotypes = new ArrayList<>(); final Allele vcRefAllele = vc.getReference(); final Map alleleCountMap = new LinkedHashMap<>(2); if (vc.getAlternateAlleles().stream().filter(a -> !a.equals(GATKSVVCFConstants.DEL_ALLELE)).filter(a -> !a.equals(GATKSVVCFConstants.DUP_ALLELE)).count() > 0) { throw new IllegalArgumentException("At site " + vc.getContig() + ":" + vc.getStart() + " variant context contains alternate alleles in addition to CNV and alleles: " + vc.getAlternateAlleles()); } alleleCountMap.put(GATKSVVCFConstants.DEL_ALLELE, 0L); alleleCountMap.put(GATKSVVCFConstants.DUP_ALLELE, 0L); int alleleNumber = 0; for (final String sample : samples) { final Genotype g = vc.getGenotype(sample); //may be null final GenotypeBuilder genotypeBuilder = g == null? new GenotypeBuilder(sample) : new GenotypeBuilder(g); //use g to set alleles final List alleles; //get proper ploidy for autosomes and allosomes (sex chromosomes) final int samplePloidy = getSamplePloidy(allosomalContigs, refAutosomalCopyNumber, sampleDB, sample, vc.getContig(), g); alleleNumber += samplePloidy; //"square off" the genotype matrix by filling in missing (non-variant) samples with homRef calls with reference copy number if (!sampleCopyNumbers.containsKey(sample) && !vc.hasGenotype(sample)) { genotypeBuilder.alleles(GATKVariantContextUtils.makePloidyLengthAlleleList(samplePloidy, vcRefAllele)); genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, samplePloidy); newGenotypes.add(genotypeBuilder.make()); //handle variant genotypes and reference genotypes with non-reference copy number due to overlapping events } else { //determine sample copy number from VC genotype or sampleCopyNumbers map or default to reference, i.e. samplePloidy final int copyNumber; if (sampleCopyNumbers.containsKey(sample) && sampleCopyNumbers.get(sample).getEndPosition() > vc.getStart()) { copyNumber = sampleCopyNumbers.get(sample).getCopyNumber(); alleles = GATKVariantContextUtils.makePloidyLengthAlleleList(samplePloidy, vcRefAllele); } else if (g != null) { copyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, samplePloidy); if (samplePloidy == g.getPloidy()) { alleles = g.getAlleles(); } else { alleles = GATKSVVariantContextUtils.makeGenotypeAllelesFromCopyNumber(copyNumber, samplePloidy, vcRefAllele); } } else { copyNumber = samplePloidy; alleles = GATKSVVariantContextUtils.makeGenotypeAllelesFromCopyNumber(copyNumber, samplePloidy, vcRefAllele); } genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber); genotypeBuilder.alleles(alleles); newGenotypes.add(genotypeBuilder.make()); //use called alleles to update AC //check for genotype in VC because we don't want to count overlapping events (in sampleCopyNumbers map) towards AC if (vc.hasGenotype(sample)) { if (alleles.contains(GATKSVVCFConstants.DEL_ALLELE)) { final Long count = alleleCountMap.get(GATKSVVCFConstants.DEL_ALLELE); alleleCountMap.put(GATKSVVCFConstants.DEL_ALLELE, count + alleles.stream().filter(Allele::isNonReference).count()); } else if (copyNumber > samplePloidy) { final Long count = alleleCountMap.get(GATKSVVCFConstants.DUP_ALLELE); alleleCountMap.put(GATKSVVCFConstants.DUP_ALLELE, count + 1); //best we can do for dupes is carrier frequency } } } } builder.genotypes(newGenotypes); if (alleleNumber > 0) { if (vc.getAlternateAlleles().size() == 1) { final long AC = alleleCountMap.get(vc.getAlternateAllele(0)); builder.attribute(VCFConstants.ALLELE_COUNT_KEY, AC) .attribute(VCFConstants.ALLELE_FREQUENCY_KEY, (double)AC / alleleNumber) .attribute(VCFConstants.ALLELE_NUMBER_KEY, alleleNumber); } else { final List alleleCounts = new ArrayList<>(vc.getNAlleles()); final List alleleFreqs = new ArrayList<>(vc.getNAlleles()); for (final Allele a : builder.getAlleles()) { if (a.isReference()) { continue; } alleleCounts.add(alleleCountMap.get(a)); alleleFreqs.add(Double.valueOf(alleleCountMap.get(a))); } builder.attribute(VCFConstants.ALLELE_COUNT_KEY, alleleCounts) .attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs) .attribute(VCFConstants.ALLELE_NUMBER_KEY, alleleNumber); } } return builder.make(); } /** * Get the sample ploidy for a given contig based on the input pedigree (if available) * defaults to {@param refAutosomalCopyNumber} for autosomes * @param allosomalContigs names of allosomal contigs (e.g. X, Y, chrX, chrY) * @param refAutosomalCopyNumber reference copy number for autosomes * @param sampleDB data structure containing sample sex assignments * @param sampleName current sample of interest * @param contig current contig of interest * @param g may be null * @return */ @VisibleForTesting protected static int getSamplePloidy(final Set allosomalContigs, final int refAutosomalCopyNumber, final SampleDB sampleDB, final String sampleName, final String contig, final Genotype g) { if (g != null && g.hasExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT)) { return VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0); } if (!allosomalContigs.contains(contig)) { return refAutosomalCopyNumber; } if (sampleDB == null || sampleDB.getSample(sampleName) == null) { if (g != null) { return g.getPloidy(); } else { throw new IllegalStateException("Sample " + sampleName + " is missing from the pedigree"); } } else { final Sex sampleSex = sampleDB.getSample(sampleName).getSex(); if (contig.equals("X") || contig.equals("chrX")) { if (sampleSex.equals(Sex.FEMALE)) { return 2; } else if (sampleSex.equals(Sex.MALE)) { return 1; } else { //UNKNOWN return 1; } } else if (contig.equals("Y") || contig.equals("chrY")) { if (sampleSex.equals(Sex.FEMALE)) { return 0; } else if (sampleSex.equals(Sex.MALE)) { return 1; } else { //UNKNOWN return 1; } } else { throw new IllegalArgumentException("Encountered unknown allosomal contig: " + contig + ". This tool only " + "supports mammalian genomes with XX/XY sex determination."); } } } @VisibleForTesting protected static VariantContext buildVariantContext(final SVCallRecord call, final ReferenceSequenceFile reference) { Utils.nonNull(call); Utils.nonNull(reference); // Use only the alt alleles actually called in genotypes final List newAltAlleles = getCalledAllelesOrOriginalIfNone(call); final boolean isCNV = newAltAlleles.size() != 1; final StructuralVariantType svtype; if (isCNV) { svtype = StructuralVariantType.CNV; } else { final Allele altAllele = newAltAlleles.get(0); if (altAllele.equals(Allele.SV_SIMPLE_DEL)) { svtype = StructuralVariantType.DEL; } else if (altAllele.equals(Allele.SV_SIMPLE_DUP)) { svtype = StructuralVariantType.DUP; } else { throw new IllegalArgumentException("Unsupported alt allele: " + altAllele.getDisplayString()); } } final Allele refAllele = call.getRefAllele(); final List outputAlleles = new ArrayList<>(newAltAlleles); outputAlleles.add(refAllele); final VariantContextBuilder builder = new VariantContextBuilder("", call.getContigA(), call.getPositionA(), call.getPositionB(), outputAlleles); builder.attribute(VCFConstants.END_KEY, call.getPositionB()); builder.attribute(GATKSVVCFConstants.SVLEN, call.getLength()); if (svtype == StructuralVariantType.CNV) { builder.attribute(VCFConstants.SVTYPE, "MCNV"); //MCNV for compatibility with svtk annotate } else { builder.attribute(VCFConstants.SVTYPE, svtype); } final List genotypes = new ArrayList<>(call.getGenotypes().size()); for (final Genotype g : call.getGenotypes()) { final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g); genotypeBuilder.noAttributes(); final Map attributes = new HashMap<>(g.getExtendedAttributes()); //update reference alleles final List newGenotypeAlleles = new ArrayList<>(g.getAlleles().size()); for (final Allele a : g.getAlleles()) { if (a.isReference()) { newGenotypeAlleles.add(refAllele); } else { newGenotypeAlleles.add(a); } } genotypeBuilder.alleles(newGenotypeAlleles); if (g.hasAnyAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) { attributes.put(GATKSVVCFConstants.COPY_NUMBER_FORMAT, g.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)); } // Strip this for gCNV pipeline attributes.remove(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); genotypeBuilder.attributes(attributes); genotypes.add(genotypeBuilder.make()); } builder.genotypes(genotypes); return builder.make(); } /** * Reduces alt alleles in genotypes, if any. If none, simply return the original alt allele list for the record. */ private static List getCalledAllelesOrOriginalIfNone(final SVCallRecord call) { // Use only the alt alleles actually called in genotypes List newAltAlleles = call.getGenotypes().stream() .flatMap(g -> g.getAlleles().stream()) .filter(allele -> allele != null && !allele.isReference() && allele.isCalled()) .distinct() .collect(Collectors.toList()); if (newAltAlleles.isEmpty()) { // If calls were dropped, use original alt alleles newAltAlleles = call.getAltAlleles(); } return newAltAlleles; } @Override public void closeTool(){ if (vcfWriter != null) { vcfWriter.close(); } } /** * Attempts to create a new record from the given variant produced by * {@link org.broadinstitute.hellbender.tools.copynumber.GermlineCNVCaller}. If the variant contains only one * genotype, then null is returned if either the genotype is hom-ref, is a no-call but does not have * a CN value, does not meet the min QS value, or is a null call (see {@link #isNullCall(Genotype)}. * Genotypes that are hom-ref or are both a no-call and missing a CN value are filtered in the resulting record. * * This currently provides legacy support for older GermlineCNVCaller records that were not spec-compliant, although * this may be deprecated in the future. * * @param variant single-sample variant from a gCNV segments VCF * @param minQuality drop events with quality lower than this * @return a new record or null */ public SVCallRecord createDepthOnlyFromGCNVWithOriginalGenotypes(final VariantContext variant, final double minQuality, final Set allosomalContigs, final int refAutosomalCopyNumber, final SampleDB sampleDB) { Utils.nonNull(variant); if (variant.getGenotypes().size() == 1) { //only cluster good variants final Genotype g = variant.getGenotypes().get(0); if (g.isHomRef() || (g.isNoCall() && !g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) || VariantContextGetters.getAttributeAsInt(g, GermlineCNVSegmentVariantComposer.QS, 0) < minQuality || isNullCall(g)) { return null; } } final VariantContextBuilder svBuilder = new VariantContextBuilder(variant); svBuilder.attribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)); // Add expected copy number, assume diploid final Allele refAllele = variant.getReference(); final List genotypesWithECN = variant.getGenotypes().stream() .map(g -> prepareGenotype(g, refAllele, allosomalContigs, refAutosomalCopyNumber, sampleDB, variant.getContig())) .collect(Collectors.toList()); svBuilder.genotypes(genotypesWithECN); final SVCallRecord baseRecord = SVCallRecordUtils.create(svBuilder.make(), true, dictionary); final List nonRefGenotypes = baseRecord.getGenotypes().stream() .filter(g -> !(g.isHomRef() || (g.isNoCall() && !g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)))) .collect(Collectors.toList()); return SVCallRecordUtils.copyCallWithNewGenotypes(baseRecord, GenotypesContext.copy(nonRefGenotypes)); } private static Genotype prepareGenotype(final Genotype g, final Allele refAllele, final Set allosomalContigs, final int refAutosomalCopyNumber, final SampleDB sampleDB, final String contig) { final GenotypeBuilder builder = new GenotypeBuilder(g); final int ploidy = getSamplePloidy(allosomalContigs, refAutosomalCopyNumber, sampleDB, g.getSampleName(), contig, g); correctGenotypePloidy(builder, g, ploidy, refAllele); addExpectedCopyNumber(builder, ploidy); return builder.make(); } /** * "Fills" genotype alleles so that it has the correct ploidy * @param builder new alleles will be set for this builder * @param g non-ref alleles will be carried over from this genotype * @param ploidy desired ploidy for the new genotype * @param refAllele desired ref allele for new genotype */ private static void correctGenotypePloidy(final GenotypeBuilder builder, final Genotype g, final int ploidy, final Allele refAllele) { if (g.getAlleles().size() == 1 && g.getAllele(0).isNoCall()) { // Special case to force interpretation of a single no-call allele as a possible null GT builder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL)); } else { final ArrayList alleles = new ArrayList<>(g.getAlleles()); Utils.validate(alleles.size() <= ploidy, "Encountered genotype with ploidy " + ploidy + " but " + alleles.size() + " alleles."); while (alleles.size() < ploidy) { alleles.add(refAllele); } alleles.trimToSize(); builder.alleles(alleles); } } private static void addExpectedCopyNumber(final GenotypeBuilder g, final int ploidy) { g.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy).make(); } /** * @param g * @return true if this is a call on a missing contig */ private static boolean isNullCall(final Genotype g) { return g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT) && VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0) == 0 && g.isNoCall(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy