org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.sv;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.apache.commons.lang3.tuple.MutablePair;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.MultiVariantWalkerGroupedOnStart;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.copynumber.GermlineCNVCaller;
import org.broadinstitute.hellbender.tools.copynumber.PostprocessGermlineCNVCalls;
import org.broadinstitute.hellbender.tools.copynumber.gcnv.GermlineCNVSegmentVariantComposer;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
import org.broadinstitute.hellbender.tools.sv.cluster.*;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import org.broadinstitute.hellbender.utils.samples.PedigreeValidationType;
import org.broadinstitute.hellbender.utils.samples.SampleDB;
import org.broadinstitute.hellbender.utils.samples.Sex;
import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;
import java.util.*;
import java.util.stream.Collectors;
/**
* Merge GCNV segments VCFs
*
* This tool takes in segmented VCFs produced by {@link PostprocessGermlineCNVCalls}. For single-sample input VCFs,
* the tool defragments CNV calls by merging adjacent events within the default or specified fractional padding margins.
* For distinct samples, the tool clusters events that meet the reciprocal overlap requirement. Output is a multi-sample VCF
* with site allele counts and frequencies (AC, AF).
*
* This tool can also be run on multiple multi-sample VCFs, as in a hierarchical gather for large cohorts. In this case,
* defragmentation is not preformed because it is assumed that single-sample defragmentation occurred during the initial
* tool invocation that produced the multi-sample VCF.
*
* Required inputs
*
* - Reference fasta
* - Segmented VCFs, single-sample or multi-sample, from {@link PostprocessGermlineCNVCalls}
* - The interval list used by {@link GermlineCNVCaller} for calling the input VCFs
* - A pedigree file with an entry giving the sex of each sample
*
*
* Output
* A single multi-sample VCF with genotypes and copy numbers
*
*
* Usage example
*
*
* gatk JointGermlineCNVSegmentation \
* -R reference.fasta
* -V sample1.vcf.gz
* -V sample2.vcf.gz
* --model-call-intervals .filtered.interval_list
* --pedigree XandYassignments.ped
* -O clustered.vcf.gz
*
*
* Caveats
*
* - All input samples are required to be present in the pedigree file
* - Quality scores are not output, but can be recalculated based on the updated event boundaries with {@link PostprocessGermlineCNVCalls}
* - Multi-sample input VCFs are assumed to have been generated by this tool and already defragmented
* - This tool only supports mammalian genomes with XX/XY sex determination.
*
**/
@DocumentedFeature
@BetaFeature
@CommandLineProgramProperties(
summary = "Gathers single-sample or multi-sample segmented gCNV VCFs, harmonizes breakpoints, and outputs a cohort VCF with genotypes.",
oneLineSummary = "Combine segmented gCNV VCFs.",
programGroup = StructuralVariantDiscoveryProgramGroup.class
)
public class JointGermlineCNVSegmentation extends MultiVariantWalkerGroupedOnStart {
private SortedSet samples;
private VariantContextWriter vcfWriter;
private SAMSequenceDictionary dictionary;
private SVClusterEngine defragmenter;
private SVClusterEngine clusterEngine;
private List callIntervals;
private String currentContig;
private SampleDB sampleDB;
private boolean isMultiSampleInput = false;
private ReferenceSequenceFile reference;
private Collection defragmentBuffer;
private Collection outputBuffer;
private final Set allosomalContigs = new LinkedHashSet<>(Arrays.asList("X","Y","chrX","chrY"));
class CopyNumberAndEndRecord {
private MutablePair record;
public CopyNumberAndEndRecord(final int copyNumber, final int end) {
record = new MutablePair<>(copyNumber, end);
}
public int getCopyNumber() {
return record.getLeft();
}
public int getEndPosition() {
return record.getRight();
}
}
public static final String MIN_QUALITY_LONG_NAME = "minimum-qs-score";
public static final String MIN_SAMPLE_NUM_OVERLAP_LONG_NAME = "min-sample-set-fraction-overlap";
public static final String DEFRAGMENTATION_PADDING_LONG_NAME = "defragmentation-padding-fraction";
public static final String CLUSTERING_INTERVAL_OVERLAP_LONG_NAME = "clustering-interval-overlap";
public static final String CLUSTERING_BREAKEND_WINDOW_LONG_NAME = "clustering-breakend-window";
public static final String CLUSTERING_SIZE_SIMILARITY_LONG_NAME = "clustering-size-similarity";
public static final String MODEL_CALL_INTERVALS_LONG_NAME = "model-call-intervals";
public static final String BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME = "breakpoint-summary-strategy";
public static final String ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME = "alt-allele-summary-strategy";
public static final String FLAG_FIELD_LOGIC_LONG_NAME = "flag-field-logic";
@Argument(fullName = MIN_QUALITY_LONG_NAME, doc = "Minimum QS score to combine a variant segment", optional = true)
private int minQS = 20;
@Argument(fullName = MIN_SAMPLE_NUM_OVERLAP_LONG_NAME, doc = "Minimum fraction of common samples for two variants to cluster together", optional = true)
private double minSampleSetOverlap = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getSampleOverlap();
@Argument(fullName = DEFRAGMENTATION_PADDING_LONG_NAME, doc = "Extend events by this fraction on each side when determining overlap to merge", optional = true)
private double defragmentationPadding = CNVLinkage.DEFAULT_PADDING_FRACTION;
@Argument(fullName = CLUSTERING_INTERVAL_OVERLAP_LONG_NAME,
doc="Minimum interval reciprocal overlap for clustering", optional=true)
public double clusterIntervalOverlap = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getReciprocalOverlap();
@Argument(fullName = CLUSTERING_BREAKEND_WINDOW_LONG_NAME,
doc="Cluster events whose endpoints are within this distance of each other", optional=true)
public int clusterWindow = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getWindow();
@Argument(fullName = CLUSTERING_SIZE_SIMILARITY_LONG_NAME,
doc="Minimum size similarity for clustering", optional=true)
public double clusterSizeSimilarity = CanonicalSVLinkage.DEFAULT_DEPTH_ONLY_PARAMS.getSizeSimilarity();
@Argument(fullName = MODEL_CALL_INTERVALS_LONG_NAME, doc = "gCNV model intervals created with the FilterIntervals tool.", optional=true)
private GATKPath modelCallIntervalList = null;
@Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME, doc = "Strategy to use for choosing a representative value for a breakpoint cluster.", optional = true)
private CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy = CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END;
@Argument(fullName = ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME, doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with different subtypes.", optional = true)
private CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy = CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE;
@Argument(fullName= StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName=StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc="The combined output VCF")
private GATKPath outputFile;
@Argument(
doc = "Reference copy-number on autosomal intervals.",
fullName = PostprocessGermlineCNVCalls.AUTOSOMAL_REF_COPY_NUMBER_LONG_NAME,
minValue = 0,
optional = true
)
private int refAutosomalCopyNumber = 2;
/**
* See https://software.broadinstitute.org/gatk/documentation/article.php?id=7696 for more details on the PED
* format. Note that each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to
* tell the GATK PED parser that the corresponding fields are missing from the ped file.
*
*/
@Argument(fullName=StandardArgumentDefinitions.PEDIGREE_FILE_LONG_NAME, shortName=StandardArgumentDefinitions.PEDIGREE_FILE_SHORT_NAME, doc="Pedigree file for samples")
private GATKPath pedigreeFile = null;
@Override
public boolean doDictionaryCrossValidation() {
return false;
}
//require a reference to do dictionary validation since there may be too many samples for cross-validating
@Override
public boolean requiresReference() {
return true;
}
// Cannot require sample overlap when clustering across samples
private static final double CLUSTER_SAMPLE_OVERLAP_FRACTION = 0;
@Argument(fullName = SVClusterWalker.MAX_RECORDS_IN_RAM_LONG_NAME,
doc = "When writing VCF files that need to be sorted, this will specify the number of records stored in " +
"RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort a " +
"VCF file, and increases the amount of RAM needed.",
optional=true)
public int maxRecordsInRam = 10000;
@Override
public void onTraversalStart() {
reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier());
//strict validation will verify that all samples are in the pedigree
sampleDB = SampleDB.createSampleDBFromPedigreeAndDataSources(pedigreeFile, getSamplesForVariants(), PedigreeValidationType.STRICT);
dictionary = getBestAvailableSequenceDictionary();
//dictionary will not be null because this tool requiresReference()
final GenomeLocParser parser = new GenomeLocParser(this.dictionary);
setIntervals(parser);
final ClusteringParameters clusterArgs = ClusteringParameters.createDepthParameters(clusterIntervalOverlap, clusterSizeSimilarity, clusterWindow, CLUSTER_SAMPLE_OVERLAP_FRACTION);
if (callIntervals == null) {
defragmenter = SVClusterEngineFactory.createCNVDefragmenter(dictionary, altAlleleSummaryStrategy, reference, defragmentationPadding, minSampleSetOverlap);
} else {
defragmenter = SVClusterEngineFactory.createBinnedCNVDefragmenter(dictionary, altAlleleSummaryStrategy, reference, defragmentationPadding, minSampleSetOverlap, callIntervals);
}
clusterEngine = SVClusterEngineFactory.createCanonical(SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE, breakpointSummaryStrategy, altAlleleSummaryStrategy,
dictionary, reference, true, clusterArgs, CanonicalSVLinkage.DEFAULT_MIXED_PARAMS, CanonicalSVLinkage.DEFAULT_PESR_PARAMS);
defragmentBuffer = new ArrayList<>();
outputBuffer = new ArrayList<>();
vcfWriter = getVCFWriter();
if (getSamplesForVariants().size() != 1) {
logger.warn("Multi-sample VCFs found, which are assumed to be pre-clustered. Skipping defragmentation.");
isMultiSampleInput = true;
} else {
isMultiSampleInput = false;
}
}
/**
* If model intervals are supplied, subset to the requested traversal intervals
* @param parser needed to merge intervals if necessary
*/
private void setIntervals(final GenomeLocParser parser) {
if (modelCallIntervalList != null) {
final List inputCoverageIntervals = IntervalUtils.featureFileToIntervals(parser, modelCallIntervalList.getURIString());
final List inputTraversalIntervals = IntervalUtils.genomeLocsFromLocatables(parser,getTraversalIntervals());
callIntervals = IntervalUtils.mergeListsBySetOperator(inputCoverageIntervals, inputTraversalIntervals, IntervalSetRule.INTERSECTION);
}
}
private VariantContextWriter getVCFWriter() {
samples = getSamplesForVariants();
final VCFHeader inputVCFHeader = new VCFHeader(getHeaderForVariants().getMetaDataInInputOrder(), samples);
final Set headerLines = new LinkedHashSet<>(inputVCFHeader.getMetaDataInInputOrder());
headerLines.addAll(getDefaultToolVCFHeaderLines());
headerLines.add(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN));
headerLines.add(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
VariantContextWriter writer = createVCFWriter(outputFile);
final Set sampleNameSet = new IndexedSampleList(samples).asSetOfSamples();
final VCFHeader vcfHeader = new VCFHeader(headerLines, new TreeSet<>(sampleNameSet));
writer.writeHeader(vcfHeader);
return writer;
}
/**
* @param variantContexts VariantContexts from driving variants with matching start position
* NOTE: This will never be empty
* @param referenceContext ReferenceContext object covering the reference of the longest spanning VariantContext
* @param readsContexts
*/
@Override
public void apply(final List variantContexts, final ReferenceContext referenceContext, final List readsContexts) {
//variantContexts should have identical start, so choose 0th arbitrarily
final String variantContig = variantContexts.get(0).getContig();
if (currentContig != null && !variantContig.equals(currentContig)) {
processClusters();
}
currentContig = variantContig;
for (final VariantContext vc : variantContexts) {
final SVCallRecord record = createDepthOnlyFromGCNVWithOriginalGenotypes(vc, minQS, allosomalContigs, refAutosomalCopyNumber, sampleDB);
if (record != null) {
if (!isMultiSampleInput) {
bufferDefragmenterOutput(defragmenter.addAndFlush(record));
} else {
bufferClusterOutput(clusterEngine.addAndFlush(record));
}
}
}
}
private void bufferDefragmenterOutput(final List records) {
defragmentBuffer.addAll(records);
}
private List flushDefragmenterBuffer() {
final List result = defragmentBuffer.stream()
.sorted(Comparator.comparingInt(SVCallRecord::getPositionA))
.collect(Collectors.toUnmodifiableList());
defragmentBuffer = new ArrayList<>();
return result;
}
private void bufferClusterOutput(final List records) {
outputBuffer.addAll(records);
}
private List flushClusterBuffer() {
final List result = outputBuffer.stream()
.sorted(Comparator.comparingInt(SVCallRecord::getPositionA))
.collect(Collectors.toUnmodifiableList());
outputBuffer = new ArrayList<>();
return result;
}
@Override
public Object onTraversalSuccess() {
processClusters();
return null;
}
/**
* Force-flushes the defragmenter, adds the resulting calls to the clustering engine, and flushes the clustering
* engine. Since we need to check for variant overlap and reset genotypes, only flush clustering when we hit a
* new contig.
*/
private void processClusters() {
bufferDefragmenterOutput(defragmenter.flush());
//Jack and Isaac cluster first and then defragment
bufferClusterOutput(
flushDefragmenterBuffer().stream()
.map(clusterEngine::addAndFlush)
.flatMap(List::stream)
.collect(Collectors.toList())
);
bufferClusterOutput(clusterEngine.flush());
write(flushClusterBuffer());
}
private VariantContext buildAndSanitizeRecord(final SVCallRecord record) {
final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(record)
.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)
.rmAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE);
final List genotypes = new ArrayList<>(builder.getGenotypes().size());
for (final Genotype g : builder.getGenotypes()) {
final Map attr = new HashMap<>(g.getExtendedAttributes());
attr.remove(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT);
genotypes.add(new GenotypeBuilder(g).noAttributes().attributes(attr).make());
}
return builder.genotypes(genotypes).make();
}
private void write(final List calls) {
final List sanitizedRecords = calls.stream()
.map(this::buildAndSanitizeRecord)
.collect(Collectors.toList());
final Iterator it = sanitizedRecords.iterator();
ArrayList overlappingVCs = new ArrayList<>(calls.size());
if (!it.hasNext()) {
return;
}
int clusterEnd = -1;
String clusterContig = null;
//gather groups of overlapping VCs and update the genotype copy numbers appropriately
while (it.hasNext()) {
final VariantContext curr = it.next();
if ((clusterEnd == -1 || curr.getStart() < clusterEnd) && (clusterContig == null || curr.getContig().equals(clusterContig))) {
overlappingVCs.add(curr);
if (curr.getEnd() > clusterEnd) {
clusterEnd = curr.getEnd();
}
if (clusterContig == null) {
clusterContig = curr.getContig();
}
} else {
final List resolvedVCs = resolveVariantContexts(allosomalContigs, refAutosomalCopyNumber, sampleDB, samples, overlappingVCs);
resolvedVCs.forEach(vcfWriter::add);
overlappingVCs = new ArrayList<>();
overlappingVCs.add(curr);
clusterEnd = curr.getEnd();
clusterContig = curr.getContig();
}
}
//write out the last set of overlapping VCs
final List resolvedVCs = resolveVariantContexts(allosomalContigs, refAutosomalCopyNumber, sampleDB, samples, overlappingVCs);
resolvedVCs.forEach(vcfWriter::add);
}
/**
* Correct genotype calls for overlapping variant contexts
* Note that we assume that a sample will not occur twice with the same copy number because it should have been defragmented
* @param allosomalContigs names of allosomal contigs (e.g. X, Y, chrX, chrY)
* @param refAutosomalCopyNumber reference copy number for autosomes
* @param sampleDB data structure containing sample sex assignments
* @param samples full set of samples to be output
* @param overlappingVCs overlapping variant contexts with correct genotypes, but maybe not copy numbers
* @return a list of VariantContexts with genotypes corrected for called alleles and copy number
*/
@VisibleForTesting
protected List resolveVariantContexts(final Set allosomalContigs, final int refAutosomalCopyNumber,
final SampleDB sampleDB, final SortedSet samples,
final List overlappingVCs) {
Utils.nonNull(overlappingVCs);
final List resolvedVCs = new ArrayList<>(overlappingVCs.size());
final Iterator it = overlappingVCs.iterator();
//sampleName, copyNumber, endPos -- it's safe to just use position because if the VCs overlap then they must be on the same contig
final Map sampleCopyNumbers = new LinkedHashMap<>(SVUtils.hashMapCapacity(overlappingVCs.size()));
while (it.hasNext()) {
final VariantContext curr = it.next();
resolvedVCs.add(updateGenotypes(allosomalContigs, refAutosomalCopyNumber, sampleDB, samples, curr, sampleCopyNumbers));
//update copy number table for subsequent VCs using variant genotypes from input VCs
for (final Genotype g : curr.getGenotypes()) {
if (g.hasAnyAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) {
sampleCopyNumbers.put(g.getSampleName(),
new CopyNumberAndEndRecord(
VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, refAutosomalCopyNumber),
curr.getAttributeAsInt(VCFConstants.END_KEY, curr.getStart())));
}
}
}
return resolvedVCs;
}
/**
* For CNVs, i.e. DEL and DUP alts only
* @param allosomalContigs names of allosomal contigs (e.g. X, Y, chrX, chrY)
* @param refAutosomalCopyNumber reference copy number for autosomes
* @param sampleDB data structure containing sample sex assignments
* @param samples full set of samples to be output
* @param vc VariantContext with just variant samples
* @param sampleCopyNumbers may be modified to remove terminated variants
* @return new VariantContext with AC and AF
*/
@VisibleForTesting
protected static VariantContext updateGenotypes(final Set allosomalContigs, final int refAutosomalCopyNumber,
final SampleDB sampleDB, final SortedSet samples,
final VariantContext vc, final Map sampleCopyNumbers) {
final VariantContextBuilder builder = new VariantContextBuilder(vc);
final List newGenotypes = new ArrayList<>();
final Allele vcRefAllele = vc.getReference();
final Map alleleCountMap = new LinkedHashMap<>(2);
if (vc.getAlternateAlleles().stream().filter(a -> !a.equals(GATKSVVCFConstants.DEL_ALLELE)).filter(a -> !a.equals(GATKSVVCFConstants.DUP_ALLELE)).count() > 0) {
throw new IllegalArgumentException("At site " + vc.getContig() + ":" + vc.getStart() + " variant context contains alternate alleles in addition to CNV and alleles: " + vc.getAlternateAlleles());
}
alleleCountMap.put(GATKSVVCFConstants.DEL_ALLELE, 0L);
alleleCountMap.put(GATKSVVCFConstants.DUP_ALLELE, 0L);
int alleleNumber = 0;
for (final String sample : samples) {
final Genotype g = vc.getGenotype(sample); //may be null
final GenotypeBuilder genotypeBuilder = g == null? new GenotypeBuilder(sample) : new GenotypeBuilder(g); //use g to set alleles
final List alleles;
//get proper ploidy for autosomes and allosomes (sex chromosomes)
final int samplePloidy = getSamplePloidy(allosomalContigs, refAutosomalCopyNumber, sampleDB, sample, vc.getContig(), g);
alleleNumber += samplePloidy;
//"square off" the genotype matrix by filling in missing (non-variant) samples with homRef calls with reference copy number
if (!sampleCopyNumbers.containsKey(sample) && !vc.hasGenotype(sample)) {
genotypeBuilder.alleles(GATKVariantContextUtils.makePloidyLengthAlleleList(samplePloidy, vcRefAllele));
genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, samplePloidy);
newGenotypes.add(genotypeBuilder.make());
//handle variant genotypes and reference genotypes with non-reference copy number due to overlapping events
} else {
//determine sample copy number from VC genotype or sampleCopyNumbers map or default to reference, i.e. samplePloidy
final int copyNumber;
if (sampleCopyNumbers.containsKey(sample) && sampleCopyNumbers.get(sample).getEndPosition() > vc.getStart()) {
copyNumber = sampleCopyNumbers.get(sample).getCopyNumber();
alleles = GATKVariantContextUtils.makePloidyLengthAlleleList(samplePloidy, vcRefAllele);
} else if (g != null) {
copyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, samplePloidy);
if (samplePloidy == g.getPloidy()) {
alleles = g.getAlleles();
} else {
alleles = GATKSVVariantContextUtils.makeGenotypeAllelesFromCopyNumber(copyNumber, samplePloidy, vcRefAllele);
}
} else {
copyNumber = samplePloidy;
alleles = GATKSVVariantContextUtils.makeGenotypeAllelesFromCopyNumber(copyNumber, samplePloidy, vcRefAllele);
}
genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber);
genotypeBuilder.alleles(alleles);
newGenotypes.add(genotypeBuilder.make());
//use called alleles to update AC
//check for genotype in VC because we don't want to count overlapping events (in sampleCopyNumbers map) towards AC
if (vc.hasGenotype(sample)) {
if (alleles.contains(GATKSVVCFConstants.DEL_ALLELE)) {
final Long count = alleleCountMap.get(GATKSVVCFConstants.DEL_ALLELE);
alleleCountMap.put(GATKSVVCFConstants.DEL_ALLELE, count + alleles.stream().filter(Allele::isNonReference).count());
} else if (copyNumber > samplePloidy) {
final Long count = alleleCountMap.get(GATKSVVCFConstants.DUP_ALLELE);
alleleCountMap.put(GATKSVVCFConstants.DUP_ALLELE, count + 1); //best we can do for dupes is carrier frequency
}
}
}
}
builder.genotypes(newGenotypes);
if (alleleNumber > 0) {
if (vc.getAlternateAlleles().size() == 1) {
final long AC = alleleCountMap.get(vc.getAlternateAllele(0));
builder.attribute(VCFConstants.ALLELE_COUNT_KEY, AC)
.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, (double)AC / alleleNumber)
.attribute(VCFConstants.ALLELE_NUMBER_KEY, alleleNumber);
} else {
final List alleleCounts = new ArrayList<>(vc.getNAlleles());
final List alleleFreqs = new ArrayList<>(vc.getNAlleles());
for (final Allele a : builder.getAlleles()) {
if (a.isReference()) {
continue;
}
alleleCounts.add(alleleCountMap.get(a));
alleleFreqs.add(Double.valueOf(alleleCountMap.get(a)));
}
builder.attribute(VCFConstants.ALLELE_COUNT_KEY, alleleCounts)
.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs)
.attribute(VCFConstants.ALLELE_NUMBER_KEY, alleleNumber);
}
}
return builder.make();
}
/**
* Get the sample ploidy for a given contig based on the input pedigree (if available)
* defaults to {@param refAutosomalCopyNumber} for autosomes
* @param allosomalContigs names of allosomal contigs (e.g. X, Y, chrX, chrY)
* @param refAutosomalCopyNumber reference copy number for autosomes
* @param sampleDB data structure containing sample sex assignments
* @param sampleName current sample of interest
* @param contig current contig of interest
* @param g may be null
* @return
*/
@VisibleForTesting
protected static int getSamplePloidy(final Set allosomalContigs, final int refAutosomalCopyNumber,
final SampleDB sampleDB, final String sampleName, final String contig, final Genotype g) {
if (g != null && g.hasExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT)) {
return VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
}
if (!allosomalContigs.contains(contig)) {
return refAutosomalCopyNumber;
}
if (sampleDB == null || sampleDB.getSample(sampleName) == null) {
if (g != null) {
return g.getPloidy();
} else {
throw new IllegalStateException("Sample " + sampleName + " is missing from the pedigree");
}
} else {
final Sex sampleSex = sampleDB.getSample(sampleName).getSex();
if (contig.equals("X") || contig.equals("chrX")) {
if (sampleSex.equals(Sex.FEMALE)) {
return 2;
} else if (sampleSex.equals(Sex.MALE)) {
return 1;
} else { //UNKNOWN
return 1;
}
} else if (contig.equals("Y") || contig.equals("chrY")) {
if (sampleSex.equals(Sex.FEMALE)) {
return 0;
} else if (sampleSex.equals(Sex.MALE)) {
return 1;
} else { //UNKNOWN
return 1;
}
} else {
throw new IllegalArgumentException("Encountered unknown allosomal contig: " + contig + ". This tool only " +
"supports mammalian genomes with XX/XY sex determination.");
}
}
}
@VisibleForTesting
protected static VariantContext buildVariantContext(final SVCallRecord call, final ReferenceSequenceFile reference) {
Utils.nonNull(call);
Utils.nonNull(reference);
// Use only the alt alleles actually called in genotypes
final List newAltAlleles = getCalledAllelesOrOriginalIfNone(call);
final boolean isCNV = newAltAlleles.size() != 1;
final StructuralVariantType svtype;
if (isCNV) {
svtype = StructuralVariantType.CNV;
} else {
final Allele altAllele = newAltAlleles.get(0);
if (altAllele.equals(Allele.SV_SIMPLE_DEL)) {
svtype = StructuralVariantType.DEL;
} else if (altAllele.equals(Allele.SV_SIMPLE_DUP)) {
svtype = StructuralVariantType.DUP;
} else {
throw new IllegalArgumentException("Unsupported alt allele: " + altAllele.getDisplayString());
}
}
final Allele refAllele = call.getRefAllele();
final List outputAlleles = new ArrayList<>(newAltAlleles);
outputAlleles.add(refAllele);
final VariantContextBuilder builder = new VariantContextBuilder("", call.getContigA(), call.getPositionA(), call.getPositionB(),
outputAlleles);
builder.attribute(VCFConstants.END_KEY, call.getPositionB());
builder.attribute(GATKSVVCFConstants.SVLEN, call.getLength());
if (svtype == StructuralVariantType.CNV) {
builder.attribute(VCFConstants.SVTYPE, "MCNV"); //MCNV for compatibility with svtk annotate
} else {
builder.attribute(VCFConstants.SVTYPE, svtype);
}
final List genotypes = new ArrayList<>(call.getGenotypes().size());
for (final Genotype g : call.getGenotypes()) {
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
genotypeBuilder.noAttributes();
final Map attributes = new HashMap<>(g.getExtendedAttributes());
//update reference alleles
final List newGenotypeAlleles = new ArrayList<>(g.getAlleles().size());
for (final Allele a : g.getAlleles()) {
if (a.isReference()) {
newGenotypeAlleles.add(refAllele);
} else {
newGenotypeAlleles.add(a);
}
}
genotypeBuilder.alleles(newGenotypeAlleles);
if (g.hasAnyAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) {
attributes.put(GATKSVVCFConstants.COPY_NUMBER_FORMAT, g.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT));
}
// Strip this for gCNV pipeline
attributes.remove(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT);
genotypeBuilder.attributes(attributes);
genotypes.add(genotypeBuilder.make());
}
builder.genotypes(genotypes);
return builder.make();
}
/**
* Reduces alt alleles in genotypes, if any. If none, simply return the original alt allele list for the record.
*/
private static List getCalledAllelesOrOriginalIfNone(final SVCallRecord call) {
// Use only the alt alleles actually called in genotypes
List newAltAlleles = call.getGenotypes().stream()
.flatMap(g -> g.getAlleles().stream())
.filter(allele -> allele != null && !allele.isReference() && allele.isCalled())
.distinct()
.collect(Collectors.toList());
if (newAltAlleles.isEmpty()) {
// If calls were dropped, use original alt alleles
newAltAlleles = call.getAltAlleles();
}
return newAltAlleles;
}
@Override
public void closeTool(){
if (vcfWriter != null) {
vcfWriter.close();
}
}
/**
* Attempts to create a new record from the given variant produced by
* {@link org.broadinstitute.hellbender.tools.copynumber.GermlineCNVCaller}. If the variant contains only one
* genotype, then null is returned if either the genotype is hom-ref, is a no-call but does not have
* a CN value, does not meet the min QS value, or is a null call (see {@link #isNullCall(Genotype)}.
* Genotypes that are hom-ref or are both a no-call and missing a CN value are filtered in the resulting record.
*
* This currently provides legacy support for older GermlineCNVCaller records that were not spec-compliant, although
* this may be deprecated in the future.
*
* @param variant single-sample variant from a gCNV segments VCF
* @param minQuality drop events with quality lower than this
* @return a new record or null
*/
public SVCallRecord createDepthOnlyFromGCNVWithOriginalGenotypes(final VariantContext variant,
final double minQuality,
final Set allosomalContigs,
final int refAutosomalCopyNumber,
final SampleDB sampleDB) {
Utils.nonNull(variant);
if (variant.getGenotypes().size() == 1) {
//only cluster good variants
final Genotype g = variant.getGenotypes().get(0);
if (g.isHomRef() || (g.isNoCall() && !g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT))
|| VariantContextGetters.getAttributeAsInt(g, GermlineCNVSegmentVariantComposer.QS, 0) < minQuality
|| isNullCall(g)) {
return null;
}
}
final VariantContextBuilder svBuilder = new VariantContextBuilder(variant);
svBuilder.attribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM));
// Add expected copy number, assume diploid
final Allele refAllele = variant.getReference();
final List genotypesWithECN = variant.getGenotypes().stream()
.map(g -> prepareGenotype(g, refAllele, allosomalContigs, refAutosomalCopyNumber, sampleDB, variant.getContig()))
.collect(Collectors.toList());
svBuilder.genotypes(genotypesWithECN);
final SVCallRecord baseRecord = SVCallRecordUtils.create(svBuilder.make(), true, dictionary);
final List nonRefGenotypes = baseRecord.getGenotypes().stream()
.filter(g -> !(g.isHomRef() || (g.isNoCall() && !g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT))))
.collect(Collectors.toList());
return SVCallRecordUtils.copyCallWithNewGenotypes(baseRecord, GenotypesContext.copy(nonRefGenotypes));
}
private static Genotype prepareGenotype(final Genotype g, final Allele refAllele,
final Set allosomalContigs, final int refAutosomalCopyNumber,
final SampleDB sampleDB, final String contig) {
final GenotypeBuilder builder = new GenotypeBuilder(g);
final int ploidy = getSamplePloidy(allosomalContigs, refAutosomalCopyNumber, sampleDB, g.getSampleName(), contig, g);
correctGenotypePloidy(builder, g, ploidy, refAllele);
addExpectedCopyNumber(builder, ploidy);
return builder.make();
}
/**
* "Fills" genotype alleles so that it has the correct ploidy
* @param builder new alleles will be set for this builder
* @param g non-ref alleles will be carried over from this genotype
* @param ploidy desired ploidy for the new genotype
* @param refAllele desired ref allele for new genotype
*/
private static void correctGenotypePloidy(final GenotypeBuilder builder, final Genotype g, final int ploidy,
final Allele refAllele) {
if (g.getAlleles().size() == 1 && g.getAllele(0).isNoCall()) {
// Special case to force interpretation of a single no-call allele as a possible null GT
builder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL));
} else {
final ArrayList alleles = new ArrayList<>(g.getAlleles());
Utils.validate(alleles.size() <= ploidy, "Encountered genotype with ploidy " + ploidy +
" but " + alleles.size() + " alleles.");
while (alleles.size() < ploidy) {
alleles.add(refAllele);
}
alleles.trimToSize();
builder.alleles(alleles);
}
}
private static void addExpectedCopyNumber(final GenotypeBuilder g, final int ploidy) {
g.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy).make();
}
/**
* @param g
* @return true if this is a call on a missing contig
*/
private static boolean isNullCall(final Genotype g) {
return g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)
&& VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0) == 0
&& g.isNoCall();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy