All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.sv.SVStratify Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.sv;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
import org.broadinstitute.hellbender.utils.*;

import java.io.File;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

/**
 * 

Stratifies structural variants into mutually exclusive groups according to the following customizable criteria: *

    *
  • SV type
  • *
  • Size range
  • *
  • Reference track overlap
  • *
* Records are annotated with their respective strata names in the {@link GATKSVVCFConstants#STRATUM_INFO_KEY} INFO * field. Users must provide a stratification configuration .tsv file (tab-delimited table) with the following column * header on the first line: *
    *
  1. NAME
  2. *
  3. SVTYPE
  4. *
  5. MIN_SIZE
  6. *
  7. MAX_SIZE
  8. *
  9. TRACKS
  10. *
*

*

For example:

* * * * * * * * * * * * * * * * * * * *
NAMESVTYPEMIN_SIZEMAX_SIZETRACKS
DEL_large_SDDEL5000-1SD
DUP_large_SDDUP5000-1SD
DEL_small_SR_RMDEL-15000SR,RM
DUP_small_SR_RMDUP-15000SR,RM
INS_SRINS-1-1SR
*

* The "NAME" column is an arbitrary identifier, "SVTYPE" is the class of variant (DEL, DUP, INS, etc.), MIN_SIZE in an * inclusive size lower-bound, MAX_SIZE is an exclusive size upper-bound, and TRACKS is a comma-delimited list of * reference tracks defined using the {@link SVStratificationEngineArgumentsCollection#trackFileList} and * {@link SVStratificationEngineArgumentsCollection#trackNameList} parameters. For example, *

*
 *     gatk GroupedSVCluster \
 *       --track-name RM \
 *       --track-intervals repeatmasker.bed \
 *       --track-name SD \
 *       --track-intervals segmental_duplications.bed \
 *       --track-name SR \
 *       --track-intervals simple_repeats.bed \
 *       ...
 * 
*

* The MIN_SIZE, MAX_SIZE, and TRACKS columns may contain null values {"-1", "", "NULL", "NA"}. Null MIN_SIZE and * MAX_SIZE correspond to negative and positive infinity, respectively, and a null TRACKS value means that variants * will not be matched based on track. Variants with undefined SVLEN will only match if both MIN_SIZE and MAX_SIZE * are null. *

* *

The {@link SVStratificationEngineArgumentsCollection#overlapFraction}, * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps}, and * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom} can be used to modify the overlap * criteria for assigning variants to each group based on overlap with the given reference track intervals. By * default, only one endpoint of the variant needs to lie in a track interval in order to match. INS variants are * treated as single points and only {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps} is used, * ignoring {@link SVStratificationEngineArgumentsCollection#overlapFraction}. Similarly, CTX and BND variant * overlap is only defined by {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom}. *

* *

By default, each stratification group must be mutually exclusive, meaning that any given SV can only belong to * one group. An error is thrown if the tool encounters a variant that meets the criteria for more than one group. * This restriction can be overridden with the {@link SVStratify#ALLOW_MULTIPLE_MATCHES_LONG_NAME} argument, in which * case the record will be written out multiple times: once for each matching stratification group with the corresponding * {@link GATKSVVCFConstants#STRATUM_INFO_KEY} value. Furthermore, SVs that do not match any of the groups will be * annotated with the {@link SVStratify#DEFAULT_STRATUM} group.

* *

If using {@link #SPLIT_OUTPUT_LONG_NAME} then the tool generates a set of VCFs as output with each VCF containing * the records of each group.

* *

This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.

* *

Inputs

* *
    *
  • * One or more SV VCFs *
  • *
  • * Stratification configuration TSV file *
  • *
  • * Reference dictionary *
  • *
* *

Output

* *
    *
  • * Annotated VCF(s) *
  • *
* *

Usage example, generating stratified VCFs:

* *
 *     gatk SVStratify \
 *       -V variants.vcf.gz \
 *       --split-output \
 *       -O ./ \
 *       --output-prefix out \
 *       --sequence-dictionary reference.dict \
 *       --track-name RM \
 *       --track-intervals repeatmasker.bed \
 *       --stratify-config strata.tsv
 * 
* *

Usage example, a single annotated VCF:

* *
 *     gatk SVStratify \
 *       -V variants.vcf.gz \
 *       -O out.vcf.gz \
 *       --sequence-dictionary reference.dict \
 *       --track-name RM \
 *       --track-intervals repeatmasker.bed \
 *       --stratify-config strata.tsv
 * 
* * @author Mark Walker <[email protected]> */ @CommandLineProgramProperties( summary = "Annotates variants by SV type, size, and reference tracks", oneLineSummary = "Annotates variants by SV type, size, and reference tracks", programGroup = StructuralVariantDiscoveryProgramGroup.class ) @BetaFeature @DocumentedFeature public final class SVStratify extends MultiVariantWalker { public static final String ALLOW_MULTIPLE_MATCHES_LONG_NAME = "allow-multiple-matches"; public static final String SPLIT_OUTPUT_LONG_NAME = "split-output"; // Default output group name for unmatched records public static final String DEFAULT_STRATUM = "default"; @Argument( doc = "Output path. Must be a directory if using --" + SPLIT_OUTPUT_LONG_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME ) private GATKPath outputPath; @Argument( doc = "Prefix for output filenames, only if using --" + SPLIT_OUTPUT_LONG_NAME, fullName = CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, optional = true ) private String outputPrefix; @ArgumentCollection private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection(); @Argument( doc = "Do not enforce mutual exclusivity for each stratification group", fullName = ALLOW_MULTIPLE_MATCHES_LONG_NAME ) private boolean allowMultipleMatches = false; @Argument( doc = "Split output into multiple VCFs, one per stratification group. If used, then --" + StandardArgumentDefinitions.OUTPUT_LONG_NAME + " must be the output directory and --" + CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME + " must be provided.", fullName = SPLIT_OUTPUT_LONG_NAME ) private boolean splitOutput = false; protected SAMSequenceDictionary dictionary; protected Map writers; protected SVStatificationEngine engine; @Override public void onTraversalStart() { super.onTraversalStart(); dictionary = getMasterSequenceDictionary(); Utils.validateArg(dictionary != null, "Reference dictionary is required; please specify with --" + StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME); engine = loadStratificationConfig(stratArgs, dictionary); logger.debug("Loaded stratification groups:"); for (final SVStatificationEngine.Stratum s : engine.getStrata()) { logger.debug(s); } initializeWriters(); } protected void createGroupWriter(final String name, final Path path) { final VariantContextWriter writer = createVCFWriter(path); final VCFHeader header = new VCFHeader(getHeaderForVariants()); addStratifyMetadata(header); writer.writeHeader(header); if (writers.containsKey(name)) { throw new GATKException.ShouldNeverReachHereException("Stratification name already exists: " + name); } writers.put(name, writer); } public static void addStratifyMetadata(final VCFHeader header) { header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRATUM_INFO_KEY, 1, VCFHeaderLineType.String, "Stratum ID")); } protected Path generateGroupOutputPath(final String name) { final String filename = outputPrefix + "." + name + ".vcf.gz"; return outputPath.toPath().resolve(filename); } protected void initializeWriters() { writers = new HashMap<>(); if (splitOutput) { Utils.validateArg(outputPrefix != null, "Argument --" + CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME + " required if using --" + SPLIT_OUTPUT_LONG_NAME); Utils.validateArg(new File(outputPath.toString()).isDirectory(), "Argument --" + StandardArgumentDefinitions.OUTPUT_LONG_NAME + " must be a directory if using " + SPLIT_OUTPUT_LONG_NAME); createGroupWriter(DEFAULT_STRATUM, generateGroupOutputPath(DEFAULT_STRATUM)); for (final SVStatificationEngine.Stratum s : engine.getStrata()) { createGroupWriter(s.getName(), generateGroupOutputPath(s.getName())); } } else { createGroupWriter(DEFAULT_STRATUM, outputPath.toPath()); } } /** * Reusable method for loading the stratification configuration table. See tool doc for the expected format. */ public static SVStatificationEngine loadStratificationConfig(final SVStratificationEngineArgumentsCollection args, final SAMSequenceDictionary dictionary) { Utils.validateArg(args.trackNameList.size() == args.trackFileList.size(), "Arguments --" + SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME + " and --" + SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME + " must be specified the same number of times."); final Map> map = new HashMap<>(); final Iterator nameIterator = args.trackNameList.iterator(); final Iterator pathIterator = args.trackFileList.iterator(); final GenomeLocParser genomeLocParser = new GenomeLocParser(dictionary); while (nameIterator.hasNext() && pathIterator.hasNext()) { final String name = nameIterator.next(); final GATKPath path = pathIterator.next(); final GenomeLocSortedSet genomeLocs = IntervalUtils.loadIntervals(Collections.singletonList(path.toString()), IntervalSetRule.UNION, IntervalMergingRule.ALL, 0, genomeLocParser); final List intervals = Collections.unmodifiableList(genomeLocs.toList()); if (map.containsKey(name)) { throw new UserException.BadInput("Duplicate track name was specified: " + name); } map.put(name, intervals); } final SVStatificationEngine engine = SVStatificationEngine.create(map, args.configFile, dictionary); if (engine.getStrata().stream().anyMatch(s -> s.getName().equals(DEFAULT_STRATUM))) { throw new UserException.BadInput("Stratification configuration contains entry with reserved " + "ID \"" + DEFAULT_STRATUM + "\""); } return engine; } @Override public void closeTool() { for (final VariantContextWriter writer : writers.values()) { writer.close(); } super.closeTool(); } @Override public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { // Save a ton of compute by not copying genotypes into the new record final VariantContext variantNoGenotypes = new VariantContextBuilder(variant).genotypes(Collections.emptyList()).make(); final SVCallRecord record = SVCallRecordUtils.create(variantNoGenotypes, dictionary); final Collection stratifications = engine.getMatches(record, stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom); final VariantContextBuilder builder = new VariantContextBuilder(variant); if (stratifications.isEmpty()) { writers.get(DEFAULT_STRATUM).add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, DEFAULT_STRATUM).make()); } else { if (!allowMultipleMatches && stratifications.size() > 1) { final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList())); throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString + ". Bypass this error using the --" + ALLOW_MULTIPLE_MATCHES_LONG_NAME + " argument"); } for (final SVStatificationEngine.Stratum stratum : stratifications) { final VariantContextWriter writer = splitOutput ? writers.get(stratum.getName()) : writers.get(DEFAULT_STRATUM); if (writer == null) { throw new GATKException("Writer not found for group: " + stratum.getName()); } writer.add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, stratum.getName()).make()); } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy