org.broadinstitute.hellbender.tools.walkers.sv.SVStratify Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.walkers.sv;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
import org.broadinstitute.hellbender.utils.*;

import java.io.File;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Stratifies structural variants into mutually exclusive groups according to the following customizable criteria:
 * 

 *     SV type
 *     Size range
 *     Reference track overlap
 * 
 * Records are annotated with their respective strata names in the {@link GATKSVVCFConstants#STRATUM_INFO_KEY} INFO
 * field. Users must provide a stratification configuration .tsv file (tab-delimited table) with the following column
 * header on the first line:
 * 
 *     NAME
 *     SVTYPE
 *     MIN_SIZE
 *     MAX_SIZE
 *     TRACKS
 * 
 * 
 * For example:
 * 
 *   
 *      
 *   
 *   
 *      
 *   
 *   
 *      
 *   
 *   
 *      
 *   
 *   
 *      
 *   
 *   
 *      
 *   
 * NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS
DEL_large_SD DEL 5000 -1 SD
DUP_large_SD DUP 5000 -1 SD
DEL_small_SR_RM DEL -1 5000 SR,RM
DUP_small_SR_RM DUP -1 5000 SR,RM
INS_SR INS -1 -1 SR
 * 
 * The "NAME" column is an arbitrary identifier, "SVTYPE" is the class of variant (DEL, DUP, INS, etc.), MIN_SIZE in an
 * inclusive size lower-bound, MAX_SIZE is an exclusive size upper-bound, and TRACKS is a comma-delimited list of
 * reference tracks defined using the {@link SVStratificationEngineArgumentsCollection#trackFileList} and
 * {@link SVStratificationEngineArgumentsCollection#trackNameList} parameters. For example,
 * 
 *  *     gatk GroupedSVCluster \
 *       --track-name RM \
 *       --track-intervals repeatmasker.bed \
 *       --track-name SD \
 *       --track-intervals segmental_duplications.bed \
 *       --track-name SR \
 *       --track-intervals simple_repeats.bed \
 *       ...
 * 
 * 
 * The MIN_SIZE, MAX_SIZE, and TRACKS columns may contain null values {"-1", "", "NULL", "NA"}. Null MIN_SIZE and
 * MAX_SIZE correspond to negative and positive infinity, respectively, and a null TRACKS value means that variants
 * will not be matched based on track. Variants with undefined SVLEN will only match if both MIN_SIZE and MAX_SIZE
 * are null.
 * 
 *
 * The {@link SVStratificationEngineArgumentsCollection#overlapFraction},
 * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps}, and
 * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom} can be used to modify the overlap
 * criteria for assigning variants to each group based on overlap with the given reference track intervals. By
 * default, only one endpoint of the variant needs to lie in a track interval in order to match. INS variants are
 * treated as single points and only {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps} is used,
 * ignoring {@link SVStratificationEngineArgumentsCollection#overlapFraction}. Similarly, CTX and BND variant
 * overlap is only defined by {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom}.
 * 
 *
 * By default, each stratification group must be mutually exclusive, meaning that any given SV can only belong to
 * one group. An error is thrown if the tool encounters a variant that meets the criteria for more than one group.
 * This restriction can be overridden with the {@link SVStratify#ALLOW_MULTIPLE_MATCHES_LONG_NAME} argument, in which
 * case the record will be written out multiple times: once for each matching stratification group with the corresponding
 * {@link GATKSVVCFConstants#STRATUM_INFO_KEY} value. Furthermore, SVs that do not match any of the groups will be
 * annotated with the {@link SVStratify#DEFAULT_STRATUM} group.
 *
 * If using {@link #SPLIT_OUTPUT_LONG_NAME} then the tool generates a set of VCFs as output with each VCF containing
 * the records of each group.
 *
 * This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.
 *
 * Inputs
 *
 * 
 *     
 *         One or more SV VCFs
 *     
 *     
 *         Stratification configuration TSV file
 *     
 *     
 *         Reference dictionary
 *     
 * 
 *
 * Output
 *
 * 
 *     
 *         Annotated VCF(s)
 *     
 * 
 *
 * Usage example, generating stratified VCFs:
 *
 *  *     gatk SVStratify \
 *       -V variants.vcf.gz \
 *       --split-output \
 *       -O ./ \
 *       --output-prefix out \
 *       --sequence-dictionary reference.dict \
 *       --track-name RM \
 *       --track-intervals repeatmasker.bed \
 *       --stratify-config strata.tsv
 * 
 *
 * Usage example, a single annotated VCF:
 *
 *  *     gatk SVStratify \
 *       -V variants.vcf.gz \
 *       -O out.vcf.gz \
 *       --sequence-dictionary reference.dict \
 *       --track-name RM \
 *       --track-intervals repeatmasker.bed \
 *       --stratify-config strata.tsv
 * 
 *
 * @author Mark Walker <[email protected]>
 */
@CommandLineProgramProperties(
        summary = "Annotates variants by SV type, size, and reference tracks",
        oneLineSummary = "Annotates variants by SV type, size, and reference tracks",
        programGroup = StructuralVariantDiscoveryProgramGroup.class
)
@BetaFeature
@DocumentedFeature
public final class SVStratify extends MultiVariantWalker {

    public static final String ALLOW_MULTIPLE_MATCHES_LONG_NAME = "allow-multiple-matches";
    public static final String SPLIT_OUTPUT_LONG_NAME = "split-output";

    // Default output group name for unmatched records
    public static final String DEFAULT_STRATUM = "default";

    @Argument(
            doc = "Output path. Must be a directory if using --" + SPLIT_OUTPUT_LONG_NAME,
            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
    )
    private GATKPath outputPath;

    @Argument(
            doc = "Prefix for output filenames, only if using --" + SPLIT_OUTPUT_LONG_NAME,
            fullName =  CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME,
            optional = true
    )
    private String outputPrefix;

    @ArgumentCollection
    private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection();

    @Argument(
            doc = "Do not enforce mutual exclusivity for each stratification group",
            fullName = ALLOW_MULTIPLE_MATCHES_LONG_NAME
    )
    private boolean allowMultipleMatches = false;

    @Argument(
            doc = "Split output into multiple VCFs, one per stratification group. If used, then --" +
                    StandardArgumentDefinitions.OUTPUT_LONG_NAME + " must be the output directory and  --" +
                    CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME + " must be provided.",
            fullName = SPLIT_OUTPUT_LONG_NAME
    )
    private boolean splitOutput = false;

    protected SAMSequenceDictionary dictionary;
    protected Map writers;
    protected SVStatificationEngine engine;

    @Override
    public void onTraversalStart() {
        super.onTraversalStart();
        dictionary = getMasterSequenceDictionary();
        Utils.validateArg(dictionary != null, "Reference dictionary is required; please specify with --" +
                StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME);
        engine = loadStratificationConfig(stratArgs, dictionary);
        logger.debug("Loaded stratification groups:");
        for (final SVStatificationEngine.Stratum s : engine.getStrata()) {
            logger.debug(s);
        }
        initializeWriters();
    }

    protected void createGroupWriter(final String name, final Path path) {
        final VariantContextWriter writer = createVCFWriter(path);
        final VCFHeader header = new VCFHeader(getHeaderForVariants());
        addStratifyMetadata(header);
        writer.writeHeader(header);
        if (writers.containsKey(name)) {
            throw new GATKException.ShouldNeverReachHereException("Stratification name already exists: " + name);
        }
        writers.put(name, writer);
    }

    public static void addStratifyMetadata(final VCFHeader header) {
        header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRATUM_INFO_KEY, 1,
                VCFHeaderLineType.String, "Stratum ID"));
    }

    protected Path generateGroupOutputPath(final String name) {
        final String filename = outputPrefix + "." + name + ".vcf.gz";
        return outputPath.toPath().resolve(filename);
    }

    protected void initializeWriters() {
        writers = new HashMap<>();
        if (splitOutput) {
            Utils.validateArg(outputPrefix != null, "Argument --" + CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME + " required if using --" + SPLIT_OUTPUT_LONG_NAME);
            Utils.validateArg(new File(outputPath.toString()).isDirectory(), "Argument --" + StandardArgumentDefinitions.OUTPUT_LONG_NAME + " must be a directory if using " + SPLIT_OUTPUT_LONG_NAME);
            createGroupWriter(DEFAULT_STRATUM, generateGroupOutputPath(DEFAULT_STRATUM));
            for (final SVStatificationEngine.Stratum s : engine.getStrata()) {
                createGroupWriter(s.getName(), generateGroupOutputPath(s.getName()));
            }
        } else {
            createGroupWriter(DEFAULT_STRATUM, outputPath.toPath());
        }
    }

    /**
     * Reusable method for loading the stratification configuration table. See tool doc for the expected format.
     */
    public static SVStatificationEngine loadStratificationConfig(final SVStratificationEngineArgumentsCollection args,
                                                                 final SAMSequenceDictionary dictionary) {
        Utils.validateArg(args.trackNameList.size() == args.trackFileList.size(), "Arguments --" +
                SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME + " and --" + SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME +
                " must be specified the same number of times.");
        final Map> map = new HashMap<>();
        final Iterator nameIterator = args.trackNameList.iterator();
        final Iterator pathIterator = args.trackFileList.iterator();
        final GenomeLocParser genomeLocParser = new GenomeLocParser(dictionary);
        while (nameIterator.hasNext() && pathIterator.hasNext()) {
            final String name = nameIterator.next();
            final GATKPath path = pathIterator.next();
            final GenomeLocSortedSet genomeLocs = IntervalUtils.loadIntervals(Collections.singletonList(path.toString()), IntervalSetRule.UNION, IntervalMergingRule.ALL, 0, genomeLocParser);
            final List intervals = Collections.unmodifiableList(genomeLocs.toList());
            if (map.containsKey(name)) {
                throw new UserException.BadInput("Duplicate track name was specified: " + name);
            }
            map.put(name, intervals);
        }
        final SVStatificationEngine engine = SVStatificationEngine.create(map, args.configFile, dictionary);
        if (engine.getStrata().stream().anyMatch(s -> s.getName().equals(DEFAULT_STRATUM))) {
            throw new UserException.BadInput("Stratification configuration contains entry with reserved " +
                    "ID \"" + DEFAULT_STRATUM + "\"");
        }
        return engine;
    }

    @Override
    public void closeTool() {
        for (final VariantContextWriter writer : writers.values()) {
            writer.close();
        }
        super.closeTool();
    }

    @Override
    public void apply(final VariantContext variant, final ReadsContext readsContext,
                      final ReferenceContext referenceContext, final FeatureContext featureContext) {
        // Save a ton of compute by not copying genotypes into the new record
        final VariantContext variantNoGenotypes = new VariantContextBuilder(variant).genotypes(Collections.emptyList()).make();
        final SVCallRecord record = SVCallRecordUtils.create(variantNoGenotypes, dictionary);
        final Collection stratifications = engine.getMatches(record,
                stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom);
        final VariantContextBuilder builder = new VariantContextBuilder(variant);
        if (stratifications.isEmpty()) {
            writers.get(DEFAULT_STRATUM).add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, DEFAULT_STRATUM).make());
        } else {
            if (!allowMultipleMatches && stratifications.size() > 1) {
                final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()));
                throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString + ". Bypass this error using the --" + ALLOW_MULTIPLE_MATCHES_LONG_NAME + " argument");
            }
            for (final SVStatificationEngine.Stratum stratum : stratifications) {
                final VariantContextWriter writer = splitOutput ? writers.get(stratum.getName()) : writers.get(DEFAULT_STRATUM);
                if (writer == null) {
                    throw new GATKException("Writer not found for group: " + stratum.getName());
                }
                writer.add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, stratum.getName()).make());
            }
        }
    }
}
NAME	SVTYPE	MIN_SIZE	MAX_SIZE	TRACKS
DEL_large_SD	DEL	5000	-1	SD
DUP_large_SD	DUP	5000	-1	SD
DEL_small_SR_RM	DEL	-1	5000	SR,RM
DUP_small_SR_RM	DUP	-1	5000	SR,RM
INS_SR	INS	-1	-1	SR