org.broadinstitute.hellbender.tools.walkers.sv.GroupedSVCluster Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.walkers.sv;

import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.cluster.*;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.tsv.TableReader;
import org.broadinstitute.hellbender.utils.tsv.TableUtils;

import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Clusters structural variants using the same base algorithms as {@link SVCluster}. In addition, variants are
 * grouped according to customizable stratification criteria including:
 * 

 *     SV type
 *     Size range
 *     Reference track overlap
 * 
 * The first step is to define these groups in a stratification configuration TSV file. Please see the
 * {@link SVStratify} tool for a description of the stratification method and expected table format.
 *
 * Each SV is only clustered with other SVs in its own group. Each group must be mutually exclusive, meaning that
 * any given SV should only belong to one group. Furthermore, SVs that do not fall into any of the groups will not be
 * clustered.
 *
 * The second step is to define the clustering configuration for each group. This is again done by creating a TSV
 * file with the following columns defined on the first line:
 * 

 *     NAME
 *     RECIPROCAL_OVERLAP
 *     SIZE_SIMILARITY
 *     BREAKEND_WINDOW
 *     SAMPLE_OVERLAP
 * 
 * where NAME corresponds to the same name given in the stratification configuration. Every group needs to be given
 * a configuration here. That is, there should be a 1:1 correspondence of the rows in the two configuration files
 * (order does not matter).
 * 
 *
 * The remaining columns define the clustering parameters for the group. See {@link SVCluster} for more information
 * on the different parameters. Note that, unlike {@link SVCluster}, distinct parameter sets for depth-only,
 * PESR, and "mixed" clustering cannot be defined for this tool. Instead, the same parameters are applied to
 * all three cases.
 *
 * For example,
 * 
 *   
 *     
 *   
 *   
 *     
 *   
 *   
 *     
 *   
 *   
 *     
 *   
 *   
 *     
 *   
 *   
 *     
 *   
 * NAME RECIPROCAL_OVERLAP SIZE_SIMILARITY BREAKEND_WINDOW SAMPLE_OVERLAP
DEL_large_SD 0.3 0.5 1000000 0.1
DUP_large_SD 0.3 0.5 1000000 0.1
DEL_small_SR_RM 0.5 0.5 100 0.1
DUP_small_SR_RM 0.5 0.5 100 0.1
INS_SR 0.5 0.5 100 0
 *
 * This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.
 *
 * This tool does not support CNV defragmentation via the {@link #algorithm} parameter.
 *
 * Inputs
 *
 * 
 *     
 *         One or more SV VCFs
 *     
 *     
 *         Stratification configuration TSV file
 *     
 *     
 *         Clustering configuration TSV file
 *     
 *     
 *         Reference fasta
 *     
 * 
 *
 * Output
 *
 * 
 *     
 *         Clustered VCF
 *     
 * 
 *
 * Usage example
 *
 *  *     gatk GroupedSVCluster \
 *       -R reference.fasta \
 *       -V variants.vcf.gz \
 *       -O clustered.vcf.gz \
 *       --track-name repeatmasker \
 *       --track-intervals repeatmasker.bed \
 *       --stratify-config strata.tsv \
 *       --clustering-config cluster.tsv
 * 
 *
 * @author Mark Walker <[email protected]>
 */
@CommandLineProgramProperties(
        summary = "Clusters structural variants within independent stratification groups",
        oneLineSummary = "Clusters structural variants grouping by type, size, and track overlap",
        programGroup = StructuralVariantDiscoveryProgramGroup.class
)
@BetaFeature
@DocumentedFeature
public final class GroupedSVCluster extends SVClusterWalker {
    public static final String CLUSTERING_CONFIG_FILE_LONG_NAME = "clustering-config";

    @ArgumentCollection
    private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection();

    /**
     * Expected format is tab-delimited and contains columns NAME, RECIPROCAL_OVERLAP, SIZE_SIMILARITY, BREAKEND_WINDOW,
     * SAMPLE_OVERLAP. First line must be a header with column names. Comment lines starting with
     * {@link TableUtils#COMMENT_PREFIX} are ignored.
     */
    @Argument(
            doc = "Configuration file (.tsv) containing the clustering parameters for each group",
            fullName = CLUSTERING_CONFIG_FILE_LONG_NAME
    )
    public GATKPath strataClusteringConfigFile;

    private SVStatificationEngine stratEngine;
    private final Map clusterEngineMap = new HashMap<>();

    @Override
    public void onTraversalStart() {
        super.onTraversalStart();
        // sorting not guaranteed
        createOutputVariantIndex = false;
        stratEngine = SVStratify.loadStratificationConfig(stratArgs, dictionary);
        Utils.validate(!stratEngine.getStrata().isEmpty(),
                "No strata defined with --" + SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME);
        readStrataClusteringConfig();
        Utils.validate(stratEngine.getStrata().size() == clusterEngineMap.size(),
                "Stratification and clustering configurations have a different number of groups.");
        for (final SVStatificationEngine.Stratum stratum : stratEngine.getStrata()) {
            Utils.validate(clusterEngineMap.containsKey(stratum.getName()),
                    "Could not find group " + stratum.getName() + " in clustering configuration.");
        }
    }

    @Override
    protected VCFHeader createHeader() {
        final VCFHeader header = super.createHeader();
        SVStratify.addStratifyMetadata(header);
        return header;
    }

    private void readStrataClusteringConfig() {
        try (final TableReader tableReader = TableUtils.reader(strataClusteringConfigFile.toPath(), StratifiedClusteringTableParser::tableParser)) {
            for (final StratifiedClusteringTableParser.StratumParameters parameters : tableReader) {
                // Identical parameters for each linkage type
                final ClusteringParameters pesrParams = ClusteringParameters.createPesrParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
                final ClusteringParameters mixedParams = ClusteringParameters.createMixedParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
                final ClusteringParameters depthParams = ClusteringParameters.createDepthParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
                final SVClusterEngine clusterEngine = createClusteringEngine(pesrParams, mixedParams, depthParams);
                clusterEngineMap.put(parameters.name(), clusterEngine);
            }
        } catch (final IOException e) {
            throw new GATKException("IO error while reading config table", e);
        }
    }

    private SVClusterEngine createClusteringEngine(final ClusteringParameters pesrParams, final ClusteringParameters mixedParams, final ClusteringParameters depthParams) {
        if (algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE || algorithm == CLUSTER_ALGORITHM.MAX_CLIQUE) {
            final SVClusterEngine.CLUSTERING_TYPE type = algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE ?
                    SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE : SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE;
            return SVClusterEngineFactory.createCanonical(type, breakpointSummaryStrategy,
                    altAlleleSummaryStrategy, dictionary, reference, enableCnv,
                    depthParams, mixedParams, pesrParams);
        } else {
            throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name());
        }
    }

    @Override
    public Object onTraversalSuccess() {
        for (final SVClusterEngine engine : clusterEngineMap.values()) {
            engine.flush().stream().forEach(this::write);
        }
        return super.onTraversalSuccess();
    }

    @Override
    public void closeTool() {
        super.closeTool();
    }

    @Override
    public void applyRecord(final SVCallRecord record) {
        final Collection stratifications = stratEngine.getMatches(record,
                stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom);
        if (stratifications.size() > 1) {
            // don't allow more than one match since it would proliferate variants
            final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()));
            throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString +
                    ". Groups must be mutually exclusive. Please modify the group configurations and/or tracks so that " +
                    "no variant can match more than one group.");
        } else if (stratifications.isEmpty()) {
            // no match, don't cluster
            record.getAttributes().put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, Collections.singletonList(record.getId()));
            record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(SVStratify.DEFAULT_STRATUM));
            write(record);
        } else {
            // exactly one match
            final SVStatificationEngine.Stratum stratum = stratifications.iterator().next();
            Utils.validate(clusterEngineMap.containsKey(stratum.getName()), "Group undefined: " + stratum.getName());
            record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(stratum.getName()));
            clusterAndWrite(record, clusterEngineMap.get(stratum.getName()));
        }
    }

    private void clusterAndWrite(final SVCallRecord record, final SVClusterEngine clusterEngine) {
        clusterEngine.addAndFlush(record).stream().forEach(this::write);
    }
}
NAME	RECIPROCAL_OVERLAP	SIZE_SIMILARITY	BREAKEND_WINDOW	SAMPLE_OVERLAP
DEL_large_SD	0.3	0.5	1000000	0.1
DUP_large_SD	0.3	0.5	1000000	0.1
DEL_small_SR_RM	0.5	0.5	100	0.1
DUP_small_SR_RM	0.5	0.5	100	0.1
INS_SR	0.5	0.5	100	0