All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.sv.GroupedSVCluster Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.sv;

import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.cluster.*;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.tsv.TableReader;
import org.broadinstitute.hellbender.utils.tsv.TableUtils;

import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

/**
 * 

Clusters structural variants using the same base algorithms as {@link SVCluster}. In addition, variants are * grouped according to customizable stratification criteria including: *

    *
  • SV type
  • *
  • Size range
  • *
  • Reference track overlap
  • *
* The first step is to define these groups in a stratification configuration TSV file. Please see the * {@link SVStratify} tool for a description of the stratification method and expected table format. * *

Each SV is only clustered with other SVs in its own group. Each group must be mutually exclusive, meaning that * any given SV should only belong to one group. Furthermore, SVs that do not fall into any of the groups will not be * clustered.

* *

The second step is to define the clustering configuration for each group. This is again done by creating a TSV * file with the following columns defined on the first line: *

    *
  1. NAME
  2. *
  3. RECIPROCAL_OVERLAP
  4. *
  5. SIZE_SIMILARITY
  6. *
  7. BREAKEND_WINDOW
  8. *
  9. SAMPLE_OVERLAP
  10. *
* where NAME corresponds to the same name given in the stratification configuration. Every group needs to be given * a configuration here. That is, there should be a 1:1 correspondence of the rows in the two configuration files * (order does not matter). *

* *

The remaining columns define the clustering parameters for the group. See {@link SVCluster} for more information * on the different parameters. Note that, unlike {@link SVCluster}, distinct parameter sets for depth-only, * PESR, and "mixed" clustering cannot be defined for this tool. Instead, the same parameters are applied to * all three cases.

* *

For example,

* * * * * * * * * * * * * * * * * * * *
NAMERECIPROCAL_OVERLAPSIZE_SIMILARITYBREAKEND_WINDOWSAMPLE_OVERLAP
DEL_large_SD0.30.510000000.1
DUP_large_SD0.30.510000000.1
DEL_small_SR_RM0.50.51000.1
DUP_small_SR_RM0.50.51000.1
INS_SR0.50.51000
* *

This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.

* *

This tool does not support CNV defragmentation via the {@link #algorithm} parameter.

* *

Inputs

* *
    *
  • * One or more SV VCFs *
  • *
  • * Stratification configuration TSV file *
  • *
  • * Clustering configuration TSV file *
  • *
  • * Reference fasta *
  • *
* *

Output

* *
    *
  • * Clustered VCF *
  • *
* *

Usage example

* *
 *     gatk GroupedSVCluster \
 *       -R reference.fasta \
 *       -V variants.vcf.gz \
 *       -O clustered.vcf.gz \
 *       --track-name repeatmasker \
 *       --track-intervals repeatmasker.bed \
 *       --stratify-config strata.tsv \
 *       --clustering-config cluster.tsv
 * 
* * @author Mark Walker <[email protected]> */ @CommandLineProgramProperties( summary = "Clusters structural variants within independent stratification groups", oneLineSummary = "Clusters structural variants grouping by type, size, and track overlap", programGroup = StructuralVariantDiscoveryProgramGroup.class ) @BetaFeature @DocumentedFeature public final class GroupedSVCluster extends SVClusterWalker { public static final String CLUSTERING_CONFIG_FILE_LONG_NAME = "clustering-config"; @ArgumentCollection private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection(); /** * Expected format is tab-delimited and contains columns NAME, RECIPROCAL_OVERLAP, SIZE_SIMILARITY, BREAKEND_WINDOW, * SAMPLE_OVERLAP. First line must be a header with column names. Comment lines starting with * {@link TableUtils#COMMENT_PREFIX} are ignored. */ @Argument( doc = "Configuration file (.tsv) containing the clustering parameters for each group", fullName = CLUSTERING_CONFIG_FILE_LONG_NAME ) public GATKPath strataClusteringConfigFile; private SVStatificationEngine stratEngine; private final Map clusterEngineMap = new HashMap<>(); @Override public void onTraversalStart() { super.onTraversalStart(); // sorting not guaranteed createOutputVariantIndex = false; stratEngine = SVStratify.loadStratificationConfig(stratArgs, dictionary); Utils.validate(!stratEngine.getStrata().isEmpty(), "No strata defined with --" + SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME); readStrataClusteringConfig(); Utils.validate(stratEngine.getStrata().size() == clusterEngineMap.size(), "Stratification and clustering configurations have a different number of groups."); for (final SVStatificationEngine.Stratum stratum : stratEngine.getStrata()) { Utils.validate(clusterEngineMap.containsKey(stratum.getName()), "Could not find group " + stratum.getName() + " in clustering configuration."); } } @Override protected VCFHeader createHeader() { final VCFHeader header = super.createHeader(); SVStratify.addStratifyMetadata(header); return header; } private void readStrataClusteringConfig() { try (final TableReader tableReader = TableUtils.reader(strataClusteringConfigFile.toPath(), StratifiedClusteringTableParser::tableParser)) { for (final StratifiedClusteringTableParser.StratumParameters parameters : tableReader) { // Identical parameters for each linkage type final ClusteringParameters pesrParams = ClusteringParameters.createPesrParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap()); final ClusteringParameters mixedParams = ClusteringParameters.createMixedParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap()); final ClusteringParameters depthParams = ClusteringParameters.createDepthParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap()); final SVClusterEngine clusterEngine = createClusteringEngine(pesrParams, mixedParams, depthParams); clusterEngineMap.put(parameters.name(), clusterEngine); } } catch (final IOException e) { throw new GATKException("IO error while reading config table", e); } } private SVClusterEngine createClusteringEngine(final ClusteringParameters pesrParams, final ClusteringParameters mixedParams, final ClusteringParameters depthParams) { if (algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE || algorithm == CLUSTER_ALGORITHM.MAX_CLIQUE) { final SVClusterEngine.CLUSTERING_TYPE type = algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE ? SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE : SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE; return SVClusterEngineFactory.createCanonical(type, breakpointSummaryStrategy, altAlleleSummaryStrategy, dictionary, reference, enableCnv, depthParams, mixedParams, pesrParams); } else { throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name()); } } @Override public Object onTraversalSuccess() { for (final SVClusterEngine engine : clusterEngineMap.values()) { engine.flush().stream().forEach(this::write); } return super.onTraversalSuccess(); } @Override public void closeTool() { super.closeTool(); } @Override public void applyRecord(final SVCallRecord record) { final Collection stratifications = stratEngine.getMatches(record, stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom); if (stratifications.size() > 1) { // don't allow more than one match since it would proliferate variants final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList())); throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString + ". Groups must be mutually exclusive. Please modify the group configurations and/or tracks so that " + "no variant can match more than one group."); } else if (stratifications.isEmpty()) { // no match, don't cluster record.getAttributes().put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, Collections.singletonList(record.getId())); record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(SVStratify.DEFAULT_STRATUM)); write(record); } else { // exactly one match final SVStatificationEngine.Stratum stratum = stratifications.iterator().next(); Utils.validate(clusterEngineMap.containsKey(stratum.getName()), "Group undefined: " + stratum.getName()); record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(stratum.getName())); clusterAndWrite(record, clusterEngineMap.get(stratum.getName())); } } private void clusterAndWrite(final SVCallRecord record, final SVClusterEngine clusterEngine) { clusterEngine.addAndFlush(record).stream().forEach(this::write); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy