org.broadinstitute.hellbender.tools.walkers.sv.GroupedSVCluster Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.walkers.sv;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.cluster.*;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.tsv.TableReader;
import org.broadinstitute.hellbender.utils.tsv.TableUtils;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
/**
* Clusters structural variants using the same base algorithms as {@link SVCluster}. In addition, variants are
* grouped according to customizable stratification criteria including:
*
* - SV type
* - Size range
* - Reference track overlap
*
* The first step is to define these groups in a stratification configuration TSV file. Please see the
* {@link SVStratify} tool for a description of the stratification method and expected table format.
*
* Each SV is only clustered with other SVs in its own group. Each group must be mutually exclusive, meaning that
* any given SV should only belong to one group. Furthermore, SVs that do not fall into any of the groups will not be
* clustered.
*
* The second step is to define the clustering configuration for each group. This is again done by creating a TSV
* file with the following columns defined on the first line:
*
* - NAME
* - RECIPROCAL_OVERLAP
* - SIZE_SIMILARITY
* - BREAKEND_WINDOW
* - SAMPLE_OVERLAP
*
* where NAME corresponds to the same name given in the stratification configuration. Every group needs to be given
* a configuration here. That is, there should be a 1:1 correspondence of the rows in the two configuration files
* (order does not matter).
*
*
* The remaining columns define the clustering parameters for the group. See {@link SVCluster} for more information
* on the different parameters. Note that, unlike {@link SVCluster}, distinct parameter sets for depth-only,
* PESR, and "mixed" clustering cannot be defined for this tool. Instead, the same parameters are applied to
* all three cases.
*
* For example,
*
*
* NAME RECIPROCAL_OVERLAP SIZE_SIMILARITY BREAKEND_WINDOW SAMPLE_OVERLAP
*
*
* DEL_large_SD 0.3 0.5 1000000 0.1
*
*
* DUP_large_SD 0.3 0.5 1000000 0.1
*
*
* DEL_small_SR_RM 0.5 0.5 100 0.1
*
*
* DUP_small_SR_RM 0.5 0.5 100 0.1
*
*
* INS_SR 0.5 0.5 100 0
*
*
*
* This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.
*
* This tool does not support CNV defragmentation via the {@link #algorithm} parameter.
*
* Inputs
*
*
* -
* One or more SV VCFs
*
* -
* Stratification configuration TSV file
*
* -
* Clustering configuration TSV file
*
* -
* Reference fasta
*
*
*
* Output
*
*
* -
* Clustered VCF
*
*
*
* Usage example
*
*
* gatk GroupedSVCluster \
* -R reference.fasta \
* -V variants.vcf.gz \
* -O clustered.vcf.gz \
* --track-name repeatmasker \
* --track-intervals repeatmasker.bed \
* --stratify-config strata.tsv \
* --clustering-config cluster.tsv
*
*
* @author Mark Walker <[email protected]>
*/
@CommandLineProgramProperties(
summary = "Clusters structural variants within independent stratification groups",
oneLineSummary = "Clusters structural variants grouping by type, size, and track overlap",
programGroup = StructuralVariantDiscoveryProgramGroup.class
)
@BetaFeature
@DocumentedFeature
public final class GroupedSVCluster extends SVClusterWalker {
public static final String CLUSTERING_CONFIG_FILE_LONG_NAME = "clustering-config";
@ArgumentCollection
private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection();
/**
* Expected format is tab-delimited and contains columns NAME, RECIPROCAL_OVERLAP, SIZE_SIMILARITY, BREAKEND_WINDOW,
* SAMPLE_OVERLAP. First line must be a header with column names. Comment lines starting with
* {@link TableUtils#COMMENT_PREFIX} are ignored.
*/
@Argument(
doc = "Configuration file (.tsv) containing the clustering parameters for each group",
fullName = CLUSTERING_CONFIG_FILE_LONG_NAME
)
public GATKPath strataClusteringConfigFile;
private SVStatificationEngine stratEngine;
private final Map clusterEngineMap = new HashMap<>();
@Override
public void onTraversalStart() {
super.onTraversalStart();
// sorting not guaranteed
createOutputVariantIndex = false;
stratEngine = SVStratify.loadStratificationConfig(stratArgs, dictionary);
Utils.validate(!stratEngine.getStrata().isEmpty(),
"No strata defined with --" + SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME);
readStrataClusteringConfig();
Utils.validate(stratEngine.getStrata().size() == clusterEngineMap.size(),
"Stratification and clustering configurations have a different number of groups.");
for (final SVStatificationEngine.Stratum stratum : stratEngine.getStrata()) {
Utils.validate(clusterEngineMap.containsKey(stratum.getName()),
"Could not find group " + stratum.getName() + " in clustering configuration.");
}
}
@Override
protected VCFHeader createHeader() {
final VCFHeader header = super.createHeader();
SVStratify.addStratifyMetadata(header);
return header;
}
private void readStrataClusteringConfig() {
try (final TableReader tableReader = TableUtils.reader(strataClusteringConfigFile.toPath(), StratifiedClusteringTableParser::tableParser)) {
for (final StratifiedClusteringTableParser.StratumParameters parameters : tableReader) {
// Identical parameters for each linkage type
final ClusteringParameters pesrParams = ClusteringParameters.createPesrParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
final ClusteringParameters mixedParams = ClusteringParameters.createMixedParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
final ClusteringParameters depthParams = ClusteringParameters.createDepthParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
final SVClusterEngine clusterEngine = createClusteringEngine(pesrParams, mixedParams, depthParams);
clusterEngineMap.put(parameters.name(), clusterEngine);
}
} catch (final IOException e) {
throw new GATKException("IO error while reading config table", e);
}
}
private SVClusterEngine createClusteringEngine(final ClusteringParameters pesrParams, final ClusteringParameters mixedParams, final ClusteringParameters depthParams) {
if (algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE || algorithm == CLUSTER_ALGORITHM.MAX_CLIQUE) {
final SVClusterEngine.CLUSTERING_TYPE type = algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE ?
SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE : SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE;
return SVClusterEngineFactory.createCanonical(type, breakpointSummaryStrategy,
altAlleleSummaryStrategy, dictionary, reference, enableCnv,
depthParams, mixedParams, pesrParams);
} else {
throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name());
}
}
@Override
public Object onTraversalSuccess() {
for (final SVClusterEngine engine : clusterEngineMap.values()) {
engine.flush().stream().forEach(this::write);
}
return super.onTraversalSuccess();
}
@Override
public void closeTool() {
super.closeTool();
}
@Override
public void applyRecord(final SVCallRecord record) {
final Collection stratifications = stratEngine.getMatches(record,
stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom);
if (stratifications.size() > 1) {
// don't allow more than one match since it would proliferate variants
final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()));
throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString +
". Groups must be mutually exclusive. Please modify the group configurations and/or tracks so that " +
"no variant can match more than one group.");
} else if (stratifications.isEmpty()) {
// no match, don't cluster
record.getAttributes().put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, Collections.singletonList(record.getId()));
record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(SVStratify.DEFAULT_STRATUM));
write(record);
} else {
// exactly one match
final SVStatificationEngine.Stratum stratum = stratifications.iterator().next();
Utils.validate(clusterEngineMap.containsKey(stratum.getName()), "Group undefined: " + stratum.getName());
record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(stratum.getName()));
clusterAndWrite(record, clusterEngineMap.get(stratum.getName()));
}
}
private void clusterAndWrite(final SVCallRecord record, final SVClusterEngine clusterEngine) {
clusterEngine.addAndFlush(record).stream().forEach(this::write);
}
}