All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVLinkage Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.sv.cluster;

import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.Iterator;
import java.util.List;

/**
 * 

Main class for SV clustering. Two items are clustered together if they:

*
    *
  • Match event types
  • *
  • Match strands
  • *
  • Match contigs
  • *
  • Meet minimum interval reciprocal overlap (unless inter-chromosomal or a BND).
  • *
  • Meet minimum sample reciprocal overlap using carrier status. Genotypes (GT fields) are used to determine carrier * status first. If not found and the event is of type DEL or DUP, then copy number (CN fields) are used instead. * In the latter case, it is expected that ploidy can be determined from the number of entries in GT.
  • *
  • Start and end coordinates are within a defined distance.
  • *
* *

Interval overlap, sample overlap, and coordinate proximity parameters are defined separately for depth-only/depth-only, * depth-only/PESR, and PESR/PESR item pairs using the {@link ClusteringParameters} class. Note that all criteria must be met for * two candidate items to cluster, unless the comparison is depth-only/depth-only, in which case only one of * interval overlap or break-end proximity must be met. For insertions with undefined length (SVLEN less than 1), interval * overlap is tested assuming the given start position and a length of * {@value CanonicalSVLinkage#INSERTION_ASSUMED_LENGTH_FOR_OVERLAP}.

*/ public class CanonicalSVLinkage extends SVClusterLinkage { protected final SAMSequenceDictionary dictionary; protected final boolean clusterDelWithDup; // DEL/DUP multi-allelic site clustering protected ClusteringParameters depthOnlyParams; protected ClusteringParameters mixedParams; protected ClusteringParameters evidenceParams; public static final int INSERTION_ASSUMED_LENGTH_FOR_OVERLAP = 50; public static final int INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY = 1; public static final double DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY = 0.8; public static final double DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY = 0; public static final int DEFAULT_WINDOW_DEPTH_ONLY = 10000000; public static final double DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY = 0; public static final double DEFAULT_RECIPROCAL_OVERLAP_MIXED = 0.8; public static final double DEFAULT_SIZE_SIMILARITY_MIXED = 0; public static final int DEFAULT_WINDOW_MIXED = 1000; public static final double DEFAULT_SAMPLE_OVERLAP_MIXED = 0; public static final double DEFAULT_RECIPROCAL_OVERLAP_PESR = 0.5; public static final double DEFAULT_SIZE_SIMILARITY_PESR = 0; public static final int DEFAULT_WINDOW_PESR = 500; public static final double DEFAULT_SAMPLE_OVERLAP_PESR = 0; protected static final Logger logger = LogManager.getLogger(CanonicalSVLinkage.class); public static final ClusteringParameters DEFAULT_DEPTH_ONLY_PARAMS = ClusteringParameters.createDepthParameters( DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY, DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY, DEFAULT_WINDOW_DEPTH_ONLY, DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY); public static final ClusteringParameters DEFAULT_MIXED_PARAMS = ClusteringParameters.createMixedParameters( DEFAULT_RECIPROCAL_OVERLAP_MIXED, DEFAULT_SIZE_SIMILARITY_MIXED, DEFAULT_WINDOW_MIXED, DEFAULT_SAMPLE_OVERLAP_MIXED); public static final ClusteringParameters DEFAULT_PESR_PARAMS = ClusteringParameters.createPesrParameters( DEFAULT_RECIPROCAL_OVERLAP_PESR, DEFAULT_SIZE_SIMILARITY_PESR, DEFAULT_WINDOW_PESR, DEFAULT_SAMPLE_OVERLAP_PESR); /** * Create a new engine * @param dictionary sequence dictionary pertaining to clustered records * @param clusterDelWithDup enables clustering of DEL/DUP variants into CNVs */ public CanonicalSVLinkage(final SAMSequenceDictionary dictionary, final boolean clusterDelWithDup) { Utils.nonNull(dictionary); this.dictionary = dictionary; this.depthOnlyParams = DEFAULT_DEPTH_ONLY_PARAMS; this.mixedParams = DEFAULT_MIXED_PARAMS; this.evidenceParams = DEFAULT_PESR_PARAMS; this.clusterDelWithDup = clusterDelWithDup; } @Override public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) { if (!typesMatch(a, b)) { return false; } // Only require matching strands if BND or INV type if ((a.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND || a.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INV) && !strandsMatch(a, b)) { return false; } // Checks appropriate parameter set if (evidenceParams.isValidPair(a, b)) { return clusterTogetherWithParams(a, b, evidenceParams, dictionary); } else if (mixedParams.isValidPair(a, b)) { return clusterTogetherWithParams(a, b, mixedParams, dictionary); } else if (depthOnlyParams.isValidPair(a, b)) { return clusterTogetherWithParams(a, b, depthOnlyParams, dictionary); } else { return false; } } /** * Tests if SVTYPEs match, allowing for DEL/DUP/CNVs to match in some cases. */ protected boolean typesMatch(final SVCallRecord a, final SVCallRecord b) { final GATKSVVCFConstants.StructuralVariantAnnotationType aType = a.getType(); final GATKSVVCFConstants.StructuralVariantAnnotationType bType = b.getType(); if (aType == bType && a.getComplexSubtype() == b.getComplexSubtype()) { return true; } // Allow CNVs to cluster with both DELs and DUPs, but only allow DEL/DUP clustering if enabled if (a.isSimpleCNV() && b.isSimpleCNV()) { if (clusterDelWithDup || (aType == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV || bType == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV)) { return true; } } return false; } /** * Test if breakend strands match. True if one has null strands. */ protected boolean strandsMatch(final SVCallRecord a, final SVCallRecord b) { if (a.nullStrands() || b.nullStrands()) { return true; } return a.getStrandA() == b.getStrandA() && a.getStrandB() == b.getStrandB(); } /** * Helper function to test for clustering between two items given a particular set of parameters. * Note that the records have already been subjected to the checks in {@link #areClusterable(SVCallRecord, SVCallRecord)}. */ private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVCallRecord b, final ClusteringParameters params, final SAMSequenceDictionary dictionary) { // Contigs match if (!(a.getContigA().equals(b.getContigA()) && a.getContigB().equals(b.getContigB()))) { return false; } // If complex, test complex intervals if (a.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX && b.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX && !testComplexIntervals(a, b, params.getReciprocalOverlap(), params.getSizeSimilarity(), params.getWindow(), dictionary)) { return false; } // Reciprocal overlap and size similarity // Check bypassed if both are inter-chromosomal final Boolean hasReciprocalOverlapAndSizeSimilarity; if (a.isIntrachromosomal()) { final boolean hasReciprocalOverlap = testReciprocalOverlap(a, b, params.getReciprocalOverlap()); final boolean hasSizeSimilarity = testSizeSimilarity(a, b, params.getSizeSimilarity()); hasReciprocalOverlapAndSizeSimilarity = hasReciprocalOverlap && hasSizeSimilarity; if (params.requiresOverlapAndProximity() && !hasReciprocalOverlapAndSizeSimilarity) { return false; } } else { hasReciprocalOverlapAndSizeSimilarity = null; } // Breakend proximity final boolean hasBreakendProximity = testBreakendProximity(a, b, params.getWindow(), dictionary); // Use short-circuiting statements since sample overlap is the least scalable / slowest check if (hasReciprocalOverlapAndSizeSimilarity == null) { return hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap()); } else { if (params.requiresOverlapAndProximity()) { return hasReciprocalOverlapAndSizeSimilarity && hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap()); } else { return (hasReciprocalOverlapAndSizeSimilarity || hasBreakendProximity) && hasSampleOverlap(a, b, params.getSampleOverlap()); } } } /** * Performs overlap testing on each pair of complex intervals in two records, requiring each pair to be * sufficiently similar by reciprocal overlap, size similarity, and breakend proximity. */ private static boolean testComplexIntervals(final SVCallRecord a, final SVCallRecord b, final double overlapThreshold, final double sizeSimilarityThreshold, final int window, final SAMSequenceDictionary dictionary) { final List intervalsA = a.getComplexEventIntervals(); final List intervalsB = b.getComplexEventIntervals(); if (intervalsA.size() != intervalsB.size()) { return false; } final Iterator iterA = intervalsA.iterator(); final Iterator iterB = intervalsB.iterator(); for (int i = 0; i < intervalsA.size(); i++) { final SVCallRecord.ComplexEventInterval cpxIntervalA = iterA.next(); final SVCallRecord.ComplexEventInterval cpxIintervalB = iterB.next(); if (cpxIntervalA.getIntervalSVType() != cpxIintervalB.getIntervalSVType()) { return false; } final SimpleInterval intervalA = cpxIntervalA.getInterval(); final SimpleInterval intervalB = cpxIintervalB.getInterval(); if (!(IntervalUtils.isReciprocalOverlap(intervalA, intervalB, overlapThreshold) && testSizeSimilarity(intervalA.getLengthOnReference(), intervalB.getLengthOnReference(), sizeSimilarityThreshold) && testBreakendProximity(new SimpleInterval(intervalA.getContig(), intervalA.getStart(), intervalA.getStart()), new SimpleInterval(intervalA.getContig(), intervalA.getEnd(), intervalA.getEnd()), new SimpleInterval(intervalB.getContig(), intervalB.getStart(), intervalB.getStart()), new SimpleInterval(intervalB.getContig(), intervalB.getEnd(), intervalB.getEnd()), window, dictionary))) { return false; } } return true; } private static boolean testReciprocalOverlap(final SVCallRecord a, final SVCallRecord b, final double threshold) { final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLength(a, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1); final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLength(b, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1); return IntervalUtils.isReciprocalOverlap(intervalA, intervalB, threshold); } private static boolean testSizeSimilarity(final SVCallRecord a, final SVCallRecord b, final double threshold) { return testSizeSimilarity(getLength(a, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY), getLength(b, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY), threshold); } private static boolean testSizeSimilarity(final int lengthA, final int lengthB, final double threshold) { return Math.min(lengthA, lengthB) / (double) Math.max(lengthA, lengthB) >= threshold; } private static boolean testBreakendProximity(final SVCallRecord a, final SVCallRecord b, final int window, final SAMSequenceDictionary dictionary) { return testBreakendProximity(a.getPositionAInterval(), a.getPositionBInterval(), b.getPositionAInterval(), b.getPositionBInterval(), window, dictionary); } private static boolean testBreakendProximity(final SimpleInterval intervalA1, final SimpleInterval intervalA2, final SimpleInterval intervalB1, final SimpleInterval intervalB2, final int window, final SAMSequenceDictionary dictionary) { final SimpleInterval intervalA1Padded = intervalA1.expandWithinContig(window, dictionary); final SimpleInterval intervalA2Padded = intervalA2.expandWithinContig(window, dictionary); if (intervalA1Padded == null) { logger.warn("Invalid start position " + intervalA1.getContig() + ":" + intervalA1.getStart() + " - record will not be matched"); return false; } if (intervalA2Padded == null) { logger.warn("Invalid end position " + intervalA2.getContig() + ":" + intervalA2.getStart() + " - record will not be matched"); return false; } return intervalA1Padded.overlaps(intervalB1) && intervalA2Padded.overlaps(intervalB2); } /** * Gets event length used for overlap testing. */ private static int getLength(final SVCallRecord record, final int missingInsertionLength) { Utils.validate(record.isIntrachromosomal(), "Record must be intra-chromosomal"); if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS || record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) { return record.getLength() == null ? missingInsertionLength : Math.max(record.getLength(), 1); } else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND) { return record.getPositionB() - record.getPositionA() + 1; } else { // TODO lengths less than 1 shouldn't be valid return Math.max(record.getLength() == null ? 1 : record.getLength(), 1); } } @Override public int getMaxClusterableStartingPosition(final SVCallRecord record) { return Math.max( getMaxClusterableStartingPositionWithParams(record, record.isDepthOnly() ? depthOnlyParams : evidenceParams, dictionary), getMaxClusterableStartingPositionWithParams(record, mixedParams, dictionary) ); } /** * Returns max feasible start position of an item clusterable with the given record, given a set of clustering parameters. */ private static int getMaxClusterableStartingPositionWithParams(final SVCallRecord call, final ClusteringParameters params, final SAMSequenceDictionary dictionary) { final String contig = call.getContigA(); final int contigLength = dictionary.getSequence(contig).getSequenceLength(); // Breakend proximity window final int maxPositionByWindow = Math.min(call.getPositionA() + params.getWindow(), contigLength); // Don't use overlap for inter-chromosomal events if (!call.isIntrachromosomal()) { return maxPositionByWindow; } // Reciprocal overlap window final int maxPositionByOverlap; final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLength(call, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP)); maxPositionByOverlap = Math.min(maxPosition, contigLength); if (params.requiresOverlapAndProximity()) { return Math.min(maxPositionByOverlap, maxPositionByWindow); } else { return Math.max(maxPositionByOverlap, maxPositionByWindow); } } public final ClusteringParameters getDepthOnlyParams() { return depthOnlyParams; } public final ClusteringParameters getMixedParams() { return mixedParams; } public final ClusteringParameters getEvidenceParams() { return evidenceParams; } public final void setDepthOnlyParams(ClusteringParameters depthOnlyParams) { this.depthOnlyParams = depthOnlyParams; } public final void setMixedParams(ClusteringParameters mixedParams) { this.mixedParams = mixedParams; } public final void setEvidenceParams(ClusteringParameters evidenceParams) { this.evidenceParams = evidenceParams; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy