 
                        
        
                        
        org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVLinkage Maven / Gradle / Ivy
 The newest version!
        
        package org.broadinstitute.hellbender.tools.sv.cluster;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import java.util.Iterator;
import java.util.List;
/**
 * Main class for SV clustering. Two items are clustered together if they:
 * 
 *   - Match event types*
- Match strands*
- Match contigs*
- Meet minimum interval reciprocal overlap (unless inter-chromosomal or a BND).*
- Meet minimum sample reciprocal overlap using carrier status. Genotypes (GT fields) are used to determine carrier
 *   status first. If not found and the event is of type DEL or DUP, then copy number (CN fields) are used instead.
 *   In the latter case, it is expected that ploidy can be determined from the number of entries in GT.*
- Start and end coordinates are within a defined distance.*
*
 *Interval overlap, sample overlap, and coordinate proximity parameters are defined separately for depth-only/depth-only,
 * depth-only/PESR, and PESR/PESR item pairs using the {@link ClusteringParameters} class. Note that all criteria must be met for
 * two candidate items to cluster, unless the comparison is depth-only/depth-only, in which case only one of
 * interval overlap or break-end proximity must be met. For insertions with undefined length (SVLEN less than 1), interval
 * overlap is tested assuming the given start position and a length of
 * {@value CanonicalSVLinkage#INSERTION_ASSUMED_LENGTH_FOR_OVERLAP}.
 */
public class CanonicalSVLinkage extends SVClusterLinkage {
    protected final SAMSequenceDictionary dictionary;
    protected final boolean clusterDelWithDup;  // DEL/DUP multi-allelic site clustering
    protected ClusteringParameters depthOnlyParams;
    protected ClusteringParameters mixedParams;
    protected ClusteringParameters evidenceParams;
    public static final int INSERTION_ASSUMED_LENGTH_FOR_OVERLAP = 50;
    public static final int INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY = 1;
    public static final double DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY = 0.8;
    public static final double DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY = 0;
    public static final int DEFAULT_WINDOW_DEPTH_ONLY = 10000000;
    public static final double DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY = 0;
    public static final double DEFAULT_RECIPROCAL_OVERLAP_MIXED = 0.8;
    public static final double DEFAULT_SIZE_SIMILARITY_MIXED = 0;
    public static final int DEFAULT_WINDOW_MIXED = 1000;
    public static final double DEFAULT_SAMPLE_OVERLAP_MIXED = 0;
    public static final double DEFAULT_RECIPROCAL_OVERLAP_PESR = 0.5;
    public static final double DEFAULT_SIZE_SIMILARITY_PESR = 0;
    public static final int DEFAULT_WINDOW_PESR = 500;
    public static final double DEFAULT_SAMPLE_OVERLAP_PESR = 0;
    protected static final Logger logger = LogManager.getLogger(CanonicalSVLinkage.class);
    public static final ClusteringParameters DEFAULT_DEPTH_ONLY_PARAMS =
            ClusteringParameters.createDepthParameters(
                    DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY,
                    DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY,
                    DEFAULT_WINDOW_DEPTH_ONLY,
                    DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY);
    public static final ClusteringParameters DEFAULT_MIXED_PARAMS =
            ClusteringParameters.createMixedParameters(
                    DEFAULT_RECIPROCAL_OVERLAP_MIXED,
                    DEFAULT_SIZE_SIMILARITY_MIXED,
                    DEFAULT_WINDOW_MIXED,
                    DEFAULT_SAMPLE_OVERLAP_MIXED);
    public static final ClusteringParameters DEFAULT_PESR_PARAMS =
            ClusteringParameters.createPesrParameters(
                    DEFAULT_RECIPROCAL_OVERLAP_PESR,
                    DEFAULT_SIZE_SIMILARITY_PESR,
                    DEFAULT_WINDOW_PESR,
                    DEFAULT_SAMPLE_OVERLAP_PESR);
    /**
     * Create a new engine
     * @param dictionary sequence dictionary pertaining to clustered records
     * @param clusterDelWithDup enables clustering of DEL/DUP variants into CNVs
     */
    public CanonicalSVLinkage(final SAMSequenceDictionary dictionary,
                              final boolean clusterDelWithDup) {
        Utils.nonNull(dictionary);
        this.dictionary = dictionary;
        this.depthOnlyParams = DEFAULT_DEPTH_ONLY_PARAMS;
        this.mixedParams = DEFAULT_MIXED_PARAMS;
        this.evidenceParams = DEFAULT_PESR_PARAMS;
        this.clusterDelWithDup = clusterDelWithDup;
    }
    @Override
    public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) {
        if (!typesMatch(a, b)) {
            return false;
        }
        // Only require matching strands if BND or INV type
        if ((a.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND
                || a.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INV)
                && !strandsMatch(a, b)) {
            return false;
        }
        // Checks appropriate parameter set
        if (evidenceParams.isValidPair(a, b)) {
            return clusterTogetherWithParams(a, b, evidenceParams, dictionary);
        } else if (mixedParams.isValidPair(a, b)) {
            return clusterTogetherWithParams(a, b, mixedParams, dictionary);
        } else if (depthOnlyParams.isValidPair(a, b)) {
            return clusterTogetherWithParams(a, b, depthOnlyParams, dictionary);
        } else {
            return false;
        }
    }
    /**
     * Tests if SVTYPEs match, allowing for DEL/DUP/CNVs to match in some cases.
     */
    protected boolean typesMatch(final SVCallRecord a, final SVCallRecord b) {
        final GATKSVVCFConstants.StructuralVariantAnnotationType aType = a.getType();
        final GATKSVVCFConstants.StructuralVariantAnnotationType bType = b.getType();
        if (aType == bType && a.getComplexSubtype() == b.getComplexSubtype()) {
            return true;
        }
        // Allow CNVs to cluster with both DELs and DUPs, but only allow DEL/DUP clustering if enabled
        if (a.isSimpleCNV() && b.isSimpleCNV()) {
            if (clusterDelWithDup || (aType == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV || bType == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV)) {
                return true;
            }
        }
        return false;
    }
    /**
     * Test if breakend strands match. True if one has null strands.
     */
    protected boolean strandsMatch(final SVCallRecord a, final SVCallRecord b) {
        if (a.nullStrands() || b.nullStrands()) {
            return true;
        }
        return a.getStrandA() == b.getStrandA() && a.getStrandB() == b.getStrandB();
    }
    /**
     * Helper function to test for clustering between two items given a particular set of parameters.
     * Note that the records have already been subjected to the checks in {@link #areClusterable(SVCallRecord, SVCallRecord)}.
     */
    private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVCallRecord b,
                                                     final ClusteringParameters params,
                                                     final SAMSequenceDictionary dictionary) {
        // Contigs match
        if (!(a.getContigA().equals(b.getContigA()) && a.getContigB().equals(b.getContigB()))) {
            return false;
        }
        // If complex, test complex intervals
        if (a.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX
                && b.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX &&
                !testComplexIntervals(a, b, params.getReciprocalOverlap(), params.getSizeSimilarity(), params.getWindow(), dictionary)) {
            return false;
        }
        // Reciprocal overlap and size similarity
        // Check bypassed if both are inter-chromosomal
        final Boolean hasReciprocalOverlapAndSizeSimilarity;
        if (a.isIntrachromosomal()) {
            final boolean hasReciprocalOverlap = testReciprocalOverlap(a, b, params.getReciprocalOverlap());
            final boolean hasSizeSimilarity = testSizeSimilarity(a, b, params.getSizeSimilarity());
            hasReciprocalOverlapAndSizeSimilarity = hasReciprocalOverlap && hasSizeSimilarity;
            if (params.requiresOverlapAndProximity() && !hasReciprocalOverlapAndSizeSimilarity) {
                return false;
            }
        } else {
            hasReciprocalOverlapAndSizeSimilarity = null;
        }
        // Breakend proximity
        final boolean hasBreakendProximity = testBreakendProximity(a, b, params.getWindow(), dictionary);
        // Use short-circuiting statements since sample overlap is the least scalable / slowest check
        if (hasReciprocalOverlapAndSizeSimilarity == null) {
            return hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap());
        } else {
            if (params.requiresOverlapAndProximity()) {
                return hasReciprocalOverlapAndSizeSimilarity && hasBreakendProximity && hasSampleOverlap(a, b, params.getSampleOverlap());
            } else {
                return (hasReciprocalOverlapAndSizeSimilarity || hasBreakendProximity) && hasSampleOverlap(a, b, params.getSampleOverlap());
            }
        }
    }
    /**
     * Performs overlap testing on each pair of complex intervals in two records, requiring each pair to be
     * sufficiently similar by reciprocal overlap, size similarity, and breakend proximity.
     */
    private static boolean testComplexIntervals(final SVCallRecord a, final SVCallRecord b, final double overlapThreshold,
                                                final double sizeSimilarityThreshold, final int window,
                                                final SAMSequenceDictionary dictionary) {
        final List intervalsA = a.getComplexEventIntervals();
        final List intervalsB = b.getComplexEventIntervals();
        if (intervalsA.size() != intervalsB.size()) {
            return false;
        }
        final Iterator iterA = intervalsA.iterator();
        final Iterator iterB = intervalsB.iterator();
        for (int i = 0; i < intervalsA.size(); i++) {
            final SVCallRecord.ComplexEventInterval cpxIntervalA = iterA.next();
            final SVCallRecord.ComplexEventInterval cpxIintervalB = iterB.next();
            if (cpxIntervalA.getIntervalSVType() != cpxIintervalB.getIntervalSVType()) {
                return false;
            }
            final SimpleInterval intervalA = cpxIntervalA.getInterval();
            final SimpleInterval intervalB = cpxIintervalB.getInterval();
            if (!(IntervalUtils.isReciprocalOverlap(intervalA, intervalB, overlapThreshold)
                    && testSizeSimilarity(intervalA.getLengthOnReference(), intervalB.getLengthOnReference(), sizeSimilarityThreshold)
                    && testBreakendProximity(new SimpleInterval(intervalA.getContig(), intervalA.getStart(), intervalA.getStart()),
                    new SimpleInterval(intervalA.getContig(), intervalA.getEnd(), intervalA.getEnd()),
                    new SimpleInterval(intervalB.getContig(), intervalB.getStart(), intervalB.getStart()),
                    new SimpleInterval(intervalB.getContig(), intervalB.getEnd(), intervalB.getEnd()), window, dictionary))) {
                return false;
            }
        }
        return true;
    }
    private static boolean testReciprocalOverlap(final SVCallRecord a, final SVCallRecord b, final double threshold) {
        final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLength(a, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
        final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLength(b, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
        return IntervalUtils.isReciprocalOverlap(intervalA, intervalB, threshold);
    }
    private static boolean testSizeSimilarity(final SVCallRecord a, final SVCallRecord b, final double threshold) {
        return testSizeSimilarity(getLength(a, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY),
                getLength(b, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY), threshold);
    }
    private static boolean testSizeSimilarity(final int lengthA, final int lengthB, final double threshold) {
        return Math.min(lengthA, lengthB) / (double) Math.max(lengthA, lengthB) >= threshold;
    }
    private static boolean testBreakendProximity(final SVCallRecord a, final SVCallRecord b, final int window,
                                                 final SAMSequenceDictionary dictionary) {
        return testBreakendProximity(a.getPositionAInterval(), a.getPositionBInterval(),
                b.getPositionAInterval(), b.getPositionBInterval(), window, dictionary);
    }
    private static boolean testBreakendProximity(final SimpleInterval intervalA1, final SimpleInterval intervalA2,
                                                 final SimpleInterval intervalB1, final SimpleInterval intervalB2,
                                                 final int window, final SAMSequenceDictionary dictionary) {
        final SimpleInterval intervalA1Padded = intervalA1.expandWithinContig(window, dictionary);
        final SimpleInterval intervalA2Padded = intervalA2.expandWithinContig(window, dictionary);
        if (intervalA1Padded == null) {
            logger.warn("Invalid start position " + intervalA1.getContig() + ":" + intervalA1.getStart() +
                    " - record will not be matched");
            return false;
        }
        if (intervalA2Padded == null) {
            logger.warn("Invalid end position " + intervalA2.getContig() + ":" + intervalA2.getStart() +
                    " - record will not be matched");
            return false;
        }
        return intervalA1Padded.overlaps(intervalB1) && intervalA2Padded.overlaps(intervalB2);
    }
    /**
     * Gets event length used for overlap testing.
     */
    private static int getLength(final SVCallRecord record, final int missingInsertionLength) {
        Utils.validate(record.isIntrachromosomal(), "Record must be intra-chromosomal");
        if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS ||
                record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
            return record.getLength() == null ? missingInsertionLength : Math.max(record.getLength(), 1);
        } else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND) {
            return record.getPositionB() - record.getPositionA() + 1;
        } else {
            // TODO lengths less than 1 shouldn't be valid
            return Math.max(record.getLength() == null ? 1 : record.getLength(), 1);
        }
    }
    @Override
    public int getMaxClusterableStartingPosition(final SVCallRecord record) {
        return Math.max(
                getMaxClusterableStartingPositionWithParams(record, record.isDepthOnly() ? depthOnlyParams : evidenceParams, dictionary),
                getMaxClusterableStartingPositionWithParams(record, mixedParams, dictionary)
        );
    }
    /**
     * Returns max feasible start position of an item clusterable with the given record, given a set of clustering parameters.
     */
    private static int getMaxClusterableStartingPositionWithParams(final SVCallRecord call,
                                                                   final ClusteringParameters params,
                                                                   final SAMSequenceDictionary dictionary) {
        final String contig = call.getContigA();
        final int contigLength = dictionary.getSequence(contig).getSequenceLength();
        // Breakend proximity window
        final int maxPositionByWindow = Math.min(call.getPositionA() + params.getWindow(), contigLength);
        // Don't use overlap for inter-chromosomal events
        if (!call.isIntrachromosomal()) {
            return maxPositionByWindow;
        }
        // Reciprocal overlap window
        final int maxPositionByOverlap;
        final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLength(call, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP));
        maxPositionByOverlap = Math.min(maxPosition, contigLength);
        if (params.requiresOverlapAndProximity()) {
            return Math.min(maxPositionByOverlap, maxPositionByWindow);
        } else {
            return Math.max(maxPositionByOverlap, maxPositionByWindow);
        }
    }
    public final ClusteringParameters getDepthOnlyParams() {
        return depthOnlyParams;
    }
    public final ClusteringParameters getMixedParams() {
        return mixedParams;
    }
    public final ClusteringParameters getEvidenceParams() {
        return evidenceParams;
    }
    public final void setDepthOnlyParams(ClusteringParameters depthOnlyParams) { this.depthOnlyParams = depthOnlyParams; }
    public final void setMixedParams(ClusteringParameters mixedParams) {
        this.mixedParams = mixedParams;
    }
    public final void setEvidenceParams(ClusteringParameters evidenceParams) {
        this.evidenceParams = evidenceParams;
    }
}
      © 2015 - 2025 Weber Informatics LLC | Privacy Policy