org.broadinstitute.hellbender.tools.walkers.sv.CollectSVEvidence Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.walkers.sv;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.sv.DepthEvidence;
import org.broadinstitute.hellbender.tools.sv.DiscordantPairEvidence;
import org.broadinstitute.hellbender.tools.sv.SiteDepth;
import org.broadinstitute.hellbender.tools.sv.SplitReadEvidence;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.codecs.*;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.*;
import java.util.function.Predicate;

import static org.broadinstitute.hellbender.utils.read.ReadUtils.isBaseInsideAdaptor;

/**
 * Creates discordant read pair, split read evidence, site depth, and read depth files for use in the GATK-SV pipeline.
 * This tool emulates the functionality of the "svtk collect-pesr" used in v1 of the GATK-SV pipeline.
 *
 * The first output file, which should be named "*.pe.txt" or "*.pe.txt.gz" is a tab-delimited file
 * containing information on discordant read pairs in the input cram, with the following columns:
 *
 * 
 *     read contig
 *     read start
 *     read strand
 *     mate contig
 *     mate start
 *     mate strand
 *     sample name
 * 
 *
 * Only one record is emitted for each discordant read pair, at the read in the pair with the "upstream" start
 * position according to the sequence dictionary contig ordering and coordinate.
 *
 * The second output file, which should be named "*.sr.txt" or "*.sr.txt.gz" contains the locations
 * of all split read clippings in the input bam or cram, with the following columns:
 *
 * 
 *     contig
 *     clipping position
 *     direction: side of the read that was clipped (either "left" or "right")
 *     count: the number of reads clipped at this location in this direction
 *     sample name
 * 
 *
 * The third output file, which should be named "*.sd.txt" or "*.sd.txt.gz" specifies site depth counts:
 * For each locus specified in an input VCF as a simple, biallelic SNP, it gives a count, for each
 * base call, of the number of reads that cover that locus.
 * It has the following columns:
 *
 * 
 *     contig
 *     position
 *     sampleName
 *     A observations
 *     C observations
 *     G observations
 *     T observations
 * 
 *
 * The fourth output file, which should be named "*.rd.txt" or "*.rd.txt.gz" specifies read depths:
 * For each interval specified by a 3-column, tab delimited input file, the number of reads that
 * start in that interval are reported.
 * It has the following columns:
 *
 * 
 *     contig
 *     starting position
 *     ending position
 *     read count
 * 
 *
 * Note: when only collecting RD evidence, users should consider providing the same interval list
 * with -L as --depth-evidence-intervals in order to avoid processing unused reads outside the intervals.
 *
 * Each of these output files may also be written as a block-compressed interval file, rather than
 * as a tab-delimited text file by specifying an output file name that ends with ".bci" rather than
 * ".txt".  These files are self-indexing, and contain complete header information including sample
 * name(s) and a dictionary for the contigs.
 */
@BetaFeature
@DocumentedFeature
@CommandLineProgramProperties(
        summary = "Gathers paired-end and split read evidence files for use in the GATK-SV pipeline. Output files " +
                "are a file containing the location of and orientation of read pairs marked as discordant, and a " +
                "file containing the clipping location of all soft clipped reads and the orientation of the clipping.",
        oneLineSummary = "Gathers paired-end and split read evidence files for use in the GATK-SV pipeline.",
        programGroup = StructuralVariantDiscoveryProgramGroup.class
)
public class CollectSVEvidence extends ReadWalker {

    public static final String PAIRED_END_FILE_ARGUMENT_SHORT_NAME = "PE";
    public static final String PAIRED_END_FILE_ARGUMENT_LONG_NAME = "pe-file";
    public static final String SPLIT_READ_FILE_ARGUMENT_SHORT_NAME = "SR";
    public static final String SPLIT_READ_FILE_ARGUMENT_LONG_NAME = "sr-file";
    public static final String SITE_DEPTH_OUTPUT_ARGUMENT_SHORT_NAME = "SD";
    public static final String SITE_DEPTH_OUTPUT_ARGUMENT_LONG_NAME = "sd-file";
    public static final String SITE_DEPTH_INPUT_ARGUMENT_SHORT_NAME = "F";
    public static final String SITE_DEPTH_INPUT_ARGUMENT_LONG_NAME = "site-depth-locs-vcf";
    public static final String DEPTH_EVIDENCE_OUTPUT_FILE_ARGUMENT_SHORT_NAME = "RD";
    public static final String DEPTH_EVIDENCE_OUTPUT_FILE_ARGUMENT_LONG_NAME = "depth-evidence-file";
    public static final String DEPTH_EVIDENCE_SUMMARY_FILE_ARGUMENT_SHORT_NAME = "DS";
    public static final String DEPTH_EVIDENCE_SUMMARY_FILE_ARGUMENT_LONG_NAME = "depth-summary-file";
    public static final String DEPTH_EVIDENCE_INTERVALS_INPUT_FILE_ARGUMENT_SHORT_NAME = "DI";
    public static final String DEPTH_EVIDENCE_INTERVALS_INPUT_FILE_ARGUMENT_LONG_NAME = "depth-evidence-intervals";
    public static final String MIN_DEPTH_EVIDENCE_MAPQ_ARGUMENT_NAME = "depth-evidence-min-mapq";
    public static final String MIN_SITE_DEPTH_MAPQ_ARGUMENT_NAME = "site-depth-min-mapq";
    public static final String MIN_SITE_DEPTH_BASEQ_ARGUMENT_NAME = "site-depth-min-baseq";
    public static final String SAMPLE_NAME_ARGUMENT_LONG_NAME = "sample-name";
    public static final String COMPRESSION_LEVEL_ARGUMENT_LONG_NAME = "compression-level";

    @Argument(shortName = PAIRED_END_FILE_ARGUMENT_SHORT_NAME,
            fullName = PAIRED_END_FILE_ARGUMENT_LONG_NAME, doc = "Output file for paired end evidence",
            optional=true)
    public GATKPath peFile;

    @Argument(shortName = SPLIT_READ_FILE_ARGUMENT_SHORT_NAME,
            fullName = SPLIT_READ_FILE_ARGUMENT_LONG_NAME, doc = "Output file for split read evidence",
            optional=true)
    public GATKPath srFile;

    @Argument(shortName = SITE_DEPTH_OUTPUT_ARGUMENT_SHORT_NAME,
            fullName = SITE_DEPTH_OUTPUT_ARGUMENT_LONG_NAME,
            doc = "Output file for site depth counts",
            optional = true)
    public GATKPath siteDepthOutputFilename;

    @Argument(shortName = SITE_DEPTH_INPUT_ARGUMENT_SHORT_NAME,
            fullName = SITE_DEPTH_INPUT_ARGUMENT_LONG_NAME,
            doc = "Input VCF of SNPs marking loci for site depth counts",
            optional = true)
    public GATKPath siteDepthInputFilename;

    @Argument(shortName = DEPTH_EVIDENCE_OUTPUT_FILE_ARGUMENT_SHORT_NAME,
            fullName = DEPTH_EVIDENCE_OUTPUT_FILE_ARGUMENT_LONG_NAME,
            doc = "Output file for depth evidence",
            optional = true)
    public GATKPath depthEvidenceOutputFilename;

    @Argument(shortName = DEPTH_EVIDENCE_SUMMARY_FILE_ARGUMENT_SHORT_NAME,
            fullName = DEPTH_EVIDENCE_SUMMARY_FILE_ARGUMENT_LONG_NAME,
            doc = "Output file for depth evidence summary statistics",
            optional = true)
    public GATKPath depthEvidenceSummaryFilename;

    @Argument(shortName = DEPTH_EVIDENCE_INTERVALS_INPUT_FILE_ARGUMENT_SHORT_NAME,
            fullName = DEPTH_EVIDENCE_INTERVALS_INPUT_FILE_ARGUMENT_LONG_NAME,
            doc = "Input feature file specifying intervals where depth evidence will be gathered",
            optional = true)
    public GATKPath depthEvidenceInputFilename;

    @Argument(fullName = MIN_DEPTH_EVIDENCE_MAPQ_ARGUMENT_NAME,
            doc = "minimum mapping quality for read to be counted as depth evidence",
            optional = true)
    public int minDepthEvidenceMapQ = 0;

    @Argument(fullName = MIN_SITE_DEPTH_MAPQ_ARGUMENT_NAME,
            doc = "minimum mapping quality for read to be counted toward site depth",
            optional = true)
    public int minMapQ = 30;

    @Argument(fullName = MIN_SITE_DEPTH_BASEQ_ARGUMENT_NAME,
            doc = "minimum base call quality for SNP to be counted toward site depth",
            optional = true)
    public int minQ = 20;

    @Argument(fullName = SAMPLE_NAME_ARGUMENT_LONG_NAME, doc = "Sample name")
    String sampleName = null;

    @Argument(fullName = COMPRESSION_LEVEL_ARGUMENT_LONG_NAME, doc = "Output compression level")
    int compressionLevel = 4;

    final Set observedDiscordantNames = new HashSet<>();
    final PriorityQueue splitPosBuffer = new PriorityQueue<>(new SplitPosComparator());
    final List discordantPairs = new ArrayList<>();

    int currentDiscordantPosition = -1;
    String currentChrom = null;

    private FeatureSink peWriter;
    private FeatureSink srWriter;
    private SiteDepthCounter siteDepthCounter;
    private DepthEvidenceCollector depthEvidenceCollector;

    private SAMSequenceDictionary sequenceDictionary;

    @Override
    public boolean requiresReads() {
        return true;
    }

    @Override
    public void onTraversalStart() {
        super.onTraversalStart();

        sequenceDictionary = getBestAvailableSequenceDictionary();
        peWriter = createPEWriter();
        srWriter = createSRWriter();
        siteDepthCounter = createSiteDepthCounter();
        depthEvidenceCollector = createDepthEvidenceCollector();
        if ( peWriter == null && srWriter == null &&
                siteDepthCounter == null && depthEvidenceCollector == null ) {
            throw new UserException("You must supply at least one output file: PE, SR, SD, or RD");
        }
    }

    @Override
    public List getDefaultReadFilters() {
        final List readFilters = new ArrayList<>(super.getDefaultReadFilters());
        readFilters.add(ReadFilterLibrary.MAPPED);
        readFilters.add(ReadFilterLibrary.NOT_DUPLICATE);
        return readFilters;
    }

    @Override
    public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
        if ( !(read.isPaired() && read.mateIsUnmapped()) &&
                !read.isSupplementaryAlignment() &&
                !read.isSecondaryAlignment() ) {
            if ( srWriter != null && isSoftClipped(read) ) {
                countSplitRead(read, splitPosBuffer, srWriter);
            }

            if ( peWriter != null && !read.isProperlyPaired() ) {
                reportDiscordantReadPair(read);
            }
        }

        if ( siteDepthCounter != null ) {
            siteDepthCounter.apply(read);
        }
        if ( depthEvidenceCollector != null ) {
            depthEvidenceCollector.apply(read);
        }
    }

    private FeatureSink createPEWriter() {
        if ( peFile == null ) {
            return null;
        }
        final String peFilename = peFile.toPath().toString();
        final List sampleNames = Collections.singletonList(sampleName);
        final DiscordantPairEvidenceCodec peCodec = new DiscordantPairEvidenceCodec();
        final DiscordantPairEvidenceBCICodec peBCICodec = new DiscordantPairEvidenceBCICodec();
        if ( peBCICodec.canDecode(peFilename) ) {
            return peBCICodec.makeSink(peFile, sequenceDictionary, sampleNames, compressionLevel);
        }
        if ( !peCodec.canDecode(peFilename) ) {
            throw new UserException("Attempting to write discordant pair evidence to a file that " +
                    "can't be read as discordant pair evidence: " + peFilename + ".  The file " +
                    "name should end with \".pe.txt\", \".pe.txt.gz\", or \".pe.bci\".");
        }
        return peCodec.makeSink(peFile, sequenceDictionary, sampleNames, compressionLevel);
    }

    private FeatureSink createSRWriter() {
        if ( srFile == null ) {
            return null;
        }
        final String srFilename = srFile.toPath().toString();
        final List sampleNames = Collections.singletonList(sampleName);
        final SplitReadEvidenceCodec srCodec = new SplitReadEvidenceCodec();
        final SplitReadEvidenceBCICodec srBCICodec = new SplitReadEvidenceBCICodec();
        if ( srBCICodec.canDecode(srFilename) ) {
            return srBCICodec.makeSink(srFile, sequenceDictionary, sampleNames, compressionLevel);
        }
        if ( !srCodec.canDecode(srFilename) ) {
            throw new UserException("Attempting to write split read evidence to a file that " +
                    "can't be read as split read evidence: " + srFilename + ".  The file " +
                    "name should end with \".sr.txt\", \".sr.txt.gz\", or \".sr.bci\".");
        }
        return srCodec.makeSink(srFile, sequenceDictionary, sampleNames, compressionLevel);
    }

    private SiteDepthCounter createSiteDepthCounter() {
        if ( siteDepthInputFilename != null && siteDepthOutputFilename != null ) {
            return new SiteDepthCounter(sequenceDictionary, sampleName, compressionLevel,
                    siteDepthInputFilename, siteDepthOutputFilename,
                                        minMapQ, minQ);
        }
        if ( siteDepthInputFilename != null ) {
            throw new UserException("Having specified a " + SITE_DEPTH_INPUT_ARGUMENT_LONG_NAME +
                    " input, you must also supply an " + SITE_DEPTH_OUTPUT_ARGUMENT_LONG_NAME +
                    " for output.");
        }
        if ( siteDepthOutputFilename != null ) {
            throw new UserException("Having specified an " + SITE_DEPTH_OUTPUT_ARGUMENT_LONG_NAME +
                    " for output, you must also supply a " + SITE_DEPTH_INPUT_ARGUMENT_LONG_NAME +
                    " as input.");
        }
        return null;
    }

    private DepthEvidenceCollector createDepthEvidenceCollector() {
        if ( depthEvidenceInputFilename != null && depthEvidenceOutputFilename != null ) {
            return new DepthEvidenceCollector(sequenceDictionary, sampleName, compressionLevel,
                                            depthEvidenceInputFilename, depthEvidenceOutputFilename,
                                            minDepthEvidenceMapQ);
        }
        if ( depthEvidenceInputFilename != null ) {
            throw new UserException("Having specified an depth-evidence-intervals input, " +
                    "you must also supply a depth-evidence-file for output.");
        }
        if ( depthEvidenceOutputFilename != null ) {
            throw new UserException("Having specified a depth-evidence-file for output, " +
                    "you must also supply depth-evidence-intervals as input.");
        }
        return null;
    }

    private void reportDiscordantReadPair(final GATKRead read) {
        if (read.getStart() != currentDiscordantPosition) {
            flushDiscordantReadPairs();
            currentDiscordantPosition = read.getStart();
            observedDiscordantNames.clear();
        }

        final DiscordantRead reportableDiscordantReadPair = getReportableDiscordantReadPair(read, observedDiscordantNames,
                sequenceDictionary);
        if (reportableDiscordantReadPair != null) {
            discordantPairs.add(reportableDiscordantReadPair);
        }
    }

    @VisibleForTesting
    public DiscordantRead getReportableDiscordantReadPair(final GATKRead read, final Set observedDiscordantNamesAtThisLocus,
                                                          final SAMSequenceDictionary samSequenceDictionary) {
        final int readSeqId = samSequenceDictionary.getSequenceIndex(read.getContig());
        final int mateSeqId = samSequenceDictionary.getSequenceIndex(read.getMateContig());
        if (readSeqId < mateSeqId) {
            return new DiscordantRead(read);
        } else if (readSeqId == mateSeqId) {
            if (read.getStart() < read.getMateStart()) {
                return new DiscordantRead(read);
            } else if (read.getStart() == read.getMateStart()) {
                final boolean seenBefore = observedDiscordantNamesAtThisLocus.remove(read.getName());
                if (! seenBefore) {
                    final DiscordantRead discordantRead = new DiscordantRead(read);
                    observedDiscordantNamesAtThisLocus.add(read.getName());
                    return discordantRead;
                }
            }
        }
        return null;
    }

    private void flushDiscordantReadPairs() {
        final Comparator discReadComparator = new DiscordantReadComparator(sequenceDictionary);

        discordantPairs.sort(discReadComparator);
        discordantPairs.forEach(this::writeDiscordantPair);
        discordantPairs.clear();
    }

    private void writeDiscordantPair(final DiscordantRead r) {
        peWriter.write(new DiscordantPairEvidence(sampleName,
                            r.getContig(), r.getStart(), !r.isReadReverseStrand(),
                            r.getMateContig(), r.getMateStart(), !r.isMateReverseStrand()));
    }

    /**
     * Adds split read information about the current read to the counts in splitCounts. Flushes split read counts to
     * srWriter if necessary.
     */
    @VisibleForTesting
    public void countSplitRead(final GATKRead read,
                               final PriorityQueue splitCounts,
                               final FeatureSink srWriter ) {
        final SplitPos splitPosition = getSplitPosition(read);
        final int readStart = read.getStart();
        if (splitPosition.direction == POSITION.MIDDLE) {
            return;
        }
        if (currentChrom == null) {
            currentChrom = read.getContig();
        } else if (!currentChrom.equals(read.getContig())) {
            flushSplitCounts(splitPos -> true, splitCounts, srWriter);
            currentChrom = read.getContig();
        } else {
            flushSplitCounts(sp -> (sp.pos < readStart - 1), splitCounts, srWriter);
        }

        splitCounts.add(splitPosition);
    }

    private void flushSplitCounts(final Predicate flushablePosition,
                                  final PriorityQueue splitCounts,
                                  final FeatureSink srWriter) {

        while (splitCounts.size() > 0 && flushablePosition.test(splitCounts.peek())) {
            final SplitPos pos = splitCounts.poll();
            int countAtPos = 1;
            while (splitCounts.size() > 0 && splitCounts.peek().equals(pos)) {
                countAtPos++;
                splitCounts.poll();
            }
            final SplitReadEvidence splitRead = new SplitReadEvidence(sampleName, currentChrom, pos.pos, countAtPos, pos.direction.equals(POSITION.RIGHT));
            srWriter.write(splitRead);
        }
    }

    private SplitPos getSplitPosition( final GATKRead read ) {
        if (read.getCigar().getFirstCigarElement().getOperator() == CigarOperator.M) {
            final int matchLength = read.getCigar().getCigarElements().stream().filter(e -> e.getOperator().consumesReferenceBases()).mapToInt(CigarElement::getLength).sum();
            return new SplitPos(read.getStart() + matchLength, POSITION.RIGHT);
        } else if (read.getCigar().getLastCigarElement().getOperator() == CigarOperator.M) {
            return new SplitPos(read.getStart(), POSITION.LEFT);
        }

        return new SplitPos(-1, POSITION.MIDDLE);
    }

    private boolean isSoftClipped( final GATKRead read ) {
        final CigarOperator firstOperator = read.getCigar().getFirstCigarElement().getOperator();
        final CigarOperator lastOperator = read.getCigar().getLastCigarElement().getOperator();
        return (firstOperator == CigarOperator.SOFT_CLIP && lastOperator != CigarOperator.SOFT_CLIP) ||
                (firstOperator != CigarOperator.SOFT_CLIP && lastOperator == CigarOperator.SOFT_CLIP);
    }

    @Override
    public Object onTraversalSuccess() {
        flushSplitCounts(splitPos -> true, splitPosBuffer, srWriter);
        flushDiscordantReadPairs();
        if ( siteDepthCounter != null ) {
            siteDepthCounter.close();
        }
        if ( depthEvidenceCollector != null ) {
            depthEvidenceCollector.close();
            if ( depthEvidenceSummaryFilename != null ) {
                depthEvidenceCollector.reportSummaryStats(depthEvidenceSummaryFilename, sampleName);
            }
        }
        return null;
    }

    @Override
    public void closeTool() {
        super.closeTool();
        if ( peWriter != null ) {
            peWriter.close();
        }
        if ( srWriter != null ) {
            srWriter.close();
        }
    }

    enum POSITION {
        LEFT ("left"),
        MIDDLE ("middle"),
        RIGHT ("right");

        private final String description;

        POSITION(final String description) {
            this.description = description;
        }

        public String getDescription() {
            return description;
        }
    }

    @VisibleForTesting final static class DiscordantRead {
        private boolean readReverseStrand;
        private boolean mateReverseStrand;
        private String contig;
        private int start;
        private String mateContig;
        private int mateStart;
        private String name;

        public DiscordantRead(final GATKRead read) {
            this.readReverseStrand = read.isReverseStrand();
            this.mateReverseStrand = read.mateIsReverseStrand();
            this.contig = read.getContig();
            this.start = read.getStart();
            this.mateContig = read.getMateContig();
            this.mateStart = read.getMateStart();
            this.name = read.getName();
        }

        public boolean isReadReverseStrand() {
            return readReverseStrand;
        }

        public void setReadReverseStrand(final boolean readReverseStrand) {
            this.readReverseStrand = readReverseStrand;
        }

        public boolean isMateReverseStrand() {
            return mateReverseStrand;
        }

        public void setMateReverseStrand(final boolean mateReverseStrand) {
            this.mateReverseStrand = mateReverseStrand;
        }

        public String getContig() {
            return contig;
        }

        public void setContig(final String contig) {
            this.contig = contig;
        }

        public int getStart() {
            return start;
        }

        public void setStart(final int start) {
            this.start = start;
        }

        public String getMateContig() {
            return mateContig;
        }

        public void setMateContig(final String mateContig) {
            this.mateContig = mateContig;
        }

        public int getMateStart() {
            return mateStart;
        }

        public void setMateStart(final int mateStart) {
            this.mateStart = mateStart;
        }

        public String getName() {
            return name;
        }

        public void setName(final String name) {
            this.name = name;
        }

        @Override
        public boolean equals(final Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;

            final DiscordantRead that = (DiscordantRead) o;

            if (readReverseStrand != that.readReverseStrand) return false;
            if (mateReverseStrand != that.mateReverseStrand) return false;
            if (start != that.start) return false;
            if (mateStart != that.mateStart) return false;
            if ( !Objects.equals(contig, that.contig) ) return false;
            if ( !Objects.equals(mateContig, that.mateContig) ) return false;
            return Objects.equals(name, that.name);
        }

        @Override
        public int hashCode() {
            int result = (readReverseStrand ? 1 : 0);
            result = 31 * result + (mateReverseStrand ? 1 : 0);
            result = 31 * result + (contig != null ? contig.hashCode() : 0);
            result = 31 * result + start;
            result = 31 * result + (mateContig != null ? mateContig.hashCode() : 0);
            result = 31 * result + mateStart;
            result = 31 * result + (name != null ? name.hashCode() : 0);
            return result;
        }
    }

    @VisibleForTesting final static class SplitPos {
        public POSITION direction;
        public int pos;

        public SplitPos(final int start, final POSITION direction) {
            this.pos = start;
            this.direction = direction;
        }

        @Override
        public boolean equals(final Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;

            final SplitPos splitPos = (SplitPos) o;

            if (pos != splitPos.pos) return false;
            return direction.ordinal() == splitPos.direction.ordinal();
        }

        @Override
        public int hashCode() {
            int result = direction != null ? direction.ordinal() : 0;
            result = 31 * result + pos;
            return result;
        }
    }

    @VisibleForTesting final static class SplitPosComparator implements Comparator {
        @Override
        public int compare(final SplitPos o1, final SplitPos o2) {
            if (o1.pos != o2.pos) {
                return Integer.compare(o1.pos, o2.pos);
            } else {
                return o1.direction.compareTo(o2.direction);
            }
        }
    }

    @VisibleForTesting final static class DiscordantReadComparator implements Comparator {

        private final Comparator internalComparator;

        public DiscordantReadComparator(final SAMSequenceDictionary sequenceDictionary) {
            internalComparator = Comparator.comparing((DiscordantRead r) -> sequenceDictionary.getSequenceIndex(r.getContig()))
                    .thenComparing(DiscordantRead::getStart)
                    .thenComparing(DiscordantRead::isReadReverseStrand)
                    .thenComparing((DiscordantRead r) -> sequenceDictionary.getSequenceIndex(r.getMateContig()))
                    .thenComparing(DiscordantRead::getMateStart)
                    .thenComparing(DiscordantRead::isMateReverseStrand);

        }

        @Override
        public int compare(final DiscordantRead o1, final DiscordantRead o2) {
            return internalComparator.compare(o1, o2);
        }
    }

    /**
     * Compare a locus to an interval and indicates whether the locus is
     * upstream (returns -1), within (returns 0), or downstream (returns 1) of an interval
     */
    @VisibleForTesting
    final static class LocusComparator {
        private final SAMSequenceDictionary dict;

        public LocusComparator( final SAMSequenceDictionary dict ) {
            this.dict = dict;
        }

        public int compareLocus( final String contig,
                                 final int position,
                                 final Locatable loc ) {
            int cmp = Integer.compare(dict.getSequenceIndex(contig),
                    dict.getSequenceIndex(loc.getContig()));
            if ( cmp == 0 ) {
                if ( position < loc.getStart() ) {
                    cmp = -1;
                } else if ( position > loc.getEnd() ) {
                    cmp = 1;
                }
            }
            return cmp;
        }
    }

    @VisibleForTesting
    final static class SiteDepthCounter {
        private final LocusComparator lComp;
        private final String sampleName;
        private final FeatureSink writer;
        private final int minMapQ;
        private final int minQ;
        private final Iterator snpSourceItr;
        private final Deque siteDepthQueue;

        public SiteDepthCounter( final SAMSequenceDictionary dict,
                                 final String sampleName,
                                 final int compressionLevel,
                                 final GATKPath inputPath,
                                 final GATKPath outputPath,
                                 final int minMapQ,
                                 final int minQ ) {
            this.lComp = new LocusComparator(dict);
            this.sampleName = sampleName;
            final String outputFilename = outputPath.toPath().toString();
            final SiteDepthBCICodec bciCodec = new SiteDepthBCICodec();
            final List sampleNames = Collections.singletonList(sampleName);
            if ( bciCodec.canDecode(outputFilename) ) {
                this.writer = bciCodec.makeSink(outputPath, dict, sampleNames, compressionLevel);
            } else {
                final SiteDepthCodec codec = new SiteDepthCodec();
                if ( !codec.canDecode(outputFilename) ) {
                    throw new UserException("Attempting to write site depth evidence to a file that " +
                            "can't be read as site depth evidence: " + outputFilename + ".  The file " +
                            "name should end with \".sd.txt\", \".sd.txt.gz\", or \".sd.bci\".");
                }
                this.writer = codec.makeSink(outputPath, dict, sampleNames, compressionLevel);
            }
            this.minMapQ = minMapQ;
            this.minQ = minQ;
            final FeatureDataSource snpSource =
                    new FeatureDataSource<>(inputPath.toPath().toString(),
                                        null,
                                            FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES,
                                            VariantContext.class);
            dict.assertSameDictionary(snpSource.getSequenceDictionary());
            this.snpSourceItr = new BAFSiteIterator(snpSource.iterator());
            this.siteDepthQueue = new ArrayDeque<>(100);
            readNextLocus();
        }

        public void apply( final GATKRead read ) {
            if ( read.getMappingQuality() < minMapQ || siteDepthQueue.isEmpty() ) {
                return;
            }

            // clean queue of SiteDepths that precede the current read
            final SimpleInterval readLoc =
                    new SimpleInterval(read.getContig(), read.getStart(), read.getEnd());
            while ( true ) {
                final SiteDepth siteDepth = siteDepthQueue.getFirst();
                if ( lComp.compareLocus(siteDepth.getContig(), siteDepth.getStart(), readLoc) >= 0 ) {
                    break;
                }
                writer.write(siteDepthQueue.removeFirst());
                if ( siteDepthQueue.isEmpty() ) {
                    if ( !readNextLocus() ) {
                        return;
                    }
                }
            }

            // make sure that the last SiteDepth in the queue occurs after the current read
            //  if such a SiteDepth is available
            while ( true ) {
                final SiteDepth siteDepth = siteDepthQueue.getLast();
                if ( lComp.compareLocus(siteDepth.getContig(), siteDepth.getStart(), readLoc) > 0 ||
                        !readNextLocus() ) {
                    break;
                }
            }

            walkReadMatches(read);
        }

        private void walkReadMatches( final GATKRead read ) {
            walkReadMatches(read, minQ, siteDepthQueue, lComp);
        }

        static void walkReadMatches( final GATKRead read,
                                     final int minQ,
                                     final Iterable sites,
                                     final LocusComparator locusComparator ) {
            int opStart = read.getStart();
            int readIdx = 0;
            final byte[] calls = read.getBasesNoCopy();
            final byte[] quals = read.getBaseQualitiesNoCopy();
            for ( final CigarElement cigEle : read.getCigar().getCigarElements() ) {
                final int eleLen = cigEle.getLength();
                final CigarOperator cigOp = cigEle.getOperator();
                if ( cigOp.isAlignment() ) {
                    final int opEnd = opStart + eleLen - 1;
                    final SimpleInterval opLoc =
                            new SimpleInterval(read.getContig(), opStart, opEnd);
                    for ( final SiteDepth siteDepth : sites ) {
                        final String siteContig = siteDepth.getContig();
                        final int sitePos = siteDepth.getStart();
                        final int cmp = locusComparator.compareLocus(siteContig, sitePos, opLoc);
                        if ( cmp > 0 ) {
                            break;
                        }
                        // don't count base calls that aren't really part of the template
                        // (if the template is shorter than the read, we can call into adaptor sequence)
                        if ( cmp == 0 && !isBaseInsideAdaptor(read, sitePos) ) {
                            final int callIdx = readIdx + sitePos - opStart;
                            if ( quals[callIdx] < minQ ) {
                                continue;
                            }
                            final Nucleotide call = Nucleotide.decode(calls[callIdx]);
                            if ( call.isStandard() ) {
                                siteDepth.observe(call.ordinal());
                            }
                        }
                    }
                }
                if ( cigOp.consumesReadBases() ) {
                    readIdx += eleLen;
                }
                if ( cigOp.consumesReferenceBases() ) {
                    opStart += eleLen;
                }
            }
        }

        public void close() {
            do {
                while ( !siteDepthQueue.isEmpty() ) {
                    writer.write(siteDepthQueue.removeFirst());
                }
            } while ( readNextLocus() );
            writer.close();
        }

        private boolean readNextLocus() {
            if ( !snpSourceItr.hasNext() ) {
                return false;
            }
            final SiteDepth siteDepth = new SiteDepth(snpSourceItr.next(), sampleName);
            siteDepthQueue.add(siteDepth);
            return true;
        }
    }

    public final static class BAFSiteIterator implements Iterator {
        final Iterator vcIterator;
        VariantContext last;
        VariantContext next;

        public BAFSiteIterator( final Iterator vcIterator ) {
            this.vcIterator = vcIterator;
            advance();
        }

        public boolean hasNext() {
            if ( next != null ) {
                return true;
            }
            advance();
            return next != null;
        }

        public VariantContext next() {
            if ( !hasNext() ) {
                throw new NoSuchElementException("baf sites iterator is exhausted");
            }
            final VariantContext result = next;
            last = next;
            next = null;
            return result;
        }

        private void advance() {
            while ( vcIterator.hasNext() ) {
                final VariantContext vc = vcIterator.next();
                // if it's a SNP, it's biallelic, and it occurs at a new locus
                if ( vc.isSNP() && vc.isBiallelic() &&
                        (last == null || !last.getContig().equals(vc.getContig()) ||
                                last.getStart() < vc.getStart()) ) {
                    next = vc;
                    break;
                }
            }
        }
    }

    @VisibleForTesting
    final static class CountCounter {
        private final int[] lowCounts;
        private final SortedMap highCounts;
        private long nCounts;
        private long totalCounts;

        public CountCounter() {
            lowCounts = new int[3000];
            highCounts = new TreeMap<>();
        }

        public void addCount( final int count ) {
            nCounts += 1;
            totalCounts += count;
            if ( count < lowCounts.length ) {
                lowCounts[count] += 1;
            } else {
                highCounts.compute(count, (k,v) -> v==null ? 1 : v+1);
            }
        }

        public int getNZeroCounts() {
            return lowCounts[0];
        }

        public double getMeanCount() {
            return (double)totalCounts/nCounts;
        }

        public int[] getQuartiles() {
            final int[] quartiles = new int[4];
            int counts = 0;
            int quartile = 0;
            long targetCounts = (nCounts + 3) / 4;
            for ( int idx = 0; idx != lowCounts.length; ++idx ) {
                counts += lowCounts[idx];
                while ( counts >= targetCounts ) {
                    quartiles[quartile++] = idx;
                    targetCounts = ((quartile + 1) * nCounts + 3) / 4;
                }
            }
            for ( final Map.Entry entry : highCounts.entrySet() ) {
                counts += entry.getValue();
                while ( counts >= targetCounts ) {
                    quartiles[quartile++] = entry.getKey();
                    targetCounts = ((quartile + 1) * nCounts + 3) / 4;
                }
            }
            return quartiles;
        }
    }

    @VisibleForTesting
    final static class DepthEvidenceCollector {
        private final LocusComparator lComp;
        private final CountCounter countCounter;
        private final FeatureSink writer;
        private final Iterator intervalIterator;
        private final int minMapQ;
        private DepthEvidence depthEvidence;
        private long summmedIntervalLengths;
        private long nIntervals;

        public DepthEvidenceCollector( final SAMSequenceDictionary dict,
                                       final String sampleName,
                                       final int cmprLevel,
                                       final GATKPath inputIntervalsPath,
                                       final GATKPath outputDepthEvidencePath,
                                       final int minMapQ ) {
            lComp = new LocusComparator(dict);
            countCounter = new CountCounter();
            final String outputFilename = outputDepthEvidencePath.toPath().toString();
            final DepthEvidenceBCICodec bciCodec = new DepthEvidenceBCICodec();
            final List sampleNames = Collections.singletonList(sampleName);
            if ( bciCodec.canDecode(outputFilename) ) {
                writer = bciCodec.makeSink(outputDepthEvidencePath, dict, sampleNames, cmprLevel);
            } else {
                final DepthEvidenceCodec codec = new DepthEvidenceCodec();
                if ( !codec.canDecode(outputFilename) ) {
                    throw new UserException("Attempting to write depth evidence to a file that " +
                            "can't be read as depth evidence: " + outputFilename + ".  The file " +
                            "name should end with \".rd.txt\", \".rd.txt.gz\", or \".rd.bci\".");
                }
                writer = codec.makeSink(outputDepthEvidencePath, dict, sampleNames, cmprLevel);
            }

            final FeatureDataSource intervalSource =
                    new FeatureDataSource<>(inputIntervalsPath.toPath().toString());
            this.intervalIterator = intervalSource.iterator();
            if ( !intervalIterator.hasNext() ) {
                throw new UserException(inputIntervalsPath + " contains no intervals.");
            }
            this.minMapQ = minMapQ;
            depthEvidence = new DepthEvidence(intervalIterator.next(), new int[1]);
            summmedIntervalLengths += depthEvidence.getLengthOnReference();
            nIntervals += 1;
        }

        void apply( final GATKRead read ) {
            if ( read.getMappingQuality() < minMapQ ) {
                return;
            }
            while ( depthEvidence != null ) {
                final int cmp = lComp.compareLocus(read.getContig(), read.getStart(), depthEvidence);
                if ( cmp < 0 ) { // if read is upstream of interval of interest, nothing to do
                    return;
                }
                if ( cmp == 0 ) { // if read is in the interval of interest, count it
                    depthEvidence.getCounts()[0] += 1;
                    return;
                }

                // read is downstream of interval
                writer.write(depthEvidence);
                countCounter.addCount(depthEvidence.getCounts()[0]);
                if ( !intervalIterator.hasNext() ) {
                    depthEvidence = null;
                    return;
                }
                depthEvidence = new DepthEvidence(intervalIterator.next(), new int[1]);
                summmedIntervalLengths += depthEvidence.getLengthOnReference();
                nIntervals += 1;
            }
        }

        void close() {
            if ( depthEvidence != null ) {
                writer.write(depthEvidence);
                countCounter.addCount(depthEvidence.getCounts()[0]);
                final int[] emptyCount = new int[1];
                while ( intervalIterator.hasNext() ) {
                    writer.write(new DepthEvidence(intervalIterator.next(), emptyCount));
                    countCounter.addCount(0);
                }
            }
            writer.close();
        }

        void reportSummaryStats( final GATKPath summaryPath, final String sampleName ) {
            try ( final BufferedWriter writer
                     = new BufferedWriter(new OutputStreamWriter(summaryPath.getOutputStream())) ) {
                final int[] quartiles = countCounter.getQuartiles();
                writer.write("rd_q25\t" + sampleName + "\t" + quartiles[0]);
                writer.newLine();
                writer.write("rd_q50\t" + sampleName + "\t" + quartiles[1]);
                writer.newLine();
                writer.write("rd_q75\t" + sampleName + "\t" + quartiles[2]);
                writer.newLine();
                final String val = String.format("%.2f",countCounter.getMeanCount());
                writer.write("rd_mean\t" + sampleName + "\t" + val);
                writer.newLine();
                writer.write("rd_num_intervals\t" + sampleName + "\t" + nIntervals);
                writer.newLine();
                writer.write("rd_intervals_size\t" + sampleName + "\t" + summmedIntervalLengths);
                writer.newLine();
            } catch ( final IOException ioe ) {
                throw new UserException("Can't write depth evidence summary statistics to " + summaryPath);
            }
        }
    }
}