All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.sv.CollectSVEvidence Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.sv;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.sv.DepthEvidence;
import org.broadinstitute.hellbender.tools.sv.DiscordantPairEvidence;
import org.broadinstitute.hellbender.tools.sv.SiteDepth;
import org.broadinstitute.hellbender.tools.sv.SplitReadEvidence;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.codecs.*;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.*;
import java.util.function.Predicate;

import static org.broadinstitute.hellbender.utils.read.ReadUtils.isBaseInsideAdaptor;

/**
 * Creates discordant read pair, split read evidence, site depth, and read depth files for use in the GATK-SV pipeline.
 * This tool emulates the functionality of the "svtk collect-pesr" used in v1 of the GATK-SV pipeline.
 *
 * The first output file, which should be named "*.pe.txt" or "*.pe.txt.gz" is a tab-delimited file
 * containing information on discordant read pairs in the input cram, with the following columns:
 *
 * 
    *
  • read contig
  • *
  • read start
  • *
  • read strand
  • *
  • mate contig
  • *
  • mate start
  • *
  • mate strand
  • *
  • sample name
  • *
* * Only one record is emitted for each discordant read pair, at the read in the pair with the "upstream" start * position according to the sequence dictionary contig ordering and coordinate. * * The second output file, which should be named "*.sr.txt" or "*.sr.txt.gz" contains the locations * of all split read clippings in the input bam or cram, with the following columns: * *
    *
  • contig
  • *
  • clipping position
  • *
  • direction: side of the read that was clipped (either "left" or "right")
  • *
  • count: the number of reads clipped at this location in this direction
  • *
  • sample name
  • *
* * The third output file, which should be named "*.sd.txt" or "*.sd.txt.gz" specifies site depth counts: * For each locus specified in an input VCF as a simple, biallelic SNP, it gives a count, for each * base call, of the number of reads that cover that locus. * It has the following columns: * *
    *
  • contig
  • *
  • position
  • *
  • sampleName
  • *
  • A observations
  • *
  • C observations
  • *
  • G observations
  • *
  • T observations
  • *
* * The fourth output file, which should be named "*.rd.txt" or "*.rd.txt.gz" specifies read depths: * For each interval specified by a 3-column, tab delimited input file, the number of reads that * start in that interval are reported. * It has the following columns: * *
    *
  • contig
  • *
  • starting position
  • *
  • ending position
  • *
  • read count
  • *
* * Note: when only collecting RD evidence, users should consider providing the same interval list * with -L as --depth-evidence-intervals in order to avoid processing unused reads outside the intervals. * * Each of these output files may also be written as a block-compressed interval file, rather than * as a tab-delimited text file by specifying an output file name that ends with ".bci" rather than * ".txt". These files are self-indexing, and contain complete header information including sample * name(s) and a dictionary for the contigs. */ @BetaFeature @DocumentedFeature @CommandLineProgramProperties( summary = "Gathers paired-end and split read evidence files for use in the GATK-SV pipeline. Output files " + "are a file containing the location of and orientation of read pairs marked as discordant, and a " + "file containing the clipping location of all soft clipped reads and the orientation of the clipping.", oneLineSummary = "Gathers paired-end and split read evidence files for use in the GATK-SV pipeline.", programGroup = StructuralVariantDiscoveryProgramGroup.class ) public class CollectSVEvidence extends ReadWalker { public static final String PAIRED_END_FILE_ARGUMENT_SHORT_NAME = "PE"; public static final String PAIRED_END_FILE_ARGUMENT_LONG_NAME = "pe-file"; public static final String SPLIT_READ_FILE_ARGUMENT_SHORT_NAME = "SR"; public static final String SPLIT_READ_FILE_ARGUMENT_LONG_NAME = "sr-file"; public static final String SITE_DEPTH_OUTPUT_ARGUMENT_SHORT_NAME = "SD"; public static final String SITE_DEPTH_OUTPUT_ARGUMENT_LONG_NAME = "sd-file"; public static final String SITE_DEPTH_INPUT_ARGUMENT_SHORT_NAME = "F"; public static final String SITE_DEPTH_INPUT_ARGUMENT_LONG_NAME = "site-depth-locs-vcf"; public static final String DEPTH_EVIDENCE_OUTPUT_FILE_ARGUMENT_SHORT_NAME = "RD"; public static final String DEPTH_EVIDENCE_OUTPUT_FILE_ARGUMENT_LONG_NAME = "depth-evidence-file"; public static final String DEPTH_EVIDENCE_SUMMARY_FILE_ARGUMENT_SHORT_NAME = "DS"; public static final String DEPTH_EVIDENCE_SUMMARY_FILE_ARGUMENT_LONG_NAME = "depth-summary-file"; public static final String DEPTH_EVIDENCE_INTERVALS_INPUT_FILE_ARGUMENT_SHORT_NAME = "DI"; public static final String DEPTH_EVIDENCE_INTERVALS_INPUT_FILE_ARGUMENT_LONG_NAME = "depth-evidence-intervals"; public static final String MIN_DEPTH_EVIDENCE_MAPQ_ARGUMENT_NAME = "depth-evidence-min-mapq"; public static final String MIN_SITE_DEPTH_MAPQ_ARGUMENT_NAME = "site-depth-min-mapq"; public static final String MIN_SITE_DEPTH_BASEQ_ARGUMENT_NAME = "site-depth-min-baseq"; public static final String SAMPLE_NAME_ARGUMENT_LONG_NAME = "sample-name"; public static final String COMPRESSION_LEVEL_ARGUMENT_LONG_NAME = "compression-level"; @Argument(shortName = PAIRED_END_FILE_ARGUMENT_SHORT_NAME, fullName = PAIRED_END_FILE_ARGUMENT_LONG_NAME, doc = "Output file for paired end evidence", optional=true) public GATKPath peFile; @Argument(shortName = SPLIT_READ_FILE_ARGUMENT_SHORT_NAME, fullName = SPLIT_READ_FILE_ARGUMENT_LONG_NAME, doc = "Output file for split read evidence", optional=true) public GATKPath srFile; @Argument(shortName = SITE_DEPTH_OUTPUT_ARGUMENT_SHORT_NAME, fullName = SITE_DEPTH_OUTPUT_ARGUMENT_LONG_NAME, doc = "Output file for site depth counts", optional = true) public GATKPath siteDepthOutputFilename; @Argument(shortName = SITE_DEPTH_INPUT_ARGUMENT_SHORT_NAME, fullName = SITE_DEPTH_INPUT_ARGUMENT_LONG_NAME, doc = "Input VCF of SNPs marking loci for site depth counts", optional = true) public GATKPath siteDepthInputFilename; @Argument(shortName = DEPTH_EVIDENCE_OUTPUT_FILE_ARGUMENT_SHORT_NAME, fullName = DEPTH_EVIDENCE_OUTPUT_FILE_ARGUMENT_LONG_NAME, doc = "Output file for depth evidence", optional = true) public GATKPath depthEvidenceOutputFilename; @Argument(shortName = DEPTH_EVIDENCE_SUMMARY_FILE_ARGUMENT_SHORT_NAME, fullName = DEPTH_EVIDENCE_SUMMARY_FILE_ARGUMENT_LONG_NAME, doc = "Output file for depth evidence summary statistics", optional = true) public GATKPath depthEvidenceSummaryFilename; @Argument(shortName = DEPTH_EVIDENCE_INTERVALS_INPUT_FILE_ARGUMENT_SHORT_NAME, fullName = DEPTH_EVIDENCE_INTERVALS_INPUT_FILE_ARGUMENT_LONG_NAME, doc = "Input feature file specifying intervals where depth evidence will be gathered", optional = true) public GATKPath depthEvidenceInputFilename; @Argument(fullName = MIN_DEPTH_EVIDENCE_MAPQ_ARGUMENT_NAME, doc = "minimum mapping quality for read to be counted as depth evidence", optional = true) public int minDepthEvidenceMapQ = 0; @Argument(fullName = MIN_SITE_DEPTH_MAPQ_ARGUMENT_NAME, doc = "minimum mapping quality for read to be counted toward site depth", optional = true) public int minMapQ = 30; @Argument(fullName = MIN_SITE_DEPTH_BASEQ_ARGUMENT_NAME, doc = "minimum base call quality for SNP to be counted toward site depth", optional = true) public int minQ = 20; @Argument(fullName = SAMPLE_NAME_ARGUMENT_LONG_NAME, doc = "Sample name") String sampleName = null; @Argument(fullName = COMPRESSION_LEVEL_ARGUMENT_LONG_NAME, doc = "Output compression level") int compressionLevel = 4; final Set observedDiscordantNames = new HashSet<>(); final PriorityQueue splitPosBuffer = new PriorityQueue<>(new SplitPosComparator()); final List discordantPairs = new ArrayList<>(); int currentDiscordantPosition = -1; String currentChrom = null; private FeatureSink peWriter; private FeatureSink srWriter; private SiteDepthCounter siteDepthCounter; private DepthEvidenceCollector depthEvidenceCollector; private SAMSequenceDictionary sequenceDictionary; @Override public boolean requiresReads() { return true; } @Override public void onTraversalStart() { super.onTraversalStart(); sequenceDictionary = getBestAvailableSequenceDictionary(); peWriter = createPEWriter(); srWriter = createSRWriter(); siteDepthCounter = createSiteDepthCounter(); depthEvidenceCollector = createDepthEvidenceCollector(); if ( peWriter == null && srWriter == null && siteDepthCounter == null && depthEvidenceCollector == null ) { throw new UserException("You must supply at least one output file: PE, SR, SD, or RD"); } } @Override public List getDefaultReadFilters() { final List readFilters = new ArrayList<>(super.getDefaultReadFilters()); readFilters.add(ReadFilterLibrary.MAPPED); readFilters.add(ReadFilterLibrary.NOT_DUPLICATE); return readFilters; } @Override public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) { if ( !(read.isPaired() && read.mateIsUnmapped()) && !read.isSupplementaryAlignment() && !read.isSecondaryAlignment() ) { if ( srWriter != null && isSoftClipped(read) ) { countSplitRead(read, splitPosBuffer, srWriter); } if ( peWriter != null && !read.isProperlyPaired() ) { reportDiscordantReadPair(read); } } if ( siteDepthCounter != null ) { siteDepthCounter.apply(read); } if ( depthEvidenceCollector != null ) { depthEvidenceCollector.apply(read); } } private FeatureSink createPEWriter() { if ( peFile == null ) { return null; } final String peFilename = peFile.toPath().toString(); final List sampleNames = Collections.singletonList(sampleName); final DiscordantPairEvidenceCodec peCodec = new DiscordantPairEvidenceCodec(); final DiscordantPairEvidenceBCICodec peBCICodec = new DiscordantPairEvidenceBCICodec(); if ( peBCICodec.canDecode(peFilename) ) { return peBCICodec.makeSink(peFile, sequenceDictionary, sampleNames, compressionLevel); } if ( !peCodec.canDecode(peFilename) ) { throw new UserException("Attempting to write discordant pair evidence to a file that " + "can't be read as discordant pair evidence: " + peFilename + ". The file " + "name should end with \".pe.txt\", \".pe.txt.gz\", or \".pe.bci\"."); } return peCodec.makeSink(peFile, sequenceDictionary, sampleNames, compressionLevel); } private FeatureSink createSRWriter() { if ( srFile == null ) { return null; } final String srFilename = srFile.toPath().toString(); final List sampleNames = Collections.singletonList(sampleName); final SplitReadEvidenceCodec srCodec = new SplitReadEvidenceCodec(); final SplitReadEvidenceBCICodec srBCICodec = new SplitReadEvidenceBCICodec(); if ( srBCICodec.canDecode(srFilename) ) { return srBCICodec.makeSink(srFile, sequenceDictionary, sampleNames, compressionLevel); } if ( !srCodec.canDecode(srFilename) ) { throw new UserException("Attempting to write split read evidence to a file that " + "can't be read as split read evidence: " + srFilename + ". The file " + "name should end with \".sr.txt\", \".sr.txt.gz\", or \".sr.bci\"."); } return srCodec.makeSink(srFile, sequenceDictionary, sampleNames, compressionLevel); } private SiteDepthCounter createSiteDepthCounter() { if ( siteDepthInputFilename != null && siteDepthOutputFilename != null ) { return new SiteDepthCounter(sequenceDictionary, sampleName, compressionLevel, siteDepthInputFilename, siteDepthOutputFilename, minMapQ, minQ); } if ( siteDepthInputFilename != null ) { throw new UserException("Having specified a " + SITE_DEPTH_INPUT_ARGUMENT_LONG_NAME + " input, you must also supply an " + SITE_DEPTH_OUTPUT_ARGUMENT_LONG_NAME + " for output."); } if ( siteDepthOutputFilename != null ) { throw new UserException("Having specified an " + SITE_DEPTH_OUTPUT_ARGUMENT_LONG_NAME + " for output, you must also supply a " + SITE_DEPTH_INPUT_ARGUMENT_LONG_NAME + " as input."); } return null; } private DepthEvidenceCollector createDepthEvidenceCollector() { if ( depthEvidenceInputFilename != null && depthEvidenceOutputFilename != null ) { return new DepthEvidenceCollector(sequenceDictionary, sampleName, compressionLevel, depthEvidenceInputFilename, depthEvidenceOutputFilename, minDepthEvidenceMapQ); } if ( depthEvidenceInputFilename != null ) { throw new UserException("Having specified an depth-evidence-intervals input, " + "you must also supply a depth-evidence-file for output."); } if ( depthEvidenceOutputFilename != null ) { throw new UserException("Having specified a depth-evidence-file for output, " + "you must also supply depth-evidence-intervals as input."); } return null; } private void reportDiscordantReadPair(final GATKRead read) { if (read.getStart() != currentDiscordantPosition) { flushDiscordantReadPairs(); currentDiscordantPosition = read.getStart(); observedDiscordantNames.clear(); } final DiscordantRead reportableDiscordantReadPair = getReportableDiscordantReadPair(read, observedDiscordantNames, sequenceDictionary); if (reportableDiscordantReadPair != null) { discordantPairs.add(reportableDiscordantReadPair); } } @VisibleForTesting public DiscordantRead getReportableDiscordantReadPair(final GATKRead read, final Set observedDiscordantNamesAtThisLocus, final SAMSequenceDictionary samSequenceDictionary) { final int readSeqId = samSequenceDictionary.getSequenceIndex(read.getContig()); final int mateSeqId = samSequenceDictionary.getSequenceIndex(read.getMateContig()); if (readSeqId < mateSeqId) { return new DiscordantRead(read); } else if (readSeqId == mateSeqId) { if (read.getStart() < read.getMateStart()) { return new DiscordantRead(read); } else if (read.getStart() == read.getMateStart()) { final boolean seenBefore = observedDiscordantNamesAtThisLocus.remove(read.getName()); if (! seenBefore) { final DiscordantRead discordantRead = new DiscordantRead(read); observedDiscordantNamesAtThisLocus.add(read.getName()); return discordantRead; } } } return null; } private void flushDiscordantReadPairs() { final Comparator discReadComparator = new DiscordantReadComparator(sequenceDictionary); discordantPairs.sort(discReadComparator); discordantPairs.forEach(this::writeDiscordantPair); discordantPairs.clear(); } private void writeDiscordantPair(final DiscordantRead r) { peWriter.write(new DiscordantPairEvidence(sampleName, r.getContig(), r.getStart(), !r.isReadReverseStrand(), r.getMateContig(), r.getMateStart(), !r.isMateReverseStrand())); } /** * Adds split read information about the current read to the counts in splitCounts. Flushes split read counts to * srWriter if necessary. */ @VisibleForTesting public void countSplitRead(final GATKRead read, final PriorityQueue splitCounts, final FeatureSink srWriter ) { final SplitPos splitPosition = getSplitPosition(read); final int readStart = read.getStart(); if (splitPosition.direction == POSITION.MIDDLE) { return; } if (currentChrom == null) { currentChrom = read.getContig(); } else if (!currentChrom.equals(read.getContig())) { flushSplitCounts(splitPos -> true, splitCounts, srWriter); currentChrom = read.getContig(); } else { flushSplitCounts(sp -> (sp.pos < readStart - 1), splitCounts, srWriter); } splitCounts.add(splitPosition); } private void flushSplitCounts(final Predicate flushablePosition, final PriorityQueue splitCounts, final FeatureSink srWriter) { while (splitCounts.size() > 0 && flushablePosition.test(splitCounts.peek())) { final SplitPos pos = splitCounts.poll(); int countAtPos = 1; while (splitCounts.size() > 0 && splitCounts.peek().equals(pos)) { countAtPos++; splitCounts.poll(); } final SplitReadEvidence splitRead = new SplitReadEvidence(sampleName, currentChrom, pos.pos, countAtPos, pos.direction.equals(POSITION.RIGHT)); srWriter.write(splitRead); } } private SplitPos getSplitPosition( final GATKRead read ) { if (read.getCigar().getFirstCigarElement().getOperator() == CigarOperator.M) { final int matchLength = read.getCigar().getCigarElements().stream().filter(e -> e.getOperator().consumesReferenceBases()).mapToInt(CigarElement::getLength).sum(); return new SplitPos(read.getStart() + matchLength, POSITION.RIGHT); } else if (read.getCigar().getLastCigarElement().getOperator() == CigarOperator.M) { return new SplitPos(read.getStart(), POSITION.LEFT); } return new SplitPos(-1, POSITION.MIDDLE); } private boolean isSoftClipped( final GATKRead read ) { final CigarOperator firstOperator = read.getCigar().getFirstCigarElement().getOperator(); final CigarOperator lastOperator = read.getCigar().getLastCigarElement().getOperator(); return (firstOperator == CigarOperator.SOFT_CLIP && lastOperator != CigarOperator.SOFT_CLIP) || (firstOperator != CigarOperator.SOFT_CLIP && lastOperator == CigarOperator.SOFT_CLIP); } @Override public Object onTraversalSuccess() { flushSplitCounts(splitPos -> true, splitPosBuffer, srWriter); flushDiscordantReadPairs(); if ( siteDepthCounter != null ) { siteDepthCounter.close(); } if ( depthEvidenceCollector != null ) { depthEvidenceCollector.close(); if ( depthEvidenceSummaryFilename != null ) { depthEvidenceCollector.reportSummaryStats(depthEvidenceSummaryFilename, sampleName); } } return null; } @Override public void closeTool() { super.closeTool(); if ( peWriter != null ) { peWriter.close(); } if ( srWriter != null ) { srWriter.close(); } } enum POSITION { LEFT ("left"), MIDDLE ("middle"), RIGHT ("right"); private final String description; POSITION(final String description) { this.description = description; } public String getDescription() { return description; } } @VisibleForTesting final static class DiscordantRead { private boolean readReverseStrand; private boolean mateReverseStrand; private String contig; private int start; private String mateContig; private int mateStart; private String name; public DiscordantRead(final GATKRead read) { this.readReverseStrand = read.isReverseStrand(); this.mateReverseStrand = read.mateIsReverseStrand(); this.contig = read.getContig(); this.start = read.getStart(); this.mateContig = read.getMateContig(); this.mateStart = read.getMateStart(); this.name = read.getName(); } public boolean isReadReverseStrand() { return readReverseStrand; } public void setReadReverseStrand(final boolean readReverseStrand) { this.readReverseStrand = readReverseStrand; } public boolean isMateReverseStrand() { return mateReverseStrand; } public void setMateReverseStrand(final boolean mateReverseStrand) { this.mateReverseStrand = mateReverseStrand; } public String getContig() { return contig; } public void setContig(final String contig) { this.contig = contig; } public int getStart() { return start; } public void setStart(final int start) { this.start = start; } public String getMateContig() { return mateContig; } public void setMateContig(final String mateContig) { this.mateContig = mateContig; } public int getMateStart() { return mateStart; } public void setMateStart(final int mateStart) { this.mateStart = mateStart; } public String getName() { return name; } public void setName(final String name) { this.name = name; } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final DiscordantRead that = (DiscordantRead) o; if (readReverseStrand != that.readReverseStrand) return false; if (mateReverseStrand != that.mateReverseStrand) return false; if (start != that.start) return false; if (mateStart != that.mateStart) return false; if ( !Objects.equals(contig, that.contig) ) return false; if ( !Objects.equals(mateContig, that.mateContig) ) return false; return Objects.equals(name, that.name); } @Override public int hashCode() { int result = (readReverseStrand ? 1 : 0); result = 31 * result + (mateReverseStrand ? 1 : 0); result = 31 * result + (contig != null ? contig.hashCode() : 0); result = 31 * result + start; result = 31 * result + (mateContig != null ? mateContig.hashCode() : 0); result = 31 * result + mateStart; result = 31 * result + (name != null ? name.hashCode() : 0); return result; } } @VisibleForTesting final static class SplitPos { public POSITION direction; public int pos; public SplitPos(final int start, final POSITION direction) { this.pos = start; this.direction = direction; } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final SplitPos splitPos = (SplitPos) o; if (pos != splitPos.pos) return false; return direction.ordinal() == splitPos.direction.ordinal(); } @Override public int hashCode() { int result = direction != null ? direction.ordinal() : 0; result = 31 * result + pos; return result; } } @VisibleForTesting final static class SplitPosComparator implements Comparator { @Override public int compare(final SplitPos o1, final SplitPos o2) { if (o1.pos != o2.pos) { return Integer.compare(o1.pos, o2.pos); } else { return o1.direction.compareTo(o2.direction); } } } @VisibleForTesting final static class DiscordantReadComparator implements Comparator { private final Comparator internalComparator; public DiscordantReadComparator(final SAMSequenceDictionary sequenceDictionary) { internalComparator = Comparator.comparing((DiscordantRead r) -> sequenceDictionary.getSequenceIndex(r.getContig())) .thenComparing(DiscordantRead::getStart) .thenComparing(DiscordantRead::isReadReverseStrand) .thenComparing((DiscordantRead r) -> sequenceDictionary.getSequenceIndex(r.getMateContig())) .thenComparing(DiscordantRead::getMateStart) .thenComparing(DiscordantRead::isMateReverseStrand); } @Override public int compare(final DiscordantRead o1, final DiscordantRead o2) { return internalComparator.compare(o1, o2); } } /** * Compare a locus to an interval and indicates whether the locus is * upstream (returns -1), within (returns 0), or downstream (returns 1) of an interval */ @VisibleForTesting final static class LocusComparator { private final SAMSequenceDictionary dict; public LocusComparator( final SAMSequenceDictionary dict ) { this.dict = dict; } public int compareLocus( final String contig, final int position, final Locatable loc ) { int cmp = Integer.compare(dict.getSequenceIndex(contig), dict.getSequenceIndex(loc.getContig())); if ( cmp == 0 ) { if ( position < loc.getStart() ) { cmp = -1; } else if ( position > loc.getEnd() ) { cmp = 1; } } return cmp; } } @VisibleForTesting final static class SiteDepthCounter { private final LocusComparator lComp; private final String sampleName; private final FeatureSink writer; private final int minMapQ; private final int minQ; private final Iterator snpSourceItr; private final Deque siteDepthQueue; public SiteDepthCounter( final SAMSequenceDictionary dict, final String sampleName, final int compressionLevel, final GATKPath inputPath, final GATKPath outputPath, final int minMapQ, final int minQ ) { this.lComp = new LocusComparator(dict); this.sampleName = sampleName; final String outputFilename = outputPath.toPath().toString(); final SiteDepthBCICodec bciCodec = new SiteDepthBCICodec(); final List sampleNames = Collections.singletonList(sampleName); if ( bciCodec.canDecode(outputFilename) ) { this.writer = bciCodec.makeSink(outputPath, dict, sampleNames, compressionLevel); } else { final SiteDepthCodec codec = new SiteDepthCodec(); if ( !codec.canDecode(outputFilename) ) { throw new UserException("Attempting to write site depth evidence to a file that " + "can't be read as site depth evidence: " + outputFilename + ". The file " + "name should end with \".sd.txt\", \".sd.txt.gz\", or \".sd.bci\"."); } this.writer = codec.makeSink(outputPath, dict, sampleNames, compressionLevel); } this.minMapQ = minMapQ; this.minQ = minQ; final FeatureDataSource snpSource = new FeatureDataSource<>(inputPath.toPath().toString(), null, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, VariantContext.class); dict.assertSameDictionary(snpSource.getSequenceDictionary()); this.snpSourceItr = new BAFSiteIterator(snpSource.iterator()); this.siteDepthQueue = new ArrayDeque<>(100); readNextLocus(); } public void apply( final GATKRead read ) { if ( read.getMappingQuality() < minMapQ || siteDepthQueue.isEmpty() ) { return; } // clean queue of SiteDepths that precede the current read final SimpleInterval readLoc = new SimpleInterval(read.getContig(), read.getStart(), read.getEnd()); while ( true ) { final SiteDepth siteDepth = siteDepthQueue.getFirst(); if ( lComp.compareLocus(siteDepth.getContig(), siteDepth.getStart(), readLoc) >= 0 ) { break; } writer.write(siteDepthQueue.removeFirst()); if ( siteDepthQueue.isEmpty() ) { if ( !readNextLocus() ) { return; } } } // make sure that the last SiteDepth in the queue occurs after the current read // if such a SiteDepth is available while ( true ) { final SiteDepth siteDepth = siteDepthQueue.getLast(); if ( lComp.compareLocus(siteDepth.getContig(), siteDepth.getStart(), readLoc) > 0 || !readNextLocus() ) { break; } } walkReadMatches(read); } private void walkReadMatches( final GATKRead read ) { walkReadMatches(read, minQ, siteDepthQueue, lComp); } static void walkReadMatches( final GATKRead read, final int minQ, final Iterable sites, final LocusComparator locusComparator ) { int opStart = read.getStart(); int readIdx = 0; final byte[] calls = read.getBasesNoCopy(); final byte[] quals = read.getBaseQualitiesNoCopy(); for ( final CigarElement cigEle : read.getCigar().getCigarElements() ) { final int eleLen = cigEle.getLength(); final CigarOperator cigOp = cigEle.getOperator(); if ( cigOp.isAlignment() ) { final int opEnd = opStart + eleLen - 1; final SimpleInterval opLoc = new SimpleInterval(read.getContig(), opStart, opEnd); for ( final SiteDepth siteDepth : sites ) { final String siteContig = siteDepth.getContig(); final int sitePos = siteDepth.getStart(); final int cmp = locusComparator.compareLocus(siteContig, sitePos, opLoc); if ( cmp > 0 ) { break; } // don't count base calls that aren't really part of the template // (if the template is shorter than the read, we can call into adaptor sequence) if ( cmp == 0 && !isBaseInsideAdaptor(read, sitePos) ) { final int callIdx = readIdx + sitePos - opStart; if ( quals[callIdx] < minQ ) { continue; } final Nucleotide call = Nucleotide.decode(calls[callIdx]); if ( call.isStandard() ) { siteDepth.observe(call.ordinal()); } } } } if ( cigOp.consumesReadBases() ) { readIdx += eleLen; } if ( cigOp.consumesReferenceBases() ) { opStart += eleLen; } } } public void close() { do { while ( !siteDepthQueue.isEmpty() ) { writer.write(siteDepthQueue.removeFirst()); } } while ( readNextLocus() ); writer.close(); } private boolean readNextLocus() { if ( !snpSourceItr.hasNext() ) { return false; } final SiteDepth siteDepth = new SiteDepth(snpSourceItr.next(), sampleName); siteDepthQueue.add(siteDepth); return true; } } public final static class BAFSiteIterator implements Iterator { final Iterator vcIterator; VariantContext last; VariantContext next; public BAFSiteIterator( final Iterator vcIterator ) { this.vcIterator = vcIterator; advance(); } public boolean hasNext() { if ( next != null ) { return true; } advance(); return next != null; } public VariantContext next() { if ( !hasNext() ) { throw new NoSuchElementException("baf sites iterator is exhausted"); } final VariantContext result = next; last = next; next = null; return result; } private void advance() { while ( vcIterator.hasNext() ) { final VariantContext vc = vcIterator.next(); // if it's a SNP, it's biallelic, and it occurs at a new locus if ( vc.isSNP() && vc.isBiallelic() && (last == null || !last.getContig().equals(vc.getContig()) || last.getStart() < vc.getStart()) ) { next = vc; break; } } } } @VisibleForTesting final static class CountCounter { private final int[] lowCounts; private final SortedMap highCounts; private long nCounts; private long totalCounts; public CountCounter() { lowCounts = new int[3000]; highCounts = new TreeMap<>(); } public void addCount( final int count ) { nCounts += 1; totalCounts += count; if ( count < lowCounts.length ) { lowCounts[count] += 1; } else { highCounts.compute(count, (k,v) -> v==null ? 1 : v+1); } } public int getNZeroCounts() { return lowCounts[0]; } public double getMeanCount() { return (double)totalCounts/nCounts; } public int[] getQuartiles() { final int[] quartiles = new int[4]; int counts = 0; int quartile = 0; long targetCounts = (nCounts + 3) / 4; for ( int idx = 0; idx != lowCounts.length; ++idx ) { counts += lowCounts[idx]; while ( counts >= targetCounts ) { quartiles[quartile++] = idx; targetCounts = ((quartile + 1) * nCounts + 3) / 4; } } for ( final Map.Entry entry : highCounts.entrySet() ) { counts += entry.getValue(); while ( counts >= targetCounts ) { quartiles[quartile++] = entry.getKey(); targetCounts = ((quartile + 1) * nCounts + 3) / 4; } } return quartiles; } } @VisibleForTesting final static class DepthEvidenceCollector { private final LocusComparator lComp; private final CountCounter countCounter; private final FeatureSink writer; private final Iterator intervalIterator; private final int minMapQ; private DepthEvidence depthEvidence; private long summmedIntervalLengths; private long nIntervals; public DepthEvidenceCollector( final SAMSequenceDictionary dict, final String sampleName, final int cmprLevel, final GATKPath inputIntervalsPath, final GATKPath outputDepthEvidencePath, final int minMapQ ) { lComp = new LocusComparator(dict); countCounter = new CountCounter(); final String outputFilename = outputDepthEvidencePath.toPath().toString(); final DepthEvidenceBCICodec bciCodec = new DepthEvidenceBCICodec(); final List sampleNames = Collections.singletonList(sampleName); if ( bciCodec.canDecode(outputFilename) ) { writer = bciCodec.makeSink(outputDepthEvidencePath, dict, sampleNames, cmprLevel); } else { final DepthEvidenceCodec codec = new DepthEvidenceCodec(); if ( !codec.canDecode(outputFilename) ) { throw new UserException("Attempting to write depth evidence to a file that " + "can't be read as depth evidence: " + outputFilename + ". The file " + "name should end with \".rd.txt\", \".rd.txt.gz\", or \".rd.bci\"."); } writer = codec.makeSink(outputDepthEvidencePath, dict, sampleNames, cmprLevel); } final FeatureDataSource intervalSource = new FeatureDataSource<>(inputIntervalsPath.toPath().toString()); this.intervalIterator = intervalSource.iterator(); if ( !intervalIterator.hasNext() ) { throw new UserException(inputIntervalsPath + " contains no intervals."); } this.minMapQ = minMapQ; depthEvidence = new DepthEvidence(intervalIterator.next(), new int[1]); summmedIntervalLengths += depthEvidence.getLengthOnReference(); nIntervals += 1; } void apply( final GATKRead read ) { if ( read.getMappingQuality() < minMapQ ) { return; } while ( depthEvidence != null ) { final int cmp = lComp.compareLocus(read.getContig(), read.getStart(), depthEvidence); if ( cmp < 0 ) { // if read is upstream of interval of interest, nothing to do return; } if ( cmp == 0 ) { // if read is in the interval of interest, count it depthEvidence.getCounts()[0] += 1; return; } // read is downstream of interval writer.write(depthEvidence); countCounter.addCount(depthEvidence.getCounts()[0]); if ( !intervalIterator.hasNext() ) { depthEvidence = null; return; } depthEvidence = new DepthEvidence(intervalIterator.next(), new int[1]); summmedIntervalLengths += depthEvidence.getLengthOnReference(); nIntervals += 1; } } void close() { if ( depthEvidence != null ) { writer.write(depthEvidence); countCounter.addCount(depthEvidence.getCounts()[0]); final int[] emptyCount = new int[1]; while ( intervalIterator.hasNext() ) { writer.write(new DepthEvidence(intervalIterator.next(), emptyCount)); countCounter.addCount(0); } } writer.close(); } void reportSummaryStats( final GATKPath summaryPath, final String sampleName ) { try ( final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(summaryPath.getOutputStream())) ) { final int[] quartiles = countCounter.getQuartiles(); writer.write("rd_q25\t" + sampleName + "\t" + quartiles[0]); writer.newLine(); writer.write("rd_q50\t" + sampleName + "\t" + quartiles[1]); writer.newLine(); writer.write("rd_q75\t" + sampleName + "\t" + quartiles[2]); writer.newLine(); final String val = String.format("%.2f",countCounter.getMeanCount()); writer.write("rd_mean\t" + sampleName + "\t" + val); writer.newLine(); writer.write("rd_num_intervals\t" + sampleName + "\t" + nIntervals); writer.newLine(); writer.write("rd_intervals_size\t" + sampleName + "\t" + summmedIntervalLengths); writer.newLine(); } catch ( final IOException ioe ) { throw new UserException("Can't write depth evidence summary statistics to " + summaryPath); } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy