All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.coverage.CallableLoci Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.coverage;


import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.LocusWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.engine.AlignmentContext;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.exceptions.GATKException;
import htsjdk.samtools.util.Locatable;

import java.util.ArrayList;
import java.util.List;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.stream.Collectors;
import org.broadinstitute.hellbender.utils.Utils;
import htsjdk.samtools.util.Lazy;
import java.util.EnumMap;

/**
 * Collect statistics on callable, uncallable, poorly mapped, and other parts of the genome
 *
 * 

* A very common question about a NGS set of reads is what areas of the genome are considered callable. This tool * considers the coverage at each locus and emits either a per base state or a summary interval BED file that * partitions the genomic intervals into the following callable states: *

*
REF_N
*
The reference base was an N, which is not considered callable the GATK
*
PASS
*
The base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
*
NO_COVERAGE
*
Absolutely no reads were seen at this locus, regardless of the filtering parameters
*
LOW_COVERAGE
*
There were fewer than min. depth bases at the locus, after applying filters
*
EXCESSIVE_COVERAGE
*
More than --max-depth read at the locus, indicating some sort of mapping problem
*
POOR_MAPPING_QUALITY
*
More than --max-fraction-of-reads-with-low-mapq at the locus, indicating a poor mapping quality of the reads
*
*

*

*

Input

*

* A BAM file containing exactly one sample. *

*

*

Output

*

* A file with the callable status covering each base and a table of callable status x count of all examined bases *

*

Usage example

*
 *  gatk CallableLoci \
 *      -I myreads.bam \
 *      -R myreference.fasta \
 *      -O callable_status.bed \    
 *      --summary table.txt 
 * 
*

* would produce a BED file that looks like: *

 *     20 10000000 10000864 PASS
 *     20 10000865 10000985 POOR_MAPPING_QUALITY
 *     20 10000986 10001138 PASS
 *     20 10001139 10001254 POOR_MAPPING_QUALITY
 *     20 10001255 10012255 PASS
 *     20 10012256 10012259 POOR_MAPPING_QUALITY
 *     20 10012260 10012263 PASS
 *     20 10012264 10012328 POOR_MAPPING_QUALITY
 *     20 10012329 10012550 PASS
 *     20 10012551 10012551 LOW_COVERAGE
 *     20 10012552 10012554 PASS
 *     20 10012555 10012557 LOW_COVERAGE
 *     20 10012558 10012558 PASS
 * 
* as well as a summary table that looks like: *

*

 *                        state nBases
 *                        REF_N 0
 *                         PASS 996046
 *                  NO_COVERAGE 121
 *                 LOW_COVERAGE 928
 *           EXCESSIVE_COVERAGE 0
 *         POOR_MAPPING_QUALITY 2906
 * 
* * @author Mark DePristo / Jonn Smith * @since May 7, 2010 / Nov 1, 2024 */ @ExperimentalFeature @DocumentedFeature(groupName = "Coverage Analysis") @CommandLineProgramProperties( summary = "Collect statistics on callable, uncallable, poorly mapped, and other parts of the genome", oneLineSummary = "Determine callable status of loci", programGroup = CoverageAnalysisProgramGroup.class ) public class CallableLoci extends LocusWalker { @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output file (BED or per-base format)") private GATKPath outputFile = null; /** * Callable loci summary counts will be written to this file. */ @Argument(fullName = "summary", doc = "Name of file for output summary") private GATKPath summaryFile; /** * The gap between this value and mmq are reads that are not sufficiently well mapped for calling but * aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with * MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where * reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to * calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states */ @Argument(fullName = "max-low-mapq", shortName = "mlmq", minValue = 0, maxValue = 255, optional = true, doc = "Maximum value for MAPQ to be considered a problematic mapped read") private int maxLowMAPQ = 1; /** * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the PASS * state. */ @Argument(fullName = "min-mapping-quality", shortName = "mmq", minValue = 0, maxValue = 255, optional = true, doc = "Minimum mapping quality of reads to count towards depth") private int minMappingQuality = 10; /** * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the PASS state */ @Argument(fullName = "min-base-quality", shortName = "mbq", minValue = 0, maxValue = 255, optional = true, doc = "Minimum quality of bases to count towards depth") private int minBaseQuality = 20; /** * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this * value and is less than maxDepth the site is considered PASS. */ @Advanced @Argument(fullName = "min-depth", shortName = "min-depth", minValue = 0, optional = true, doc = "Minimum QC+ read depth before a locus is considered callable") private int minDepth = 4; /** * If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH */ @Argument(fullName = "max-depth", shortName = "max-depth", optional = true, doc = "Maximum read depth before a locus is considered poorly mapped") private Integer maxDepth = null; /** * We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We * won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads * covering the site. */ @Advanced @Argument(fullName = "min-depth-for-low-mapq", shortName = "mdflmq", optional = true, doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped") private int minDepthLowMAPQ = 10; /** * If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality * exceeds this fraction then the site has POOR_MAPPING_QUALITY. */ @Argument(fullName = "max-fraction-of-reads-with-low-mapq", shortName = "frlmq", optional = true, doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped") private double maxLowMAPQFraction = 0.1; /** * The output of this tool will be written in this format. The recommended option is BED. */ @Advanced @Argument(fullName = "format", shortName = "format", optional = true, doc = "Output format") private OutputFormat outputFormat = OutputFormat.BED; private OutputStreamWriter outputStream = null; private OutputStreamWriter summaryStream = null; public enum OutputFormat { BED, STATE_PER_BASE } public enum State { REF_N, CALLABLE, NO_COVERAGE, LOW_COVERAGE, EXCESSIVE_COVERAGE, POOR_MAPPING_QUALITY } private BaseState currentState = null; private final StateCounter stateCounter = new StateCounter(); protected static class BaseState implements Locatable { private Locatable interval; private final State state; public BaseState(Locatable interval, State state) { this.interval = interval; this.state = state; } public State getState() { return state; } @Override public String getContig() { return interval.getContig(); } @Override public int getStart() { return interval.getStart(); } @Override public int getEnd() { return interval.getEnd(); } public void updateInterval(final Locatable newInterval) { this.interval = new SimpleInterval( interval.getContig(), interval.getStart(), newInterval.getEnd() ); } @Override public String toString() { return toBedString(); } public String toBedString() { // BED format is 0-based, so subtract 1 from start return String.format("%s\t%d\t%d\t%s", interval.getContig(), interval.getStart() - 1, interval.getEnd(), state); } } @Override public boolean requiresReference() { return true; } @Override public boolean emitEmptyLoci() { return true; } @Override public boolean includeNs() { return true; } @Override // This is the default set of filters for CallableLoci as implemented in the GATK3 version of this tool. public List getDefaultReadFilters() { final List defaultFilters = new ArrayList<>(6); defaultFilters.add(new ReadFilterLibrary.GoodCigarReadFilter()); defaultFilters.add(new ReadFilterLibrary.NotDuplicateReadFilter()); defaultFilters.add(new ReadFilterLibrary.PassesVendorQualityCheckReadFilter()); defaultFilters.add(new WellformedReadFilter()); defaultFilters.add(new ReadFilterLibrary.PrimaryLineReadFilter()); defaultFilters.add(new ReadFilterLibrary.MappedReadFilter()); return defaultFilters; } @Override public void onTraversalStart() { // Validate sample count final List sampleList = getHeaderForReads().getReadGroups().stream() .map(rg -> rg.getSample()) .distinct() .collect(Collectors.toList()); if (sampleList.size() != 1) { throw new UserException.BadInput("CallableLoci only works for a single sample. Found " + sampleList.size() + " samples (" + String.join(", ", sampleList) + ")."); } outputStream = new OutputStreamWriter(outputFile.getOutputStream()); summaryStream = new OutputStreamWriter(summaryFile.getOutputStream()); } private State getCurrentState(ReferenceContext referenceContext, AlignmentContext alignmentContext) { State state; if (BaseUtils.isNBase(referenceContext.getBase())) { state = State.REF_N; } else { int rawDepth = 0, QCDepth = 0, lowMAPQDepth = 0; for (PileupElement e : alignmentContext.getBasePileup()) { rawDepth++; if (e.getMappingQual() <= maxLowMAPQ) { lowMAPQDepth++; } if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) { QCDepth++; } } if (rawDepth == 0) { state = State.NO_COVERAGE; } else if ((rawDepth >= minDepthLowMAPQ) && (((double)lowMAPQDepth) / ((double)rawDepth) >= maxLowMAPQFraction)) { state = State.POOR_MAPPING_QUALITY; } else if (QCDepth < minDepth) { state = State.LOW_COVERAGE; } else if (maxDepth != null && rawDepth >= maxDepth) { state = State.EXCESSIVE_COVERAGE; } else { state = State.CALLABLE; } } return state; } @Override public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) { final State state = getCurrentState(referenceContext, alignmentContext); BaseState callableState = new BaseState(alignmentContext, state); // Update counts using the counter stateCounter.increment(state); try { if (outputFormat == OutputFormat.STATE_PER_BASE) { outputStream.write(callableState.toBedString() + "\n"); } else { // BED format - integrate adjacent regions with same state if (currentState == null) { currentState = callableState; } else if (callableState.getStart() != currentState.getEnd() + 1 || currentState.getState() != callableState.getState()) { outputStream.write(currentState.toBedString() + "\n"); currentState = callableState; } else { currentState.updateInterval(callableState); } } } catch (IOException e) { throw new GATKException("Error writing to output stream", e); } } @Override public Object onTraversalSuccess() { try{ // Print final state for BED format if (outputFormat == OutputFormat.BED && currentState != null) { outputStream.write(currentState.toBedString() + "\n"); } // Write summary statistics using the counter summaryStream.write(String.format("%30s %s%n", "state", "nBases")); for (State state : State.values()) { summaryStream.write(String.format("%30s %d%n", state, stateCounter.getCount(state))); } } catch (IOException e) { throw new GATKException("Error writing to summary stream", e); } return null; } @Override public void closeTool() { try { outputStream.close(); summaryStream.close(); } catch (IOException e) { throw new GATKException("Error closing output streams", e); } } private static class StateCounter { private final EnumMap counts = new EnumMap<>(State.class); public StateCounter() { for (State state : State.values()) { counts.put(state, 0L); } } public void increment(State state) { counts.put(state, counts.get(state) + 1); } public long getCount(State state) { return counts.get(state); } public EnumMap getCounts() { return new EnumMap<>(counts); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy