
org.broadinstitute.hellbender.tools.walkers.coverage.CoverageUtils Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.coverage;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import org.broadinstitute.hellbender.engine.AlignmentContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import java.util.*;
/**
* Utility classes to be used for DepthOfCoverage tool
*/
class CoverageUtils {
// private constructor to prevent instantiation
private CoverageUtils() {}
public enum CountPileupType {
/**
* Count all reads independently (even if from the same fragment).
*/
COUNT_READS,
/**
* Count all fragments (even if the reads that compose the fragment are not consistent at that base).
*/
COUNT_FRAGMENTS,
/**
* Count all fragments (but only if the reads that compose the fragment are consistent at that base).
*/
COUNT_FRAGMENTS_REQUIRE_SAME_BASE
}
/**
* Method takes a ReadGroup record and a partition type and returns the appropriate summary string for displaying in output tables.
* Since each partition type has its own pattern for separating out readgroups this method is intended to handle all of the
* processing for regroups so that outputs can be properly partitioned in all cases.
*
* @param rg readGroupRecord to be processed.
* @param type partition type to generate read group string for.
*/
public static String getTypeID(final SAMReadGroupRecord rg, final DoCOutputType.Partition type ) {
switch (type) {
case sample:
return rg.getSample();
case readgroup:
return rg.getSample()+"_rg_"+rg.getReadGroupId();
case library:
return rg.getLibrary();
case center:
return rg.getSequencingCenter();
case platform:
return rg.getPlatform();
case sample_by_center:
return rg.getSample()+"_cn_"+rg.getSequencingCenter();
case sample_by_platform:
return rg.getSample()+"_pl_"+rg.getPlatform();
case sample_by_platform_by_center:
return rg.getSample()+"_pl_"+rg.getPlatform()+"_cn_"+rg.getSequencingCenter();
default:
throw new GATKException(String.format("Invalid aggregation type %s", type));
}
}
/**
* Returns the counts of bases from reads with maxMapQ >= MAPQ >= minMapQ and maxBaseQ >= base quality >= minBaseQ in the context
* as an array of ints, indexed by the index fields of BaseUtils. These counts are computed seperately for each combination of
* {@link DoCOutputType.Partition} and then by sampleID.
*
* @param context Alignment context to compute base counts over
* @param minBaseQ minimum base quality for read to be counted towards depth
* @param maxBaseQ maximum base quality for read to be counted towards depth
* @param countType flag for controlling whether to count fragments or independent reads (currently only COUNT_READS is supported)
* @param types partitions over which to copy count information across
* @param header header to collect read group associations from
* @return
*/
public static Map>
getBaseCountsByPartition(final AlignmentContext context, final byte minBaseQ, final byte maxBaseQ,
final CountPileupType countType, final Collection types, final SAMFileHeader header) {
final Map> countsByIDByType = new HashMap<>();
final Map countsByRG = getBaseCountsByReadGroup(context, minBaseQ, maxBaseQ, countType, header);
for (DoCOutputType.Partition t : types ) {
// iterate through the read group counts and buildAndWriteLine the type associations
for ( Map.Entry readGroupCountEntry : countsByRG.entrySet() ) {
String typeID = getTypeID(readGroupCountEntry.getKey(), t);
if ( ! countsByIDByType.keySet().contains(t) ) {
countsByIDByType.put(t, new HashMap<>());
}
if ( ! countsByIDByType.get(t).keySet().contains(typeID) ) {
countsByIDByType.get(t).put(typeID, readGroupCountEntry.getValue().clone());
} else {
addCounts(countsByIDByType.get(t).get(typeID), readGroupCountEntry.getValue());
}
}
}
return countsByIDByType;
}
private static void addCounts(int[] updateMe, int[] leaveMeAlone ) {
for ( int index = 0; index < leaveMeAlone.length; index++ ) {
updateMe[index] += leaveMeAlone[index];
}
}
/**
* Takes an AlignmentContext object and extracts all the reads that pass the provided filters, and then returns a
* count breakdown for each base (plus D and N bases) present at the site.
*
* NOTE: this currently doesn't support counts by fragments as was the case in gatk3
*/
private static Map getBaseCountsByReadGroup(final AlignmentContext context, final byte minBaseQ, final byte maxBaseQ, final CountPileupType countType, final SAMFileHeader header) {
Map countsByRG = new HashMap<>();
Map countsByRGName = new HashMap<>();
Map RGByName = new HashMap<>();
List countPileup = new ArrayList<>(context.getBasePileup().size());
switch (countType) {
case COUNT_READS:
for (PileupElement read : context.getBasePileup()) {
if (elementWithinQualRange(read, minBaseQ, maxBaseQ)) {
countPileup.add(read);
}
}
break;
// TODO see reconcile FragmentUtils.create() and its various idiosyncrasies to re-enable this feature see https://github.com/broadinstitute/gatk/issues/6491
case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters:
throw new UnsupportedOperationException("Fragment based counting is currently unsupported");
case COUNT_FRAGMENTS_REQUIRE_SAME_BASE:
throw new UnsupportedOperationException("Fragment based counting is currently unsupported");
default:
throw new UserException("Must use valid CountPileupType");
}
for (PileupElement e : countPileup) {
SAMReadGroupRecord readGroup = ReadUtils.getSAMReadGroupRecord(e.getRead(), header);
Utils.nonNull(readGroup, () -> String.format("Read %s was missing read group information", e.getRead()));
// uniqueReadGroupID is unique across the library, read group ID, and the sample
String uniqueReadGroupId = readGroup.getSample() + "_" + readGroup.getReadGroupId() + "_" + readGroup.getLibrary() + "_" + readGroup.getPlatformUnit();
int[] counts = countsByRGName.get(uniqueReadGroupId);
if (counts == null) {
counts = new int[6];
countsByRGName.put(uniqueReadGroupId, counts);
RGByName.put(uniqueReadGroupId, readGroup);
}
updateCounts(counts, e);
}
for (String readGroupId : RGByName.keySet()) {
countsByRG.put(RGByName.get(readGroupId), countsByRGName.get(readGroupId));
}
return countsByRG;
}
// Applies the provided mapping and base quality filters to the provided read
private static boolean elementWithinQualRange(final PileupElement e, final byte minBaseQ, final byte maxBaseQ) {
return ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() );
}
private static void updateCounts(int[] counts, PileupElement e) {
if ( e.isDeletion() ) {
counts[BaseUtils.Base.D.ordinal()]++;
} else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) {
counts[BaseUtils.Base.N.ordinal()]++;
} else {
try {
counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++;
} catch (ArrayIndexOutOfBoundsException exc) {
throw new UserException("Expected a simple base, but actually received"+(char)e.getBase());
}
}
}
/**
* This method is used to construct the bins we use to compute coverage histograms.
*
* Returns the coverage "leftEndpoints" which correspond to the lowest value inclusive to be included in the given
* histogram bins. This method will attempt to distribute the bins between the specified values based a logarithmic
* distribution.
*
* @param lower lower bound of the second bin in the histogram (the first bin includes everything below this value)
* @param upper lower bound of the last bin in the histogram (the last bin will correspond to everything over this value)
* @param nBins the number of bins to try and fit between lower and upper (must be < upper - lower)
* @return an array corresponding to all of the inclusive left edges for a histogram with the specified parameters.
*/
public static int[] calculateCoverageHistogramBinEndpoints(int lower, int upper, int nBins) {
if ( nBins > upper - lower || lower < 1 ) {
throw new UserException.BadInput("the start must be at least 1 and the number of bins may not exceed stop - start");
}
int[] binLeftEndpoints = new int[nBins+1];
binLeftEndpoints[0] = lower;
int length = upper - lower;
double scale = Math.log10((double) length)/nBins;
for ( int b = 1; b < nBins ; b++ ) {
int leftEnd = lower + (int) Math.floor(Math.pow(10.0,(b-1.0)*scale));
// todo -- simplify to length^(scale/bins); make non-constant to put bin ends in more "useful"
// todo -- positions on the number line
while ( leftEnd <= binLeftEndpoints[b-1] ) {
leftEnd++;
}
binLeftEndpoints[b] = leftEnd;
}
binLeftEndpoints[binLeftEndpoints.length-1] = upper;
return binLeftEndpoints;
}
/*
* @updateTargetTable
* The idea is to have counts for how many *targets* have at least K samples with
* median coverage of at least X.
* To that end:
* Iterate over the samples the DOCS object, determine how many there are with
* median coverage > leftEnds[0]; how many with median coverage > leftEnds[1]
* and so on. Then this target has at least N, N-1, N-2, ... 1, 0 samples covered
* to leftEnds[0] and at least M,M-1,M-2,...1,0 samples covered to leftEnds[1]
* and so on.
*/
public static void updateTargetTable(int[][] table, DepthOfCoverageStats stats) {
int[] cutoffs = stats.getEndpoints();
int[] countsOfMediansAboveCutoffs = new int[cutoffs.length+1]; // 0 bin to catch everything
for ( String s : stats.getAllSamples() ) {
int medianBin = getQuantile(stats.getHistograms().get(s),0.5);
for ( int i = 0; i <= medianBin; i ++) {
countsOfMediansAboveCutoffs[i]++;
}
}
for ( int medianBin = 0; medianBin < countsOfMediansAboveCutoffs.length; medianBin++) {
for ( ; countsOfMediansAboveCutoffs[medianBin] > 0; countsOfMediansAboveCutoffs[medianBin]-- ) {
table[countsOfMediansAboveCutoffs[medianBin]-1][medianBin]++;
// the -1 is due to counts being 1-based and offsets being 0-based
}
}
}
/**
* Returns the percentage of the overall histogram dataset that is beyond a certain bin.
*
* @param histogram histogram to test
* @param bin bin to evaluate
* @return value from 0-100 corresponding to the portion of the total count over the given bin
*/
public static double getPctBasesAbove(long[] histogram, int bin) {
long below = 0;
long above = 0;
for ( int index = 0; index < histogram.length; index++) {
if ( index < bin ) {
below+=histogram[index];
} else {
above+=histogram[index];
}
}
return 100*( (double) above )/( above + below );
}
/**
* Returns the bin that corresponds to provided quantile value.
*
* @param histogram histogram to compute quantile for
* @param prop Quantile proportion
* @return index of histogram bin containing at lest total*prop values in lower indexed bins
*/
public static int getQuantile(long[] histogram, double prop) {
Utils.validate(prop >= 0 && prop <= 1, "Quantile proportion must fall between 0.0 and 1.0 inclusive");
long total = MathUtils.sum(histogram);
long counts = 0;
int bin = -1;
while ( counts < prop*total ) {
counts += histogram[bin+1];
bin++;
}
return bin == -1 ? 0 : bin;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy