All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.transformers.BQSRReadTransformer Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.transformers;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.SAMUtils;
import org.broadinstitute.hellbender.exceptions.UserException.MalformedRead;
import org.broadinstitute.hellbender.tools.ApplyBQSRArgumentCollection;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.collections.NestedIntegerArray;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.recalibration.*;
import org.broadinstitute.hellbender.utils.recalibration.covariates.CovariateKeyCache;
import org.broadinstitute.hellbender.utils.recalibration.covariates.ReadCovariates;
import org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList;

import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import static org.broadinstitute.hellbender.utils.MathUtils.fastRound;
import static org.broadinstitute.hellbender.utils.QualityUtils.boundQual;
import static org.broadinstitute.hellbender.utils.recalibration.RecalDatum.MAX_RECALIBRATED_Q_SCORE;

public final class BQSRReadTransformer implements ReadTransformer {
    private static final long serialVersionUID = 1L;

    private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
    private final RecalibrationTables recalibrationTables;
    private final StandardCovariateList covariates; // list of all covariates to be used in this calculation
    private final SAMFileHeader header;
    
    private final int preserveQLessThan;
    private final double globalQScorePrior;
    private final boolean emitOriginalQuals;

    //These fields are created to avoid redoing these calculations for every read
    private final int totalCovariateCount;
    private final int specialCovariateCount;

    private static final int BASE_SUBSTITUTION_INDEX = EventType.BASE_SUBSTITUTION.ordinal();

    //Note: varargs allocates a new array every time. We'll pre-allocate one array and reuse it to avoid object allocation for every base of every read.
    private final RecalDatum[] empiricalQualCovsArgs;
    private final boolean useOriginalBaseQualities;

    private byte[] staticQuantizedMapping;
    private final CovariateKeyCache keyCache;

    /**
     * Constructor using a GATK Report file
     *
     * @param header header for the reads
     * @param bqsrRecalFile a GATK Report file containing the recalibration information
     * @param args ApplyBQSR args
     */
    public BQSRReadTransformer(final SAMFileHeader header, final File bqsrRecalFile, final ApplyBQSRArgumentCollection args) {
        this(header, new RecalibrationReport(bqsrRecalFile), args);
    }

    /**
     * Constructor using separate RecalibrationTables, QuantizationInfo, and StandardCovariateList
     *
     * @param header header for the reads
     * @param recalibrationTables recalibration tables output from BQSR
     * @param quantizationInfo quantization info
     * @param covariates standard covariate set
     * @param args ApplyBQSR arguments
     */
    private BQSRReadTransformer(final SAMFileHeader header, final RecalibrationTables recalibrationTables, final QuantizationInfo quantizationInfo, final StandardCovariateList covariates, final ApplyBQSRArgumentCollection args) {
        this.header = header;
        this.recalibrationTables = recalibrationTables;
        this.covariates = covariates;
        this.quantizationInfo = quantizationInfo;

        if (args.quantizationLevels == 0) { // quantizationLevels == 0 means no quantization, preserve the quality scores
            quantizationInfo.noQuantization();
        } else if (args.quantizationLevels > 0 && args.quantizationLevels != quantizationInfo.getQuantizationLevels()) { // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report.
            quantizationInfo.quantizeQualityScores(args.quantizationLevels);
        }

        this.preserveQLessThan = args.PRESERVE_QSCORES_LESS_THAN;
        this.globalQScorePrior = args.globalQScorePrior;
        this.emitOriginalQuals = args.emitOriginalQuals;
        this.useOriginalBaseQualities = args.useOriginalBaseQualities;

        // staticQuantizedQuals is entirely separate from the dynamic binning that quantizationLevels, and
        // staticQuantizedQuals does not make use of quantizationInfo
        if(args.staticQuantizationQuals != null && !args.staticQuantizationQuals.isEmpty()) {
            staticQuantizedMapping = constructStaticQuantizedMapping(args.staticQuantizationQuals, args.roundDown);
        }

        totalCovariateCount = covariates.size();
        specialCovariateCount = covariates.numberOfSpecialCovariates();

        //Note: We pre-create the varargs arrays that will be used in the calls. Otherwise we're spending a lot of time allocating those int[] objects
        empiricalQualCovsArgs = new RecalDatum[totalCovariateCount - specialCovariateCount];
        keyCache = new CovariateKeyCache();//one cache per transformer
    }

    /**
     * Constructor using a RecalibrationReport
     *
     * @param header header for the reads
     * @param recalInfo the output of BaseRecalibration, containing the recalibration information
     * @param args a set of arguments to control how bqsr is applied
     */
    public BQSRReadTransformer(final SAMFileHeader header, final RecalibrationReport recalInfo, final ApplyBQSRArgumentCollection args) {
        this(header, recalInfo.getRecalibrationTables(), recalInfo.getQuantizationInfo(), recalInfo.getCovariates(), args);
    }

    /**
     * Recalibrates the base qualities of a read
     * 

* It updates the base qualities of the read with the new recalibrated qualities (for all event types) *

* Implements a serial recalibration of the reads using the combinational table. * First, we perform a positional recalibration, and then a subsequent dinuc correction. *

* Given the full recalibration table, we perform the following preprocessing steps: *

* - calculate the global quality score shift across all data [DeltaQ] * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual * - The final shift equation is: *

* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) * * @param originalRead the read to recalibrate */ @Override public GATKRead apply(final GATKRead originalRead) { final GATKRead read = useOriginalBaseQualities ? ReadUtils.resetOriginalBaseQualities(originalRead) : originalRead; if (emitOriginalQuals && ! read.hasAttribute(SAMTag.OQ.name())) { // Save the old qualities if the tag isn't already taken in the read try { read.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(read.getBaseQualities())); } catch (final IllegalArgumentException e) { throw new MalformedRead(read, "illegal base quality encountered; " + e.getMessage()); } } final ReadCovariates readCovariates = RecalUtils.computeCovariates(read, header, covariates, false, keyCache); //clear indel qualities read.clearAttribute(ReadUtils.BQSR_BASE_INSERTION_QUALITIES); read.clearAttribute(ReadUtils.BQSR_BASE_DELETION_QUALITIES); // get the keyset for this base using the error model final int[][] fullReadKeySet = readCovariates.getKeySet(EventType.BASE_SUBSTITUTION); // the rg key is constant over the whole read, the global deltaQ is too final int rgKey = fullReadKeySet[0][0]; final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get2Keys(rgKey, BASE_SUBSTITUTION_INDEX); if (empiricalQualRG == null) { return read; } final byte[] quals = read.getBaseQualities(); final int readLength = quals.length; final double epsilon = globalQScorePrior > 0.0 ? globalQScorePrior : empiricalQualRG.getEstimatedQReported(); final NestedIntegerArray qualityScoreTable = recalibrationTables.getQualityScoreTable(); final List quantizedQuals = quantizationInfo.getQuantizedQuals(); //Note: this loop is under very heavy use in applyBQSR. Keep it slim. for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) if (quals[offset] < preserveQLessThan) { continue; } Arrays.fill(empiricalQualCovsArgs, null); //clear the array final int[] keySet = fullReadKeySet[offset]; final RecalDatum empiricalQualQS = qualityScoreTable.get3Keys(keySet[0], keySet[1], BASE_SUBSTITUTION_INDEX); for (int i = specialCovariateCount; i < totalCovariateCount; i++) { if (keySet[i] >= 0) { empiricalQualCovsArgs[i - specialCovariateCount] = recalibrationTables.getTable(i).get4Keys(keySet[0], keySet[1], keySet[i], BASE_SUBSTITUTION_INDEX); } } final double recalibratedQualDouble = hierarchicalBayesianQualityEstimate(epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovsArgs); final byte recalibratedQualityScore = quantizedQuals.get(getRecalibratedQual(recalibratedQualDouble)); // Bin to static quals quals[offset] = staticQuantizedMapping == null ? recalibratedQualityScore : staticQuantizedMapping[recalibratedQualityScore]; } read.setBaseQualities(quals); return read; } // recalibrated quality is bound between 1 and MAX_QUAL private byte getRecalibratedQual(final double recalibratedQualDouble) { return boundQual(fastRound(recalibratedQualDouble), MAX_RECALIBRATED_Q_SCORE); } public static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final RecalDatum... empiricalQualCovs ) { final double globalDeltaQ = empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon; final double deltaQReported = empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon); double deltaQCovariates = 0.0; final double conditionalPrior2 = deltaQReported + globalDeltaQ + epsilon; for( final RecalDatum empiricalQualCov : empiricalQualCovs ) { if (empiricalQualCov != null) { deltaQCovariates += empiricalQualCov.getEmpiricalQuality(conditionalPrior2) - conditionalPrior2; } } return conditionalPrior2 + deltaQCovariates; } /** * Constructs an array that maps particular quantized values to a rounded value in staticQuantizedQuals * * Rounding is done in probability space. When roundDown is true, we simply round down to the nearest * available qual in staticQuantizedQuals * * @param staticQuantizedQuals the list of qualities to round to * @param roundDown round down if true, round to nearest (in probability space) otherwise * @return Array where index representing the quality score to be mapped and the value is the rounded quality score */ public static byte[] constructStaticQuantizedMapping(final List staticQuantizedQuals, final boolean roundDown) { if (staticQuantizedQuals == null || staticQuantizedQuals.isEmpty()){ return createIdentityMatrix(QualityUtils.MAX_QUAL); } Utils.nonNull(staticQuantizedQuals); // Create array mapping that maps quals to their rounded value. final byte[] mapping = new byte[QualityUtils.MAX_QUAL]; Collections.sort(staticQuantizedQuals); // Fill mapping with one-to-one mappings for values between 0 and MIN_USABLE_Q_SCORE // This ensures that quals used as special codes will be preserved for(int i = 0 ; i < QualityUtils.MIN_USABLE_Q_SCORE ; i++) { mapping[i] = (byte) i; } // If only one staticQuantizedQual is given, fill mappings larger than QualityUtils.MAX_QUAL with that value if(staticQuantizedQuals.size() == 1) { final int onlyQual = staticQuantizedQuals.get(0); for(int i = QualityUtils.MIN_USABLE_Q_SCORE ; i < QualityUtils.MAX_QUAL ; i++) { mapping[i] = (byte) onlyQual; } return mapping; } final int firstQual = QualityUtils.MIN_USABLE_Q_SCORE; int previousQual = firstQual; double previousProb = QualityUtils.qualToProb(previousQual); for (final int nextQual : staticQuantizedQuals) { final double nextProb = QualityUtils.qualToProb(nextQual); for (int i = previousQual; i < nextQual; i++) { if (roundDown) { mapping[i] = (byte) previousQual; } else { final double iProb = QualityUtils.qualToProb(i); if (iProb - previousProb > nextProb - iProb) { mapping[i] = (byte) nextQual; } else { mapping[i] = (byte) previousQual; } } } previousQual = nextQual; previousProb = nextProb; } // Round all quals larger than the largest static qual down to the largest static qual for(int i = previousQual; i < QualityUtils.MAX_QUAL ; i++) { mapping[i] = (byte) previousQual; } return mapping; } private static byte[] createIdentityMatrix(int maxQual) { final byte[] bytes = new byte[maxQual]; for (int i = 0; i < bytes.length; i++) { bytes[i] = (byte) i; } return bytes; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy