All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.groundtruth.AddFlowBaseQuality Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.groundtruth;

import com.google.common.annotations.VisibleForTesting;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
import org.broadinstitute.hellbender.utils.read.FlowBasedReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;

import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

/**
 * Convert quality string that reports probability of indel errors (in flow chemistry) to the quality string that approximates probability of mismatch error.
 *
 * 

*

* *

* The reference is strictly required when handling CRAM files. *

* *

Input

*
    *
  • Coordinate-sorted and indexed SAM/BAM/CRAM
  • *
* *

Output

*
    *
  • Coordinate-sorted and indexed SAM/BAM/CRAM
  • *
* * {@GATK.walkertype ReadWalker} */ @CommandLineProgramProperties( summary = "Prints reads from the input SAM/BAM/CRAM file to the SAM/BAM/CRAM file while adding a base quality attribute.", oneLineSummary = "Add base quality attribute to reads in in the SAM/BAM/CRAM file", programGroup = FlowBasedProgramGroup.class ) @ExperimentalFeature @WorkflowProperties public final class AddFlowBaseQuality extends ReadWalker { public static final String MINIMAL_ERROR_RATE_LONG_NAME = "minimal-error-rate"; public static final String MAXIMAL_QUALITY_SCORE_LONG_NAME = "maximal-quality-score"; public static final String REPLACE_QUALITY_MODE_LONG_NAME = "replace-quality-mode"; public static final String BASE_QUALITY_ATTRIBUTE_NAME = "XQ"; public static final String OLD_QUALITY_ATTRIBUTE_NAME = "OQ"; public static final char PHRED_ASCII_BASE = '!'; public static final int ERROR_PROB_BAND_1LESS = 0; public static final int ERROR_PROB_BAND_KEY = 1; public static final int ERROR_PROB_BAND_1MORE = 2; public static final int ERROR_PROB_BANDS = 3; @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc="Write output to this file") @WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION}) public GATKPath output; private SAMFileGATKReadWriter outputWriter; @Argument(fullName = MINIMAL_ERROR_RATE_LONG_NAME, doc = "lower floor for flow error rate values") public double minErrorRate = 1e-3; @Argument(fullName = MAXIMAL_QUALITY_SCORE_LONG_NAME, doc = "clip quality score to the given value)") public int maxQualityScore = 93; @Argument(fullName = REPLACE_QUALITY_MODE_LONG_NAME, doc = "replace existing base qualities while saving previous qualities to OQ (when true) or simply write to BQ (when false) ") public boolean replaceQualityMode = false; @ArgumentCollection public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection(); @Override public void onTraversalStart() { outputWriter = createSAMWriter(output, true); // do not trim the boundary homopolymers as the default fbargs.keepBoundaryFlows = true; } @Override public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) { outputWriter.addRead(addBaseQuality(read)); } @Override public void closeTool() { if ( outputWriter != null ) { outputWriter.close(); } } /** * this is the actual 'worker' of the tool */ private GATKRead addBaseQuality(final GATKRead read) { // convert to a flow base read final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read); final FlowBasedRead fbRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs); final int flowOrderLength = calcFlowOrderLength(rgInfo.flowOrder); // generate base quality final double[] baseErrorProb = generateBaseErrorProbability(fbRead, flowOrderLength); final byte[] phred = convertErrorProbToPhred(baseErrorProb); // install in read if ( !replaceQualityMode ) { read.setAttribute(BASE_QUALITY_ATTRIBUTE_NAME, convertPhredToString(phred)); } else { read.setAttribute(OLD_QUALITY_ATTRIBUTE_NAME, convertPhredToString(read.getBaseQualitiesNoCopy())); read.setBaseQualities(phred); } // return reused read return read; } private byte[] convertErrorProbToPhred(double[] errorProb) { final byte[] phred = new byte[errorProb.length]; for ( int i = 0 ; i < errorProb.length ; i++ ) { if ( errorProb[i] == 0 ) { phred[i] = (byte)(maxQualityScore); } else { phred[i] = (byte)Math.min((maxQualityScore), (int) (-10 * Math.log10(errorProb[i]))); } } return phred; } private String convertPhredToString(final byte[] phred) { final byte[] s = new byte[phred.length]; for ( int i = 0 ; i < phred.length ; i++ ) { s[i] = (byte)(phred[i] + PHRED_ASCII_BASE); } return new String(s); } private int calcFlowOrderLength(String flowOrder) { final int i = flowOrder.indexOf(flowOrder.charAt(0), 1); return (i < 0) ? flowOrder.length() : i; } private double[] generateBaseErrorProbability(final FlowBasedRead fbRead, final int flowOrderLength) { /** * access key and error probabilities * for a description of the flow probabilities see {@link FlowBasedRead#flowMatrix} */ final int[] key = fbRead.getKey(); final double[][] errorProbBands = extractErrorProbBands(fbRead, minErrorRate); final double[] result = new double[fbRead.getBasesNoCopy().length]; // loop over hmers via flow key int base = 0; for ( int flow = 0 ; flow < key.length ; flow++ ) { if ( key[flow] != 0 ) { // establish hmer quality final int hmerLength = key[flow]; final double[] hmerBaseErrorProbs = generateHmerBaseErrorProbabilities(key, errorProbBands, flow, flowOrderLength); // fill in // first base of the read gets the original error probability of the hmer, otherwise use computed error probability if ( base == 0 ) { result[base++] = errorProbBands[ERROR_PROB_BAND_KEY][flow]; } else { result[base++] = hmerBaseErrorProbs[0]; // first base, or only base in case of a single base hmer } // for hmers longer than 1 if ( hmerLength > 1 ) { // skip all but last (leave with zero error probability) base += (hmerLength - 2); // fill last base from computed error probability result[base++] = hmerBaseErrorProbs[1]; // last base, if hmer is longer than 1 } // override result for the last base with the orignal hmer error probability if ( base == result.length ) { result[base - 1] = errorProbBands[ERROR_PROB_BAND_KEY][flow]; } } } return result; } // extract error probability bands. middle (1) band is the key prob. // lower (0) and high (2) are corresponding to -1 and +1 in hmer lengths private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, final double minValue) { // access key final int[] key = flowRead.getKey(); // allocate result double[][] result = new double[ERROR_PROB_BANDS][]; for ( int i = 0 ; i < result.length ; i++ ) { result[i] = new double[key.length]; } for ( int i = 0 ; i < key.length ; i++ ) { // extract key probability result[ERROR_PROB_BAND_KEY][i] = Math.max(flowRead.getProb(i, key[i]), minValue); // -1 if ( key[i] > 0 ) { result[ERROR_PROB_BAND_1LESS][i] = Math.max(flowRead.getProb(i, key[i] - 1), minValue); } else { result[ERROR_PROB_BAND_1LESS][i] = minValue; } // +1 if ( key[i] < flowRead.getMaxHmer() ) { result[ERROR_PROB_BAND_1MORE][i] = Math.max(flowRead.getProb(i, key[i] + 1), minValue); } else { result[ERROR_PROB_BAND_1MORE][i] = minValue; } } return result; } /** * The following functions estimate the error probability for an hmer. specifically two error * probability values are generated: one for the first base of the hmer and another for the * rest of its bases. * * The computation itself is performed in a subsequent function: generateSidedHmerBaseErrorProbability * It iterates over the possible valid combinations of errors and sums them up. * * @param key - key (hmer length) in flow space * @param errorProbBands - for each flow (position in the key) three error probabilities are provided: * [0] - for the hmer being one base shorter * [1] - for the hmer to be at its length * [2] - for the hmer to be one base longer * @param flow - the flow (index) for which to generate the probabilities (0 <= flow < key.length) * @param flowOrderLength - the cycle length of of the flow order (usually 4) * @return an array of two probabilities: * [0] - probability for the first base of the hmer * [1] - probability for the rest of the bases of the hmer */ @VisibleForTesting protected static double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength) { // result is left/right error probabilities final double[] errorProbs = new double[2]; final int hmerLength = key[flow]; errorProbs[0] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, -1, flowOrderLength); if ( hmerLength != 1 ) { errorProbs[1] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, 1, flowOrderLength); } return errorProbs; } private static double generateSidedHmerBaseErrorProbability(final int[] key, final double[][] errorProbBands, final int flow, final int sideIncr, final int flowOrderLength) { // create a key slice of the area around the flow/hmer. final int minIndex = Math.max(flow - flowOrderLength + 1, 0); final int maxIndex = Math.min(flow + flowOrderLength - 1, key.length - 1); final int[] slice = Arrays.copyOfRange(key, minIndex, maxIndex + 1); final int hmerLength = key[flow]; // walk the flows towards the side until (and including) the first non-zero key // on hmers of length 1 we walk both sides final List slices = new LinkedList<>(); final int[] incrs = (hmerLength != 1) ? new int[] { sideIncr } : new int[] { sideIncr, -sideIncr}; for (int incr : incrs) { for (int sideFlow = flow + incr; sideFlow >= 0 && sideFlow < key.length; sideFlow += incr) { // create a alternative key slice by incrementing sideFlow and decrementing flow final int[] altSlice = Arrays.copyOf(slice, slice.length); altSlice[sideFlow - minIndex] += 1; altSlice[flow - minIndex] -= 1; if (sliceIsValid(altSlice, flowOrderLength)) { slices.add(altSlice); } // is the sideFlow non-zero? if so, break if (key[sideFlow] != 0) { break; } } } // at this point, we have a list of valid slices. figure out the error probability for each of them // and compute the base quality final double keyP = sliceProb(slice, minIndex, key, errorProbBands); double sumP = keyP; for ( int[] s : slices ) { sumP += sliceProb(s, minIndex, key, errorProbBands); } final double ep = 1 - (keyP / sumP); return ep; } // compute probability for a slice private static double sliceProb(int[] slice, int minIndex, int[] key, double[][] errorProbBands) { double p = 1.0; for ( int i = 0 ; i < slice.length ; i++ ) { final int band; if ( slice[i] < key[i + minIndex] ) { band = ERROR_PROB_BAND_1LESS; } else if ( slice[i] > key[i + minIndex] ) { band = ERROR_PROB_BAND_1MORE; } else { band = ERROR_PROB_BAND_KEY; } p *= errorProbBands[band][i + minIndex]; } return p; } private static boolean sliceIsValid(final int[] slice, final int flowOrderLength) { // look for strings of consecutive zeros in length of flowOrderLength - 1 int consecutiveZeros = 0; for ( int key : slice ) { if ( key != 0 ) { consecutiveZeros = 0; } else { consecutiveZeros++; if ( consecutiveZeros >= (flowOrderLength - 1) ) { return false; } } } // if here, not found -> valid return true; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy