All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.read.FlowBasedRead Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.utils.read;

import htsjdk.samtools.*;
import htsjdk.samtools.util.SequenceUtil;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedHMMEngine;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Tail;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;

import java.io.*;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.util.*;
/**
 * Adds flow information to the usual GATKRead. In addition to the usual read data this class keeps flowMatrix,
 * that contains probabilities for alternative hmer calls.
 *
 * Main function deals with parsing flow-specific QUAL representation readFlowMatrix.
 * Note that there is a lot of code that deals with other various formats of the representation (e.g. when the matrix
 * is coded in the tags of the BAM and is given in flow space). This code is not used in production, but was used in
 * development and testing
 *
 * A common usage pattern is to covert a GATKRead into a FlowBasedRead. Additionally,
 * a SAMRecord can also be converted into a FlowBasedRead. Follows a common usage pattern:
 *
 * For a self-contained example of a usage pattern, see {@link FlowBasedReadUtils#convertToFlowBasedRead(GATKRead, SAMFileHeader)}
 *
 **/

public class FlowBasedRead extends SAMRecordToGATKReadAdapter implements GATKRead, Serializable {

    public static final int  MAX_CLASS = 12; //note - this is a historical value to support files with max class is not defined in the header, it is expected that mc tag exists in the CRAM
    public static final String DEFAULT_FLOW_ORDER = "TGCA";
    private static final long serialVersionUID = 42L;
    private static final Logger logger = LogManager.getLogger(FlowBasedRead.class);
    private static final OneShotLogger vestigialOneShotLogger = new OneShotLogger(FlowBasedRead.class);

    // constants
    static private final int MINIMAL_READ_LENGTH = 10; // check if this is the right number
    private final double MINIMAL_CALL_PROB = 0.1;

    // constants for clippingTagContains.
    // The tag is present when the end of the read was clipped at base calling.
    // The value of the tag is a string consisting of any one or more of the following:
    // A - adaptor clipped
    // Q - quality clipped
    // Z - "three zeros" clipped
    final  public static String  CLIPPING_TAG_NAME = "tm";

    final public static String FLOW_MATRIX_TAG_NAME = "tp";
    final public static String FLOW_MATRIX_T0_TAG_NAME = "t0";
    final public static String FLOW_MATRiX_OLD_TAG_KR = "kr";
    final public static String FLOW_MATRiX_OLD_TAG_TI = "ti";
    public static final String FLOW_MATRiX_OLD_TAG_KH = "kh";
    public static final String FLOW_MATRiX_OLD_TAG_KF = "kf";
    public static final String FLOW_MATRiX_OLD_TAG_KD = "kd";

    final public static String MAX_CLASS_READ_GROUP_TAG = "mc";

    final public static String[] HARD_CLIPPED_TAGS = new String[] {
            FLOW_MATRIX_TAG_NAME, FLOW_MATRiX_OLD_TAG_KR, FLOW_MATRiX_OLD_TAG_TI,
            FLOW_MATRiX_OLD_TAG_KH, FLOW_MATRiX_OLD_TAG_KF, FLOW_MATRiX_OLD_TAG_KD,
            FLOW_MATRIX_T0_TAG_NAME
    };

    /**
     * The sam record from which this flow based read originated
     */
    private final SAMRecord samRecord;

    /**
     * The flow key for the read - i.e. lengths of hmers in flow order.
     *
     * For example, assuming a flow order of TGCA, and a forward sequence of GGAAT, the key will be 0,2,0,2,1
     */
    private int[] key;

    /**
     * the mapping of key elements to their origin locations in the sequence. Entry n contains the offset in the sequence
     * where the hmer described by this key element starts.
     */
    private int [] flow2base;

    /**
     * The maximal length of an hmer that can be encoded (normally in the 10-12 range)
     */
    private final int maxHmer;

    /**
     * The value to fill the flow matrix with. Normally 0.001
     */
    private double totalMinErrorProb;
    private double perHmerMinErrorProb;

    /**
     * The order in which flow key in encoded (See description for key field). Flow order may be wrapped if a longer one
     * needed.
     */
    private byte[] flowOrder;

    /**
     * The probability matrix for this read. [n][m] position represents that probability that an hmer of n length will be
     * present at the m key position. Therefore, the first dimension is in the maxHmer order, where the second dimension
     * is length(key).
     */
    private double[][] flowMatrix;

    /**
     * The validity status of the key. Certain operations may produce undefined/erroneous results. This is signaled by
     * the read being marked with a validKey == false
     */
    private boolean validKey = true;

    /**
     * The direction of this read. After being red, the direction will also swing to be REFERENCE
     */
    private Direction direction = Direction.SYNTHESIS;

    /**
     * Was base clipping applied to this read? (normally to trim a read to the span of a haplotype)
     */
    private boolean baseClipped = false;

    /**
     * If applyBaseClipping was called, the left trimming that was actually applied to the read
     */
    private int trimLeftBase = 0 ;

    /**
     * If applyBaseClipping was called, the right trimming that was actually applied to the read
     */
    private int trimRightBase = 0 ;

    /**
     * The flow based argument collection under which this read was created
     */
    private final FlowBasedArgumentCollection fbargs;

    /**
     * This allows tools to reduce/enlarge the lower limit of read size for clipping operations.
     */
    static private int minimalReadLength = MINIMAL_READ_LENGTH;


    /**
     * Various methods for storing arrays of pre-computed qualities to be used in {@link FlowBasedHMMEngine}.
     */
    public byte[] getReadInsQuals() {
        return readInsQuals;
    }
    public void setReadInsQuals(byte[] readInsQuals) {
        this.readInsQuals = readInsQuals;
    }
    public byte[] getReadDelQuals() {
        return readDelQuals;
    }
    public void setReadDelQuals(byte[] readDelQuals) {
        this.readDelQuals = readDelQuals;
    }
    public byte[] getOverallGCP() {
        return overallGCP;
    }
    public void setOverallGCP(byte[] overallGCP) {
        this.overallGCP = overallGCP;
    }
    private byte[] readInsQuals;
    private byte[] readDelQuals;
    private byte[] overallGCP;

    public enum Direction {
        REFERENCE, SYNTHESIS
    }

    /**

     * Constructor from GATKRead. flow order, hmer and arguments
     * @param read GATK read
     * @param flowOrder flow order string (one cycle)
     * @param maxHmer maximal hmer to keep in the flow matrix
     * @param fbargs arguments that control resolution etc. of the flow matrix
     */
    public FlowBasedRead(final GATKRead read, final String flowOrder, final int maxHmer, final FlowBasedArgumentCollection fbargs) {
        this(read.convertToSAMRecord(null), flowOrder, maxHmer, fbargs);
    }

    /**
     * Same as above but constructs from SAMRecord
     * @param samRecord record from SAM file
     * @param flowOrder flow order (single cycle)
     * @param maxHmer maximal hmer to keep in the flow matrix
     * @param fbargs arguments that control resolution of the flow matrix
     */
    public FlowBasedRead(final SAMRecord samRecord, final String flowOrder, final int maxHmer, final FlowBasedArgumentCollection fbargs) {
        super(samRecord);
        Utils.nonNull(fbargs);
        Utils.validate(FlowBasedReadUtils.hasFlowTags(samRecord), "FlowBasedRead can only be used on flow reads. failing read: " + samRecord);
        this.fbargs = fbargs;
        this.maxHmer = maxHmer;
        this.samRecord = samRecord;

        // read flow matrix in. note that below code contains accommodates for old formats
        if ( samRecord.hasAttribute(FLOW_MATRIX_TAG_NAME) ) {
            perHmerMinErrorProb = fbargs.fillingValue;
            totalMinErrorProb = perHmerMinErrorProb;
            // this path is the production path. A flow read should contain a FLOW_MATRIX_TAG_NAME tag
            readFlowMatrix(flowOrder);

        } else {

            // NOTE: this path is vestigial and deals with old formats of the matrix
            if ( samRecord.hasAttribute(FLOW_MATRiX_OLD_TAG_KR) ) {
                readVestigialFlowMatrixFromKR(flowOrder);
            } else if ( samRecord.hasAttribute(FLOW_MATRiX_OLD_TAG_TI) ) {
                readVestigialFlowMatrixFromTI(flowOrder);
            } else {
                throw new GATKException("read missing flow matrix attribute: " + FLOW_MATRIX_TAG_NAME);
            }
        }

        implementMatrixMods(FlowBasedReadUtils.getFlowMatrixModsInstructions(fbargs.flowMatrixMods, maxHmer));

        //Spread boundary flow probabilities when the read is unclipped
        //in this case the value of the hmer is uncertain
        if ( !fbargs.keepBoundaryFlows ) {
            if (samRecord.getReadUnmappedFlag() || CigarUtils.countClippedBases(samRecord.getCigar(), Tail.LEFT, CigarOperator.HARD_CLIP) == 0) {
                spreadFlowLengthProbsAcrossCountsAtFlow(findFirstNonZero(key));
            }
            if (samRecord.getReadUnmappedFlag() || CigarUtils.countClippedBases(samRecord.getCigar(), Tail.RIGHT, CigarOperator.HARD_CLIP) == 0) {
                spreadFlowLengthProbsAcrossCountsAtFlow(findLastNonZero(key));
            }
        }

        if ( logger.isDebugEnabled() ) {
            logger.debug("cons: name: " + samRecord.getReadName()
                    + " len: " + samRecord.getReadLength()
                    + " loc: " + samRecord.getStart() + "-" + samRecord.getEnd()
                    + " rev: " + isReverseStrand()
                    + " cigar:" + samRecord.getCigarString());
            logger.debug("     bases: " + new String(samRecord.getReadBases()));
            logger.debug("       key: " + FlowBasedKeyCodec.keyAsString(key));
        }

    }

    //since the last unclipped flow is uncertain (we give high probabilities to
    //also hmers higher than the called hmer)
    private void spreadFlowLengthProbsAcrossCountsAtFlow(final int flowToSpread) {
        if (flowToSpread<0) //boundary case when all the key is zero
            return;

        final int call = key[flowToSpread];
        if (call==0){
            throw new IllegalStateException("Boundary key value should not be zero for the spreading");
        }

        final int numberToFill = maxHmer - call+1;
        double total = 0;
        for (int i = call; i < maxHmer+1; i++)
            total += flowMatrix[i][flowToSpread];
        final double fillProb = Math.max(total / numberToFill, perHmerMinErrorProb);
        for (int i = call; i < maxHmer+1; i++){
            flowMatrix[i][flowToSpread] = fillProb;
        }
    }



    // This is the code for parsing the current/production BAM format (with TP tag)
    private void readFlowMatrix(final String _flowOrder) {

        // generate key (base to flow space)
        setDirection(Direction.REFERENCE);  // base is always in reference/alignment direction

        key = FlowBasedKeyCodec.baseArrayToKey(samRecord.getReadBases(), _flowOrder);
        flow2base = FlowBasedKeyCodec.getKeyToBase(key);
        flowOrder = FlowBasedKeyCodec.getFlowToBase(_flowOrder, key.length);

        if (perHmerMinErrorProb == 0){
            // estimate the lowest possible error probability and fill the matrix with it
            // it is not recommended to do this in the M2 or HaplotypeCaller, since obviously the error are not
            // equally distributed over the homopolymers
            totalMinErrorProb = estimateMinErrorProb();
            perHmerMinErrorProb = totalMinErrorProb/maxHmer;
        }

        // initialize matrix. fill first line, copy subsequent lines from first
        flowMatrix = new double[maxHmer+1][key.length];
        Arrays.fill(flowMatrix[0], perHmerMinErrorProb);
        for (int i = 1 ; i < maxHmer+1; i++) {
            System.arraycopy(flowMatrix[0], 0, flowMatrix[i], 0, key.length);
        }

        // access qual, convert to flow representation
        final byte[]      quals = samRecord.getBaseQualities();
        final byte[]      tp = samRecord.getSignedByteArrayAttribute(FLOW_MATRIX_TAG_NAME);

        boolean specialTreatmentForZeroCalls = false;
        final byte[]      t0 = SAMUtils.fastqToPhred(samRecord.getStringAttribute(FLOW_MATRIX_T0_TAG_NAME));
        final double[]     t0probs = new double[quals.length];
        if ((t0!=null) && fbargs.useT0Tag){
            specialTreatmentForZeroCalls = true;

            if (t0.length!=tp.length){
                throw new GATKException("Illegal read len(t0)!=len(qual): " + samRecord.getReadName());
            }
        }

        final double[]    probs = new double[quals.length];
        for ( int i = 0 ; i < quals.length ; i++ ) {
            final double q = quals[i];
            final double p = Math.pow(10, -q/10);
            probs[i] = p;
            if (specialTreatmentForZeroCalls){
                final double qq = t0[i];
                t0probs[i] = Math.pow(10,-qq/10);
            }
        }

        // apply key and qual/tp to matrix
        int     qualOfs = 0; //converts between base -> flow
        for ( int i = 0 ; i < key.length ; i++ ) {
            final int        run = key[i];
            if (run > 0) {
                parseSingleHmer(probs, tp, i, run, qualOfs);
            }

            if ((run == 0)&&(specialTreatmentForZeroCalls)){
                parseZeroQuals(t0probs, i, qualOfs);
            }

            double totalErrorProb = 0;

            for (int k=0; k < maxHmer; k++ ){
                totalErrorProb += flowMatrix[k][i];
            }
            final double callProb = Math.max(MINIMAL_CALL_PROB, 1-totalErrorProb);
            // the probability in the recalibration is not divided by two for hmers of length 1
            flowMatrix[Math.min(run, maxHmer)][i] = callProb;
            qualOfs+=run;
        }
        applyFilteringFlowMatrix();
    }


    //convert qualities from the single hmer to a column in a flow matrix
    private void parseSingleHmer(final double[] probs, final byte[] tp, final int flowIdx,
                                 final int flowCall, final int qualOfs){
        for (int i = qualOfs ; i < qualOfs+flowCall; i++) {
            if (tp[i]!=0) {
                final int loc = Math.max(Math.min(flowCall+tp[i], maxHmer),0);
                if (flowMatrix[loc][flowIdx] == perHmerMinErrorProb) {
                    flowMatrix[loc][flowIdx] = probs[i];
                } else {
                    flowMatrix[loc][flowIdx]    += probs[i];
                }
            }
        }
    }

    // convert qualities from the t0 tag to the probabilities of 1->0 error.
    // This function deals with t0 tag that encodes the probability of 1->0 error
    // in this case there is no nucleotide to place the error probability on, so we
    // place it on the neighboring bases and choose the **lower** error probability between the
    // neighbors (that's how T0 encoding works). The error is placed only on the 1-mer error assuming
    // that 2->0 errors are negligibly rare.
    private void parseZeroQuals(final double[] probs, final int flowIdx, final int qualOfs){
        if ((qualOfs == 0) | (qualOfs==probs.length)){ // do not report zero error probability on the edge of the read
            return;
        }

        double prob0 = Math.min(probs[qualOfs-1],probs[qualOfs]);
        // cases where t0 actually does not report anything. This awkward situation comes because
        // if fbargs.fillingValue is zero, the empty cell probability is maximalQuality / getMaxHmer()
        // while if not, the empty cell probability is fbargs.fillingValue.
        if (prob0 <= totalMinErrorProb * 3) {
            prob0 = 0;
        }
        flowMatrix[1][flowIdx] = Math.max(flowMatrix[1][flowIdx], prob0);
    }

    public String getFlowOrder() {
        return new String(Arrays.copyOfRange(flowOrder, 0, Math.min(fbargs.flowOrderCycleLength,flowOrder.length)));
    }


    public int getMaxHmer() {
        return maxHmer;
    }

    public int getNFlows() {
        return key.length;
    }

    public Direction getDirection(){
        return direction;
    }


    private int[] getAttributeAsIntArray(final String attributeName, final boolean isSigned) {
        ReadUtils.assertAttributeNameIsLegal(attributeName);
        final Object attributeValue = this.samRecord.getAttribute(attributeName);

        if (attributeValue == null) {
            return null;
        } else if (attributeValue instanceof byte[]) {
            final byte[] byteArrayAttributeValue = (byte[]) attributeValue;
            final int[] ret = new int[byteArrayAttributeValue.length];
            for (int i = 0; i < ret.length; i++)
                if (!isSigned) ret[i] = byteArrayAttributeValue[i]&0xff;
                else ret[i]=byteArrayAttributeValue[i]; //converting signed byte to unsigned
            return Arrays.copyOf(ret, ret.length);
        } else if ((attributeValue instanceof int[])) {
            final int[] ret = (int[]) attributeValue;
            return Arrays.copyOf(ret, ret.length);
        } else if  (attributeValue instanceof short[]) {
            final short [] shortArrayAttributeValue = (short[]) attributeValue;
            final int[] ret = new int[shortArrayAttributeValue.length];
            for (int i = 0 ; i < shortArrayAttributeValue.length; i++ )
                ret[i] = shortArrayAttributeValue[i];
            return Arrays.copyOf(ret, ret.length);
        }else {
            throw new GATKException.ReadAttributeTypeMismatch(attributeName, "integer array");
        }
    }

    public boolean isValid() {
        return validKey;
    }

    /**
     * get a specific cell from the flow matrix. Each cell contains the probability
     * for an hmer of the given length to appear the given position in the flow key
     *
     * @param flow - position in the flow key (index into key[])
     * @param hmer - length of the hmer
     * @return
     */
    public double getProb(final int flow, final int hmer) {
        double prob = flowMatrix[Math.min(hmer, maxHmer)][flow];
        return (prob <= 1) ? prob : 1;
    }

    /*
    * Legacy function from the time when the error probability were in flow space and when the read was clipped we had to
    * translate the clipping in the base space to the clipping in the flow space. Now does nothing (isBaseFormat() is true)
    * but is kept for R&D cases.
     */
    public void applyAlignment(){
        if ((getDirection() == Direction.SYNTHESIS) && ( isReverseStrand() )) {
            flipMatrix();
            ArrayUtils.reverse(key);
            flow2base = FlowBasedKeyCodec.getKeyToBase(key);
            SequenceUtil.reverseComplement(flowOrder);

        }

        final boolean isBase = isBaseFormat();
        final int[] basePair = {0, 0};
        final int[] clipLeftPair = !isBase ? findLeftClippingFromCigar() : basePair;
        final int[] clipRightPair = !isBase ? findRightClippingFromCigar() : basePair;
        final int clipLeft = clipLeftPair[0];
        final int leftHmerClip = clipLeftPair[1];
        final int clipRight = clipRightPair[0];
        final int rightHmerClip = clipRightPair[1];

        applyClipping(clipLeft, leftHmerClip, clipRight, rightHmerClip, false);

        setDirection(Direction.REFERENCE);

    }

    private boolean isBaseFormat() {
       return samRecord.hasAttribute(FLOW_MATRiX_OLD_TAG_TI) || samRecord.hasAttribute(FLOW_MATRIX_TAG_NAME);
    }

    private void fillFlowMatrix(final int [] kh, final int [] kf,
                                final double [] kdProbs ) {
        for ( int i = 0 ; i < kh.length; i++ ) {
            // normal matrix filling
            final int     pos = kf[i];
            final int     hmer = kh[i] & 0xff;
            if (hmer > maxHmer){
                flowMatrix[maxHmer][pos] = Math.max(flowMatrix[maxHmer][pos], kdProbs[i]);
            } else {
                flowMatrix[hmer][pos] = Math.max(flowMatrix[hmer][pos], kdProbs[i]);
            }
        }

    }

    // execute the matrix modifications
    private void implementMatrixMods(final int[] flowMatrixModsInstructions) {

        if ( flowMatrixModsInstructions != null ) {
            for (int hmer = 0; hmer < flowMatrixModsInstructions.length; hmer++) {
                final int hmer2 = flowMatrixModsInstructions[hmer];
                if (hmer2 != 0) {
                    for (int pos = 0; pos < flowMatrix[0].length; pos++) {

                        if (flowMatrix[hmer][pos] > flowMatrix[hmer2][pos]) {
                            flowMatrix[hmer2][pos] = flowMatrix[hmer][pos];
                        }

                        // if we are copying backwards, zero out source
                        if (hmer > hmer2)
                            flowMatrix[hmer][pos] = 0;
                    }
                }
            }
        }
    }


    private void flipMatrix() {
        for ( int i = 0 ; i < flowMatrix.length; i++) {
            ArrayUtils.reverse(flowMatrix[i]);
        }
    }

    private static int findFirstNonZero(final int[] array){
        int result = -1;
        for (int i = 0 ; i < array.length; i++){
            if (array[i]!=0) {
                result = i;
                break;
            }
        }
        return result;
    }

    private static int findLastNonZero(final int[] array){
        int result = -1;
        for (int i = array.length-1 ; i >= 0; i--){
            if (array[i]!=0) {
                result = i;
                break;
            }
        }
        return result;
    }

    private static void shiftColumnUp(final double[][] matrix, final int colnum, final int shift) {
        for (int i = 0; i < matrix.length - shift; i ++ ) {
            matrix[i][colnum] = matrix[i+shift][colnum];
        }
        for (int i = matrix.length - shift; i < matrix.length; i ++ ) {
            matrix[i][colnum] = 0;
        }

    }

    public void setDirection(final Direction dir ) {
        direction = dir;
    }

    //trims base-spaced reads. Usually not needed, but kept for completeness
    public void applyBaseClipping(final int clipLeftBase, final int clipRightBase, boolean spread){
        final int[] clipLeftPair = findLeftClipping(clipLeftBase);
        final int[] clipRightPair = findRightClipping(clipRightBase);
        final int clipLeft = clipLeftPair[0];
        final int leftHmerClip = clipLeftPair[1];
        final int clipRight = clipRightPair[0];
        final int rightHmerClip = clipRightPair[1];
        if (getLength() - clipLeftBase - clipRightBase < minimalReadLength) {
            baseClipped = true;
            validKey = false;
            trimLeftBase = clipLeftBase;
            trimRightBase = clipRightBase;
        } else {
            applyClipping(clipLeft, leftHmerClip, clipRight, rightHmerClip, spread);
            baseClipped = true;
            validKey = true;
            trimLeftBase = clipLeftBase;
            trimRightBase = clipRightBase;
        }
    }

    private void applyClipping(int clipLeft, final int leftHmerClip, int clipRight, final int rightHmerClip, final boolean spread){
        if ((clipLeft < 0) || (clipRight < 0)  || (clipLeft >= getKeyLength() ) || ( clipRight >= getKeyLength())) {
            throw new IllegalStateException(String.format("Weird read clip calculated: left/right/keyLength %d/%d/%d", clipLeft, clipRight, getKeyLength()));
        }

        if ((leftHmerClip < 0) || (rightHmerClip < 0)  || (leftHmerClip >= (getMaxHmer() + 2) ) || ( rightHmerClip >= (getMaxHmer() + 2))) {
            throw new IllegalStateException(String.format("Weird read clip calculated: left/right/maxHmer+2 %d/%d/%d", leftHmerClip, rightHmerClip, getMaxHmer() + 2));
        }

        final int originalLength = key.length;

        key[clipLeft]-=leftHmerClip;
        boolean shiftLeft = true;
        while (key[clipLeft] == 0) {
            clipLeft += 1 ;
            shiftLeft = false;
        }

        key[key.length - clipRight-1] -= rightHmerClip;
        boolean shiftRight = true;
        while (key[originalLength - 1- clipRight] == 0) {
            clipRight += 1 ;
            shiftRight = false;
        }

        key = Arrays.copyOfRange(key, clipLeft, originalLength - clipRight);
        flow2base = FlowBasedKeyCodec.getKeyToBase(key);
        flowOrder = Arrays.copyOfRange(flowOrder, clipLeft, originalLength - clipRight);

        final double [][] newFlowMatrix = new double[flowMatrix.length][originalLength - clipLeft - clipRight] ;
        for ( int i = 0 ; i < newFlowMatrix.length; i++) {
            newFlowMatrix[i] = Arrays.copyOfRange(flowMatrix[i], clipLeft, originalLength - clipRight);
        }

        flowMatrix = newFlowMatrix;
        if (shiftLeft) {
            shiftColumnUp(flowMatrix, 0, leftHmerClip);
        }

        if (shiftRight) {
            shiftColumnUp(flowMatrix, flowMatrix[0].length-1, rightHmerClip);
        }

        //Spread boundary flow probabilities for the boundary hmers of the read
        //in this case the value of the genome hmer is uncertain
        if ( spread ) {
            spreadFlowLengthProbsAcrossCountsAtFlow(findFirstNonZero(key));
            spreadFlowLengthProbsAcrossCountsAtFlow(findLastNonZero(key));
        }
    }

    private int[] findLeftClippingFromCigar() {
        final List cigar = getCigarElements();
        final int[] result = new int[2];
        if (cigar.size() == 0 ) {
            return result;
        }

        final CigarElement start = cigar.get(0);
        if (start.getOperator() != CigarOperator.H) {
            return result;
        }

        final int basesClipped = start.getLength();
        return findLeftClipping(basesClipped);
    }


    private int[] findLeftClipping(final int basesClipped){
        return FlowBasedReadUtils.findLeftClipping(basesClipped, flow2base, key);
    }

    private int[] findRightClippingFromCigar() {
        final List cigar = getCigarElements();
        final int[] result = new int[2];
        if (cigar.size() == 0 ) {
            result[0] = 0;
            result[1] = 0;
            return result;
        }

        final CigarElement end = cigar.get(cigar.size()-1);
        if (end.getOperator() != CigarOperator.H) {
            result[0] = 0;
            result[1] = 0;
            return result;
        }

        final int basesClipped = end.getLength();

        return findRightClipping(basesClipped);
    }

    private int[] findRightClipping(final int basesClipped) {
        final int[] rkey = new int[key.length];
        for (int i = 0 ; i < key.length; i++ ){
            rkey[i] = key[key.length-1-i];
        }
        final int[] rflow2base = FlowBasedKeyCodec.getKeyToBase(rkey);

        return FlowBasedReadUtils.findRightClipping(basesClipped, rflow2base, rkey);
    }


    public void writeKey(final FileWriter oos)
            throws IOException {
        for (int i = 0; i < key.length; i++)
            oos.write(key[i]+"\n");

    }

    /**
     * Flow matrix logger
     *
     * This is used exclusively for testing
     *
     * @param oos
     * @throws IOException
     */
    protected void writeMatrix(final OutputStreamWriter oos)
            throws IOException {
        final DecimalFormat formatter = new DecimalFormat("0.000000", DecimalFormatSymbols.getInstance(Locale.ENGLISH));

        final byte[]      bases = samRecord.getReadBases();
        int         basesOfs = 0;
        final byte[]      quals = samRecord.getBaseQualities();
        final byte[]      ti = samRecord.hasAttribute(FLOW_MATRiX_OLD_TAG_TI) ? samRecord.getByteArrayAttribute(FLOW_MATRiX_OLD_TAG_TI) : (new byte[key.length*3]);
        if ( isReverseStrand() )
        {
            ArrayUtils.reverse(quals);
            ArrayUtils.reverse(ti);
        }

        for ( int col = 0 ; col < key.length ; col++ ) {
            oos.write("C,R,F,B,Bi,Q,ti\n");
            final byte base = (key[col] != 0) ? (basesOfs < bases.length ? bases[basesOfs] : (byte)'?') : (byte)'.';
            final String bi = (key[col] != 0) ? Integer.toString(basesOfs) : ".";
            final String q = (key[col] != 0) ? Integer.toString(quals[basesOfs]) : ".";
            final String Ti = (key[col] != 0) ? Integer.toString(ti[basesOfs]) : ".";
            for (int row = 0; row < flowMatrix.length; row++) {
                final String s = formatter.format(flowMatrix[row][col]);
                oos.write(""
                        + col + ","
                        + row + ","
                        + key[col] + ","
                        + (char)base + ","
                        + bi + ","
                        + q + ","
                        + Ti + ","
                        + (isReverseStrand() ? "r" : ".") + " "
                        + s + "\n");
            }
            if ( key[col] != 0 )
                basesOfs +=  key[col];
            oos.write("\n");
        }

    }


    public byte [] getFlowOrderArray() {
        return flowOrder;
    }

    public int getKeyLength() {
        return key.length;
    }

    public int[] getKey() {
        return key;
    }

    /**
     * Number of total bases that the flow based key generates
     * @return number of bases
     */
    public int totalKeyBases()  {
        int sum = 0 ;
        for (int i = 0 ; i < key.length; i++){
            sum += key[i];
        }
        return sum;
    }

    public boolean isBaseClipped() {
        return baseClipped;
    }

    public int getTrimmedStart() {
        return trimLeftBase + getStart();
    }
    public int getTrimmedEnd() {
        return getEnd() - trimRightBase;
    }



    //functions that take care of simulating base format
    //they perform modifications on the flow matrix that are defined in applyFilteringFlowMatrix
    //this function was only applied when we tested what is the necessary information to be reported in the flow matrix
    private void applyFilteringFlowMatrix(){

        if (fbargs.disallowLargerProbs) {
            removeLargeProbs();
        }

        if (fbargs.removeLongerThanOneIndels) {
            removeLongIndels( key );
        }

        if (fbargs.removeOneToZeroProbs) {
            removeOneToZeroProbs(key);
        }

        if ((fbargs.lumpProbs)) {
            lumpProbs();
        }
        clipProbs();

        if (fbargs.symmetricIndels) {
            smoothIndels(key);
        }
        if (fbargs.onlyInsOrDel) {
            reportInsOrDel(key);
        }

        if ((fbargs.retainMaxNProbs)){
            reportMaxNProbsHmer(key);
        }

    }

    /**
     * clip probability values in a way that probability between perHmerMinErrorProb and 3*perHmerMinErrorProb
     * is automatically clipped to perHmerMinErrorProb. This is done to avoid issues with rounding of
     * small error probabilities in the basecalling
     */
    private void clipProbs() {
        double probabilityThreshold = perHmerMinErrorProb*3;
        for ( int i = 0 ; i < getMaxHmer(); i++ ) {
            for ( int j =0; j < getNFlows(); j++) {
                if ((flowMatrix[i][j] <= probabilityThreshold) &&
                        (key[j]!=i)) {
                    flowMatrix[i][j] = perHmerMinErrorProb;
                }
            }
        }
    }

    /**
     * remove probabilities larger than 1
     */
    private void removeLargeProbs(){
        for (int i = 0; i < getNFlows(); i++){
            for (int j = 0 ; j < getMaxHmer()+1; j++) {
                if (flowMatrix[j][i] > 1) {
                    flowMatrix[j][i] = 1;
                }
            }
        }
    }

    /**
     * This is vestigial and applies only to old formats
     * @param key_kh
     */
    private void removeLongIndels(final int [] key_kh ){
        for ( int i = 0 ; i < getNFlows(); i++ ) {
            for (int j = 0; j < getMaxHmer()+1; j++){
                if (Math.abs(j-key_kh[i])>1){
                    flowMatrix[j][i] = perHmerMinErrorProb;
                }
            }
        }
    }

    /**
     * This is vestigial and applies only to old formats
     * @param key_kh
     */
    private void removeOneToZeroProbs(final int [] key_kh) {
        for (int i = 0 ; i < getNFlows(); i++){
            if (key_kh[i] == 0){
                for (int j = 1; j < getMaxHmer()+1; j++){
                    flowMatrix[j][i] = perHmerMinErrorProb;
                }
            }
        }
    }


    /**
     * Quantize probability values according to fbargs.probabilityQuantization and fbargs.probabilityScalingFactor
     * @param kd_probs
     */

    private void quantizeProbs(final int [] kd_probs ) {
        final int nQuants = fbargs.probabilityQuantization;
        final double bin_size = 6*fbargs.probabilityScalingFactor/(float)nQuants;
        for ( int i = 0 ; i < kd_probs.length; i++) {
            if (kd_probs[i] <=0)
                continue;
            else {
                kd_probs[i] = (byte)(bin_size * (byte)(kd_probs[i]/bin_size)+1);
            }
        }
    }

    /**
     * Smooth out probabilities by averaging with neighbours
     * @param kr
     */
    private void smoothIndels(final int [] kr ) {
        for ( int i = 0 ; i < kr.length; i++ ){
            final int idx = kr[i];
            if (( idx > 1 ) && ( idx < maxHmer) ) {
                final double prob = (flowMatrix[idx - 1][i] + flowMatrix[idx + 1][i]) / 2;
                flowMatrix[idx - 1][i] = prob;
                flowMatrix[idx + 1][i] = prob;
            }
        }
    }

    /**
     * Only report probability of insertions or of deletions, never both
     * @param kr
     */
    private void reportInsOrDel(final int [] kr ) {
        for ( int i = 0 ; i < kr.length; i++ ){
            final int idx = kr[i];
            if (( idx > 1 ) && ( idx < maxHmer) ) {
                if ((flowMatrix[idx-1][i] > perHmerMinErrorProb) && (flowMatrix[idx+1][i] > perHmerMinErrorProb)) {
                    final int fixCell = flowMatrix[idx-1][i] > flowMatrix[idx+1][i] ? idx+1 : idx-1;
                    flowMatrix[fixCell][i] = perHmerMinErrorProb;
                }
            }
        }
    }

    /**
     * Combine all probabilities of insertions together and report them as probabilities of 1-mer insertion
     * Combine all probabilities of deletions together and report them as probabilities of 1-mer deletion
     */
    private void lumpProbs() {

        for (int i = 0; i < getMaxHmer(); i++) {
            for (int j = 0 ; j < getNFlows(); j ++ ) {
                final int fkey = key[j];
                if (flowMatrix[i][j]<=perHmerMinErrorProb) {
                    continue;
                } else {
                    if ( (i - fkey) < -1 ){
                        flowMatrix[fkey-1][j]+=flowMatrix[i][j];
                        flowMatrix[i][j] = perHmerMinErrorProb;
                    } else if ((i-fkey) > 1) {
                        flowMatrix[fkey+1][j]+=flowMatrix[i][j];
                        flowMatrix[i][j] = perHmerMinErrorProb;
                    }

                }

            }
        }

    }

    /*
    Given full vector of error probabilities retain only the probabilities that can be reported in the base format
    (N+1/2 highest error probabilities)
     */
    private void reportMaxNProbsHmer(final int [] key) {
        final double [] tmpContainer = new double[maxHmer];
        for (int i = 0 ; i < key.length;i++){

            for (int j = 0 ; j < tmpContainer.length; j++) {
                tmpContainer[j] = flowMatrix[j][i];
            }
            final int k = (key[i]+1)/2;
            final double kth_highest = findKthLargest(tmpContainer, k+1);
            for (int j = 0 ; j < maxHmer; j++)
                if (flowMatrix[j][i] < kth_highest)
                    flowMatrix[j][i] = perHmerMinErrorProb;
        }

    }


    private static double findKthLargest(final double[] nums, final int k) {
        final PriorityQueue q = new PriorityQueue(k);
        for(final double i: nums){
            q.offer(i);

            if(q.size()>k){
                q.poll();
            }
        }

        return q.peek();
    }

    private double[] phredToProb(final int [] kq) {
        final double [] result = new double[kq.length];
        for (int i = 0 ; i < kq.length; i++ ) {
            //disallow probabilities below fillingValue
            result[i] = Math.max(Math.pow(10, ((double)-kq[i])/fbargs.probabilityScalingFactor), perHmerMinErrorProb);
        }
        return result;
    }


    public static void setMinimalReadLength(int minimalReadLength) {
        FlowBasedRead.minimalReadLength = minimalReadLength;
    }

    // convert qualities and ti tag to flow matrix
    private void readVestigialFlowMatrixFromTI(final String _flowOrder) {

        vestigialOneShotLogger.warn("Vestigial read format detected: " + samRecord);

        // generate key (base to flow space)
        setDirection(Direction.REFERENCE);  // base is always in reference/alignment direction
        key = FlowBasedKeyCodec.baseArrayToKey(samRecord.getReadBases(), _flowOrder);
        flow2base = FlowBasedKeyCodec.getKeyToBase(key);
        flowOrder = FlowBasedKeyCodec.getFlowToBase(_flowOrder, key.length);

        // initialize matrix
        flowMatrix = new double[maxHmer+1][key.length];
        for (int i = 0 ; i < maxHmer+1; i++) {
            for (int j = 0 ; j < key.length; j++ ){
                flowMatrix[i][j] = perHmerMinErrorProb;
            }
        }

        // access qual, convert to flow representation
        final byte[]      quals = samRecord.getBaseQualities();
        final byte[]      ti = samRecord.getByteArrayAttribute(FLOW_MATRiX_OLD_TAG_TI);
        final double[]    probs = new double[quals.length];
        for ( int i = 0 ; i < quals.length ; i++ ) {
            final double q = quals[i];
            final double p = QualityUtils.qualToErrorProb(q);
            probs[i] = p*2;
        }

        // apply key and qual/ti to matrix
        int     qualOfs = 0;
        for ( int i = 0 ; i < key.length ; i++ ) {
            final int        run = key[i];

            // the probability is not divided by two for hmers of length 1
            if ( run == 1 ) {
                probs[qualOfs] = probs[qualOfs]/2;
            }

            //filling the probability for the called hmer (not reported by the quals
            if ( run <= maxHmer ) {
                flowMatrix[run][i] = (run > 0) ? (1 - probs[qualOfs]) : 1;
                //require a prob. at least 0.1
                flowMatrix[run][i] = Math.max(MINIMAL_CALL_PROB, flowMatrix[run][i]);

            }

            if ( run != 0 ) {
                if ( quals[qualOfs] != 40 ) {
                    final int     run1 = (ti[qualOfs] == 0) ? (run - 1) : (run + 1);
                    if (( run1 <= maxHmer ) && (run <= maxHmer)){
                        flowMatrix[run1][i] = probs[qualOfs] / flowMatrix[run][i];
                    }
                    if (run <= maxHmer) {
                        flowMatrix[run][i] /= flowMatrix[run][i]; // for comparison to the flow space - probabilities are normalized by the key's probability
                    }
                }
                qualOfs += run;
            }

        }

        //this is just for tests of all kinds of
        applyFilteringFlowMatrix();
    }

    // code for reading BAM format where the flow matrix is stored in sparse representation in kr,kf,kh and kd tags
    // used for development of the new basecalling, but not in production code
    private void readVestigialFlowMatrixFromKR(final String _flowOrder) {

        vestigialOneShotLogger.warn("Vestigial read format detected: " + samRecord);

        key = getAttributeAsIntArray(FLOW_MATRiX_OLD_TAG_KR, true);

        // creates a translation from flow # to base #
        flow2base = FlowBasedKeyCodec.getKeyToBase(key);

        // create a translation from
        flowOrder = FlowBasedKeyCodec.getFlowToBase(_flowOrder, key.length);

        flowMatrix = new double[maxHmer+1][key.length];
        for (int i = 0 ; i < maxHmer+1; i++) {
            for (int j = 0 ; j < key.length; j++ ){
                flowMatrix[i][j] = perHmerMinErrorProb;
            }
        }

        int [] kh = getAttributeAsIntArray(FLOW_MATRiX_OLD_TAG_KH, true);
        int [] kf = getAttributeAsIntArray(FLOW_MATRiX_OLD_TAG_KF, false);
        int [] kd = getAttributeAsIntArray(FLOW_MATRiX_OLD_TAG_KD, true);

        final int [] key_kh = key;
        final int [] key_kf = new int[key.length];
        for ( int i = 0 ; i < key_kf.length ; i++)
            key_kf[i] = i;
        final int [] key_kd = new int[key.length];

        kh = ArrayUtils.addAll(kh, key_kh);
        kf = ArrayUtils.addAll(kf, key_kf);
        kd = ArrayUtils.addAll(kd, key_kd);

        quantizeProbs(kd);

        final double [] kdProbs = phredToProb(kd);
        fillFlowMatrix( kh, kf, kdProbs);
        applyFilteringFlowMatrix();
    }
    //Finds the quality that is being set when the probability of error is very low
    private double estimateMinErrorProb(){
        final byte[] quals = samRecord.getBaseQualities();
        double maxQual = 0;

        for (int i = 0; i < quals.length; i++) {
            if (quals[i] > maxQual){
                maxQual = quals[i];
            }
        }

        // in the very rare case when there is no qualities
        if (maxQual==0){
            maxQual = 40;
        }
        return Math.pow(10, -maxQual / 10);
    }
}






© 2015 - 2025 Weber Informatics LLC | Privacy Policy