org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanJavaAligner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.utils.smithwaterman;

import com.google.common.collect.Lists;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWParameters;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.CigarBuilder;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
 * Pairwise discrete smith-waterman alignment implemented in pure java
 *
 * ************************************************************************
 * ****                    IMPORTANT NOTE:                             ****
 * ****  This class assumes that all bytes come from UPPERCASED chars! ****
 * ************************************************************************
 */
public final class SmithWatermanJavaAligner implements SmithWatermanAligner {
    private static final SmithWatermanJavaAligner ALIGNER = new SmithWatermanJavaAligner();
    private long totalComputeTime = 0;

    /**
     * return the stateless singleton instance of SmithWatermanJavaAligner
     */
    public static SmithWatermanJavaAligner getInstance() {
        return ALIGNER;
    }

    /**
     * The state of a trace step through the matrix
     */
    protected enum State {
        MATCH,
        INSERTION,
        DELETION,
        CLIP
    }

    /**
     * Create a new SW pairwise aligner, this has no state so instead of creating new instances, we create a singleton which is
     * accessible via {@link #getInstance}
     */
    private SmithWatermanJavaAligner(){}

    /**
     * Aligns the alternate sequence to the reference sequence
     *
     * @param reference  ref sequence
     * @param alternate  alt sequence
     */
    @Override
    public SmithWatermanAlignment align(final byte[] reference, final byte[] alternate, final SWParameters parameters, final SWOverhangStrategy overhangStrategy) {
        long startTime = System.nanoTime();

        new String(reference);

        if ( reference == null || reference.length == 0 || alternate == null || alternate.length == 0 ) {
            throw new IllegalArgumentException("Non-null, non-empty sequences are required for the Smith-Waterman calculation");
        }
        Utils.nonNull(parameters);
        Utils.nonNull(overhangStrategy);

        // avoid running full Smith-Waterman if there is an exact match of alternate in reference
        int matchIndex = -1;
        if (overhangStrategy == SWOverhangStrategy.SOFTCLIP || overhangStrategy == SWOverhangStrategy.IGNORE) {
            // Use a substring search to find an exact match of the alternate in the reference
            // NOTE: This approach only works for SOFTCLIP and IGNORE overhang strategies
            matchIndex = Utils.lastIndexOf(reference, alternate);
        }

        final SmithWatermanAlignment alignmentResult;

        if (matchIndex != -1) {
            // generate the alignment result when the substring search was successful
            alignmentResult = new SWPairwiseAlignmentResult(new Cigar(Collections.singletonList(new CigarElement(alternate.length, CigarOperator.M))), matchIndex);
        }
        else {
            // run full Smith-Waterman
            final int n = reference.length+1;
            final int m = alternate.length+1;
            final int[][] sw = new int[n][m];
            final int[][] btrack=new int[n][m];

            calculateMatrix(reference, alternate, sw, btrack, overhangStrategy, parameters);
            alignmentResult = calculateCigar(sw, btrack, overhangStrategy); // length of the segment (continuous matches, insertions or deletions)
        }

        totalComputeTime += System.nanoTime() - startTime;
        return alignmentResult;
    }

    /**
     * Calculates the SW matrices for the given sequences
     * @param reference  ref sequence
     * @param alternate  alt sequence
     * @param sw         the Smith-Waterman matrix to populate
     * @param btrack     the back track matrix to populate
     * @param overhangStrategy    the strategy to use for dealing with overhangs
     * @param parameters the set of weights to use to configure the alignment
     */
    private static void calculateMatrix(final byte[] reference, final byte[] alternate, final int[][] sw, final int[][] btrack,
                                        final SWOverhangStrategy overhangStrategy, final SWParameters parameters) {
        if ( reference.length == 0 || alternate.length == 0 ) {
            throw new IllegalArgumentException("Non-null, non-empty sequences are required for the Smith-Waterman calculation");
        }

        final int ncol = sw[0].length;//alternate.length+1; formerly m
        final int nrow = sw.length;// reference.length+1; formerly n

        final int MATRIX_MIN_CUTOFF = (int) -1.0e8;   // never let matrix elements drop below this cutoff

        final int lowInitValue= Integer.MIN_VALUE/2;
        final int[] best_gap_v = new int[ncol+1];
        Arrays.fill(best_gap_v, lowInitValue);
        final int[] gap_size_v = new int[ncol+1];
        final int[] best_gap_h = new int[nrow+1];
        Arrays.fill(best_gap_h, lowInitValue);
        final int[] gap_size_h = new int[nrow+1];

        // we need to initialize the SW matrix with gap penalties if we want to keep track of indels at the edges of alignments
        if ( overhangStrategy == SWOverhangStrategy.INDEL || overhangStrategy == SWOverhangStrategy.LEADING_INDEL ) {
            // initialize the first row
            final int[] topRow=sw[0];
            topRow[1]= parameters.getGapOpenPenalty();
            int currentValue = parameters.getGapOpenPenalty();
            for ( int i = 2; i < topRow.length; i++ ) {
                currentValue += parameters.getGapExtendPenalty();
                topRow[i]=currentValue;
            }
            // initialize the first column
            sw[1][0]= parameters.getGapOpenPenalty();
            currentValue = parameters.getGapOpenPenalty();
            for ( int i = 2; i < sw.length; i++ ) {
                currentValue += parameters.getGapExtendPenalty();
                sw[i][0]=currentValue;
            }
        }
        // build smith-waterman matrix and keep backtrack info:
        int[] curRow=sw[0];

        //access is pricey if done enough times so we extract those out
        final int w_open = parameters.getGapOpenPenalty();
        final int w_extend = parameters.getGapExtendPenalty();
        final int w_match = parameters.getMatchValue();
        final int w_mismatch = parameters.getMismatchPenalty();

        //array length checks are expensive in tight loops so extract the length out
        for ( int i = 1, sw_length = sw.length; i < sw_length ; i++ ) {
            final byte a_base = reference[i-1]; // letter in a at the current pos
            final int[] lastRow=curRow;
            curRow=sw[i];
            final int[] curBackTrackRow=btrack[i];

            //array length checks are expensive in tight loops so extract the length out
            for ( int j = 1, curRow_length = curRow.length; j < curRow_length; j++) {
                final byte b_base = alternate[j-1]; // letter in b at the current pos
                // in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base);
                final int step_diag = lastRow[j-1] + (a_base == b_base ? w_match : w_mismatch);

                // optimized "traversal" of all the matrix cells above the current one (i.e. traversing
                // all 'step down' events that would end in the current cell. The optimized code
                // does exactly the same thing as the commented out loop below. IMPORTANT:
                // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!!

                // if a gap (length 1) was just opened above, this is the cost of arriving to the current cell:
                int prev_gap = lastRow[j] + w_open;
                best_gap_v[j] += w_extend; // for the gaps that were already opened earlier, extending them by 1 costs w_extend
                if (  prev_gap > best_gap_v[j]  ) {
                    // opening a gap just before the current cell results in better score than extending by one
                    // the best previously opened gap. This will hold for ALL cells below: since any gap
                    // once opened always costs w_extend to extend by another base, we will always get a better score
                    // by arriving to any cell below from the gap we just opened (prev_gap) rather than from the previous best gap
                    best_gap_v[j] = prev_gap;
                    gap_size_v[j] = 1; // remember that the best step-down gap from above has length 1 (we just opened it)
                } else {
                    // previous best gap is still the best, even after extension by another base, so we just record that extension:
                    gap_size_v[j]++;
                }

                final int step_down = best_gap_v[j] ;
                final int kd = gap_size_v[j];

                // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing
                // all 'step right' events that would end in the current cell. The optimized code
                // does exactly the same thing as the commented out loop below. IMPORTANT:
                // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!!

                prev_gap =curRow[j-1]  + w_open; // what would it cost us to open length 1 gap just to the left from current cell
                best_gap_h[i] += w_extend; // previous best gap would cost us that much if extended by another base
                if ( prev_gap > best_gap_h[i] ) {
                    // newly opened gap is better (score-wise) than any previous gap with the same row index i; since
                    // gap penalty is linear with k, this new gap location is going to remain better than any previous ones
                    best_gap_h[i] = prev_gap;
                    gap_size_h[i] = 1;
                } else {
                    gap_size_h[i]++;
                }

                final int step_right = best_gap_h[i];
                final int ki = gap_size_h[i];

                //priority here will be step diagonal, step right, step down
                final boolean diagHighestOrEqual = (step_diag >= step_down)
                                                && (step_diag >= step_right);

                if ( diagHighestOrEqual ) {
                    curRow[j]= Math.max(MATRIX_MIN_CUTOFF, step_diag);
                    curBackTrackRow[j]=0;
                }
                else if(step_right>=step_down) { //moving right is the highest
                    curRow[j]= Math.max(MATRIX_MIN_CUTOFF, step_right);
                    curBackTrackRow[j]=-ki; // negative = horizontal
                }
                else  {
                    curRow[j]= Math.max(MATRIX_MIN_CUTOFF, step_down);
                    curBackTrackRow[j]= kd; // positive=vertical
                }
            }
        }
    }

    /*
     * Class to store the result of calculating the CIGAR from the back track matrix
     */
    private static final class SWPairwiseAlignmentResult implements SmithWatermanAlignment {
        private final Cigar cigar;
        private final int alignmentOffset;

        SWPairwiseAlignmentResult(final Cigar cigar, final int alignmentOffset) {
            this.cigar = cigar;
            this.alignmentOffset = alignmentOffset;
        }

        @Override
        public Cigar getCigar() {
            return cigar;
        }

        @Override
        public int getAlignmentOffset() {
            return alignmentOffset;
        }
    }

    /**
     * Calculates the CIGAR for the alignment from the back track matrix
     *
     * @param sw                   the Smith-Waterman matrix to use
     * @param btrack               the back track matrix to use
     * @param overhangStrategy    the strategy to use for dealing with overhangs
     * @return non-null SWPairwiseAlignmentResult object
     */
    private static SWPairwiseAlignmentResult calculateCigar(final int[][] sw, final int[][] btrack, final SWOverhangStrategy overhangStrategy) {
        // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
        int p1 = 0, p2 = 0;

        final int refLength = sw.length-1;
        final int altLength = sw[0].length-1;

        int maxscore = Integer.MIN_VALUE; // sw scores are allowed to be negative
        int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)

        // if we want to consider overhangs as legitimate operators, then just start from the corner of the matrix
        if ( overhangStrategy == SWOverhangStrategy.INDEL ) {
            p1 = refLength;
            p2 = altLength;
        } else {
            // look for the largest score on the rightmost column. we use >= combined with the traversal direction
            // to ensure that if two scores are equal, the one closer to diagonal gets picked
            //Note: this is not technically smith-waterman, as by only looking for max values on the right we are
            //excluding high scoring local alignments
            p2=altLength;

            for(int i=1;i= maxscore ) {
                    p1 = i;
                    maxscore = curScore;
               }
            }
            // now look for a larger score on the bottom-most row
            if ( overhangStrategy != SWOverhangStrategy.LEADING_INDEL ) {
                final int[] bottomRow=sw[refLength];
                for ( int j = 1 ; j < bottomRow.length; j++) {
                    final int curScore=bottomRow[j];
                    // data_offset is the offset of [n][j]
                    if ( curScore > maxscore ||
                            (curScore == maxscore && Math.abs(refLength - j) < Math.abs(p1 - p2) ) ) {
                        p1 = refLength;
                        p2 = j ;
                        maxscore = curScore;
                        segment_length = altLength - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
                    }
                }
            }
        }
        final List lce = new ArrayList<>(5);
        if ( segment_length > 0 && overhangStrategy == SWOverhangStrategy.SOFTCLIP ) {
            lce.add(makeElement(State.CLIP, segment_length));
            segment_length = 0;
        }

        // we will be placing all insertions and deletions into sequence b, so the states are named w/regard
        // to that sequence

        State state = State.MATCH;
        do {
            final int btr = btrack[p1][p2];
            final State new_state;
            int step_length = 1;
            if ( btr > 0 ) {
                new_state = State.DELETION;
                step_length = btr;
            } else if ( btr < 0 ) {
                new_state = State.INSERTION;
                step_length = (-btr);
            } else new_state = State.MATCH; // and step_length =1, already set above

            // move to next best location in the sw matrix:
            switch( new_state ) {
                case MATCH:  p1--; p2--; break; // move back along the diag in the sw matrix
                case INSERTION: p2 -= step_length; break; // move left
                case DELETION:  p1 -= step_length; break; // move up
            }

            // now let's see if the state actually changed:
            if ( new_state == state ) segment_length+=step_length;
            else {
                // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
                if (segment_length > 0) {
                    lce.add(makeElement(state, segment_length));
                }
                segment_length = step_length;
                state = new_state;
            }
        // next condition is equivalent to  while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2:
        } while ( p1 > 0 && p2 > 0 );

        // post-process the last segment we are still keeping;
        // NOTE: if reads "overhangs" the ref on the left (i.e. if p2>0) we are counting
        // those extra bases sticking out of the ref into the first cigar element if DO_SOFTCLIP is false;
        // otherwise they will be soft-clipped. For instance,
        // if read length is 5 and alignment starts at offset -2 (i.e. read starts before the ref, and only
        // last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M if
        // DO_SOFTCLIP is false or 2S3M if DO_SOFTCLIP is true.
        // The consumers need to check for the alignment offset and deal with it properly.
        final int alignment_offset;
        if ( overhangStrategy == SWOverhangStrategy.SOFTCLIP ) {
            lce.add(makeElement(state, segment_length));
            if ( p2 > 0 ) lce.add(makeElement(State.CLIP, p2));
            alignment_offset = p1;
        } else if ( overhangStrategy == SWOverhangStrategy.IGNORE ) {
            lce.add(makeElement(state, segment_length + p2));
            alignment_offset = p1 - p2;
        } else {  // overhangStrategy == OverhangStrategy.INDEL || overhangStrategy == OverhangStrategy.LEADING_INDEL

            // take care of the actual alignment
            lce.add(makeElement(state, segment_length));

            // take care of overhangs at the beginning of the alignment
            if ( p1 > 0 ) {
                lce.add(makeElement(State.DELETION, p1));
            } else if ( p2 > 0 ) {
                lce.add(makeElement(State.INSERTION, p2));
            }

            alignment_offset = 0;
        }

        return new SWPairwiseAlignmentResult(new Cigar(Lists.reverse(lce)), alignment_offset);
    }

    private static CigarElement makeElement(final State state, final int length) {
        CigarOperator op = null;
        switch (state) {
            case MATCH: op = CigarOperator.M; break;
            case INSERTION: op = CigarOperator.I; break;
            case DELETION: op = CigarOperator.D; break;
            case CLIP: op = CigarOperator.S; break;
        }
        return new CigarElement(length, op);
    }

    @Override
    public void close() {
        logger.info(String.format("Total compute time in java Smith-Waterman : %.2f sec", totalComputeTime * 1e-9));
    }
}