All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.transformers.DUSTReadTransformer Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.transformers;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import scala.Tuple2;

import java.util.*;

/**
 * Masks read bases and base qualities using the symmetric DUST algorithm
 */
public class DUSTReadTransformer implements ReadTransformer {
    public static final long serialVersionUID = 1L;
    private static final Logger logger = LogManager.getLogger(DUSTReadTransformer.class);

    private int MASK_PHRED = 15; //Phred score to mask low-complexity sequences with
    private int W = 64; //DUST window size
    private double T = 20.0f; //DUST score threshold

    private static final class PStruct {
        public int start = 0;
        public int finish = 0;
        public double score = 0;
    }

    private static final class IntTuple {
        public int a;
        public int b;
        public IntTuple(final int a, final int b) {this.a = a; this.b = b;}
    }


    public DUSTReadTransformer(final int dust_mask, final int dust_w, final double dust_t) {
        MASK_PHRED = dust_mask;
        W = dust_w;
        T = dust_t;
    }

    @Override
    public GATKRead apply(final GATKRead read) {

        List res = runDUST(read,W,T);

        //Mask the intervals and base qualities
        byte[] q_new = read.getBases();
        for (IntTuple interval : res) {
            for (int i = interval.a; i <= interval.b; i++) {
                q_new[i] = 'N';
            }
        }
        read.setBases(q_new);

        if (read.getBaseQualityCount() == read.getLength()) {
            byte[] qual_new = read.getBaseQualities();
            for (IntTuple interval : res) {
                for (int i = interval.a; i <= interval.b; i++) {
                    qual_new[i] = (byte)MASK_PHRED;
                }
            }
            read.setBaseQualities(qual_new);
        }

        return read;
    }

    /**
     * Wrapper method for runDUST so that we don't have to expose the IntTuple class.
     */
    public static Collection> sDUST(final GATKRead read, final int window, final double tscore) {
        Collection res = runDUST(read,window,tscore);
        List> res_tuple2 = new ArrayList<>(res.size());
        for (IntTuple tuple : res) {
            res_tuple2.add(new Tuple2<>(tuple.a,tuple.b));
        }
        return res_tuple2;
    }

    /**
     *   Implementation of the sDUST algorithm described in:
     //  Liebert et al.(2006). A Fast and Symmetric DUST Implementation to Mask Low-Complexity DNA Sequences.
     //  Journal of Computational Biology, 13(5), 1028 - 1040.
     */
    private static List runDUST(final GATKRead read, final int window, final double tscore) {

        final byte[] q = read.getBases();

        //Find convert to 2-bit and assign random bases to N's
        Random rng = null;

        for (int i = 0; i < q.length; i++) {
            if (q[i] == 'N' || q[i] == 'n') {
                if (rng == null) {rng = new Random(); rng.setSeed(0);}
                q[i] = (byte)rng.nextInt(4);
            } else {
                if (q[i] == 'A' || q[i] == 'a') {
                    q[i] = 0x0;
                } else if (q[i] == 'T' || q[i] == 't') {
                    q[i] = 0x1;
                } else if (q[i] == 'C' || q[i] == 'c') {
                    q[i] = 0x2;
                } else if (q[i] == 'G' || q[i] == 'g') {
                    q[i] = 0x3;
                } else {
                    logger.warn("Invalid base " + (char) q[i] + " in read " + read.getName() + ", assigning to random base.");
                    if (rng == null) {rng = new Random(); rng.setSeed(0);}
                    q[i] = (byte)rng.nextInt(4);
                }
            }
        }

        //Main sDUST algorithm as written in the pseudocode at the end of the paper
        //  Subroutines ADD_TRIPLET_INFO and REMOVE_TRIPLET_INFO are converted to 1-line statements
        //  SAVE_MASKED_REGIONS is implemented as a separate function
        //  SHIFT_WINDOW and FIND_PERFECT are written in place

        //List of low-complexity base intervals ("res" in pseudocode)
        final ArrayList final_intervals = new ArrayList<>();

        //List of perfect intervals ("P"). A perfect interval is a subsequence of the read at most length window and with
        //score t > tscore whose subsequences all have score t_sub <= t. The list is maintained in sorted order, first
        //ascending by start position, then descending by end position.
        final LinkedList perfect_intervals = new LinkedList<>();

        //Triplet counts in the largest suffix, v, of the current window, w ("cv[t]" in pseudocode)
        //cv[i] corresponds to the count of triplet i, where i is the decimal value of the concatenation of the 2-bit
        //represenations of the 3 bases (64 total possible, see the triplet() function)
        final short[] triplet_count_suffix = new short[64];

        //Triplet counts in w, analogous to cv ("cw[t]")
        final short[] triplet_count_window = new short[64];

        //Running score counts for w and v ("rw" and "rv")
        int running_score_suffix = 0;
        int running_score_window = 0;

        //List of triplets in w ("w")
        final LinkedList window_triplets = new LinkedList<>();

        //Number of triplets in v occuring at most 2*tscore times (Proposition 1 in manuscript)
        int L = 0;

        //Window start position
        int wstart;

        //Loop over all windows in the read
        for (int wfinish = 2; wfinish < read.getLength(); wfinish++) {
            wstart = Math.max(wfinish - window + 1, 0);

            save_masked_regions(perfect_intervals, final_intervals, wstart);

            //Triplet at end of the window
            int t = triplet(q[wfinish-2],q[wfinish-1],q[wfinish]);

            //Updates triplet counts and running scores of w and v, window_triplets, and L
            //Begin: SHIFT_WINDOW
            int s;
            if (window_triplets.size() >= window - 2) {
                s = window_triplets.pop();
                running_score_window -= --triplet_count_window[s]; //REMOVE_TRIPLET_INFO
                if (L > window_triplets.size()) {
                    L--;
                    running_score_suffix -= --triplet_count_suffix[s]; //REMOVE_TRIPLET_INFO
                }
            }
            window_triplets.add(t);
            L++;
            running_score_window += triplet_count_window[t]++; //ADD_TRIPLET_INFO
            running_score_suffix += triplet_count_suffix[t]++; //ADD_TRIPLET_INFO
            if (triplet_count_suffix[t]*10 > 2*tscore) {
                do {
                    s = window_triplets.get(window_triplets.size()-L);
                    running_score_suffix -= --triplet_count_suffix[s]; //REMOVE_TRIPLET_INFO
                    L--;
                } while (s != t);
            }
            //End: SHIFT_WINDOW

            if (running_score_window*10 > L*tscore) {

                //Adds perfect intervals in w that are suffixes of w to perfect_intervals
                //Begin: FIND_PERFECT
                final short[] c = triplet_count_suffix.clone();
                int r = running_score_suffix;
                ListIterator iter = perfect_intervals.listIterator();
                PStruct perf = iter.hasNext() ? iter.next() : null;
                double max_score = 0;
                for (int i = window_triplets.size() - L - 1; i >= 0; i--) {
                    t = window_triplets.get(i);
                    r += c[t];
                    c[t]++;
                    final double new_score = ((double)r)/(window_triplets.size() - i - 1);
                    if (new_score*10 > tscore) {
                        while (perf != null && perf.start >= i + wstart) {
                            max_score = Math.max(max_score,perf.score);
                            perf = iter.hasNext() ? iter.next() : null;
                        }
                        if (new_score >= max_score) {
                            max_score = new_score;
                            PStruct new_perf = new PStruct();
                            new_perf.start = i + wstart;
                            new_perf.finish = window_triplets.size() + 1 + wstart;
                            new_perf.score = new_score;
                            if (iter.hasPrevious()) {
                                iter.previous();
                            } else {
                                iter = perfect_intervals.listIterator();
                            }
                            iter.add(new_perf);
                            if (iter.hasNext()) {iter.next();}
                        }
                    }
                }
                //End: FIND_PERFECT

            }
        } //End of main loop


        //Save masked regions from final window
        wstart = Math.max(0,read.getLength() - window + 1);
        while (!perfect_intervals.isEmpty()) {
            save_masked_regions(perfect_intervals, final_intervals, wstart);
            wstart++;
        }

        return final_intervals;
    }

    /**
     * Saves masked intervals found in the previous window that do not overlap the current window.
     * Note that the LinkedList and ArrayList are modified.
     */
    private static void save_masked_regions(final LinkedList perfect_intervals,
                                            final ArrayList final_intervals,
                                            final int wstart) {
        if (!perfect_intervals.isEmpty()) {
            PStruct p1 = perfect_intervals.getLast();
            if (p1.start < wstart) {
                final int l = final_intervals.size();
                if (l > 0) {
                    IntTuple rt = final_intervals.get(l - 1);
                    if (p1.start < rt.b + 1) {
                        final IntTuple new_rt = new IntTuple(rt.a, Math.max(rt.b, p1.finish));
                        final_intervals.set(l - 1, new_rt);
                    } else {
                        final_intervals.add(new IntTuple(p1.start, p1.finish));
                    }
                } else {
                    final_intervals.add(new IntTuple(p1.start, p1.finish));
                }
                while (p1 != null && p1.start < wstart) {
                    perfect_intervals.removeLast();
                    p1 = perfect_intervals.isEmpty() ? null : perfect_intervals.getLast();
                }
            }
        }
    }

    /**
     * Returns integer 0-63 of a given triplet of 2-bit bases
     */
    private static int triplet(final byte b1, final byte b2, final byte b3) {
        return (b1 | (b2 << 2) | (b3 << 4));
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy