net.maizegenetics.dna.BaseEncoder Maven / Gradle / Ivy

package net.maizegenetics.dna;


/**
 * Utility class for encoding tags into longs.
 * 
 * Sequencing reads are chunked into 32bp and recorded in a 64-bit long.  Only 
 * A (00), C (01), G (10), T (11) are encoded.  Any other character sets the entire long to -1.
 * Missing data at the end is padded with poly-A or (0).  This missing end, is tracked
 * by the tag length attribute.
 * 

 * Some of these methods should be transitioned to {@link net.maizegenetics.dna.snp.NucleotideAlignmentConstants},
 * however, BaseEncoder only supports four states, while NucleotideAlignment includes gaps, insertions, and missing.
 * 
 * @author Ed Buckler
 */
public class BaseEncoder {

    /** defines the number of bases fitting with a long */
    public static final int chunkSize = 32;
    public static final int chunkSizeForInt = 16;
    /** defines the base order */
    public static final char[] bases = {'A', 'C', 'G', 'T'};

    private BaseEncoder() {
    }

    /**
     * Returns a long for a sequence in a String
     * @param seq
     * @return 2-bit encode sequence (-1 if an invalid sequence state is provided e.g. N)
     */
    public static long getLongFromSeq(String seq) {
        int seqLength = seq.length();
        long v = 0;
        for (int i = 0; i < seqLength; i++) {
            switch (seq.charAt(i)) {
                case 'A':
                case 'a':
                    v = v << 2;
                    break;
                case 'C':
                case 'c':
                    v = (v << 2) + (byte) 1;
                    break;
                case 'G':
                case 'g':
                    v = (v << 2) + (byte) 2;
                    break;
                case 'T':
                case 't':
                    v = (v << 2) + (byte) 3;
                    break;
                default:
                    return -1;
            }
        }
        if (seqLength == chunkSize) {
            return v;
        }
        if (seqLength > chunkSize) {
            return -1;
        }
        v = (v << (2 * (chunkSize - seqLength))); //if shorter fill with AAAA
        return v;
    }

    /**
     * @param seq A String containing a DNA sequence.
     * @return result A array of Long containing the binary representation of the sequence.
     * null if sequence length is not a multiple of BaseEncoder.chunksize.
     */
    public static long[] getLongArrayFromSeq(String seq) {
        if (seq.length() % chunkSize != 0) {
            return null;
        }
        long[] result = new long[seq.length() / chunkSize];
        for (int i = 0; i < result.length; i++) {
            result[i] = getLongFromSeq(seq.substring(i * chunkSize, (i + 1) * chunkSize));
        }
        return result;
    }
    
    /**
     * @param seq A String containing a DNA sequence.
     * @return result A array of Long containing the binary representation of the sequence.
     * if sequence length is shorter than padded Length adds A to the end.
     */
    public static long[] getLongArrayFromSeq(String seq, int paddedLength) {
        if(seq.length() chunkSize) {
            return -1;
        }
        // Comment out the shift so padding occurs at the front
        // of the sequence.  This is give smaller numbers and
        // makes SPARK machine learning happier.
        //v = (v << (2 * (chunkSize - seqLength))); //if shorter fill with AAAA (which is 0000)
        return v;
    }

    /**
     * Returns the reverse complement of a sequence already encoded in a 2-bit long.
     * 

     * Note: polyA is used represent unknown, but reverse complement will change it to polyT which does not mean the same
     * sometimes it is best to reverseComplement by text below
     * @param seq  2-bit encoded sequence
     * @param len  length of the sequence
     * @return  2-bit reverse complement
     */
    public static long getReverseComplement(long seq, byte len) {
        // if(seq==-1) return -1;
        long rev = 0;
        // byte b=0;
        long mask = 3;
        seq = ~seq;
        for (int i = 0; i < len; i++) {
            rev = (rev << 2) + (seq & mask);
            seq = seq >> 2;
            // System.out.println("v = " + v);
        }
        return rev;
    }

    /**
     * Returns the reverse complement of a sequence already encoded in a 2-bit long.
     * The entire long (32-bp) is reverse complemented.
     * 

     * Note: polyA is used represent unknown, but reverse complement will change it to polyT which does not mean the same
     * sometimes it is best to reverseComplement by text below
     * @param seq  2-bit encoded sequence
     * @return  2-bit reverse complement
     */
    public static long getReverseComplement(long seq) {
        return getReverseComplement(seq, (byte) chunkSize);
    }


    /**
     * Returns the reverse complement of a arrays of sequences already encoded in a 2-bit long.
     * 

     * Note: polyA is used represent unknown, but reverse complement will change it to polyT which does not mean the same
     * sometimes it is best to reverseComplement by text below
     * @param seq  array of 2-bit encoded sequences
     * @return  array of 2-bit reverse complements
     */
    public static long[] getReverseComplement(long[] seq) {
        long[] rev = new long[seq.length];
        for (int i = 0; i < rev.length; i++) {
            rev[i] = getReverseComplement(seq[seq.length - i - 1], (byte) chunkSize);
        }
        return rev;
    }

    /**
     * Returns a string based reverse complement.  Get around issues with the poly-A tailing in the 2-bit encoding approach.
     *
     * @param seq  DNA sequence
     * @return  reverse complement DNA sequence
     */
    public static String getReverseComplement(String seq) {
        StringBuilder sb = new StringBuilder(seq.length());
        for (int i = seq.length() - 1; i >= 0; i--) {
            sb.append(getComplementBase(seq.charAt(i)));
        }
        return sb.toString();
    }

    /**
     * Returns reverse complement for a sequence.
     * @param base
     * @return  reverse complement of base
     */
    public static char getComplementBase(char base) {
        switch (base) {
            case 'A':
                return 'T';
            case 'C':
                return 'G';
            case 'G':
                return 'C';
            case 'T':
                return 'A';
        }
        return 'N';
    }

    /**
     * Returns the byte {@link net.maizegenetics.dna.snp.NucleotideAlignmentConstants} representation
     * used by TASSEL for the 2-bit encoded long.
     * 

     * e.g. A > 2-bit encode 00 > byte (0)
     * @param val 2-bit encoded DNA sequence
     * @return array of bytes for the DNA sequence
     */
    public static byte[] getByteSeqFromLong(long val) {
        byte[] b = new byte[chunkSize];
        long mask = 3;
        for (int i = 0; i < chunkSize; i++) {
            b[chunkSize - i - 1] = (byte) (val & mask);
            val = val >> 2;
        }
        return b;
    }

    /**
     * Returns the byte {@link net.maizegenetics.dna.snp.NucleotideAlignmentConstants} representation
     * used by TASSEL for the 2-bit encoded long.
     * 

     * e.g. A > 2-bit encode 00 > byte (0)
     * @param valA array of 2-bit encoded DNA sequence
     * @return array of bytes for the DNA sequence
     */
    public static byte[] getByteSeqFromLong(long[] valA) {
        byte[] b = new byte[chunkSize * valA.length];
        long mask = 3;
        long val;
        for (int j = 0; j < valA.length; j++) {
            val = valA[j];
            for (int i = 0; i < chunkSize; i++) {
                b[(j * chunkSize) + chunkSize - i - 1] = (byte) (val & mask);
                val = val >> 2;
            }
        }
        return b;
    }

    /**
     * Returns the 2-bit encoded long represented by 32 bytes representing {@link net.maizegenetics.dna.snp.NucleotideAlignmentConstants}
     * representation.  It is padded by As if shorter than 32 bytes, -1 returned if longer than 32.
     * The byte array values must be 0-3. If the array contains a value outside that range returns -1.
     * 
     * @param b array of bytes encoding NucleotideAlignmentConstants
     * @return 2-bit encoded long
     */
    public static long getLongSeqFromByteArray(byte[] b) {
        //the byte array must be in 0-3 coding for A, C, G, T
        long v = 0;
        if (b.length > chunkSize) {
            return -1L;
        }
        for (int i = 0; i < b.length; i++) {
            if (b[i] > 3) return -1L;
            v = (v << 2) + b[i];
        }
        v = (v << (2*(chunkSize-b.length)));
        return v;
    }
    
//     /**
//     * Return a string representation of the 2-bit encoded long.
//     * @param val 2-bit encoded sequence
//     * @param len length of the sequence
//     * @return DNA sequence as a string
//     */ 
 //   public static String getSequenceFromLong(long val, byte len) {
 //       StringBuilder seq = new StringBuilder(chunkSize + 4);
 //       long mask = 3;
 //       for (int i = 0; i < len; i++) {
 //           byte base = (byte) (val & mask);
 //          seq.insert(0, bases[base]); 
 //           val = val >> 2;
 //       }
 //       return seq.toString();
 //   }

    /**
     * Return a string representation of the 2-bit encoded long. 
     * @param val 2-bit encoded sequence
     * @param len length of the sequence
     * @return DNA sequence as a string
     * Reworked from above. Using "append" vs "insert" results
     * in ~9% faster execution.
     */
    public static String getSequenceFromLong(long val, byte len) {
    	StringBuilder seq = new StringBuilder(chunkSize + 4);
    	long mask = 3L << 62;
    	for (int i = 0; i < len; i++) {
    		byte base = (byte) (((val & mask) >> 62) & 0x03);
    		seq.append(bases[base]);          
    		val = val << 2;
    	}
    	return seq.toString();
    }
	

     /**
     * Return a string representation of an array of 2-bit encoded longs.
     * @param val array of 2-bit encoded sequences
     * @return DNA sequence as a string
     */
    public static String getSequenceFromLong(long[] val) {
        StringBuilder seq = new StringBuilder();
        for (long v : val) {
            seq.append(getSequenceFromLong(v));
        }
        return seq.toString();
    }

//    /**
//     * Return a string representation of an array of 2-bit encoded longs.
//     * @param val array of 2-bit encoded sequences
//     * @return DNA sequence as a string
//     */
//    public static String getSequenceFromLong(long[] val, short length) {
//        StringBuilder seq = new StringBuilder();
//        for (long v : val) {
//            seq.append(getSequenceFromLong(v,(byte)Math.max(32,length)));
//            length-=32;
//        }
//        return seq.toString();
//    }

    /**
     * Split a 2-bit encoded long into 2 integers.
     * @param val 2-bit encoded long sequence
     * @return array of 2-bit encoded integers
     */
    public static int[] getIntFromLong(long val) {
        int[] ival = new int[2];
        ival[0] = (int) (val >> chunkSize);
        ival[1] = (int) (val);
        return ival;
    }

    /**
     * Return a string representation of the 2-bit encoded Integer (16bp).
     * @param val 2-bit encoded sequence
     * @return DNA sequence as a string
     */
    public static String getSequenceFromInt(int val) {
        StringBuilder seq = new StringBuilder(chunkSizeForInt + 1);
        long mask = 3;
        for (int i = 0; i < chunkSizeForInt; i++) {
            byte base = (byte) (val & mask);
            seq.insert(0, bases[base]);
            val = val >> 2;
        }
        return seq.toString();
    }

    /**
     * Returns the position of the first low quality positions based on a quality
     * fastq (?) string.
     * @param quality fastq quality string
     * @param minQual minimum quality threshold
     * @return position of first low quality position (quality length is returned is not low 
     * quality base is found.
     *
     *  S - Sanger        Phred+33,  raw reads typically (0, 40)
    X - Solexa        Solexa+64, raw reads typically (-5, 40)
    I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
    J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40)
    with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
    (Note: See discussion above).
    L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
     */
    public static int getFirstLowQualityPos(String quality, int minQual) {
        int qualInt = 0;
        for (int i = 0; i < quality.length(); i++) {
            qualInt = (int) quality.charAt(i) - 64;
            if (qualInt < minQual) {
                return i;
            }
        }
        return quality.length();
    }

    /**
     * Returns the position of the first low quality positions based on a quality
     * fastq (?) string.
     * @param quality fastq quality string
     * @param minQual minimum quality threshold
     * @return position of first low quality position (quality length is returned is not low
     * quality base is found.
     *
     *  S - Sanger        Phred+33,  raw reads typically (0, 40)
    X - Solexa        Solexa+64, raw reads typically (-5, 40)
    I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
    J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40)
    with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
    (Note: See discussion above).
    L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
     */
    public static int getFirstLowQualityPos(String quality, int minQual, int qualBase) {
        int qualInt = 0;
        for (int i = 0; i < quality.length(); i++) {
            qualInt = (int) quality.charAt(i) - qualBase;
            if (qualInt < minQual) {
                return i;
            }
        }
        return quality.length();
    }


    /**
     * Return a string representation of the 2-bit encoded long.
     * @param val 2-bit encoded sequence
     * @return DNA sequence as a string
     */
    public static String getSequenceFromLong(long val) {
        return getSequenceFromLong(val, (byte) chunkSize);
    }

    /**
     * Returns the number of bp differences between two 2-bit encoded longs.
     * Maximum divergence is used to save time when only interested in very similar 
     * sequences.
     * @param seq1 2-bit encoded sequence
     * @param seq2 2-bit encoded sequence
     * @param maxDivergence threshold for counting divergence upto
     * @return count of the divergence (above the maxDivergence, chunkSize is returned)
     */
    public static byte seqDifferences(long seq1, long seq2, int maxDivergence) {
        long mask = 3;
        byte cnt = 0;
        long diff = seq1 ^ seq2;
        for (int x = 0; x < chunkSize && cnt <= maxDivergence; x++) {
            if ((diff & mask) > 0) {
                cnt++;
            }
            diff = diff >> 2;
            // System.out.println("v = " + v);
        }
        if (cnt > maxDivergence) {
            cnt = (byte) chunkSize;
        }
        // if(x<(chunkSize-1)) cnt=(byte)chunkSize;  //if didn't get to the end of the sequence set to maximum
        return cnt;
    }

    
    /**
     * Returns the number of bp differences between two 2-bit encoded longs.
     * @param seq1 2-bit encoded sequence
     * @param seq2 2-bit encoded sequence
     * @return count of the divergence 
     */
    public static byte seqDifferences(long seq1, long seq2) {
        long mask = 3;
        byte cnt = 0;
        long diff = seq1 ^ seq2;
        for (int x = 0; x < chunkSize; x++) {
            if ((diff & mask) > 0) {
                cnt++;
            }
            diff = diff >> 2;
            // System.out.println("v = " + v);
        }
        return cnt;
    }
    /**
     * Returns the number of sequencing differences between two 2-bit encoded longs.
     * Maximum divergence is used to save time when only interested in very similar 
     * sequences.
     * @param seq1 2-bit encoded sequence
     * @param seq2 2-bit encoded sequence
     * @param lengthOfComp number of sites to compare
     * @param maxDivergence threshold for counting divergence upto
     * @return count of the divergence (above the maxDivergence, chunkSize is returned)
     */
    public static byte seqDifferencesForSubset(long seq1, long seq2, int lengthOfComp, int maxDivergence) {
        long mask = 3;
        byte cnt = 0;
        long diff = seq1 ^ seq2;
        diff = diff >> (2 * (chunkSize - lengthOfComp));  //shift to 5' end of sequence
        for (int x = 0; x < lengthOfComp && cnt < maxDivergence; x++) {
            if ((diff & mask) > 0) {
                cnt++;
            }
            diff = diff >> 2;
        }
        return cnt;
    }

    /**
     * Trim the poly-A off the sequence string
     * @param s input sequence
     * @return sequence with polyA removed
     */
    public static String removePolyAFromEnd(String s) {
        int index = s.length() - 1;
        while (s.charAt(index) == 'A') {
            index--;
            if (index < 1) {
                return null;
            }
        }
        return s.substring(0, index + 1);
    }
}