All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.samtools.util.SequenceUtil Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools.util;

import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.fastq.FastqConstants;

import java.io.File;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SequenceUtil {
    /** Byte typed variables for all normal bases. */
    public static final byte a = 'a', c = 'c', g = 'g', t = 't', n = 'n', A = 'A', C = 'C', G = 'G', T = 'T', N = 'N';

    public static final byte[] VALID_BASES_UPPER = new byte[]{A, C, G, T};
    public static final byte[] VALID_BASES_LOWER = new byte[]{a, c, g, t};
    private static final byte[] ACGTN_BASES = new byte[]{A, C, G, T, N};
    private static final String IUPAC_CODES_STRING = ".aAbBcCdDgGhHkKmMnNrRsStTvVwWyY";
    /**
     * A set of bases supported by BAM in reads, see http://samtools.github.io/hts-specs/SAMv1.pdf chapter 4.2 on 'seq' field.
     * Effectively these are upper cased IUPAC codes with equals sign ('=') and without dot ('.').
     */
    private static final byte[] BAM_READ_BASE_SET = "=ABCDGHKMNRSTVWY".getBytes();

    /**
     * A lookup table to find a corresponding BAM read base.
     */
    private static final byte[] bamReadBaseLookup = new byte[127];
    static {
        Arrays.fill(bamReadBaseLookup, N);
        for (final byte base: BAM_READ_BASE_SET) {
            bamReadBaseLookup[base] = base;
            bamReadBaseLookup[base + 32] = base;
        }
    }

    private static final byte A_MASK = 1;
    private static final byte C_MASK = 2;
    private static final byte G_MASK = 4;
    private static final byte T_MASK = 8;

    private static final byte[] bases = new byte[127];
    private static final byte NON_IUPAC_CODE = 0;
    /*
     * Definition of IUPAC codes:
     * http://www.bioinformatics.org/sms2/iupac.html
     */
    static {
        Arrays.fill(bases, NON_IUPAC_CODE);
        bases[A] = A_MASK;
        bases[C] = C_MASK;
        bases[G] = G_MASK;
        bases[T] = T_MASK;
        bases['M'] = A_MASK | C_MASK;
        bases['R'] = A_MASK | G_MASK;
        bases['W'] = A_MASK | T_MASK;
        bases['S'] = C_MASK | G_MASK;
        bases['Y'] = C_MASK | T_MASK;
        bases['K'] = G_MASK | T_MASK;
        bases['V'] = A_MASK | C_MASK | G_MASK;
        bases['H'] = A_MASK | C_MASK | T_MASK;
        bases['D'] = A_MASK | G_MASK | T_MASK;
        bases['B'] = C_MASK | G_MASK | T_MASK;
        bases['N'] = A_MASK | C_MASK | G_MASK | T_MASK;
        // Also store the bases in lower case
        for (int i = 'A'; i <= 'Z'; i++) {
            bases[(byte) i + 32] = bases[(byte) i];
        }
        bases['.'] = A_MASK | C_MASK | G_MASK | T_MASK;
    };


    /**
     * Calculate the reverse complement of the specified sequence
     * (Stolen from Reseq)
     *
     * @param sequenceData
     * @return reverse complement
     */
    public static String reverseComplement(final String sequenceData) {
        final byte[] bases = htsjdk.samtools.util.StringUtil.stringToBytes(sequenceData);
        reverseComplement(bases);
        return htsjdk.samtools.util.StringUtil.bytesToString(bases);
    }


    /**
     * Efficiently compare two IUPAC base codes, simply returning true if they are equal (ignoring case),
     * without considering the set relationships between ambiguous codes.
     */
    public static boolean basesEqual(final byte lhs, final byte rhs) {
        return (bases[lhs] == bases[rhs]);
    }

    /**
     * Efficiently compare two IUPAC base codes, one coming from a read sequence and the other coming from
     * a reference sequence, using the reference code as a 'pattern' that the read base must match.
     *
     * We take ambiguous codes into account, returning true if the set of possible bases
     * represented by the read value is a (non-strict) subset of the possible bases represented
     * by the reference value.
     *
     * Since the comparison is directional, make sure to pass read / ref codes in correct order.
     */
    public static boolean readBaseMatchesRefBaseWithAmbiguity(final byte readBase, final byte refBase) {
        return (bases[readBase] & bases[refBase]) == bases[readBase];
    }

    /**
     * returns true if the value of base represents a no call
     */
    public static boolean isNoCall(final byte base) {
        return base == 'N' || base == 'n' || base == '.';
    }

    /** Returns true if the byte is in [acgtACGT]. */
    public static boolean isValidBase(final byte b) {
        return isValidBase(b, VALID_BASES_UPPER) || isValidBase(b, VALID_BASES_LOWER);
    }

    private static boolean isValidBase(final byte b, final byte[] validBases) {
        for (final byte validBase : validBases) {
            if (b == validBase) return true;
        }
        return false;
    }

    /**
     * Check if the given base is one of upper case ACGTN */
    public static boolean isUpperACGTN(final byte base) {
        return isValidBase(base, ACGTN_BASES);
    }


    /** Returns all IUPAC codes as a string */
    public static String getIUPACCodesString() {
        return IUPAC_CODES_STRING;
    }

    /** Checks if the given base is a IUPAC code */
    public static boolean isIUPAC(final byte base) {
        return bases[base] != NON_IUPAC_CODE;
    }

    /** Calculates the fraction of bases that are G/C in the sequence */
    public static double calculateGc(final byte[] bases) {
        int gcs = 0;
        for (int i = 0; i < bases.length; ++i) {
            final byte b = bases[i];
            if (b == 'C' || b == 'G' || b == 'c' || b == 'g') ++gcs;
        }

        return gcs / (double) bases.length;
    }

    /** Check if the given base belongs to BAM read base set '=ABCDGHKMNRSTVWY' */
    public static boolean isBamReadBase(final byte base) {
        return isValidBase(base, BAM_READ_BASE_SET);
    }

    /** Update and return the given array of bases by upper casing and then replacing all non-BAM read bases with N */
    public static byte[] toBamReadBasesInPlace(final byte[] bases) {
        for (int i = 0; i < bases.length; i++)
            bases[i] = bamReadBaseLookup[bases[i]];
        return bases;
    }

    /**
     * default signature that forces the lists to be the same size
     *
     * @param s1 a list of sequence headers
     * @param s2 a second list of sequence headers
     */
    public static void assertSequenceListsEqual(final List s1, final List s2) {
        assertSequenceListsEqual(s1, s2, false);
    }
    /**
     * Throws an exception only if both (first) parameters are not null
     * optionally check that one list is a (nonempty) prefix of the other.
     *
     * @param s1 a list of sequence headers
     * @param s2 a second list of sequence headers
     * @param checkPrefixOnly a flag specifying whether to only look at the first records in the lists. This will then check that the
     * records of the smaller dictionary are equal to the records of the beginning of the larger dictionary, which can be useful since
     * sometimes different pipelines choose to use only the first contigs of a standard reference.
     */
    public static void assertSequenceListsEqual(final List s1, final List s2, final boolean checkPrefixOnly) {
        if (s1 != null && s2 != null) {

            final int sizeToTest;

            if (checkPrefixOnly) {
                sizeToTest = Math.min(s1.size(), s2.size());
                if (sizeToTest == 0) {
                    throw new SequenceListsDifferException("Neither of the dictionaries can be empty.");
                }
            } else {
                sizeToTest = s1.size();
                if (s1.size() != s2.size()) {
                    throw new SequenceListsDifferException(
                            "Sequence dictionaries are not the same size (" + s1.size() + ", " + s2.size() +
                                    ")");
                }
            }
            for (int i = 0; i < sizeToTest; ++i) {
                if (!s1.get(i).isSameSequence(s2.get(i))) {
                    String s1Attrs = "";
                    for (final java.util.Map.Entry entry : s1.get(i)
                            .getAttributes()) {
                        s1Attrs += "/" + entry.getKey() + "=" + entry.getValue();
                    }
                    String s2Attrs = "";
                    for (final java.util.Map.Entry entry : s2.get(i)
                            .getAttributes()) {
                        s2Attrs += "/" + entry.getKey() + "=" + entry.getValue();
                    }
                    throw new SequenceListsDifferException(
                            "Sequences at index " + i + " don't match: " +
                                    s1.get(i).getSequenceIndex() + "/" + s1.get(i).getSequenceLength() +
                                    "/" + s1.get(i).getSequenceName() + s1Attrs + " " +
                                    s2.get(i).getSequenceIndex() + "/" + s2.get(i).getSequenceLength() +
                                    "/" + s2.get(i).getSequenceName() + s2Attrs);
                }
            }
        }
    }

    public static class SequenceListsDifferException extends SAMException {
        public SequenceListsDifferException() {
        }

        public SequenceListsDifferException(final String s) {
            super(s);
        }

        public SequenceListsDifferException(final String s, final Throwable throwable) {
            super(s, throwable);
        }

        public SequenceListsDifferException(final Throwable throwable) {
            super(throwable);
        }
    }

    /**
     * Returns true if both parameters are null or equal, otherwise returns false
     *
     * @param s1 a list of sequence headers
     * @param s2 a second list of sequence headers
     */
    public static boolean areSequenceDictionariesEqual(final SAMSequenceDictionary s1, final SAMSequenceDictionary s2) {
        if (s1 == null && s2 == null) return true;
        if (s1 == null || s2 == null) return false;

        try {
            assertSequenceListsEqual(s1.getSequences(), s2.getSequences());
            return true;
        } catch (final SequenceListsDifferException e) {
            return false;
        }
    }

    /**
     * Throws an exception if both parameters are non-null and unequal.
     *
     * @param s1 a list of sequence headers
     * @param s2 a second list of sequence headers
     */
    public static void assertSequenceDictionariesEqual(final SAMSequenceDictionary s1, final SAMSequenceDictionary s2) {
        assertSequenceDictionariesEqual(s1, s2, false);
    }

    /**
     * Throws an exception if both (first) parameters are non-null and unequal (if checkPrefixOnly, checks prefix of lists only).
     *
     * @param s1 a list of sequence headers
     * @param s2 a second list of sequence headers
     * @param checkPrefixOnly a flag specifying whether to only look at the first records in the lists. This will then check that the
     * records of the smaller dictionary are equal to the records of the beginning of the larger dictionary, which can be useful since
     * sometimes different pipelines choose to use only the first contigs of a standard reference.
     */
    public static void assertSequenceDictionariesEqual(final SAMSequenceDictionary s1, final SAMSequenceDictionary s2, final boolean checkPrefixOnly) {
        if (s1 == null || s2 == null) return;
        assertSequenceListsEqual(s1.getSequences(), s2.getSequences(), checkPrefixOnly);
    }

    /**
     * Throws an exception if both parameters are non-null and unequal, including the filenames.
     */
    public static void assertSequenceDictionariesEqual(final SAMSequenceDictionary s1, final SAMSequenceDictionary s2,
                                                       final File f1, final File f2) {
        try {
            assertSequenceDictionariesEqual(s1, s2);
        } catch (final SequenceListsDifferException e) {
            throw new SequenceListsDifferException("In files " + f1.getAbsolutePath() + " and " + f2.getAbsolutePath(), e);
        }
    }

    /**
     * Create a simple ungapped cigar string, which might have soft clipping at either end
     *
     * @param alignmentStart raw aligment start, which may result in read hanging off beginning or end of read
     * @return cigar string that may have S operator at beginning or end, and has M operator for the rest of the read
     */
    public static String makeCigarStringWithPossibleClipping(final int alignmentStart, final int readLength, final int referenceSequenceLength) {
        int start = alignmentStart;
        int leftSoftClip = 0;
        if (start < 1) {
            leftSoftClip = 1 - start;
            start = 1;
        }
        int rightSoftClip = 0;
        if (alignmentStart + readLength > referenceSequenceLength + 1) {
            rightSoftClip = alignmentStart + readLength - referenceSequenceLength - 1;
        }
        // CIGAR is trivial because there are no indels or clipping in Gerald
        final int matchLength = readLength - leftSoftClip - rightSoftClip;
        if (matchLength < 1) {
            throw new SAMException("Unexpected cigar string with no M op for read.");
        }
        return makeSoftClipCigar(leftSoftClip) + Integer.toString(matchLength) + "M" + makeSoftClipCigar(rightSoftClip);
    }

    /**
     * Create a cigar string for a gapped alignment, which may have soft clipping at either end
     *
     * @param alignmentStart          raw alignment start, which may result in read hanging off beginning or end of read
     * @param readLength
     * @param referenceSequenceLength
     * @param indelPosition           number of matching bases before indel.  Must be > 0
     * @param indelLength             length of indel.  Positive for insertion, negative for deletion.
     * @return cigar string that may have S operator at beginning or end, has one or two M operators, and an I or a D.
     */
    public static String makeCigarStringWithIndelPossibleClipping(final int alignmentStart,
                                                                  final int readLength,
                                                                  final int referenceSequenceLength,
                                                                  final int indelPosition,
                                                                  final int indelLength) {
        int start = alignmentStart;
        int leftSoftClip = 0;
        if (start < 1) {
            leftSoftClip = 1 - start;
            start = 1;
        }
        int rightSoftClip = 0;
        final int alignmentEnd = alignmentStart + readLength - indelLength;
        if (alignmentEnd > referenceSequenceLength + 1) {
            rightSoftClip = alignmentEnd - referenceSequenceLength - 1;
        }
        if (leftSoftClip >= indelPosition) {
            throw new IllegalStateException("Soft clipping entire pre-indel match. leftSoftClip: " + leftSoftClip +
                    "; indelPosition: " + indelPosition);
        }
        // CIGAR is trivial because there are no indels or clipping in Gerald
        final int firstMatchLength = indelPosition - leftSoftClip;
        final int secondMatchLength = readLength - indelPosition - (indelLength > 0 ? indelLength : 0) - rightSoftClip;
        if (secondMatchLength < 1) {
            throw new SAMException("Unexpected cigar string with no M op for read.");
        }
        return makeSoftClipCigar(leftSoftClip) + Integer.toString(firstMatchLength) + "M" +
                Math.abs(indelLength) + (indelLength > 0 ? "I" : "D") +
                Integer.toString(secondMatchLength) + "M" +
                makeSoftClipCigar(rightSoftClip);
    }

    public static String makeSoftClipCigar(final int clipLength) {
        if (clipLength == 0) {
            return "";
        }
        return Integer.toString(clipLength) + "S";
    }

    /**
     * Helper method to handle the various use cases of base comparison.
     *
     * @param readBase the read base to match
     * @param refBase the reference base to match
     * @param negativeStrand set to true if the base to test is on the negative strand and should be reverse complemented (only applies if bisulfiteSequence is true)
     * @param bisulfiteSequence set to true if the base to match is a bisulfite sequence and needs to be converted
     * @param matchAmbiguousRef causes the match to return true when the read base is a subset of the possible IUPAC reference bases, but not the other way around
     * @return true if the bases match, false otherwise
     */
    private static boolean basesMatch(final byte readBase, final byte refBase, final boolean negativeStrand,
                                      final boolean bisulfiteSequence, final boolean matchAmbiguousRef) {
        if (bisulfiteSequence) {
            if (matchAmbiguousRef) return bisulfiteBasesMatchWithAmbiguity(negativeStrand, readBase, refBase);
            else return bisulfiteBasesEqual(negativeStrand, readBase, refBase);
        } else {
            if (matchAmbiguousRef) return readBaseMatchesRefBaseWithAmbiguity(readBase, refBase);
            else return basesEqual(readBase, refBase);
        }
    }

    /** Calculates the number of mismatches between the read and the reference sequence provided. */
    public static int countMismatches(final SAMRecord read, final byte[] referenceBases) {
        return countMismatches(read, referenceBases, 0, false);
    }

    /** Calculates the number of mismatches between the read and the reference sequence provided. */
    public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final int referenceOffset) {
        return countMismatches(read, referenceBases, referenceOffset, false);
    }

    /**
     * Calculates the number of mismatches between the read and the reference sequence provided.
     *
     * @param referenceBases    Array of ASCII bytes that covers at least the the portion of the reference sequence
     *                          to which read is aligned from getReferenceStart to getReferenceEnd.
     * @param referenceOffset   0-based offset of the first element of referenceBases relative to the start
     *                          of that reference sequence.
     * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated
     *                          and C->T on the positive strand and G->A on the negative strand will not be counted
     *                          as mismatches.
     */
    public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final int referenceOffset, final boolean bisulfiteSequence) {
        return countMismatches(read, referenceBases, referenceOffset, bisulfiteSequence, false);
    }

    public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final int referenceOffset,
                                      final boolean bisulfiteSequence, final boolean matchAmbiguousRef) {
        try {
            int mismatches = 0;

            final byte[] readBases = read.getReadBases();

            for (final AlignmentBlock block : read.getAlignmentBlocks()) {
                final int readBlockStart = block.getReadStart() - 1;
                final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset;
                final int length = block.getLength();

                for (int i = 0; i < length; ++i) {
                    if (!basesMatch(readBases[readBlockStart + i], referenceBases[referenceBlockStart + i],
                            read.getReadNegativeStrandFlag(), bisulfiteSequence, matchAmbiguousRef)) {
                        ++mismatches;
                    }
                }
            }
            return mismatches;
        } catch (final Exception e) {
            throw new SAMException("Exception counting mismatches for read " + read, e);
        }
    }

    /**
     * Calculates the number of mismatches between the read and the reference sequence provided.
     *
     * @param referenceBases    Array of ASCII bytes that covers at least the the portion of the reference sequence
     *                          to which read is aligned from getReferenceStart to getReferenceEnd.
     * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated
     *                          and C->T on the positive strand and G->A on the negative strand will not be counted
     *                          as mismatches.
     */
    public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final boolean bisulfiteSequence) {
        return countMismatches(read, referenceBases, 0, bisulfiteSequence);
    }

    /**
     * Calculates the sum of qualities for mismatched bases in the read.
     *
     * @param referenceBases Array of ASCII bytes in which the 0th position in the array corresponds
     *                       to the first element of the reference sequence to which read is aligned.
     */
    public static int sumQualitiesOfMismatches(final SAMRecord read, final byte[] referenceBases) {
        return sumQualitiesOfMismatches(read, referenceBases, 0, false);
    }

    /**
     * Calculates the sum of qualities for mismatched bases in the read.
     *
     * @param referenceBases  Array of ASCII bytes that covers at least the the portion of the reference sequence
     *                        to which read is aligned from getReferenceStart to getReferenceEnd.
     * @param referenceOffset 0-based offset of the first element of referenceBases relative to the start
     *                        of that reference sequence.
     */
    public static int sumQualitiesOfMismatches(final SAMRecord read, final byte[] referenceBases,
                                               final int referenceOffset) {
        return sumQualitiesOfMismatches(read, referenceBases, referenceOffset, false);
    }

    /**
     * Calculates the sum of qualities for mismatched bases in the read.
     *
     * @param referenceBases    Array of ASCII bytes that covers at least the the portion of the reference sequence
     *                          to which read is aligned from getReferenceStart to getReferenceEnd.
     * @param referenceOffset   0-based offset of the first element of referenceBases relative to the start
     *                          of that reference sequence.
     * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated
     *                          and C->T on the positive strand and G->A on the negative strand will not be counted
     *                          as mismatches.
     */
    public static int sumQualitiesOfMismatches(final SAMRecord read, final byte[] referenceBases,
                                               final int referenceOffset, final boolean bisulfiteSequence) {
        int qualities = 0;

        final byte[] readBases = read.getReadBases();
        final byte[] readQualities = read.getBaseQualities();

        if (read.getAlignmentStart() <= referenceOffset) {
            throw new IllegalArgumentException("read.getAlignmentStart(" + read.getAlignmentStart() +
                    ") <= referenceOffset(" + referenceOffset + ")");
        }

        for (final AlignmentBlock block : read.getAlignmentBlocks()) {
            final int readBlockStart = block.getReadStart() - 1;
            final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset;
            final int length = block.getLength();

            for (int i = 0; i < length; ++i) {
                if (!bisulfiteSequence) {
                    if (!basesEqual(readBases[readBlockStart + i], referenceBases[referenceBlockStart + i])) {
                        qualities += readQualities[readBlockStart + i];
                    }

                } else {
                    if (!bisulfiteBasesEqual(read.getReadNegativeStrandFlag(), readBases[readBlockStart + i],
                            referenceBases[referenceBlockStart + i])) {
                        qualities += readQualities[readBlockStart + i];
                    }
                }
            }
        }

        return qualities;
    }

    public static int countInsertedBases(final Cigar cigar) {
        int ret = 0;
        for (final CigarElement element : cigar.getCigarElements()) {
            if (element.getOperator() == CigarOperator.INSERTION) ret += element.getLength();
        }
        return ret;
    }

    public static int countDeletedBases(final Cigar cigar) {
        int ret = 0;
        for (final CigarElement element : cigar.getCigarElements()) {
            if (element.getOperator() == CigarOperator.DELETION) ret += element.getLength();
        }
        return ret;
    }

    public static int countInsertedBases(final SAMRecord read) {
        return countInsertedBases(read.getCigar());
    }

    public static int countDeletedBases(final SAMRecord read) {
        return countDeletedBases(read.getCigar());
    }

    /**
     * Calculates the predefined NM tag from the SAM spec: (# of mismatches + # of indels)
     * For the purposes for calculating mismatches, we do not yet support IUPAC ambiguous codes
     * (see readBaseMatchesRefBaseWithAmbiguity method).
     */
    public static int calculateSamNmTag(final SAMRecord read, final byte[] referenceBases) {
        return calculateSamNmTag(read, referenceBases, 0, false);
    }

    /**
     * Calculates the predefined NM tag from the SAM spec: (# of mismatches + # of indels)
     * For the purposes for calculating mismatches, we do not yet support IUPAC ambiguous codes
     * (see readBaseMatchesRefBaseWithAmbiguity method).
     *
     * @param referenceOffset 0-based offset of the first element of referenceBases relative to the start
     *                        of that reference sequence.
     */
    public static int calculateSamNmTag(final SAMRecord read, final byte[] referenceBases,
                                        final int referenceOffset) {
        return calculateSamNmTag(read, referenceBases, referenceOffset, false);
    }

    /**
     * Calculates the predefined NM tag from the SAM spec: (# of mismatches + # of indels)
     * For the purposes for calculating mismatches, we do not yet support IUPAC ambiguous codes
     * (see readBaseMatchesRefBaseWithAmbiguity method).
     *
     * @param referenceOffset   0-based offset of the first element of referenceBases relative to the start
     *                          of that reference sequence.
     * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated
     *                          and C->T on the positive strand and G->A on the negative strand will not be counted
     *                          as mismatches.
     */
    public static int calculateSamNmTag(final SAMRecord read, final byte[] referenceBases,
                                        final int referenceOffset, final boolean bisulfiteSequence) {
        int samNm = countMismatches(read, referenceBases, referenceOffset, bisulfiteSequence, false);
        for (final CigarElement el : read.getCigar().getCigarElements()) {
            if (el.getOperator() == CigarOperator.INSERTION || el.getOperator() == CigarOperator.DELETION) {
                samNm += el.getLength();
            }
        }
        return samNm;
    }

    /**
     * Attempts to calculate the predefined NM tag from the SAM spec using the cigar string alone.
     * It may calculate incorrectly if ambiguous operators (Like M) are used.
     *
     * Needed for testing infrastructure: SAMRecordSetBuilder
     */
    public static int calculateSamNmTagFromCigar(final SAMRecord record) {
        int samNm = 0;
        for (final CigarElement el : record.getCigar().getCigarElements()) {
            if ( el.getOperator() == CigarOperator.X ||
                 el.getOperator() == CigarOperator.INSERTION ||
                 el.getOperator() == CigarOperator.DELETION) {
                samNm += el.getLength();
            }
        }
        return samNm;
    }

    /** Returns the complement of a single byte. */
    public static byte complement(final byte b) {
        switch (b) {
            case a:
                return t;
            case c:
                return g;
            case g:
                return c;
            case t:
                return a;
            case A:
                return T;
            case C:
                return G;
            case G:
                return C;
            case T:
                return A;
            default:
                return b;
        }
    }



    /**
     * Returns true if the bases are equal OR if the mismatch can be accounted for by
     * bisulfite treatment. C->T on the positive strand and G->A on the negative strand
     * do not count as mismatches.
     */
    public static boolean bisulfiteBasesEqual(final boolean negativeStrand, final byte read, final byte reference) {
        return (basesEqual(read, reference)) || (isBisulfiteConverted(read, reference, negativeStrand));
    }

    public static boolean bisulfiteBasesEqual(final byte read, final byte reference) {
        return bisulfiteBasesEqual(false, read, reference);
    }

    /**
     * Same as above, but use readBaseMatchesRefBaseWithAmbiguity instead of basesEqual.
     * Note that isBisulfiteConverted is not affected because it only applies when the
     * reference base is non-ambiguous.
     */
    public static boolean bisulfiteBasesMatchWithAmbiguity(final boolean negativeStrand, final byte read, final byte reference) {
        return (readBaseMatchesRefBaseWithAmbiguity(read, reference)) || (isBisulfiteConverted(read, reference, negativeStrand));
    }

    /**
     * Checks for bisulfite conversion, C->T on the positive strand and G->A on the negative strand.
     */
    public static boolean isBisulfiteConverted(final byte read, final byte reference, final boolean negativeStrand) {
        if (negativeStrand) {
            if (basesEqual(reference, G) && basesEqual(read, A)) {
                return true;
            }
        } else {
            if (basesEqual(reference, C) && basesEqual(read, T)) {
                return true;
            }
        }
        return false;
    }

    public static boolean isBisulfiteConverted(final byte read, final byte reference) {
        return isBisulfiteConverted(read, reference, false);
    }

    /*
     * Regexp for MD string.
     *
     * \G = end of previous match.
     * (?:[0-9]+) non-capturing (why non-capturing?) group of digits.  For this number of bases read matches reference.
     *  - or -
     * Single reference base for case in which reference differs from read.
     *  - or -
     * ^one or more reference bases that are deleted in read.
     *
     */
    static final Pattern mdPat = Pattern.compile("\\G(?:([0-9]+)|([ACTGNactgn])|(\\^[ACTGNactgn]+))");

    /**
     * Produce reference bases from an aligned SAMRecord with MD string and Cigar.
     *
     * @param rec                               Must contain non-empty CIGAR and MD attribute.
     * @param includeReferenceBasesForDeletions If true, include reference bases that are deleted in the read.
     *                                          This will make the returned array not line up with the read if there are deletions.
     * @return References bases corresponding to the read.  If there is an insertion in the read, reference contains
     * '-'.  If the read is soft-clipped, reference contains '0'.  If there is a skipped region and
     * includeReferenceBasesForDeletions==true, reference will have Ns for the skipped region.
     */
    public static byte[] makeReferenceFromAlignment(final SAMRecord rec, final boolean includeReferenceBasesForDeletions) {
        final String md = rec.getStringAttribute(SAMTag.MD.name());
        if (md == null) {
            throw new SAMException("Cannot create reference from SAMRecord with no MD tag, read: " + rec.getReadName());
        }
        // Not sure how long output will be, but it will be no longer than this.
        int maxOutputLength = 0;
        final Cigar cigar = rec.getCigar();
        if (cigar == null) {
            throw new SAMException("Cannot create reference from SAMRecord with no CIGAR, read: " + rec.getReadName());
        }
        for (final CigarElement cigarElement : cigar.getCigarElements()) {
            maxOutputLength += cigarElement.getLength();
        }
        final byte[] ret = new byte[maxOutputLength];
        int outIndex = 0;

        final Matcher match = mdPat.matcher(md);
        int curSeqPos = 0;

        int savedBases = 0;
        final byte[] seq = rec.getReadBases();
        for (final CigarElement cigEl : cigar.getCigarElements()) {
            final int cigElLen = cigEl.getLength();
            final CigarOperator cigElOp = cigEl.getOperator();


            if (cigElOp == CigarOperator.SKIPPED_REGION) {
                // We've decided that MD tag will not contain bases for skipped regions, as they
                // could be megabases long, so just put N in there if caller wants reference bases,
                // otherwise ignore skipped regions.
                if (includeReferenceBasesForDeletions) {
                    for (int i = 0; i < cigElLen; ++i) {
                        ret[outIndex++] = N;
                    }
                }
            }
            // If it consumes reference bases, it's either a match or a deletion in the sequence
            // read.  Either way, we're going to need to parse through the MD.
            else if (cigElOp.consumesReferenceBases()) {
                // We have a match region, go through the MD
                int basesMatched = 0;

                // Do we have any saved matched bases?
                while ((savedBases > 0) && (basesMatched < cigElLen)) {
                    ret[outIndex++] = seq[curSeqPos++];
                    savedBases--;
                    basesMatched++;
                }

                while (basesMatched < cigElLen) {
                    boolean matched = match.find();
                    if (matched) {
                        String mg;
                        if (((mg = match.group(1)) != null) && (!mg.isEmpty())) {
                            // It's a number , meaning a series of matches
                            final int num = Integer.parseInt(mg);
                            for (int i = 0; i < num; i++) {
                                if (basesMatched < cigElLen) {
                                    ret[outIndex++] = seq[curSeqPos++];
                                } else {
                                    savedBases++;
                                }
                                basesMatched++;
                            }
                        } else if (((mg = match.group(2)) != null) && (!mg.isEmpty())) {
                            // It's a single nucleotide, meaning a mismatch
                            if (basesMatched < cigElLen) {
                                ret[outIndex++] = StringUtil.charToByte(mg.charAt(0));
                                curSeqPos++;
                            } else {
                                throw new IllegalStateException("Should never happen.");
                            }
                            basesMatched++;
                        } else if (((mg = match.group(3)) != null) && (!mg.isEmpty())) {
                            // It's a deletion, starting with a caret
                            // don't include caret
                            if (includeReferenceBasesForDeletions) {
                                final byte[] deletedBases = StringUtil.stringToBytes(mg);
                                System.arraycopy(deletedBases, 1, ret, outIndex, deletedBases.length - 1);
                                outIndex += deletedBases.length - 1;
                            }
                            basesMatched += mg.length() - 1;

                            // Check just to make sure.
                            if (basesMatched != cigElLen) {
                                throw new SAMException("Got a deletion in CIGAR (" + cigar + ", deletion " + cigElLen +
                                        " length) with an unequal ref insertion in MD (" + md + ", md " + basesMatched + " length");
                            }
                            if (cigElOp != CigarOperator.DELETION) {
                                throw new SAMException("Got an insertion in MD (" + md + ") without a corresponding deletion in cigar (" + cigar + ")");
                            }

                        } else {
                            matched = false;
                        }
                    }

                    if (!matched) {
                        throw new SAMException("Illegal MD pattern: " + md + " for read " + rec.getReadName() +
                                " with CIGAR " + rec.getCigarString());
                    }
                }

            } else if (cigElOp.consumesReadBases()) {
                // We have an insertion in read
                for (int i = 0; i < cigElLen; i++) {
                    final char c = (cigElOp == CigarOperator.SOFT_CLIP) ? '0' : '-';
                    ret[outIndex++] = StringUtil.charToByte(c);
                    curSeqPos++;
                }
            } else {
                // It's an op that consumes neither read nor reference bases.  Do we just ignore??
            }

        }
        if (outIndex < ret.length) {
            final byte[] shorter = new byte[outIndex];
            System.arraycopy(ret, 0, shorter, 0, outIndex);
            return shorter;
        }
        return ret;
    }

    /** Reverses and complements the bases in place. */
    public static void reverseComplement(final byte[] bases) {
        reverseComplement(bases, 0, bases.length);
    }

    /** Reverses the quals in place. */
    public static void reverseQualities(final byte[] quals) {
        reverse(quals, 0, quals.length);
    }

    public static void reverse(final byte[] array, final int offset, final int len) {
        final int lastIndex = len - 1;

        int i, j;
        for (i = offset, j = offset + lastIndex; i < j; ++i, --j) {
            final byte tmp = array[i];
            array[i] = array[j];
            array[j] = tmp;
        }
        if (len % 2 == 1) {
            array[i] = array[i];
        }
    }

    public static void reverseComplement(final byte[] bases, final int offset, final int len) {
        final int lastIndex = len - 1;

        int i, j;
        for (i = offset, j = offset + lastIndex; i < j; ++i, --j) {
            final byte tmp = complement(bases[i]);
            bases[i] = complement(bases[j]);
            bases[j] = tmp;
        }
        if (len % 2 == 1) {
            bases[i] = complement(bases[i]);
        }
    }

    public static String calculateMD5String(final byte[] data)
            throws NoSuchAlgorithmException {
        return SequenceUtil.calculateMD5String(data, 0, data.length);
    }

    public static String calculateMD5String(final byte[] data, final int offset, final int len) {
        final byte[] digest = calculateMD5(data, offset, len);
        return String.format("%032x", new BigInteger(1, digest));
    }

    public static byte[] calculateMD5(final byte[] data, final int offset, final int len) {
        final MessageDigest md5_MessageDigest;
        try {
            md5_MessageDigest = MessageDigest.getInstance("MD5");
            md5_MessageDigest.reset();

            md5_MessageDigest.update(data, offset, len);
            return md5_MessageDigest.digest();
        } catch (final NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Calculate MD and NM similarly to Samtools, except that N->N is a match.
     *
     * @param record Input record for which to calculate NM and MD.
     *               The appropriate tags will be added/updated in the record
     * @param ref    The reference bases for the sequence to which the record is mapped
     * @param calcMD A flag indicating whether to update the MD tag in the record
     * @param calcNM A flag indicating whether to update the NM tag in the record
     */
    public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref,
                                            final boolean calcMD, final boolean calcNM) {
        if (!calcMD && !calcNM)
            return;

        final Cigar cigar = record.getCigar();
        final List cigarElements = cigar.getCigarElements();
        final byte[] seq = record.getReadBases();
        final int alignmentStart = record.getAlignmentStart() - 1;
        int cigarIndex, blockRefPos, blockReadStart, matchCount = 0;
        int nmCount = 0;
        final StringBuilder mdString = new StringBuilder();

        final int nElements = cigarElements.size();
        for (cigarIndex = blockReadStart = 0, blockRefPos = alignmentStart; cigarIndex < nElements; ++cigarIndex) {
            final CigarElement ce = cigarElements.get(cigarIndex);
            int inBlockOffset;
            final int blockLength = ce.getLength();
            final CigarOperator op = ce.getOperator();
            if (op == CigarOperator.MATCH_OR_MISMATCH || op == CigarOperator.EQ
                    || op == CigarOperator.X) {
                for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) {
                    final int readOffset = blockReadStart + inBlockOffset;

                    if (ref.length <= blockRefPos + inBlockOffset) break; // out of boundary

                    final byte readBase = seq[readOffset];
                    final byte refBase = ref[blockRefPos + inBlockOffset];

                    if ((bases[readBase] == bases[refBase]) || readBase == 0) {
                        // a match
                        ++matchCount;
                    } else {
                        mdString.append(matchCount);
                        mdString.appendCodePoint(refBase);
                        matchCount = 0;
                        ++nmCount;
                    }
                }
                if (inBlockOffset < blockLength) break;
                blockRefPos += blockLength;
                blockReadStart += blockLength;
            } else if (op == CigarOperator.DELETION) {
                mdString.append(matchCount);
                mdString.append('^');
                for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) {
                    if (ref[blockRefPos + inBlockOffset] == 0) break;
                    mdString.appendCodePoint(ref[blockRefPos + inBlockOffset]);
                }
                matchCount = 0;
                if (inBlockOffset < blockLength) break;
                blockRefPos += blockLength;
                nmCount += blockLength;
            } else if (op == CigarOperator.INSERTION
                    || op == CigarOperator.SOFT_CLIP) {
                blockReadStart += blockLength;
                if (op == CigarOperator.INSERTION) nmCount += blockLength;
            } else if (op == CigarOperator.SKIPPED_REGION) {
                blockRefPos += blockLength;
            }
        }
        mdString.append(matchCount);

        if (calcMD) record.setAttribute(SAMTag.MD, mdString.toString());
        if (calcNM) record.setAttribute(SAMTag.NM, nmCount);
    }

    public static byte upperCase(final byte base) {
        return base >= a ? (byte) (base - (a - A)) : base;
    }

    public static byte[] upperCase(final byte[] bases) {
        for (int i = 0; i < bases.length; i++)
            bases[i] = upperCase(bases[i]);
        return bases;
    }

    /** Generates all possible unambiguous kmers (upper-case) of length and returns them as byte[]s. */
    public static List generateAllKmers(final int length) {
        final List sofar = new LinkedList<>();

        if (sofar.isEmpty()) {
            sofar.add(new byte[length]);
        }

        while (true) {
            final byte[] bs = sofar.remove(0);
            int indexOfNextBase = -1;
            for (int i = 0; i < bs.length; ++i) {
                if (bs[i] == 0) {
                    indexOfNextBase = i;
                    break;
                }
            }

            if (indexOfNextBase == -1) {
                sofar.add(bs);
                break;
            } else {
                for (final byte b : VALID_BASES_UPPER) {
                    final byte[] next = Arrays.copyOf(bs, bs.length);
                    next[indexOfNextBase] = b;
                    sofar.add(next);
                }
            }
        }

        return sofar;
    }

    /**
     * Returns a read name from a FASTQ header string suitable for use in a SAM/BAM file.  Any letters after the first space are ignored.
     * Ths method also strips trailing "/1" or "/2" so that paired end reads have the same name.
     *
     * @param fastqHeader the header from a {@link htsjdk.samtools.fastq.FastqRecord}.
     * @return a read name appropriate for output in a SAM/BAM file.
     */
    // Read names cannot contain blanks
    public static String getSamReadNameFromFastqHeader(final String fastqHeader) {
        final int idx = fastqHeader.indexOf(" ");
        String readName = (idx == -1) ? fastqHeader : fastqHeader.substring(0,idx);

        // NOTE: the while loop isn't necessarily the most efficient way to handle this but we don't
        // expect this to ever happen more than once, just trapping pathological cases
        while ((readName.endsWith(FastqConstants.FIRST_OF_PAIR) || readName.endsWith(FastqConstants.SECOND_OF_PAIR))) {
            // If this is an unpaired run we want to make sure that "/1" isn't tacked on the end of the read name,
            // as this can cause problems down the road (ex. in Picard's MergeBamAlignment).
            readName = readName.substring(0, readName.length() - 2);
        }

        return readName;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy