All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.samtools.SAMUtils Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.CigarUtil;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CoordMath;
import htsjdk.samtools.util.RuntimeEOFException;
import htsjdk.samtools.util.StringUtil;
import htsjdk.tribble.annotation.Strand;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.nio.file.Path;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.TreeMap;
import java.util.regex.Pattern;

/**
 * Utilty methods.
 */
public final class SAMUtils {
    /**
     * regex for semicolon, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)}
     */
    private static final Pattern SEMICOLON_PAT = Pattern.compile("[;]");
    /**
     * regex for comma, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)}
     */
    private static final Pattern COMMA_PAT = Pattern.compile("[,]");

    // Representation of bases, one for when in low-order nybble, one for when in high-order nybble.
    private static final byte COMPRESSED_EQUAL_LOW = 0;
    private static final byte COMPRESSED_A_LOW = 1;
    private static final byte COMPRESSED_C_LOW = 2;
    private static final byte COMPRESSED_M_LOW = 3;
    private static final byte COMPRESSED_G_LOW = 4;
    private static final byte COMPRESSED_R_LOW = 5;
    private static final byte COMPRESSED_S_LOW = 6;
    private static final byte COMPRESSED_V_LOW = 7;
    private static final byte COMPRESSED_T_LOW = 8;
    private static final byte COMPRESSED_W_LOW = 9;
    private static final byte COMPRESSED_Y_LOW = 10;
    private static final byte COMPRESSED_H_LOW = 11;
    private static final byte COMPRESSED_K_LOW = 12;
    private static final byte COMPRESSED_D_LOW = 13;
    private static final byte COMPRESSED_B_LOW = 14;
    private static final byte COMPRESSED_N_LOW = 15;
    private static final byte COMPRESSED_EQUAL_HIGH = COMPRESSED_EQUAL_LOW << 4;
    private static final byte COMPRESSED_A_HIGH = COMPRESSED_A_LOW << 4;
    private static final byte COMPRESSED_C_HIGH = COMPRESSED_C_LOW << 4;
    private static final byte COMPRESSED_G_HIGH = COMPRESSED_G_LOW << 4;
    private static final byte COMPRESSED_T_HIGH = (byte) (COMPRESSED_T_LOW << 4);
    private static final byte COMPRESSED_N_HIGH = (byte) (COMPRESSED_N_LOW << 4);

    private static final byte COMPRESSED_M_HIGH = (byte) (COMPRESSED_M_LOW << 4);
    private static final byte COMPRESSED_R_HIGH = (byte) (COMPRESSED_R_LOW << 4);
    private static final byte COMPRESSED_S_HIGH = (byte) (COMPRESSED_S_LOW << 4);
    private static final byte COMPRESSED_V_HIGH = (byte) (COMPRESSED_V_LOW << 4);
    private static final byte COMPRESSED_W_HIGH = (byte) (COMPRESSED_W_LOW << 4);
    private static final byte COMPRESSED_Y_HIGH = (byte) (COMPRESSED_Y_LOW << 4);
    private static final byte COMPRESSED_H_HIGH = (byte) (COMPRESSED_H_LOW << 4);
    private static final byte COMPRESSED_K_HIGH = (byte) (COMPRESSED_K_LOW << 4);
    private static final byte COMPRESSED_D_HIGH = (byte) (COMPRESSED_D_LOW << 4);
    private static final byte COMPRESSED_B_HIGH = (byte) (COMPRESSED_B_LOW << 4);

    private static final byte[] COMPRESSED_LOOKUP_TABLE = {
            '=',
            'A',
            'C',
            'M',
            'G',
            'R',
            'S',
            'V',
            'T',
            'W',
            'Y',
            'H',
            'K',
            'D',
            'B',
            'N'
    };

    public static final int MAX_PHRED_SCORE = 93;

    /**
     * Convert from a byte array containing =AaCcGgTtNnMmRrSsVvWwYyHhKkDdBb represented as ASCII, to a byte array half as long,
     * with for example, =, A, C, G, T converted to 0, 1, 2, 4, 8, 15.
     *
     * @param readBases Bases as ASCII bytes.
     * @return New byte array with bases represented as nybbles, in BAM binary format.
     */
    static byte[] bytesToCompressedBases(final byte[] readBases) {
        final byte[] compressedBases = new byte[(readBases.length + 1) / 2];
        int i;
        for (i = 1; i < readBases.length; i += 2) {
            compressedBases[i / 2] = (byte) (charToCompressedBaseHigh(readBases[i - 1]) |
                    charToCompressedBaseLow(readBases[i]));
        }
        // Last nybble
        if (i == readBases.length) {
            compressedBases[i / 2] = charToCompressedBaseHigh(readBases[i - 1]);
        }
        return compressedBases;
    }

    /**
     * Convert from a byte array with bases stored in nybbles, with for example,=, A, C, G, T, N represented as 0, 1, 2, 4, 8, 15,
     * to a a byte array containing =AaCcGgTtNn represented as ASCII.
     *
     * @param length           Number of bases (not bytes) to convert.
     * @param compressedBases  Bases represented as nybbles, in BAM binary format.
     * @param compressedOffset Byte offset in compressedBases to start.
     * @return New byte array with bases as ASCII bytes.
     */
    public static byte[] compressedBasesToBytes(final int length, final byte[] compressedBases, final int compressedOffset) {
        final byte[] ret = new byte[length];
        int i;
        for (i = 1; i < length; i += 2) {
            final int compressedIndex = i / 2 + compressedOffset;
            ret[i - 1] = compressedBaseToByteHigh(compressedBases[compressedIndex]);
            ret[i] = compressedBaseToByteLow(compressedBases[compressedIndex]);
        }
        // Last nybble
        if (i == length) {
            ret[i - 1] = compressedBaseToByteHigh(compressedBases[i / 2 + compressedOffset]);
        }
        return ret;
    }

    /**
     * Convert from ASCII byte to BAM nybble representation of a base in low-order nybble.
     *
     * @param base One of =AaCcGgTtNnMmRrSsVvWwYyHhKkDdBb.
     * @return Low-order nybble-encoded equivalent.
     * @throws IllegalArgumentException if the base is not one of =AaCcGgTtNnMmRrSsVvWwYyHhKkDdBb.
     */
    private static byte charToCompressedBaseLow(final byte base) {
        switch (base) {
            case '=':
                return COMPRESSED_EQUAL_LOW;
            case 'a':
            case 'A':
                return COMPRESSED_A_LOW;
            case 'c':
            case 'C':
                return COMPRESSED_C_LOW;
            case 'g':
            case 'G':
                return COMPRESSED_G_LOW;
            case 't':
            case 'T':
                return COMPRESSED_T_LOW;
            case 'n':
            case 'N':
            case '.':
                return COMPRESSED_N_LOW;

            // IUPAC ambiguity codes
            case 'M':
            case 'm':
                return COMPRESSED_M_LOW;
            case 'R':
            case 'r':
                return COMPRESSED_R_LOW;
            case 'S':
            case 's':
                return COMPRESSED_S_LOW;
            case 'V':
            case 'v':
                return COMPRESSED_V_LOW;
            case 'W':
            case 'w':
                return COMPRESSED_W_LOW;
            case 'Y':
            case 'y':
                return COMPRESSED_Y_LOW;
            case 'H':
            case 'h':
                return COMPRESSED_H_LOW;
            case 'K':
            case 'k':
                return COMPRESSED_K_LOW;
            case 'D':
            case 'd':
                return COMPRESSED_D_LOW;
            case 'B':
            case 'b':
                return COMPRESSED_B_LOW;
            default:
                throw new IllegalArgumentException("Bad base passed to charToCompressedBaseLow: " + Character.toString((char) base) + "(" + base + ")");
        }
    }

    /**
     * Convert from ASCII byte to BAM nybble representation of a base in high-order nybble.
     *
     * @param base One of =AaCcGgTtNnMmRrSsVvWwYyHhKkDdBb.
     * @return High-order nybble-encoded equivalent.
     * @throws IllegalArgumentException if the base is not one of =AaCcGgTtNnMmRrSsVvWwYyHhKkDdBb.
     */
    private static byte charToCompressedBaseHigh(final byte base) {
        switch (base) {
            case '=':
                return COMPRESSED_EQUAL_HIGH;
            case 'a':
            case 'A':
                return COMPRESSED_A_HIGH;
            case 'c':
            case 'C':
                return COMPRESSED_C_HIGH;
            case 'g':
            case 'G':
                return COMPRESSED_G_HIGH;
            case 't':
            case 'T':
                return COMPRESSED_T_HIGH;
            case 'n':
            case 'N':
            case '.':
                return COMPRESSED_N_HIGH;

            // IUPAC ambiguity codes
            case 'M':
            case 'm':
                return COMPRESSED_M_HIGH;
            case 'R':
            case 'r':
                return COMPRESSED_R_HIGH;
            case 'S':
            case 's':
                return COMPRESSED_S_HIGH;
            case 'V':
            case 'v':
                return COMPRESSED_V_HIGH;
            case 'W':
            case 'w':
                return COMPRESSED_W_HIGH;
            case 'Y':
            case 'y':
                return COMPRESSED_Y_HIGH;
            case 'H':
            case 'h':
                return COMPRESSED_H_HIGH;
            case 'K':
            case 'k':
                return COMPRESSED_K_HIGH;
            case 'D':
            case 'd':
                return COMPRESSED_D_HIGH;
            case 'B':
            case 'b':
                return COMPRESSED_B_HIGH;
            default:
                throw new IllegalArgumentException("Bad base passed to charToCompressedBaseHigh: " + Character.toString((char) base) + "(" + base + ")");
        }
    }

    /**
     * Returns the byte corresponding to a certain nybble
     *
     * @param base One of COMPRESSED_*_LOW, a low-order nybble encoded base.
     * @return ASCII base, one of =ACGTNMRSVWYHKDB.
     * @throws IllegalArgumentException if the base is not one of =ACGTNMRSVWYHKDB.
     */
    private static byte compressedBaseToByte(byte base) {
        try {
            return COMPRESSED_LOOKUP_TABLE[base];
        } catch (IndexOutOfBoundsException e) {
            throw new IllegalArgumentException("Bad base passed to charToCompressedBase: " + Character.toString((char) base) + "(" + base + ")");
        }
    }

    /**
     * Convert from BAM nybble representation of a base in low-order nybble to ASCII byte.
     *
     * @param base One of COMPRESSED_*_LOW, a low-order nybble encoded base.
     * @return ASCII base, one of ACGTN=.
     */
    private static byte compressedBaseToByteLow(final int base) {
        return compressedBaseToByte((byte) (base & 0xf));
    }

    /**
     * Convert from BAM nybble representation of a base in high-order nybble to ASCII byte.
     *
     * @param base One of COMPRESSED_*_HIGH, a high-order nybble encoded base.
     * @return ASCII base, one of ACGTN=.
     */
    private static byte compressedBaseToByteHigh(final int base) {
        return compressedBaseToByte((byte) ((base >> 4) & 0xf));
    }

    /**
     * Convert bases in place into canonical form, upper case, and with no-call represented as N.
     *
     * @param bases byte array of bases to "normalize", in place.
     */
    static void normalizeBases(final byte[] bases) {
        for (int i = 0; i < bases.length; ++i) {
            bases[i] = StringUtil.toUpperCase(bases[i]);
            if (bases[i] == '.') {
                bases[i] = 'N';
            }
        }
    }

    /**
     * Convert an array of bytes, in which each byte is a binary phred quality score, to
     * printable ASCII representation of the quality scores, ala FASTQ format.
     * 

* Equivalent to phredToFastq(data, 0, data.length) * * @param data Array of bytes in which each byte is a binar phred score. * @return String with ASCII representation of those quality scores. */ public static String phredToFastq(final byte[] data) { if (data == null) { return null; } return phredToFastq(data, 0, data.length); } /** * Convert an array of bytes, in which each byte is a binary phred quality score, to * printable ASCII representation of the quality scores, ala FASTQ format. * * @param buffer Array of bytes in which each byte is a binar phred score. * @param offset Where in buffer to start conversion. * @param length How many bytes of buffer to convert. * @return String with ASCII representation of those quality scores. */ public static String phredToFastq(final byte[] buffer, final int offset, final int length) { final char[] chars = new char[length]; for (int i = 0; i < length; i++) { chars[i] = phredToFastq(buffer[offset + i] & 0xFF); } return new String(chars); } /** * Convert a single binary phred score to printable ASCII representation, ala FASTQ format. * * @param phredScore binary phred score. * @return Printable ASCII representation of phred score. */ public static char phredToFastq(final int phredScore) { if (phredScore < 0 || phredScore > MAX_PHRED_SCORE) { throw new IllegalArgumentException("Cannot encode phred score: " + phredScore); } return (char) (33 + phredScore); } /** * Convert a string with phred scores in printable ASCII FASTQ format to an array * of binary phred scores. * * @param fastq Phred scores in FASTQ printable ASCII format. * @return byte array of binary phred scores in which each byte corresponds to a character in the input string. */ public static byte[] fastqToPhred(final String fastq) { if (fastq == null) { return null; } final int length = fastq.length(); final byte[] scores = new byte[length]; for (int i = 0; i < length; i++) { scores[i] = (byte) fastqToPhred(fastq.charAt(i)); } return scores; } /** * Converts printable qualities in Sanger fastq format to binary phred scores. */ public static void fastqToPhred(final byte[] fastq) { for (int i = 0; i < fastq.length; ++i) { fastq[i] = (byte) fastqToPhred((char) (fastq[i] & 0xff)); } } /** * Convert a single printable ASCII FASTQ format phred score to binary phred score. * * @param ch Printable ASCII FASTQ format phred score. * @return Binary phred score. */ public static int fastqToPhred(final char ch) { if (ch < 33 || ch > 126) { throw new IllegalArgumentException("Invalid fastq character: " + ch); } return (ch - 33); } /** * calculate the bin given an alignment in [beg,end) * Copied from SAM spec. * * @param beg 0-based start of read (inclusive) * @param end 0-based end of read (exclusive) * @deprecated Use {@link GenomicIndexUtil#regionToBin} */ @Deprecated static int reg2bin(final int beg, final int end) { return GenomicIndexUtil.regionToBin(beg, end); } /** * Handle a list of validation errors according to the validation stringency. * * @param validationErrors List of errors to report, or null if there are no errors. * @param samRecordIndex Record number of the SAMRecord corresponding to the validation errors, or -1 if * the record number is not known. * @param validationStringency If STRICT, throw a SAMFormatException. If LENIENT, print the validation * errors to stderr. If SILENT, do nothing. */ public static void processValidationErrors(final List validationErrors, final long samRecordIndex, final ValidationStringency validationStringency) { if (validationErrors != null && !validationErrors.isEmpty()) { for (final SAMValidationError validationError : validationErrors) { validationError.setRecordNumber(samRecordIndex); } if (validationStringency == ValidationStringency.STRICT) { throw new SAMFormatException("SAM validation error: " + validationErrors.get(0)); } else if (validationStringency == ValidationStringency.LENIENT) { for (final SAMValidationError error : validationErrors) { System.err.println("Ignoring SAM validation error: " + error); } } } } public static void processValidationError(final SAMValidationError validationError, final ValidationStringency validationStringency) { if (validationStringency == ValidationStringency.STRICT) { throw new SAMFormatException("SAM validation error: " + validationError); } else if (validationStringency == ValidationStringency.LENIENT) { System.err.println("Ignoring SAM validation error: " + validationError); } } private static final SAMHeaderRecordComparator HEADER_RECORD_COMPARATOR = new SAMHeaderRecordComparator<>( SAMReadGroupRecord.PLATFORM_UNIT_TAG, SAMReadGroupRecord.LIBRARY_TAG, SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG, SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG, SAMReadGroupRecord.SEQUENCING_CENTER_TAG, SAMReadGroupRecord.PLATFORM_TAG, SAMReadGroupRecord.DESCRIPTION_TAG, SAMReadGroupRecord.READ_GROUP_ID_TAG // We don't actually want to compare with ID but it's suitable // "just in case" since it's the only one that's actually required ); /** * Calculate a hash code from identifying information in the RG (read group) records in a SAM file's * header. This hash code changes any time read groups are added or removed. Comparing one file's * hash code to another's tells you if the read groups in the BAM files are different. */ public static String calculateReadGroupRecordChecksum(final File input, final File referenceFasta) { final String ENCODING = "UTF-8"; final MessageDigest digest; try { digest = MessageDigest.getInstance("MD5"); } catch (final NoSuchAlgorithmException nsae) { throw new Error("No MD5 algorithm was available in a Java JDK? Unheard-of!"); } // Sort the read group records by their first final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(input); final List sortedRecords = new ArrayList<>(reader.getFileHeader().getReadGroups()); Collections.sort(sortedRecords, HEADER_RECORD_COMPARATOR); for (final SAMReadGroupRecord rgRecord : sortedRecords) { final TreeMap sortedAttributes = new TreeMap<>(); for (final Map.Entry attributeEntry : rgRecord.getAttributes()) { sortedAttributes.put(attributeEntry.getKey(), attributeEntry.getValue()); } try { for (final Map.Entry sortedEntry : sortedAttributes.entrySet()) { if (!sortedEntry.getKey().equals(SAMReadGroupRecord.READ_GROUP_ID_TAG)) { // Redundant check, safety first digest.update(sortedEntry.getKey().getBytes(ENCODING)); digest.update(sortedEntry.getValue().getBytes(ENCODING)); } } } catch (final UnsupportedEncodingException uee) { throw new Error("No " + ENCODING + "!? WTH?"); } } // Convert to a String and pad to get the full 32 chars. final StringBuilder hashText = new StringBuilder((new BigInteger(1, digest.digest())).toString(16)); while (hashText.length() < 32) hashText.insert(0, "0"); CloserUtil.close(reader); return hashText.toString(); } /** * Chains program in front of the first "head" item in the list of * SAMProgramRecords in header. This method should not be used * when there are multiple chains of program groups in a header, only when * it can safely be assumed that there is only one chain. It correctly handles * the case where program has already been added to the header, so * it can be used whether creating a SAMProgramRecord with a constructor or when * calling SAMFileHeader.createProgramRecord(). */ public static void chainSAMProgramRecord(final SAMFileHeader header, final SAMProgramRecord program) { final List pgs = header.getProgramRecords(); if (!pgs.isEmpty()) { final List referencedIds = new ArrayList<>(); for (final SAMProgramRecord pg : pgs) { if (pg.getPreviousProgramGroupId() != null) { referencedIds.add(pg.getPreviousProgramGroupId()); } } for (final SAMProgramRecord pg : pgs) { // if record being chained has already been added, ignore it if (pg.getProgramGroupId().equals(program.getProgramGroupId())) { continue; } if (!referencedIds.contains(pg.getProgramGroupId())) { program.setPreviousProgramGroupId(pg.getProgramGroupId()); break; } } } } /** * Strip mapping information from a SAMRecord. *

* WARNING: by clearing the secondary and supplementary flags, * this may have the affect of producing multiple distinct records with the * same read name and flags, which may lead to invalid SAM/BAM output. * Callers of this method should make sure to deal with this issue. */ public static void makeReadUnmapped(final SAMRecord rec) { if (rec.getReadNegativeStrandFlag()) { rec.reverseComplement(true); rec.setReadNegativeStrandFlag(false); } rec.setDuplicateReadFlag(false); rec.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); rec.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); rec.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); rec.setInferredInsertSize(0); rec.setSecondaryAlignment(false); rec.setSupplementaryAlignmentFlag(false); rec.setProperPairFlag(false); rec.setReadUnmappedFlag(true); } /** * Strip mapping information from a SAMRecord, but preserve it in the 'O' tags if it isn't already set. */ public static void makeReadUnmappedWithOriginalTags(final SAMRecord rec) { if (!hasOriginalMappingInformation(rec)) { rec.setAttribute(SAMTag.OP, rec.getAlignmentStart()); rec.setAttribute(SAMTag.OC, rec.getCigarString()); rec.setAttribute(SAMTag.OF, rec.getFlags()); rec.setAttribute(SAMTag.OR, rec.getReferenceName()); } makeReadUnmapped(rec); } /** * See if any tags pertaining to original mapping information have been set. */ public static boolean hasOriginalMappingInformation(final SAMRecord rec) { return rec.getAttribute(SAMTag.OP) != null || rec.getAttribute(SAMTag.OC) != null || rec.getAttribute(SAMTag.OF) != null || rec.getAttribute(SAMTag.OR) != null; } /** * Determines if a cigar has any element that both consumes read bases and consumes reference bases * (e.g. is not all soft-clipped) */ public static boolean cigarMapsNoBasesToRef(final Cigar cigar) { for (final CigarElement el : cigar.getCigarElements()) { if (el.getOperator().consumesReadBases() && el.getOperator().consumesReferenceBases()) { return false; } } return true; } /** * Tests if the provided record is mapped entirely beyond the end of the reference (i.e., the alignment start is greater than the * length of the sequence to which the record is mapped). * * @param record must not have a null SamFileHeader */ public static boolean recordMapsEntirelyBeyondEndOfReference(final SAMRecord record) { if (record.getHeader() == null) { throw new SAMException("A non-null SAMHeader is required to resolve the mapping position: " + record.getReadName()); } else { return record.getHeader().getSequence(record.getReferenceIndex()).getSequenceLength() < record.getAlignmentStart(); } } /** * @return negative if mapq1 < mapq2, etc. * Note that MAPQ(0) < MAPQ(255) < MAPQ(1) */ public static int compareMapqs(final int mapq1, final int mapq2) { if (mapq1 == mapq2) return 0; if (mapq1 == 0) return -1; else if (mapq2 == 0) return 1; else if (mapq1 == 255) return -1; else if (mapq2 == 255) return 1; else return mapq1 - mapq2; } /** * Hokey algorithm for combining two MAPQs into values that are comparable, being cognizant of the fact * that in MAPQ world, 1 > 255 > 0. In this algorithm, 255 is treated as if it were 0.01, so that * CombinedMapq(1,0) > CombinedMapq(255, 255) > CombinedMapq(0, 0). * The return value should not be used for anything other than comparing to the return value of other * invocations of this method. */ public static int combineMapqs(int m1, int m2) { if (m1 == 255) { m1 = 1; } else { m1 *= 100; } if (m2 == 255) { m2 = 1; } else { m2 *= 100; } return m1 + m2; } public static long findVirtualOffsetOfFirstRecordInBam(final Path bamFile) { try (SeekableStream ss = new SeekablePathStream(bamFile)){ return BAMFileReader.findVirtualOffsetOfFirstRecord(ss); } catch (final IOException ioe) { throw new RuntimeEOFException(ioe); } } /** * Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file * offset after skipping over the text header and the sequence records. */ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) { try { return BAMFileReader.findVirtualOffsetOfFirstRecord(bamFile); } catch (final IOException ioe) { throw new RuntimeEOFException(ioe); } } /** * Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file * offset after skipping over the text header and the sequence records. */ public static long findVirtualOffsetOfFirstRecordInBam(final SeekableStream seekableStream) { try { return BAMFileReader.findVirtualOffsetOfFirstRecord(seekableStream); } catch (final IOException ioe) { throw new RuntimeEOFException(ioe); } } /** * Given a Cigar, Returns blocks of the sequence that have been aligned directly to the * reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference) * are not represented in the alignment blocks. * * @param cigar The cigar containing the alignment information * @param alignmentStart The start (1-based) of the alignment * @param cigarTypeName The type of cigar passed - for error logging. * @return List of alignment blocks */ public static List getAlignmentBlocks(final Cigar cigar, final int alignmentStart, final String cigarTypeName) { if (cigar == null) return Collections.emptyList(); final List alignmentBlocks = new ArrayList<>(); int readBase = 1; int refBase = alignmentStart; for (final CigarElement e : cigar.getCigarElements()) { switch (e.getOperator()) { case H: break; // ignore hard clips case P: break; // ignore pads case S: readBase += e.getLength(); break; // soft clip read bases case N: refBase += e.getLength(); break; // reference skip case D: refBase += e.getLength(); break; case I: readBase += e.getLength(); break; case M: case EQ: case X: final int length = e.getLength(); alignmentBlocks.add(new AlignmentBlock(readBase, refBase, length)); readBase += length; refBase += length; break; default: throw new IllegalStateException("Case statement didn't deal with " + cigarTypeName + " op: " + e.getOperator() + "in CIGAR: " + cigar); } } return Collections.unmodifiableList(alignmentBlocks); } /** * @param alignmentStart The start (1-based) of the alignment * @param cigar The cigar containing the alignment information * @return the alignment start (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) * then this method will return 96. *

* Invalid to call on an unmapped read. * Invalid to call with cigar = null */ public static int getUnclippedStart(final int alignmentStart, final Cigar cigar) { int unClippedStart = alignmentStart; for (final CigarElement cig : cigar.getCigarElements()) { final CigarOperator op = cig.getOperator(); if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { unClippedStart -= cig.getLength(); } else { break; } } return unClippedStart; } /** * @param alignmentEnd The end (1-based) of the alignment * @param cigar The cigar containing the alignment information * @return the alignment end (1-based, inclusive) adjusted for clipped bases. For example if the read * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) * then this method will return 107. *

* Invalid to call on an unmapped read. * Invalid to call with cigar = null */ public static int getUnclippedEnd(final int alignmentEnd, final Cigar cigar) { int unClippedEnd = alignmentEnd; final List cigs = cigar.getCigarElements(); for (int i = cigs.size() - 1; i >= 0; --i) { final CigarElement cig = cigs.get(i); final CigarOperator op = cig.getOperator(); if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { unClippedEnd += cig.getLength(); } else { break; } } return unClippedEnd; } /** * Returns the Mate Cigar String as stored in the attribute 'MC'. * * @param rec the SAM record * @return Mate Cigar String, or null if there is none. */ public static String getMateCigarString(final SAMRecord rec) { return rec.getStringAttribute(SAMTag.MC); } /** * Returns the Mate Cigar or null if there is none. * * @param rec the SAM record * @param withValidation true if we are to validate the mate cigar before returning, false otherwise. * @return Cigar object for the read's mate, or null if there is none. */ public static Cigar getMateCigar(final SAMRecord rec, final boolean withValidation) { final String mateCigarString = getMateCigarString(rec); Cigar mateCigar = null; if (mateCigarString != null) { mateCigar = TextCigarCodec.decode(mateCigarString); if (withValidation && rec.getValidationStringency() != ValidationStringency.SILENT) { final List alignmentBlocks = getAlignmentBlocks(mateCigar, rec.getMateAlignmentStart(), "mate cigar"); SAMUtils.processValidationErrors(validateCigar(rec, mateCigar, rec.getMateReferenceIndex(), alignmentBlocks, -1, "Mate CIGAR"), -1L, rec.getValidationStringency()); } } return mateCigar; } /** * Returns the Mate Cigar or null if there is none. No validation is done on the returned cigar. * * @param rec the SAM record * @return Cigar object for the read's mate, or null if there is none. */ public static Cigar getMateCigar(final SAMRecord rec) { return getMateCigar(rec, false); } /** * @param rec the SAM record * @return number of cigar elements (number + operator) in the mate cigar string. */ public static int getMateCigarLength(final SAMRecord rec) { final Cigar mateCigar = getMateCigar(rec); return (mateCigar != null) ? mateCigar.numCigarElements() : 0; } /** * This method uses the MateCigar value as determined from the attribute MC. It must be non-null. * * @param rec the SAM record * @return 1-based inclusive rightmost position of the clipped mate sequence, or 0 read if unmapped. */ public static int getMateAlignmentEnd(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) { throw new RuntimeException("getMateAlignmentEnd called on an unmapped mate: " + rec); } final Cigar mateCigar = SAMUtils.getMateCigar(rec); if (mateCigar == null) { throw new SAMException("Mate CIGAR (Tag MC) not found:" + rec); } return CoordMath.getEnd(rec.getMateAlignmentStart(), mateCigar.getReferenceLength()); } /** * @param rec the SAM record * @return the mate alignment start (1-based, inclusive) adjusted for clipped bases. For example if the mate * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) * then this method will return 96. *

* Invalid to call on an unmapped read. */ public static int getMateUnclippedStart(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) throw new RuntimeException("getMateUnclippedStart called on an unmapped mate: " + rec); final Cigar mateCigar = getMateCigar(rec); if (mateCigar == null) { throw new SAMException("Mate CIGAR (Tag MC) not found: " + rec); } return SAMUtils.getUnclippedStart(rec.getMateAlignmentStart(), mateCigar); } /** * @param rec the SAM record * @return the mate alignment end (1-based, inclusive) adjusted for clipped bases. For example if the mate * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) * then this method will return 107. *

* Invalid to call on an unmapped read. */ public static int getMateUnclippedEnd(final SAMRecord rec) { if (rec.getMateUnmappedFlag()) { throw new RuntimeException("getMateUnclippedEnd called on an unmapped mate: " + rec); } final Cigar mateCigar = SAMUtils.getMateCigar(rec); if (mateCigar == null) { throw new SAMException("Mate CIGAR (Tag MC) not found: " + rec); } return SAMUtils.getUnclippedEnd(getMateAlignmentEnd(rec), mateCigar); } /** * @param rec the SAM record * Returns blocks of the mate sequence that have been aligned directly to the * reference sequence. Note that clipped portions of the mate and inserted and * deleted bases (vs. the reference) are not represented in the alignment blocks. */ public static List getMateAlignmentBlocks(final SAMRecord rec) { return getAlignmentBlocks(getMateCigar(rec), rec.getMateAlignmentStart(), "mate cigar"); } /** * Run all validations of the mate's CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. * * @param rec the SAM record * @param cigar The cigar containing the alignment information * @param referenceIndex The reference index * @param alignmentBlocks The alignment blocks (parsed from the cigar) * @param recordNumber For error reporting. -1 if not known. * @param cigarTypeName For error reporting. "Read CIGAR" or "Mate Cigar" * @return List of errors, or null if no errors. */ public static List validateCigar(final SAMRecord rec, final Cigar cigar, final Integer referenceIndex, final List alignmentBlocks, final long recordNumber, final String cigarTypeName) { // Don't know line number, and don't want to force read name to be decoded. List ret = cigar.isValid(rec.getReadName(), recordNumber); if (referenceIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { SAMFileHeader samHeader = rec.getHeader(); if (null == samHeader) { if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.MISSING_HEADER, cigarTypeName + " A non-null SAMHeader is required to validate cigar elements for: ", rec.getReadName(), recordNumber)); } else { final SAMSequenceRecord sequence = samHeader.getSequence(referenceIndex); final int referenceSequenceLength = sequence.getSequenceLength(); for (final AlignmentBlock alignmentBlock : alignmentBlocks) { if (alignmentBlock.getReferenceStart() + alignmentBlock.getLength() - 1 > referenceSequenceLength) { if (ret == null) ret = new ArrayList<>(); ret.add(new SAMValidationError(SAMValidationError.Type.CIGAR_MAPS_OFF_REFERENCE, cigarTypeName + " M operator maps off end of reference", rec.getReadName(), recordNumber)); break; } } } } return ret; } /** * Run all validations of the mate's CIGAR. These include validation that the CIGAR makes sense independent of * placement, plus validation that CIGAR + placement yields all bases with M operator within the range of the reference. * * @param rec the SAM record * @param recordNumber For error reporting. -1 if not known. * @return List of errors, or null if no errors. */ public static List validateMateCigar(final SAMRecord rec, final long recordNumber) { List ret = null; if (rec.getValidationStringency() != ValidationStringency.SILENT) { if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { // The mateCigar will be defined if the mate is mapped if (getMateCigarString(rec) != null) { ret = SAMUtils.validateCigar(rec, getMateCigar(rec), rec.getMateReferenceIndex(), getMateAlignmentBlocks(rec), recordNumber, "Mate CIGAR"); } } else { if (getMateCigarString(rec) != null) { ret = new ArrayList<>(); if (!rec.getReadPairedFlag()) { // If the read is not paired, and the Mate Cigar String (MC Attribute) exists, that is a validation error ret.add(new SAMValidationError(SAMValidationError.Type.MATE_CIGAR_STRING_INVALID_PRESENCE, "Mate CIGAR String (MC Attribute) present for a read that is not paired", rec.getReadName(), recordNumber)); } else { // will hit here if rec.getMateUnmappedFlag() is true // If the Mate is unmapped, and the Mate Cigar String (MC Attribute) exists, that is a validation error. ret.add(new SAMValidationError(SAMValidationError.Type.MATE_CIGAR_STRING_INVALID_PRESENCE, "Mate CIGAR String (MC Attribute) present for a read whose mate is unmapped", rec.getReadName(), recordNumber)); } } } } return ret; } /** * Checks to see if it is valid for this record to have a mate CIGAR (MC) and then if there is a mate CIGAR available. This is done by * checking that this record is paired, its mate is mapped, and that it returns a non-null mate CIGAR. * * @param rec * @return */ public static boolean hasMateCigar(SAMRecord rec) { // NB: use getMateCigarString rather than getMateCigar to avoid validation. return (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && null != SAMUtils.getMateCigarString(rec)); } /** * Returns a string that is the the read group ID and read name separated by a colon. This is meant to canonically * identify a given record within a set of records. * * @param record SAMRecord for which "canonical" read name is requested * @return The record's readgroup-id (if non-null) and the read name, separated by a colon, ':' */ public static String getCanonicalRecordName(final SAMRecord record) { String name = record.getStringAttribute(ReservedTagConstants.READ_GROUP_ID); if (null == name) name = record.getReadName(); else name = name + ":" + record.getReadName(); return name; } /** * Returns the number of bases that need to be clipped due to overlapping pairs. If the record is not paired, * or the given record's start position is greater than its mate's start position, zero is automatically returned. * NB: This method assumes that the record's mate is not contained within the given record's alignment. * * @param rec SAMRecord that needs clipping due to overlapping pairs. * @return the number of bases at the end of the read that need to be clipped such that there would be no overlapping bases with its mate. * Read bases include only those from insertion, match, or mismatch Cigar operators. */ public static int getNumOverlappingAlignedBasesToClip(final SAMRecord rec) { // NB: ignores how to handle supplemental records when present for both ends by just using the mate information in the record. if (!rec.getReadPairedFlag() || rec.getReadUnmappedFlag() || rec.getMateUnmappedFlag()) return 0; // Only clip records that are left-most in genomic order and overlapping. if (rec.getMateAlignmentStart() < rec.getAlignmentStart()) return 0; // right-most, so ignore. else if (rec.getMateAlignmentStart() == rec.getAlignmentStart() && rec.getFirstOfPairFlag()) return 0; // same start, so pick the first end // Find the number of read bases after the given mate's alignment start. int numBasesToClip = 0; final int refStartPos = rec.getMateAlignmentStart(); // relative reference position after which we should start clipping final Cigar cigar = rec.getCigar(); int refPos = rec.getAlignmentStart(); for (final CigarElement el : cigar.getCigarElements()) { final CigarOperator operator = el.getOperator(); final int refBasesLength = operator.consumesReferenceBases() ? el.getLength() : 0; if (refStartPos <= refPos + refBasesLength - 1) { // add to clipped bases if (operator == CigarOperator.MATCH_OR_MISMATCH) { // M if (refStartPos < refPos) numBasesToClip += refBasesLength; // use all of the bases else numBasesToClip += (refPos + refBasesLength) - refStartPos; // since the mate's alignment start can be in the middle of a cigar element } else if (operator == CigarOperator.SOFT_CLIP || operator == CigarOperator.HARD_CLIP || operator == CigarOperator.PADDING || operator == CigarOperator.SKIPPED_REGION) { // ignore } else { // ID numBasesToClip += operator.consumesReadBases() ? el.getLength() : 0; // clip all the bases in the read from this operator } } refPos += refBasesLength; } if (numBasesToClip < 0) return 0; // left-most but not overlapping return numBasesToClip; } /** * Returns a (possibly new) record that has been clipped if input is a mapped paired and has overlapping bases with its mate. * See {@link #getNumOverlappingAlignedBasesToClip(SAMRecord)} for how the number of overlapping bases is computed. * NB: this does not properly consider a cigar like: 100M20S10H. * NB: This method assumes that the record's mate is not contained within the given record's alignment. * * @param record the record from which to clip bases. * @param noSideEffects if true a modified clone of the original record is returned, otherwise we modify the record directly. * @return a (possibly new) record that has been clipped */ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, final boolean noSideEffects) { return clipOverlappingAlignedBases(record, getNumOverlappingAlignedBasesToClip(record), noSideEffects); } /** * Returns a (possibly new) SAMRecord with the given number of bases soft-clipped at the end of the read if is a mapped * paired and has overlapping bases with its mate. * NB: this does not properly consider a cigar like: 100M20S10H. * NB: This method assumes that the record's mate is not contained within the given record's alignment. * * @param record the record from which to clip bases. * @param numOverlappingBasesToClip the number of bases to clip at the end of the read. * @param noSideEffects if true a modified clone of the original record is returned, otherwise we modify the record directly. * @return Returns a (possibly new) SAMRecord with the given number of bases soft-clipped */ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, final int numOverlappingBasesToClip, final boolean noSideEffects) { // NB: ignores how to handle supplemental records when present for both ends by just using the mate information in the record. if (numOverlappingBasesToClip <= 0 || record.getReadUnmappedFlag() || record.getMateUnmappedFlag()) { return record; } try { final SAMRecord rec = noSideEffects ? ((SAMRecord) record.clone()) : record; // watch out for when the second read overlaps all of the first read if (rec.getMateAlignmentStart() <= rec.getAlignmentStart()) { // make it unmapped rec.setReadUnmappedFlag(true); return rec; } // 1-based index of first base in read to clip. int clipFrom = rec.getReadLength() - numOverlappingBasesToClip + 1; // we have to check if the last cigar element is soft-clipping, so we can subtract that from clipFrom final CigarElement cigarElement = rec.getCigar().getCigarElement(rec.getCigarLength() - 1); if (CigarOperator.SOFT_CLIP == cigarElement.getOperator()) clipFrom -= cigarElement.getLength(); // FIXME: does not properly consider a cigar like: 100M20S10H // clip it, clip it good rec.setCigar(new Cigar(CigarUtil.softClipEndOfRead(clipFrom, rec.getCigar().getCigarElements()))); return rec; } catch (final CloneNotSupportedException e) { throw new SAMException(e.getMessage(), e); } } /** * Checks if a long attribute value is within the allowed range of a 32-bit unsigned integer. * * @param value a long value to check * @return true if value is >= 0 and <= {@link BinaryCodec#MAX_UINT}, and false otherwise */ public static boolean isValidUnsignedIntegerAttribute(long value) { return value >= 0 && value <= BinaryCodec.MAX_UINT; } /** * Extract a List of 'other canonical alignments' from a SAM record. Those alignments are stored as a string in the 'SA' tag as defined * in the SAM specification. * The name, sequence and qualities, mate data are copied from the original record. * * @param record must be non null and must have a non-null associated header. * @return a list of 'other canonical alignments' SAMRecords. The list is empty if the 'SA' attribute is missing. */ public static List getOtherCanonicalAlignments(final SAMRecord record) { if (record == null) throw new IllegalArgumentException("record is null"); if (record.getHeader() == null) throw new IllegalArgumentException("record.getHeader() is null"); /* extract value of SA tag */ final Object saValue = record.getAttribute(SAMTag.SA); if (saValue == null) return Collections.emptyList(); if (!(saValue instanceof String)) throw new SAMException( "Expected a String for attribute 'SA' but got " + saValue.getClass() + ". Record: " + record); final SAMRecordFactory samReaderFactory = new DefaultSAMRecordFactory(); /* the spec says: "Other canonical alignments in a chimeric alignment, formatted as a * semicolon-delimited list: (rname,pos,strand,CIGAR,mapQ,NM;)+. * Each element in the list represents a part of the chimeric alignment. * Conventionally, at a supplementary line, the 1st element points to the primary line. */ /* break string using semicolon */ final String semiColonStrs[] = SEMICOLON_PAT.split((String) saValue); /* the result list */ final List alignments = new ArrayList<>(semiColonStrs.length); /* base SAM flag */ int record_flag = record.getFlags(); record_flag &= ~SAMFlag.PROPER_PAIR.flag; record_flag &= ~SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag; record_flag &= ~SAMFlag.READ_REVERSE_STRAND.flag; for (int i = 0; i < semiColonStrs.length; ++i) { final String semiColonStr = semiColonStrs[i]; /* ignore empty string */ if (semiColonStr.isEmpty()) continue; /* break string using comma */ final String commaStrs[] = COMMA_PAT.split(semiColonStr); if (commaStrs.length != 6) throw new SAMException("Bad 'SA' attribute in " + semiColonStr + ". Record: " + record); /* create the new record */ final SAMRecord otherRec = samReaderFactory.createSAMRecord(record.getHeader()); /* copy fields from the original record */ otherRec.setReadName(record.getReadName()); otherRec.setReadBases(record.getReadBases()); otherRec.setBaseQualities(record.getBaseQualities()); if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { otherRec.setMateReferenceIndex(record.getMateReferenceIndex()); otherRec.setMateAlignmentStart(record.getMateAlignmentStart()); } /* get reference sequence */ final int tid = record.getHeader().getSequenceIndex(commaStrs[0]); if (tid == -1) throw new SAMException("Unknown contig in " + semiColonStr + ". Record: " + record); otherRec.setReferenceIndex(tid); /* fill POS */ final int alignStart; try { alignStart = Integer.parseInt(commaStrs[1]); } catch (final NumberFormatException err) { throw new SAMException("bad POS in " + semiColonStr + ". Record: " + record, err); } otherRec.setAlignmentStart(alignStart); /* set TLEN */ if (record.getReadPairedFlag() && !record.getMateUnmappedFlag() && record.getMateReferenceIndex() == tid) { otherRec.setInferredInsertSize(record.getMateAlignmentStart() - alignStart); } /* set FLAG */ int other_flag = record_flag; other_flag |= (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag); /* spec: Conventionally, at a supplementary line, the 1st element points to the primary line */ if (!(record.getSupplementaryAlignmentFlag() && i == 0)) { other_flag |= SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag; } otherRec.setFlags(other_flag); /* set CIGAR */ otherRec.setCigar(TextCigarCodec.decode(commaStrs[3])); /* set MAPQ */ try { otherRec.setMappingQuality(Integer.parseInt(commaStrs[4])); } catch (final NumberFormatException err) { throw new SAMException("bad MAPQ in " + semiColonStr + ". Record: " + record, err); } /* fill NM */ try { if (!commaStrs[5].equals("*")) { otherRec.setAttribute(SAMTag.NM, Integer.parseInt(commaStrs[5])); } } catch (final NumberFormatException err) { throw new SAMException("bad NM in " + semiColonStr + ". Record: " + record, err); } /* if strand is not the same: reverse-complement */ if (otherRec.getReadNegativeStrandFlag() != record.getReadNegativeStrandFlag()) { otherRec.reverseComplement(true); } /* add the alignment */ alignments.add(otherRec); } return alignments; } /** * @deprecated because the method does the exact opposite of what it says. Use the correctly named * isReferenceSequenceIncompatibleWithBAI() instead. */ @Deprecated public static boolean isReferenceSequenceCompatibleWithBAI(final SAMSequenceRecord sequence) { return isReferenceSequenceIncompatibleWithBAI(sequence); } /** * Checks if reference sequence is compatible with BAI indexing format. * @param sequence reference sequence. */ public static boolean isReferenceSequenceIncompatibleWithBAI(final SAMSequenceRecord sequence) { return sequence.getSequenceLength() > GenomicIndexUtil.BIN_GENOMIC_SPAN; } /** * Function to create the OA tag value from a record. The OA tag contains the mapping information * of a record encoded as a comma-separated string (REF,POS,STRAND,CIGAR,MAPPING_QUALITY,NM_TAG_VALUE) * @param record to use for generating the OA tag * @return the OA tag string value */ public static String calculateOATagValue(SAMRecord record) { if (record.getReferenceName().contains(",")) { throw new SAMException(String.format("Reference name for record %s contains a comma character.", record.getReadName())); } final String oaValue; if (record.getReadUnmappedFlag()) { oaValue = String.format(Locale.US, "*,0,%s,*,%s,", record.getReadNegativeStrandFlag() ? Strand.NEGATIVE : Strand.POSITIVE, record.getMappingQuality()); } else { oaValue = String.format(Locale.US, "%s,%s,%s,%s,%s,%s", record.getReferenceName(), record.getAlignmentStart(), record.getReadNegativeStrandFlag() ? Strand.NEGATIVE : Strand.POSITIVE, record.getCigarString(), record.getMappingQuality(), Optional.ofNullable(record.getAttribute(SAMTag.NM)).orElse("")); } return oaValue; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy