All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.testutils.SamAssertionUtils Maven / Gradle / Ivy

package org.broadinstitute.hellbender.testutils;

import com.google.common.collect.Sets;
import htsjdk.samtools.*;
import org.apache.commons.io.FilenameUtils;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.testng.Assert;
import picard.sam.SortSam;
import picard.sam.util.SamComparison;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Path;
import java.util.*;

/**
 * Collection of utilities for making common assertions about SAM files for unit testing purposes.
 */
public final class SamAssertionUtils {

    private static SamReader getReader(final File sam, final ValidationStringency validationStringency, final File reference) {
        return SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(reference).open(sam);
    }
    private static SamReader getReader(final Path sam, final ValidationStringency validationStringency, final Path reference) {
        return SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(reference).open(sam);
    }
    /**
     *  causes an exception if the given sam files aren't equal
     *  @param actualSam the actual file
     *  @param expectedSam the expected file
     *  @param validationStringency how stringently do we validate the files
     *  @param reference is allowed to be null
     */
    public static void assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency, final File reference) throws IOException {
        assertSamsEqual(actualSam.toPath(), expectedSam.toPath(), validationStringency,
                (null==reference?null:reference.toPath()));
    }
    public static void assertSamsEqual(final Path actualSam, final Path expectedSam, final ValidationStringency validationStringency, final Path reference) throws IOException {
        final String equalStringent = samsEqualStringent(actualSam, expectedSam, validationStringency, reference);
        Assert.assertNull(equalStringent, "SAM file " + actualSam.toUri().toString() + " differs from expected output:" + expectedSam.toUri().toString() + " " + equalStringent);
    }

    /**
     * causes an exception if the given sam files aren't equal
     *  @param actualSam the actual file
     *  @param expectedSam the expected file
     *  @param validationStringency how stringently do we validate the files
     */
    public static void assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency) throws IOException {
        assertSamsEqual(actualSam, expectedSam, validationStringency, null);
    }

    /**
     * causes an exception if the given sam files aren't equal
     *  @param actualSam the actual file
     *  @param expectedSam the expected file
     *  @param reference is allowed to be null
     */
    public static void assertSamsEqual(final File actualSam, final File expectedSam, final File reference) throws IOException {
        assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, reference);
    }
    public static void assertSamsEqual(final Path actualSam, final Path expectedSam, final Path reference) throws IOException {
        assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, reference);
    }

    /**
     * causes an exception if the given sam files aren't equal
     *  @param actualSam the actual file
     *  @param expectedSam the expected file
     */
    public static void assertSamsEqual(final File actualSam, final File expectedSam) throws IOException {
        assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, null);
    }

    /**
     * causes an exception if the given sam isn't valid
     * @param reference is allowed to be null
     */
    public static void assertSamValid(final File sam, final ValidationStringency validationStringency, final File reference) throws IOException {
        assertCRAMContentsIfCRAM(sam);
        try (final SamReader samReader = getReader(sam, validationStringency, reference)) {
            final SamFileValidator validator = new SamFileValidator(new PrintWriter(System.out), 8000);
            validator.setIgnoreWarnings(true);
            validator.setVerbose(true, 1000);
            validator.setErrorsToIgnore(Collections.singletonList(SAMValidationError.Type.MISSING_READ_GROUP));
            final boolean validated = validator.validateSamFileVerbose(samReader, null);
            Assert.assertTrue(validated, "SAM file validation failed");
        }
    }

    /**
     * causes an exception if the given sam isn't valid
     */
    public static void assertSamValid(final File sam, final ValidationStringency validationStringency) throws IOException {
        assertSamValid(sam, validationStringency, null);
    }

    /**
     * causes an exception if the given sam isn't valid
     * @param reference is allowed to be null
     * the default ValidationStringency value for this method is LENIENT
     */
    public static void assertSamValid(final File sam, final File reference) throws IOException {
        assertSamValid(sam, ValidationStringency.LENIENT, reference);
    }

    /**
     * causes an exception if the given sam isn't valid
     * the default ValidationStringency value for this method is LENIENT
     */
    public static void assertSamValid(final File sam) throws IOException {
        assertSamValid(sam, ValidationStringency.LENIENT, null);
    }

    /**
     * Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes).
     * Returns null if the files are considered equals and returns a String describing the reason for comparison failure.
     * The lenient comparison only checks headers and alignment info {@link SamComparison}. Compares headers, and if headers are compatible enough, compares SAMRecords,
     * looking only at basic alignment info.
     */
    public static String samsEqualLenient(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference) throws IOException {
        assertCRAMContentsIfCRAM(actualSam);
        assertCRAMContentsIfCRAM(expectedSam);
        try(final SamReader reader1 = getReader(actualSam, validation, reference);
            final SamReader reader2 = getReader(expectedSam, validation, reference)) {

            final SamComparison comparison = new SamComparison(reader1, reader2);
            return comparison.areEqual() ? null : "SamComparison fails";
        }
    }

    /**
     * Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes)
     * Comparing by MD5s is too strict and comparing by SamComparison is too lenient. So we need this method.
     *
     * This differs from a byte-to-byte comparison:
     * - @PG and @CO lines in headers are ignored in the comparison.
     * - each read in the actual file are allowed to have a superset of the attributes of the corresponding read in the expected set
     * @return null if equal or message string if not equal.
     */
    public static String samsEqualStringent(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference) throws IOException {
        return samsEqualStringent(actualSam.toPath(), expectedSam.toPath(), validation, (null==reference?null:reference.toPath()));
    }

    /**
     * Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes)
     * Comparing by MD5s is too strict and comparing by SamComparison is too lenient. So we need this method.
     *
     * This differs from a byte-to-byte comparison:
     * - @PG and @CO lines in headers are ignored in the comparison.
     * - each read in the actual file are allowed to have a superset of the attributes of the corresponding read in the expected set
     * @return null if equal or message string if not equal.
     */
    public static String samsEqualStringent(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException {
        if (sameMD5s(actualSam, expectedSam)) {
            return null;
        }

        //  verify that CRAM files have CRAM contents
        assertCRAMContentsIfCRAM(actualSam);
        assertCRAMContentsIfCRAM(expectedSam);

        String msg = equalHeadersIgnoreCOandPG(actualSam, expectedSam, validation, reference);
        if (msg != null) { return msg; }

        //At this point we know that the files are not byte-wise identical, but are equal according to SamComparison and their headers are equal
        //So we iterate over reads and compare them one by one.
        return compareReads(actualSam, expectedSam, validation, reference);
    }


    private static boolean sameMD5s(final File actualSam, final File expectedSam) throws IOException {
        final String fileMD5_1 = Utils.calculateFileMD5(actualSam);
        final String fileMD5_2 = Utils.calculateFileMD5(expectedSam);
        return fileMD5_1.equals(fileMD5_2);
    }


    private static boolean sameMD5s(final Path actualSam, final Path expectedSam) throws IOException {
        final String fileMD5_1 = Utils.calculatePathMD5(actualSam);
        final String fileMD5_2 = Utils.calculatePathMD5(expectedSam);
        return fileMD5_1.equals(fileMD5_2);
    }

    private static String compareReads(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException {
        try(final SamReader reader1 = getReader(actualSam, validation, reference);
            final SamReader reader2 = getReader(expectedSam, validation, reference)) {
            final SAMRecordIterator it1 = reader1.iterator();
            final SAMRecordIterator it2 = reader2.iterator();
            while (it1.hasNext() && it2.hasNext()) {
                final SAMRecord read1 = it1.next();
                final SAMRecord read2 = it2.next();
                final String eqMessage = readsEqualAllowAddingAttributes(read1, read2);
                if (eqMessage != null){
                    return eqMessage;
                }
            }
            if (it1.hasNext() || it2.hasNext()) {
                //at least one has no more records (because the while loop is done) and at least one does have records. So we're not equal.
                return "Not the same number of reads";
            }
            return null;
        }
    }

    private static String equalHeadersIgnoreCOandPG(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException {
        try(final SamReader reader1 = getReader(actualSam, validation, reference);
                final SamReader reader2 = getReader(expectedSam, validation, reference)){

            final SAMFileHeader h1 = reader1.getFileHeader();
            final SAMFileHeader h2 = reader2.getFileHeader();
            String msg;

            //Note: we allow the versions to differ

            msg = compareValues(h1.getCreator(), h2.getCreator(), "File creator");
            if (msg != null) { return msg; }

            msg = compareValues(h1.getAttribute("SO"), h2.getAttribute("SO"), "Sort order");
            if (msg != null) { return msg; }

            if (! Objects.equals(h1.getSequenceDictionary(), h2.getSequenceDictionary())){
                return "Different Sequence dictionaries";
            }

            msg = compareReadGroups(h1, h2);
            if (msg != null) { return msg; }

            return msg;
        }
    }

    private static String compareReadGroups(final SAMFileHeader h1, final SAMFileHeader h2) {
        final List l1 = h1.getReadGroups();
        final List l2 = h2.getReadGroups();
        final String msg = compareValues(l1.size(), l2.size(), "Number of read groups");
        if (msg != null){ return msg; }

        for (int i = 0; i < l1.size(); ++i) {
            if (! Objects.equals(l1.get(i), l2.get(i))){
                 return "Read group records different:" + l1.get(i) + " vs " + l2.get(i);
            }
        }
        return null;
    }

    private static  String compareValues(final T v1, final T v2, final String label) {
        boolean eq;
        if ((v1!=null) && (v2!=null) && (v1.getClass().isArray()) && (v2.getClass().isArray())) {
            eq = Arrays.equals((byte[]) v1,(byte [])v2);
        } else {
            eq = Objects.equals(v1, v2);
        }
        if (eq) {
            return null;
        } else {
            final String s1 = String.valueOf(v1);
            final String s2 = String.valueOf(v2);
            return label + " differs. File 1: " + s1 + " File 2: " + s2;
        }
    }

    /**
     * Compares the reads but ignores order of attributes.
     * Also allows actualRead to have a superset of attributes of expectedRead.
     */
    private static String readsEqualAllowAddingAttributes(final SAMRecord actualRead, final SAMRecord expectedRead) {
        final String actualName = actualRead.getReadName();
        final String expectedName = expectedRead.getReadName();

        String msg;

        msg = compareValues(actualName, expectedName, "name");
        if (msg != null){ return msg; }

        final String readNames = "actualName:" + actualName + " expectedName:" + expectedName;

        msg = compareValues(SAMFlag.getFlags(actualRead.getFlags()), SAMFlag.getFlags(expectedRead.getFlags()), readNames + " getFlags");
        if (msg != null){ return msg; }

        msg = compareValues(actualRead.getInferredInsertSize(), expectedRead.getInferredInsertSize(), readNames + " getInferredInsertSize");
        if (msg != null){ return msg; }

        msg = compareValues(actualRead.getMappingQuality(), expectedRead.getMappingQuality(), readNames + " getMappingQuality");
        if (msg != null){ return msg; }

        msg = compareValues(actualRead.getMateReferenceIndex(), expectedRead.getMateReferenceIndex(), readNames + "getMateReferenceIndex");
        if (msg != null){ return msg; }

        msg = compareValues(actualRead.getMateAlignmentStart(), expectedRead.getMateAlignmentStart(), readNames + "getMateAlignmentStart");
        if (msg != null){ return msg; }

        msg = compareValues(actualRead.getReferenceIndex(), expectedRead.getReferenceIndex(), readNames + " getReferenceIndex");
        if (msg != null){ return msg; }

        msg = compareValues(actualRead.getAlignmentStart(), expectedRead.getAlignmentStart(), readNames + " getAlignmentStart");
        if (msg != null){ return msg; }

        msg = compareValues(actualRead.getCigar(), expectedRead.getCigar(), readNames + " getCigar");
        if (msg != null){ return msg; }

        msg = compareValues(actualRead.getReferenceName(), expectedRead.getReferenceName(), readNames + " getReferenceName");
        if (msg != null){ return msg; }

        msg = compareValues(actualRead.getMateReferenceName(), expectedRead.getMateReferenceName(), readNames + " getMateReferenceName");
        if (msg != null){ return msg; }

        if (!Arrays.equals(actualRead.getReadBases(), expectedRead.getReadBases())){
            return "getReadBases different actualRead:" + actualName + " expectedRead:" + expectedName + " (" + Arrays.toString(actualRead.getReadBases()) + " vs " + Arrays.toString(expectedRead.getReadBases()) + ")";
        }
        if (!Arrays.equals(actualRead.getBaseQualities(), expectedRead.getBaseQualities())){
            return "getBaseQualities different actualRead:" + actualName + " expectedRead:" + expectedName + " (" + Arrays.toString(actualRead.getBaseQualities()) + " vs " + Arrays.toString(expectedRead.getBaseQualities()) + ")";
        }
        return compareReadAttributes(actualRead, expectedRead);
    }

    private static String compareReadAttributes(final SAMRecord actualRead, final SAMRecord expectedRead) {
        final String actualName = actualRead.getReadName();
        final String expectedName = expectedRead.getReadName();
        final String readNames = "actualName:" + actualName + " expectedName:" + expectedName;

        String msg;
        final List actualAttributes = actualRead.getAttributes();
        final List expectedAttributes = expectedRead.getAttributes();

        //We want to compare attributes regardless of order, so we put them in a map
        final Map actualAttributesByName = new LinkedHashMap<>();
        final Map expectedAttributesByName = new LinkedHashMap<>();

        for (final SAMRecord.SAMTagAndValue samTagAndValue : actualAttributes) {
            actualAttributesByName.put(samTagAndValue.tag, samTagAndValue.value);
        }
        for (final SAMRecord.SAMTagAndValue samTagAndValue : expectedAttributes) {
            expectedAttributesByName.put(samTagAndValue.tag, samTagAndValue.value);
        }

        final Sets.SetView attrDiff = Sets.difference(expectedAttributesByName.keySet(), actualAttributesByName.keySet());
        if (!attrDiff.isEmpty()){
            final StringBuilder sb= new StringBuilder();
            sb.append("expected read contains attributes that actual read lacks: " + readNames + " " + attrDiff + "\n");
            for (final String attr : attrDiff) {
                sb.append(attr + " " + expectedAttributesByName.get(attr) + "\n");
            }
            return sb.toString();
        }

        for (int i = 0; i < expectedAttributesByName.size(); i++) {
            final String expectedTag = expectedAttributes.get(i).tag;
            final Object expectedValue = expectedAttributesByName.get(expectedTag);
            final Object actualValue = actualAttributesByName.get(expectedTag);

            msg = compareValues(actualValue, expectedValue, readNames + " attribute " + expectedTag);
            if (msg != null){ return msg; }
        }
        return null;
    }

    /**
     * Compares the two given bam files, optionally sorting them before comparison.
     * The sorting is helpful to compare files that are different but equivalent (eg read pairs with same coordinates get reordered).
     */
    public static void assertEqualBamFiles(
            final File resultFile,
            final File expectedFile,
            final boolean compareBamFilesSorted,
            final ValidationStringency stringency) throws IOException {
        assertEqualBamFiles(resultFile, expectedFile, null, compareBamFilesSorted, stringency);
    }

    /**
     * Compares the two given bam files, optionally sorting them before comparison.
     * The sorting is helpful to compare files that are different but equivalent (eg read pairs with same coordinates get reordered).
     */
    public static void assertEqualBamFiles(
            final File resultFile,
            final File expectedFile,
            final File reference,
            final boolean compareBamFilesSorted,
            final ValidationStringency stringency) throws IOException {

        if (compareBamFilesSorted) {
            final File resultFileSorted= BaseTest.createTempFile("resultsFileSorted", "." + FilenameUtils.getExtension(resultFile.getName()));
            final File expectedFileSorted = BaseTest.createTempFile("expectedFileSorted", "." + FilenameUtils.getExtension(expectedFile.getName()));

            sortSam(resultFile, resultFileSorted, reference, stringency);
            sortSam(expectedFile, expectedFileSorted, reference, stringency);

            assertSamsEqual(resultFileSorted, expectedFileSorted, stringency, reference);
        } else {
            assertSamsEqual(resultFile, expectedFile, stringency, reference);
        }
    }

    /**
     * Compares the two given bam files, optionally sorting them before comparison.
     * The sorting is helpful to compare files that are different but equivalent (eg read pairs with same coordinates get reordered).
     */
    public static void assertEqualBamFiles(
            final Path resultFile,
            final Path expectedFile,
            final Path reference,
            final ValidationStringency stringency) throws IOException {

            assertSamsEqual(resultFile, expectedFile, stringency, reference);

    }

    /**
     * Validate/assert that the contents are CRAM if the extension is .cram
     */
    public static void assertCRAMContentsIfCRAM(final File putativeCRAMFile) {
        Path path = (null==putativeCRAMFile?null:putativeCRAMFile.toPath());
        assertCRAMContentsIfCRAM(path);
    }

    /**
     * Validate/assert that the contents are CRAM if the extension is .cram
     */
    public static void assertCRAMContentsIfCRAM(final Path putativeCRAMPath) {
        if (new GATKPath(putativeCRAMPath.toUri().toString()).isCram()) {
            assertCRAMContents(putativeCRAMPath);
        }
    }

    /**
     * Unconditionally validate/assert that the contents are CRAM
     */
    public static void assertCRAMContents(final Path putativeCRAMPath) {
        Assert.assertTrue(ReadUtils.hasCRAMFileContents(putativeCRAMPath), "should have had CRAM contents: " + putativeCRAMPath.toUri().toString());
    }

    private static void sortSam(final File input, final File output, final File reference, final ValidationStringency stringency) {
        final SortSam sort = new SortSam();

        // We can't use ArgumentsBuilder since it assumes GATK argument names, but we're running a Picard
        // tool, which uses upper case argument names.
        final List args = new ArrayList<>(6);
        args.add("-I");
        args.add(input.getAbsolutePath());
        args.add("-O");
        args.add(output.getAbsolutePath());
        args.add("-SO");
        args.add(SAMFileHeader.SortOrder.coordinate.name());
        args.add("--VALIDATION_STRINGENCY");
        args.add(stringency.name());
        if (reference != null) {
            args.add("--REFERENCE_SEQUENCE");
            args.add(reference.getAbsolutePath());
        }

        int returnCode  = sort.instanceMain(args.toArray(new String[0]));
        if (returnCode != 0) {
            throw new RuntimeException("Failure running SortSam on inputs");
        }
    }

    /**
     * Get the program records (@PG) in the BAM file header
     *
     * @param bamFile   the BAM file
     * @return  program records from the BAN file header
     * @throws IOException if it cannot close the BAM file
     */
    private static List getProgramRecords(final File bamFile) throws IOException {
        try(final SamReader bamInReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(bamFile)) {
            return bamInReader.getFileHeader().getProgramRecords();
        }
    }

    /**
     * Assert the output BAM file header contains the input BAM file header Program Records (@PG)
     * @param inputBam  input BAM file
     * @param outputBam output BAM file
     */
    public static void assertOutBamContainsInBamProgramRecords(final File inputBam, final File outputBam) throws IOException {
        final List bamInProgramRecords = getProgramRecords(inputBam);
        final List bamOutProgramRecords = getProgramRecords(outputBam);
        Assert.assertTrue(bamOutProgramRecords.containsAll(bamInProgramRecords));
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy