All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.filediagnostics.CRAMIssue8768Analyzer Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.filediagnostics;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.cram.build.CRAMReferenceRegion;
import htsjdk.samtools.cram.build.CramIO;
import htsjdk.samtools.cram.ref.ReferenceContext;
import htsjdk.samtools.cram.ref.ReferenceSource;
import htsjdk.samtools.cram.structure.*;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.Tuple;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
import org.broadinstitute.hellbender.utils.tsv.TableWriter;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.util.*;

/**
 * A diagnostic class that analyzes a CRAM input file to look for conditions that trigger issue 8768.
 */
public class CRAMIssue8768Analyzer extends HTSAnalyzer {
    private final ReferenceSource referenceSource;
    private final OutputStream outputStream;
    private final CompressorCache compressorCache = new CompressorCache();
    private final boolean verbose;
    private final boolean echoToConsole;
    private final double mismatchRateThreshold;
    private final GATKPath tsvOutputPath;
    private int retCode = 0;

    private SAMFileHeader samHeader = null;

    // everything we need to record for a bad contig
    private record BadContig(
            String contigName,
            List badContainers
    ) { }

    // everything we need to record for a (good or bad) container
    private record ContainerStats(
            int containerOrdinal, // container ordinal # within the contig
            boolean isBad,        // true if this container is bad
            AlignmentContext alignmentContext,  // reference ID, alignment start, alignment span
            long misMatchCount,   // count of mismatched bases
            double misMatchRate   // rate of mismatched bases (mismatches/total bases)
    ) { }

    public CRAMIssue8768Analyzer(
            final GATKPath inputPath,
            final GATKPath textOutputPath,
            final GATKPath tsvOutputPath,
            final GATKPath referencePath,
            final double mismatchRateThreshold,
            final boolean verbose,
            final boolean echoToConsole) {
        super(inputPath, textOutputPath);
        this.verbose = verbose;
        this.echoToConsole = echoToConsole;
        this.tsvOutputPath = tsvOutputPath;
        this.mismatchRateThreshold = mismatchRateThreshold;

        referenceSource = new ReferenceSource(referencePath.toPath());
        try {
            outputStream = Files.newOutputStream(this.outputPath.toPath());
        } catch (final IOException e) {
            throw new RuntimeIOException(e);
        }
    }

    @Override
    public void close() throws IOException {
        if (outputStream != null) {
            outputStream.close();
        }
    }

    public int getRetCode() {
        return retCode;
    }
    protected void emitln(final String s) {
        try {
            outputStream.write(s.getBytes());
            outputStream.write('\n');
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Some background:
     *
     * Each CRAM container has an associated htsjdk AlignmentContext that models if/how the contents of the
     * container relates to the reference. An AlignmentContext includes a ReferenceContext, and depending on the
     * type of the ReferenceContext, possibly an alignment start and alignment span. There are 3 possible types of
     * ReferenceContexts:
     *
     * 1. SINGLE_REF: The container contains only reads that are mapped to a single reference contig, in which case
     * the referenceID for that ReferenceContext is the contig ID for the associated contig. This is the most
     * common case, and is the only type of ReferenceContext for which the alignment start and span are meaningful.
     * Call isMappedSingleRef() to determine if the ReferenceContext is SINGLE_REF.
     *
     * Note that it is an error (the code throws) to you attempt to query a ReferenceContext for it's contig ID if
     * the ReferenceContext is not SINGLE_REF.
     *
     * 2. MULTI_REF: The container contains reads that are mapped to more than one reference contig. This is an
     * optimization used primarily when there aren't enough reads mapped to a single reference contig to justify
     * putting the reads into a separate container. Reads in these containers are not reference compressed, and
     * AlignmentContexts for MULTI_REF containers have no meaningful start/span values. Call isMappedMultiRef() to
     * determine if the ReferenceContext is MULTI_REF.
     *
     * 3. UNMAPPED_UNPLACED: The container contains only unmapped unplaced reads. start/span are irrelevant. Call
     * isUnmappedUnplaced() to determine if the ReferenceContext is UNMAPPED_UNPLACED.
     */
    public void doAnalysis() {
        final Map badContigs = new LinkedHashMap<>(); // contig name, BadContig
        final List goodContainers = new ArrayList<>(); // good containers, for all contigs
        int containerOrdinalForContig = 0;
        final int NUMBER_OF_GOOD_CONTAINERS_PER_CONTIG_TO_REPORT = 4;
        int nGoodContainersReportedForContig = 0;

        // these are HTSJDK CRAM container alignment contexts, not the GATK kind you're thinking of
        AlignmentContext previousAlignmentContext = null;

        try (final SeekablePathStream seekableStream = new SeekablePathStream(this.inputPath.toPath())) {
            List badContainersForContig = new ArrayList<>();
            final CramHeader cramHeader = analyzeCRAMHeader(seekableStream);
            samHeader = Container.readSAMFileHeaderContainer(
                    cramHeader.getCRAMVersion(),
                    seekableStream,
                    inputPath.getRawInputString());

            // iterate through the containers looking for ones with alignment spans that trigger the issue
            for (boolean isEOF = false; !isEOF;) {
                final Container container = new Container(
                        cramHeader.getCRAMVersion(),
                        seekableStream,
                        seekableStream.position());
                containerOrdinalForContig++;

                // reject CRAMs with properties that clearly indicate they were not written by GATK/Picard
                if (isForeignCRAM(container)) {
                    return;
                }

                if (previousAlignmentContext == null) {
                    // first container in the whole file can't be bad
                    recordContainerStats(goodContainers, false, container, containerOrdinalForContig);
                    nGoodContainersReportedForContig++;
                } else if (!previousAlignmentContext.getReferenceContext().equals(
                        container.getAlignmentContext().getReferenceContext())) {
                    // this is the first container for a new reference context; emit any bad containers accumulated
                    // for the previous reference context/contig, and reset state for the next one
                    if (badContainersForContig.size() > 0) {
                        recordContigStats(badContigs, badContainersForContig, previousAlignmentContext);
                        badContainersForContig = new ArrayList<>();
                    }
                    containerOrdinalForContig = 1;
                    // the first container for a reference context is never bad, so always add it to the good list
                    recordContainerStats(goodContainers, false, container, containerOrdinalForContig);
                    nGoodContainersReportedForContig = 1;
                } else if (previousAlignmentContext.getReferenceContext().isMappedSingleRef() &&
                        (previousAlignmentContext.getAlignmentStart() == 1)) {
                    // we're on the same reference context as the previous container, and the previous container
                    // was mapped to position 1, so if this container is mapped, it's a candidate for bad, whether
                    // it starts at position 1 (the rare case where there is more than one bad container) or not
                    // (the common case where this is the one bad container for this contig)
                    recordContainerStats(badContainersForContig, true, container, containerOrdinalForContig);
                } else {
                    // we're on the same reference context as the previous container, and the previous container
                    // was NOT mapped to position 1, so this container is not bad - add it to the list of good
                    // containers
                    if (verbose || nGoodContainersReportedForContig < NUMBER_OF_GOOD_CONTAINERS_PER_CONTIG_TO_REPORT) {
                        recordContainerStats(goodContainers, false,container, containerOrdinalForContig);
                        nGoodContainersReportedForContig++;
                    }
                }

                previousAlignmentContext = new AlignmentContext(
                        container.getAlignmentContext().getReferenceContext(),
                        container.getAlignmentContext().getAlignmentStart(),
                        container.getAlignmentContext().getAlignmentSpan());
                isEOF = container.isEOF();
            }
        }
        catch (IOException e) {
            throw new RuntimeIOException(e);
        }

        retCode = printTextResults(badContigs, goodContainers);
        if (tsvOutputPath != null) {
            printTSVResults(badContigs, goodContainers, tsvOutputPath);
        }
    }

    /**
     * Display metadata for a CRAM file header.
     */
    private CramHeader analyzeCRAMHeader(InputStream is) {
        final CramHeader cramHeader = CramIO.readCramHeader(is);
        emitln("CRAM File Name: " + inputPath.toPath().getFileName());
        emitln("CRAM Version: " + cramHeader.getCRAMVersion().toString());
        emitln("CRAM ID Contents: " + String.format("%s", Base64.getEncoder().encodeToString(cramHeader.getId())));
        return cramHeader;
    }

    // reject any inputs that have containers that are reference-less; have multiple slices per container;
    // or have slices with an embedded reference, since these indicate that the file was not written by GATK/Picard.
    // it is in theory possible that the file could have been written by some other client of htsjdk (i.e., the
    // htsjdk tests can write such file), but analyzing such files is beyond the scope of this tool
    private boolean isForeignCRAM(final Container container) {
        final List slices = container.getSlices();
        if (slices.size() > 1 ) {
            emitln("Multi-slice container detected. This file was not written by GATK or Picard.");
            return true;
        } else if (container.getAlignmentContext().getReferenceContext().isMappedSingleRef() &&
                !container.getCompressionHeader().isReferenceRequired()) {
            emitln("Reference-less container detected. This file was not written by GATK or Picard.");
            return true;
        }
        for (final Slice slice : slices) {
            if (slice.getEmbeddedReferenceContentID() != Slice.EMBEDDED_REFERENCE_ABSENT_CONTENT_ID) {
                emitln(String.format("Embedded reference block (ID %d) detected. This file was not written by GATK or Picard.",
                        slice.getEmbeddedReferenceContentID()));
                return true;
            }
        }
        return false;
    }

    private void recordContainerStats(
            final List targetList,
            final boolean isBad,
            final Container container,
            final int containerOrdinal) {
        // don't even try to compute stats for unmapped-unplaced containers or multi-ref containers
        if (container.getAlignmentContext().getReferenceContext().isMappedSingleRef()) {
            final Tuple containerStats = analyzeContainerBaseMismatches(container);
            targetList.add(new ContainerStats(
                    containerOrdinal,
                    isBad,
                    container.getAlignmentContext(),
                    containerStats.a,   // mismatches
                    containerStats.b)); // mismatchPercent
        }
    }

    private void recordContigStats(
            final Map badContigs,
            final List badContainers,
            final AlignmentContext previousAlignmentContext) {
        if (null != badContigs.putIfAbsent(
                previousAlignmentContext.getReferenceContext().toString(),
                new BadContig(
                        previousAlignmentContext.getReferenceContext().toString(),
                        badContainers))) {
            throw new IllegalStateException(
                    String.format(
                            "Attempt to add a bad contig (%s) more than once",
                            previousAlignmentContext.getReferenceContext().toString()));
        }
    }

    /**
     * Analyze base mismatches CRAM file container.
     * return true if container is EOF container
     */
    private Tuple analyzeContainerBaseMismatches(final Container container) {
        final SAMSequenceDictionary sequenceDictionary = samHeader.getSequenceDictionary();
        final List actualSAMRecords = container.getSAMRecords(
                ValidationStringency.LENIENT,
                new CRAMReferenceRegion(referenceSource, samHeader.getSequenceDictionary()),
                compressorCache,
                samHeader);

        final CRAMReferenceRegion localReferenceRegion = new CRAMReferenceRegion(referenceSource, sequenceDictionary);
        // SequenceUtil.countMismatches wants the full contig's reference bases
        localReferenceRegion.fetchReferenceBases(container.getAlignmentContext().getReferenceContext().getReferenceContextID());
        final byte referenceBases[] = localReferenceRegion.getCurrentReferenceBases();

        long totalBases = 0;
        long misMatches = 0;
        for (SAMRecord samRec : actualSAMRecords) {
            totalBases += (long) samRec.getReadLength();
            misMatches += (long) SequenceUtil.countMismatches(samRec, referenceBases);
        }
        return new Tuple<>(totalBases, misMatches/(double) totalBases);
    }

    private int printTextResults(final Map badContigs, final List goodContainers) {
        int retCode;
        if (badContigs.isEmpty()) {
            final String NO_CORRUPT_CONTAINERS = "\n**********************NO CORRUPT CONTAINERS DETECTED**********************";
            emitln(NO_CORRUPT_CONTAINERS);
            // always emit the results summary to console
            System.out.println(NO_CORRUPT_CONTAINERS);
            retCode = 0;
        } else {
            final String POSSIBLE_CORRUPT_CONTAINERS = "\n**********************!!!!!Possible CORRUPT CONTAINERS DETECTED!!!!!**********************:\n";
            emitln(POSSIBLE_CORRUPT_CONTAINERS);
            // always emit the results summary to console
            System.out.println(POSSIBLE_CORRUPT_CONTAINERS);
            retCode = 1;
        }

        // before we print out the containers, print out the stats for both the good and the bad containers
        final int totalGoodContainers = goodContainers.size();
        final double sumOfGoodMismatchRates = goodContainers.stream().mapToDouble(c -> c.misMatchRate).sum();
        final double averageGoodMismatchRate = sumOfGoodMismatchRates / totalGoodContainers;
        final String avgGoodMismatchStr = String.format("Average mismatch rate for presumed good containers: %f", averageGoodMismatchRate);
        emitln(avgGoodMismatchStr);
        if (echoToConsole) {
            System.out.println(avgGoodMismatchStr);
        }

        if (!badContigs.isEmpty()) {
            final int totalBadContainers = badContigs.values().stream().mapToInt(bc -> bc.badContainers().size()).sum();
            final double sumOfBadMismatchRates = badContigs.values().stream().mapToDouble(
                    bc -> bc.badContainers().stream().mapToDouble(c -> c.misMatchRate).sum()).sum();
            final double averageBadMismatchRate = sumOfBadMismatchRates / totalBadContainers;
            final String avgBadMismatchStr = String.format("Average mismatch rate for suspected bad containers: %f", averageBadMismatchRate);
            emitln(avgBadMismatchStr);
            if (echoToConsole) {
                System.out.println(avgBadMismatchStr);
            }

            if (averageBadMismatchRate > mismatchRateThreshold) {
                final String exceedThresholdStr = String.format(
                        "The average base mismatch rate of %f for suspected bad containers exceeds the threshold rate of %1.2f, and indicates this file may be corrupt.",
                        averageBadMismatchRate,
                        mismatchRateThreshold);
                emitln(exceedThresholdStr);
                if (echoToConsole) {
                    System.out.println(exceedThresholdStr);
                }
            }

            // now emit the list of corrupt containers for each bad contig
            emitln("\nSuspected CORRUPT Containers:");
            for (final Map.Entry entry : badContigs.entrySet()) {
                for (final ContainerStats badContainer : entry.getValue().badContainers()) {
                    final String badStatStr = String.format("  Ordinal: %d (%s) Mismatch Rate/Count: %f/%d",
                            badContainer.containerOrdinal,
                            badContainer.alignmentContext.toString(),
                            badContainer.misMatchRate,
                            badContainer.misMatchCount);
                    emitln(badStatStr);
                    if (echoToConsole) {
                        System.out.println(badStatStr);
                    }
                }
            }
        }

        emitln("\nPresumed GOOD Containers:");
        int lastContig = ReferenceContext.UNINITIALIZED_REFERENCE_ID;
        for (final ContainerStats goodContainer : goodContainers) {
            if (lastContig != ReferenceContext.UNINITIALIZED_REFERENCE_ID &&
                    lastContig != goodContainer.alignmentContext.getReferenceContext().getReferenceContextID()) {
                emitln("");
                if (echoToConsole) {
                    System.out.println("");
                }
            }
            lastContig = goodContainer.alignmentContext.getReferenceContext().getReferenceContextID();
            final String goodDetailStr = String.format("  Ordinal: %d (%s) Mismatch Rate/Count: %f/%d",
                    goodContainer.containerOrdinal,
                    goodContainer.alignmentContext.toString(),
                    goodContainer.misMatchRate,
                    goodContainer.misMatchCount);
            emitln(goodDetailStr);
            if (echoToConsole) {
                System.out.println(goodDetailStr);
            }
        }
        return retCode;
    }

    // write the results out to a machine-readable tsv file
    private void printTSVResults(
            final Map badContigs,
            final List goodContainers,
            final GATKPath tsvOutputPath) {
        // File name, contig name, container ordinal, good or bad, mismatch rate
        final TableColumnCollection columnNames = new TableColumnCollection(
                "file_name",    // file name
                "contig_name",          // contig name
                "container_ordinal",    // container ordinal
                "container_is_bad",     // good or bad, 1 or 0
                "mismatch_rate",        // mismatch rate (double)
                "alignment_start",      // alignment start (int)
                "alignment_span"        // alignment span (int)
        );

        try (final TableWriter tsvWriter = new TableWriter<>(tsvOutputPath.toPath(), columnNames) {
            @Override
            protected void composeLine(final ContainerStats containerStats, final DataLine dataLine) {
                dataLine.set("file_name", inputPath.toPath().getFileName().toString())
                        .set("contig_name", samHeader.getSequenceDictionary().getSequence(containerStats.alignmentContext.getReferenceContext().getReferenceContextID()).getSequenceName())
                        .set("container_ordinal", containerStats.containerOrdinal)
                        .set("container_is_bad", containerStats.isBad ? 1 : 0)
                        .set("mismatch_rate", containerStats.misMatchRate)
                        .set("alignment_start", containerStats.alignmentContext.getAlignmentStart())
                        .set("alignment_span", containerStats.alignmentContext.getAlignmentSpan());
                }
            })
        {
            tsvWriter.writeHeaderIfApplies();
            if (badContigs.isEmpty()) {
                tsvWriter.writeComment("No bad containers detected");
            } else {
                tsvWriter.writeComment("Bad containers:");
                for (final Map.Entry entry : badContigs.entrySet()) {
                    tsvWriter.writeAllRecords(entry.getValue().badContainers());
                }
            }
            if (goodContainers.isEmpty()) {
                tsvWriter.writeComment("No good mapped containers detected");
            } else {
                tsvWriter.writeComment("Good containers:");
                tsvWriter.writeAllRecords(goodContainers);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy