All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.sam.markduplicates.CheckDuplicateMarking Maven / Gradle / Ivy

There is a newer version: 3.2.0
Show newest version
package picard.sam.markduplicates;

import htsjdk.samtools.BAMRecordCodec;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordQueryNameComparator;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SortingCollection;
import org.apache.commons.io.output.NullOutputStream;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineParser;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Iterator;

@CommandLineProgramProperties(
        summary = CheckDuplicateMarking.USAGE_DETAILS,
        oneLineSummary = CheckDuplicateMarking.USAGE_SUMMARY,
        programGroup = DiagnosticsAndQCProgramGroup.class)

@DocumentedFeature(enable = false)
public class CheckDuplicateMarking extends CommandLineProgram {
    static final String USAGE_SUMMARY = "Checks the consistency of duplicate markings.";
    static final String USAGE_DETAILS = "This tool checks that all reads with the same queryname have their duplicate marking flags set the same way. " +
            "NOTE: This tool does NOT check that the duplicate marking is correct. The ONLY thing that it checks is that the 0x400 bit-flags of records " +
            "with the same queryname are equal.";

    @Argument(doc = "Input BAM or SAM file to check.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
    public File INPUT;

    @Argument(doc = "Output file into which bad querynames will be placed (if not null).", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, optional = true)
    public File OUTPUT = null;

    @Argument(doc = "Which reads of the same name should be checked to have same duplicate marking.")
    public Mode MODE = Mode.ALL;

    private String currentReadName = "";
    private boolean currentReadDuplicateMarked = false;

    private static final Log log = Log.getInstance(CheckDuplicateMarking.class);
    private final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Checked.");

    private static final int NUM_WARNINGS = 100;
    private int numBadRecords = 0;

    public enum Mode implements CommandLineParser.ClpEnum {
        ALL("Check all reads."),
        PRIMARY_ONLY("Check primary alignments."),
        PRIMARY_MAPPED_ONLY("Check mapped alignments."),
        PRIMARY_PROPER_PAIR_ONLY("Check mapped alignments.");

        private final String message;

        Mode(final String message) {
            this.message = message;
        }

        public String getHelpDoc() {
            return message;
        }
    }

    @Override
    protected int doWork() {
        IOUtil.assertFileIsReadable(INPUT);
        if (OUTPUT != null) {
            IOUtil.assertFileIsWritable(OUTPUT);
        }

        try (SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT)) {

            checkDuplicateMarkingsInIterator(getSortedRecordsFromReader(reader));
            if (numBadRecords > 0) {
                log.error("Found " + numBadRecords + " records that do not agree on their duplicate flag.");
            } else {
                log.info("All records' duplicate markings agree.");
            }

        } catch (IOException e) {
            throw new PicardException("Error while reading input file " + INPUT, e);
        }

        return numBadRecords > 0 ? 1 : 0;
    }

    private Iterator getSortedRecordsFromReader(final SamReader reader) {
        if (reader.getFileHeader().getSortOrder() == SAMFileHeader.SortOrder.queryname) {
            return reader.iterator();
        }

        log.info("Input file isn't queryname sorted. Sorting into temp space.");

        final ProgressLogger sortProgress = new ProgressLogger(log, (int) 1e6, "Read into sorter");
        final SortingCollection alignmentSorter = SortingCollection.newInstance(SAMRecord.class,
                new BAMRecordCodec(reader.getFileHeader()),
                new SAMRecordQueryNameComparator(),
                MAX_RECORDS_IN_RAM);

        for (final SAMRecord rec : reader) {
            alignmentSorter.add(rec);
            sortProgress.record(rec);
        }
        final CloseableIterator iterator = alignmentSorter.iterator();

        alignmentSorter.cleanup();

        return iterator;
    }

    private boolean checkAndTallyRecordDuplicateMarking(final SAMRecord rec) {
        if (!rec.getReadName().equals(currentReadName)) {
            // this case the queryname changed, and thus there's no comparison to make
            currentReadName = rec.getReadName();
            currentReadDuplicateMarked = rec.getDuplicateReadFlag();
        } else if (rec.getDuplicateReadFlag() != currentReadDuplicateMarked) {
            // Here the current queryname is the same, but the duplicate flag doesn't match the first record with that queryname
            numBadRecords++;

            if (numBadRecords <= NUM_WARNINGS) {
                log.warn(() -> "Reads with queryname " + currentReadName + " have different duplicate flags (at " +
                        rec.getContig() + ":" + rec.getStart() + ")");
            }

            if (numBadRecords == NUM_WARNINGS) {
                log.warn("Further warnings will be suppressed.");
            }

            return false;
        }
        return true;
    }

    private void checkDuplicateMarkingsInIterator(final Iterator iterator) throws IOException {
        try (PrintWriter writer = OUTPUT == null ?
                new PrintWriter(NullOutputStream.NULL_OUTPUT_STREAM) :
                new PrintWriter(new FileWriter(OUTPUT))) {

            while (iterator.hasNext()) {
                final SAMRecord rec = iterator.next();

                if (MODE != Mode.ALL && rec.isSecondaryOrSupplementary()) {
                    continue;
                }

                if (MODE == Mode.PRIMARY_MAPPED_ONLY && rec.getReadUnmappedFlag()) {
                    continue;
                }

                if (MODE == Mode.PRIMARY_PROPER_PAIR_ONLY && !rec.getProperPairFlag()) {
                    continue;
                }

                if (!checkAndTallyRecordDuplicateMarking(rec)) {
                    writer.println(rec.getReadName());
                }
                progress.record(rec);
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy