picard.sam.markduplicates.MarkDuplicatesWithMateCigar Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates;
import picard.PicardException;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IterableAdapter;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.*;
import picard.cmdline.programgroups.SamOrBam;
import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram;
import java.util.*;
/**
* An even better duplication marking algorithm that handles all cases including clipped
* and gapped alignments.
*
* This tool differs with MarkDuplicates as it may break ties differently. Furthermore,
* as it is a one-pass algorithm, it cannot know the program records contained in the file
* that should be chained in advance. Therefore it will only be able to examine the header
* to attempt to infer those program group records that have no associated previous program
* group record. If a read is encountered without a program record, or not one as previously
* defined, it will not be updated.
*
* This tool will also not work with alignments that have large gaps or skips, such as those
* from RNA-seq data. This is due to the need to buffer small genomic windows to ensure
* integrity of the duplicate marking, while large skips (ex. skipping introns) in the
* alignment records would force making that window very large, thus exhausting memory.
*
* @author Nils Homer
*/
@CommandLineProgramProperties(
usage = MarkDuplicatesWithMateCigar.USAGE_SUMMARY + MarkDuplicatesWithMateCigar.USAGE_DETAILS,
usageShort = MarkDuplicatesWithMateCigar.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
public class MarkDuplicatesWithMateCigar extends AbstractMarkDuplicatesCommandLineProgram {
static final String USAGE_SUMMARY = "Identifies duplicate reads, accounting for mate CIGAR. ";
static final String USAGE_DETAILS = "This tool locates and tags duplicate reads (both PCR and optical) in a BAM or SAM file, where " +
"duplicate reads are defined as originating from the same original fragment of DNA, taking into account the CIGAR string of " +
"read mates.
" +
"" +
"It is intended as an improvement upon the original MarkDuplicates algorithm, from which it differs in several ways, including" +
"differences in how it breaks ties. It may be the most effective duplicate marking program available, as it handles all cases " +
"including clipped and gapped alignments and locates duplicate molecules using mate cigar information. However, please note " +
"that it is not yet used in the Broad's production pipeline, so use it at your own risk.
" +
"" +
"Note also that this tool will not work with alignments that have large gaps or deletions, such as those from RNA-seq data. " +
"This is due to the need to buffer small genomic windows to ensure integrity of the duplicate marking, while large skips " +
"(ex. skipping introns) in the alignment records would force making that window very large, thus exhausting memory.
" +
"" +
"Note: Metrics labeled as percentages are actually expressed as fractions!
" +
"Usage example:
" +
"" +
"java -jar picard.jar MarkDuplicatesWithMateCigar \\
" +
" I=input.bam \\
" +
" O=mark_dups_w_mate_cig.bam \\
" +
" M=mark_dups_w_mate_cig_metrics.txt" +
"
" +
"
";
private final Log log = Log.getInstance(MarkDuplicatesWithMateCigar.class);
@Option(doc = "The minimum distance to buffer records to account for clipping on the 5' end of the records. " +
"For a given alignment, this parameter controls the width of the window to search for duplicates of that alignment. " +
"Due to 5' read clipping, duplicates do not necessarily have the same 5' alignment coordinates, so the algorithm " +
"needs to search around the neighborhood. For single end sequencing data, the neighborhood is only determined by " +
"the amount of clipping (assuming no split reads), thus setting MINIMUM_DISTANCE to twice the sequencing read length " +
"should be sufficient. For paired end sequencing, the neighborhood is also determined by the fragment insert size, " +
"so you may want to set MINIMUM_DISTANCE to something like twice the 99.5% percentile of the fragment insert size " +
"distribution (see CollectInsertSizeMetrics). Or you can set this number to -1 to use either a) twice the first read's " +
"read length, or b) 100, whichever is smaller. Note that the larger the window, the greater the RAM requirements, so " +
"you could run into performance limitations if you use a value that is unnecessarily large.", optional = true)
public int MINIMUM_DISTANCE = -1;
@Option(doc = "Skip record pairs with no mate cigar and include them in the output.")
boolean SKIP_PAIRS_WITH_NO_MATE_CIGAR = true;
@Option(doc = "The block size for use in the coordinate-sorted record buffer.", optional = true)
public int BLOCK_SIZE = 100000;
/** Warnings that will only be emitted once */
private boolean warnedNullProgramRecords = false;
private boolean warnedMissingProgramRecords = false;
/** Stock main method. */
public static void main(final String[] args) {
new MarkDuplicatesWithMateCigar().instanceMainWithExit(args);
}
/**
* Main work method.
*/
protected int doWork() {
IOUtil.assertInputsAreValid(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(METRICS_FILE);
// Open the inputs
final SamHeaderAndIterator headerAndIterator = openInputs();
final SAMFileHeader header = headerAndIterator.header;
// Create the output header
final SAMFileHeader outputHeader = header.clone();
if (outputHeader.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new PicardException("This program requires inputs in coordinate SortOrder");
}
COMMENT.forEach(outputHeader::addComment);
// Since this is one-pass, unlike MarkDuplicates, we cannot only chain together program
// group records we have seen, we have to assume all of them may be seen. We can perhaps
// filter out any program groups which have been referenced previously.
setPGIdsSeen(outputHeader);
// Key: previous PG ID on a SAM Record (or null). Value: New PG ID to replace it.
final Map chainedPgIds = getChainedPgIds(outputHeader);
// Open the output
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outputHeader,
true,
OUTPUT);
// Create the mark duplicate iterator. The duplicate marking is handled by the iterator, conveniently.
final MarkDuplicatesWithMateCigarIterator iterator = new MarkDuplicatesWithMateCigarIterator(headerAndIterator.header,
headerAndIterator.iterator,
this.opticalDuplicateFinder,
this.DUPLICATE_SCORING_STRATEGY,
this.MINIMUM_DISTANCE,
this.REMOVE_DUPLICATES,
this.SKIP_PAIRS_WITH_NO_MATE_CIGAR,
this.MAX_RECORDS_IN_RAM,
this.BLOCK_SIZE,
this.TMP_DIR);
// progress logger!
final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read");
// Go through the records
for (final SAMRecord record : new IterableAdapter(iterator)) {
if (progress.record(record)) {
iterator.logMemoryStats(log);
}
// Update the program record if necessary
updateProgramRecord(record, chainedPgIds);
// Write the alignment
out.addAlignment(record);
}
// remember to close the inputs
iterator.close();
out.close();
// For convenience to reference
final Histogram opticalDupesByLibraryId = iterator.getOpticalDupesByLibraryId();
// Log info
log.info("Processed " + progress.getCount() + " records");
log.info("Found " + iterator.getNumRecordsWithNoMateCigar() + " records with no mate cigar optional tag.");
log.info("Marking " + iterator.getNumDuplicates() + " records as duplicates.");
log.info("Found " + ((long) opticalDupesByLibraryId.getSumOfValues()) + " optical duplicate clusters."); // cast as long due to returning a double
// Write out the metrics
finalizeAndWriteMetrics(iterator.getLibraryIdGenerator());
return 0;
}
/**
* Updates the program record if necessary.
*/
private void updateProgramRecord(final SAMRecord record, final Map chainedPgIds) {
if (PROGRAM_RECORD_ID != null) {
final String pgId = record.getStringAttribute(SAMTag.PG.name());
if (null == pgId) {
if (!warnedNullProgramRecords) {
warnedNullProgramRecords = true;
log.warn("Encountered a record with no program record, program group chaining will not occur for this read: " + record);
} // else already warned!
} else if (!chainedPgIds.containsKey(pgId)) {
if (!warnedMissingProgramRecords) {
warnedMissingProgramRecords = true;
log.warn("Encountered a record with an intermediate program record, program group chaining will not occur for this read: " + record);
} // else already warned!
} else {
record.setAttribute(SAMTag.PG.name(), chainedPgIds.get(pgId));
}
}
}
/**
* Generate the list of program records seen in the SAM file, approximating this with those in the header that were not
* themselves mentioned elsewhere.
*/
private void setPGIdsSeen(final SAMFileHeader header) {
final Set pgIdsSeenAsPrevious = new HashSet();
// get all program record ids that are mentioned as previously seen
for (final SAMProgramRecord samProgramRecord : header.getProgramRecords()) {
final String previousProgramGroupID = samProgramRecord.getPreviousProgramGroupId();
if (null != previousProgramGroupID) pgIdsSeenAsPrevious.add(previousProgramGroupID);
}
// ignore those that were previously seen
for (final SAMProgramRecord samProgramRecord : header.getProgramRecords()) {
final String pgId = samProgramRecord.getId();
if (!pgIdsSeenAsPrevious.contains(pgId)) this.pgIdsSeen.add(pgId);
}
}
}