
org.broadinstitute.hellbender.tools.spark.validation.CompareDuplicatesSpark Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.spark.validation;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import htsjdk.samtools.SAMFileHeader;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSparkUtils;
import org.broadinstitute.hellbender.utils.read.markduplicates.LibraryIdGenerator;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
import org.broadinstitute.hellbender.engine.TraversalParameters;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey;
import scala.Tuple2;
import java.util.*;
/**
* Determine if two potentially identical BAMs have the same duplicate reads. This tool is useful for checking if two
* BAMs that seem identical have the same reads marked as duplicates.
*
* It fails if at least one of the following conditions is true of the two BAM files:
*
*
* - Different number of primary mapped reads
* - Different number of duplicate reads (as indicated by the SAM record flag)
* - Different reads mapped to some position in the reference
*
*
* The tool gathers the mapped reads together into groups that belong to the same library and map to the same
* position and strand in the reference. If the tool does not fail, then it reports the number of these groups with
* the following properties:
*
*
* - SIZE_UNEQUAL: different number of reads
* - EQUAL: same reads and same duplicates
* - READ_MISMATCH: reads with different names
* - DIFFERENT_REPRESENTATIVE_READ: same reads and number of duplicates, but one or more duplicate reads are different
* - DIFF_NUM_DUPES: same reads but different number of duplicates
*
*
* Input
*
*
* - Two BAM files
*
*
* Output
*
* Results are printed to the screen
*
* Usage example
*
* gatk CompareDuplicatesSpark \
* -I input_reads_1.bam \
* -I2 input_reads_2.bam
*
*
* This tool can be run without explicitly specifying Spark options. That is to say, the given example command
* without Spark options will run locally. See
* Tutorial#10060 for an example
* of how to set up and run a Spark tool on a cloud Spark cluster.
*/
@DocumentedFeature
@CommandLineProgramProperties(summary = "Determine if two potentially identical BAMs have the same duplicate reads. " +
"This tool is useful for checking if two BAMs that seem identical have the same reads marked as duplicates.",
oneLineSummary = "Determine if two potentially identical BAMs have the same duplicate reads",
programGroup = DiagnosticsAndQCProgramGroup.class)
@BetaFeature
public final class CompareDuplicatesSpark extends GATKSparkTool {
private static final long serialVersionUID = 1L;
public static final String INPUT_2_LONG_NAME = "input2";
public static final String INPUT_2_SHORT_NAME = "I2";
public static final String PRINT_SUMMARY_LONG_NAME = "print-summary";
public static final String THROW_ON_DIFF_LONG_NAME = "throw-on-diff";
@Override
public boolean requiresReads() { return true; }
@Argument(doc="The second BAM", shortName = INPUT_2_SHORT_NAME, fullName = INPUT_2_LONG_NAME, optional = false)
protected String input2;
@Argument(doc="Print a summary", fullName = PRINT_SUMMARY_LONG_NAME, optional = true)
protected boolean printSummary = true;
@Argument(doc="Throw error if any differences were found", fullName = THROW_ON_DIFF_LONG_NAME, optional = true)
protected boolean throwOnDiff = false;
@Argument(doc = "If output is given, the tool will return a bam with all the mismatching duplicate groups in the first specified file",
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = true)
protected String output;
@Argument(doc = "If output is given, the tool will return a bam with all the mismatching duplicate groups in the second specified input file",
shortName = "O2", fullName = "output2", optional = true)
protected String output2;
@Override
public List getDefaultReadFilters() {
return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS);
}
@Override
protected void runTool(final JavaSparkContext ctx) {
if (hasOutputSpecified()) {
// Checking the they are both specified given that at least one was
if (output == null | output2 == null) {
throw new IllegalArgumentException("Arguments '--output' and '--output2' must both be specified together in order to write mismatch bams or not at all");
}
}
JavaRDD firstReads = removeNonReadGroupAttributes(getReads());
ReadsSparkSource readsSource2 = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency());
TraversalParameters traversalParameters;
if ( hasUserSuppliedIntervals() ) {
traversalParameters = intervalArgumentCollection.getTraversalParameters(getHeaderForReads().getSequenceDictionary());
} else {
traversalParameters = null;
}
JavaRDD secondReads = removeNonReadGroupAttributes(readsSource2.getParallelReads(new GATKPath(input2), null, traversalParameters, bamPartitionSplitSize, useNio));
// Start by verifying that we have same number of reads and duplicates in each BAM.
long firstBamSize = firstReads.count();
long secondBamSize = secondReads.count();
if (firstBamSize != secondBamSize) {
throw new UserException("input bams have different numbers of mapped reads: "
+ firstBamSize + "," + secondBamSize);
}
System.out.println("processing bams with " + firstBamSize + " mapped reads");
long firstDupesCount = firstReads.filter(GATKRead::isDuplicate).count();
long secondDupesCount = secondReads.filter(GATKRead::isDuplicate).count();
if (firstDupesCount != secondDupesCount) {
System.out.println("BAMs have different number of total duplicates: " + firstDupesCount + "," + secondDupesCount);
}
System.out.println("first and second: " + firstDupesCount + "," + secondDupesCount);
Broadcast
© 2015 - 2025 Weber Informatics LLC | Privacy Policy