picard.analysis.directed.CollectHsMetrics Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.analysis.directed;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.StringUtil;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.analysis.MetricAccumulationLevel;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
import java.io.File;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
/**
* This tool takes a SAM/BAM file input and collects metrics that are specific for sequence
* datasets generated through hybrid-selection. Hybrid-selection (HS) is the most commonly used technique to capture
* exon-specific sequences for targeted sequencing experiments such as exome sequencing; for more information, please
* see the corresponding GATK Dictionary entry.
*
* This tool requires an aligned SAM or BAM file as well as bait and target interval files in Picard interval_list format.
* You should use the bait and interval files that correspond to the capture kit that was used to generate the capture
* libraries for sequencing, which can generally be obtained from the kit manufacturer. If the baits and target
* intervals are provided in BED format, you can convert them to the Picard interval_list format using Picard's
* BedToInterval tool.
*
* If a reference sequence is provided, this program will calculate both AT_DROPOUT and GC_DROPOUT metrics. Dropout
* metrics are an attempt to measure the reduced representation of reads, in regions that deviate from 50% G/C content.
* This reduction in the number of aligned reads is due to the increased numbers of errors associated with sequencing
* regions with excessive or deficient numbers of G/C bases, ultimately leading to poor mapping efficiencies and low
* coverage in the affected regions.
*
* If you are interested in getting G/C content and mean sequence depth information for every target interval, use the
* PER_TARGET_COVERAGE option.
*
* Note: Metrics labeled as percentages are actually expressed as fractions!
*
* Usage Example:
*
* java -jar picard.jar CollectHsMetrics \\
* I=input_reds.bam \\
* O=output_hs_metrics.txt \\
"
* R=reference.fasta \\
* BAIT_INTERVALS=bait.interval_list \\
* TARGET_INTERVALS=target.interval_list
*
* Please see
* CollectHsMetrics for
* detailed descriptions of the output metrics produced by this tool.
*
*
* See {@link HsMetricCollector} and {@link CollectTargetedMetrics} for more details.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
summary = CollectHsMetrics.USAGE_SUMMARY + CollectHsMetrics.USAGE_DETAILS,
oneLineSummary = CollectHsMetrics.USAGE_SUMMARY,
programGroup = DiagnosticsAndQCProgramGroup.class
)
@DocumentedFeature
public class CollectHsMetrics extends CollectTargetedMetrics {
static final String USAGE_SUMMARY = "Collects hybrid-selection (HS) metrics for a SAM or BAM file. ";
static final String USAGE_DETAILS = "This tool takes a SAM/BAM file input and collects metrics that are specific for sequence "+
"datasets generated through hybrid-selection. Hybrid-selection (HS) is the most commonly used technique to capture "+
"exon-specific sequences for targeted sequencing experiments such as exome sequencing; for more information, please " +
"see the corresponding GATK Dictionary entry.
"+
"This tool requires an aligned SAM or BAM file as well as bait and target interval files in Picard interval_list format. " +
"You should use the bait and interval files that correspond to the capture kit that was used to generate the capture " +
"libraries for sequencing, which can generally be obtained from the kit manufacturer. If the baits and target " +
"intervals are provided in BED format, you can convert them to the Picard interval_list format using Picard's " +
"BedToInterval tool.
" +
"If a reference sequence is provided, this program will calculate both AT_DROPOUT and GC_DROPOUT metrics. Dropout " +
"metrics are an attempt to measure the reduced representation of reads, in regions that deviate from 50% G/C content. " +
"This reduction in the number of aligned reads is due to the increased numbers of errors associated with sequencing " +
"regions with excessive or deficient numbers of G/C bases, ultimately leading to poor mapping efficiencies and low" +
"coverage in the affected regions.
" +
"If you are interested in getting G/C content and mean sequence depth information for every target interval, use the " +
"PER_TARGET_COVERAGE option.
" +
"Note: Metrics labeled as percentages are actually expressed as fractions!
" +
"Usage Example:
" +
"" +
"java -jar picard.jar CollectHsMetrics \\
" +
" I=input_reads.bam \\
" +
" O=output_hs_metrics.txt \\
" +
" R=reference.fasta \\
" +
" BAIT_INTERVALS=bait.interval_list \\
" +
" TARGET_INTERVALS=target.interval_list" +
"
" +
"Please see " +
"CollectHsMetrics for " +
"detailed descriptions of the output metrics produced by this tool.
" +
"
"
;
@Argument(shortName = "BI", doc = "An interval list file that contains the locations of the baits used.", minElements=1)
public List BAIT_INTERVALS;
@Argument(shortName = "N", doc = "Bait set name. If not provided it is inferred from the filename of the bait intervals.", optional = true)
public String BAIT_SET_NAME;
public CollectHsMetrics() {
// Override inherited default values
MINIMUM_MAPPING_QUALITY = 20;
MINIMUM_BASE_QUALITY = 20;
CLIP_OVERLAPPING_READS = true;
}
@Override
protected IntervalList getProbeIntervals() {
for (final File file : BAIT_INTERVALS) IOUtil.assertFileIsReadable(file);
return IntervalList.fromFiles(BAIT_INTERVALS);
}
@Override
protected String getProbeSetName() {
if (BAIT_SET_NAME != null) {
return BAIT_SET_NAME;
} else {
final SortedSet baitSetNames = new TreeSet();
for (final File file : BAIT_INTERVALS) {
baitSetNames.add(CollectTargetedMetrics.renderProbeNameFromFile(file));
}
return StringUtil.join(".", baitSetNames);
}
}
@Override
protected HsMetricCollector makeCollector(final Set accumulationLevels,
final List samRgRecords,
final ReferenceSequenceFile refFile,
final File perTargetCoverage,
final File perBaseCoverage,
final IntervalList targetIntervals,
final IntervalList probeIntervals,
final String probeSetName,
final int nearProbeDistance) {
return new HsMetricCollector(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance,
MINIMUM_MAPPING_QUALITY, MINIMUM_BASE_QUALITY, CLIP_OVERLAPPING_READS, true, INCLUDE_INDELS, COVERAGE_CAP, SAMPLE_SIZE);
}
}