picard.analysis.CollectRnaSeqMetrics Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.analysis;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.OverlapDetector;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.analysis.directed.RnaSeqMetricsCollector;
import picard.annotation.Gene;
import picard.annotation.GeneAnnotationReader;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
import picard.util.RExecutor;
import java.io.File;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@CommandLineProgramProperties(
summary = CollectRnaSeqMetrics.USAGE_SUMMARY + CollectRnaSeqMetrics.USAGE_DETAILS,
oneLineSummary = CollectRnaSeqMetrics.USAGE_SUMMARY,
programGroup = DiagnosticsAndQCProgramGroup.class
)
@DocumentedFeature
public class CollectRnaSeqMetrics extends SinglePassSamProgram {
static final String USAGE_SUMMARY = "Produces RNA alignment metrics for a SAM or BAM file. ";
static final String USAGE_DETAILS = "This tool takes a SAM/BAM file containing the aligned reads from an RNAseq experiment "+
"and produces metrics describing the distribution of the bases within the transcripts. It calculates the total numbers and the "+
"fractions of nucleotides within specific genomic regions including untranslated regions (UTRs), introns, intergenic sequences "+
"(between discrete genes), and peptide-coding sequences (exons). This tool also determines the numbers of bases that pass quality filters "+
"that are specific to Illumina data (PF_BASES). For more information please see the corresponding GATK "+
"Dictionary entry.
" +
"Other metrics include the median coverage (depth), the ratios of 5 prime /3 prime-biases, and the numbers of reads with the "+
"correct/incorrect strand designation. The 5 prime /3 prime-bias results from errors introduced by reverse transcriptase enzymes "+
"during library construction, ultimately leading to the over-representation of either the 5 prime or 3 prime ends of transcripts. "+
"Please see the CollectRnaSeqMetrics "+
"definitions "+
"for details on how these biases are calculated.
" +
"The sequence input must be a valid SAM/BAM file containing RNAseq data aligned by an RNAseq-aware genome aligner such a "+
"STAR or TopHat. "+
"The tool also requires a REF_FLAT file, a tab-delimited file containing information about the location of RNA transcripts, "+
"exon start and stop sites, etc. For an example refFlat file for GRCh38, see refFlat.txt.gz at "+
"http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database. "+
"The first five lines of the tab-limited text file appear as follows.
"+
"" +
"DDX11L1 NR_046018 chr1 + 11873 14409 14409 14409 3 11873,12612,13220, 12227,12721,14409," +
"WASH7P NR_024540 chr1 - 14361 29370 29370 29370 11 14361,14969,15795,16606,16857,17232,17605,17914,18267,24737,29320, 14829,15038,15947,16765,17055,17368,17742,18061,18366,24891,29370," +
"DLGAP2-AS1 NR_103863 chr8_KI270926v1_alt - 33083 35050 35050 35050 3 33083,33761,35028, 33281,33899,35050," +
"MIR570 NR_030296 chr3 + 195699400 195699497 195699497 195699497 1 195699400, 195699497," +
"MIR548A3 NR_030330 chr8 - 104484368 104484465 104484465 104484465 1 104484368, 104484465," +
"
" +
"Note: Metrics labeled as percentages are actually expressed as fractions!
"+
"Usage example:
"+
"" +
"java -jar picard.jar CollectRnaSeqMetrics \\
" +
" I=input.bam \\
" +
" O=output.RNA_Metrics \\
" +
" REF_FLAT=ref_flat.txt \\
" +
" STRAND=SECOND_READ_TRANSCRIPTION_STRAND \\
" +
" RIBOSOMAL_INTERVALS=ribosomal.interval_list" +
"
" +
"Please see the CollectRnaSeqMetrics " +
"definitions " +
"for a complete description of the metrics produced by this tool." +
"
"
;
private static final Log LOG = Log.getInstance(CollectRnaSeqMetrics.class);
@Argument(doc="Gene annotations in refFlat form. Format described here: http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat")
public File REF_FLAT;
@Argument(doc="Location of rRNA sequences in genome, in interval_list format. " +
"If not specified no bases will be identified as being ribosomal. " +
"Format described here:", optional = true)
public File RIBOSOMAL_INTERVALS;
@Argument(shortName = "STRAND", doc="For strand-specific library prep. " +
"For unpaired reads, use FIRST_READ_TRANSCRIPTION_STRAND if the reads are expected to be on the transcription strand.")
public RnaSeqMetricsCollector.StrandSpecificity STRAND_SPECIFICITY;
@Argument(doc="When calculating coverage based values (e.g. CV of coverage) only use transcripts of this length or greater.")
public int MINIMUM_LENGTH = 500;
@Argument(doc="The PDF file to write out a plot of normalized position vs. coverage.", shortName="CHART", optional = true)
public File CHART_OUTPUT;
@Argument(doc="If a read maps to a sequence specified with this option, all the bases in the read are counted as ignored bases. " +
"These reads are not counted towards any metrics, except for the PF_BASES field.", optional = true)
public Set IGNORE_SEQUENCE = new HashSet();
@Argument(doc="This percentage of the length of a fragment must overlap one of the ribosomal intervals for a read or read pair to be considered rRNA.")
public double RRNA_FRAGMENT_PERCENTAGE = 0.8;
@Argument(shortName="LEVEL", doc="The level(s) at which to accumulate metrics. ")
public Set METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS);
private RnaSeqMetricsCollector collector;
/**
* A subtitle for the plot, usually corresponding to a library.
*/
private String plotSubtitle = "";
@Override
protected String[] customCommandLineValidation() {
// No ribosomal intervals file and rRNA fragment percentage = 0
if ( RIBOSOMAL_INTERVALS == null && RRNA_FRAGMENT_PERCENTAGE == 0 ) {
throw new PicardException("Must use a RIBOSOMAL_INTERVALS file if RRNA_FRAGMENT_PERCENTAGE = 0.0");
}
return super.customCommandLineValidation();
}
@Override
protected void setup(final SAMFileHeader header, final File samFile) {
if (CHART_OUTPUT != null) IOUtil.assertFileIsWritable(CHART_OUTPUT);
final OverlapDetector geneOverlapDetector = GeneAnnotationReader.loadRefFlat(REF_FLAT, header.getSequenceDictionary());
LOG.info("Loaded " + geneOverlapDetector.getAll().size() + " genes.");
final Long ribosomalBasesInitialValue = RIBOSOMAL_INTERVALS != null ? 0L : null;
final OverlapDetector ribosomalSequenceOverlapDetector = RnaSeqMetricsCollector.makeOverlapDetector(samFile, header, RIBOSOMAL_INTERVALS, LOG);
final HashSet ignoredSequenceIndices = RnaSeqMetricsCollector.makeIgnoredSequenceIndicesSet(header, IGNORE_SEQUENCE);
collector = new RnaSeqMetricsCollector(METRIC_ACCUMULATION_LEVEL, header.getReadGroups(), ribosomalBasesInitialValue,
geneOverlapDetector, ribosomalSequenceOverlapDetector, ignoredSequenceIndices, MINIMUM_LENGTH, STRAND_SPECIFICITY, RRNA_FRAGMENT_PERCENTAGE,
true);
// If we're working with a single library, assign that library's name as a suffix to the plot title
final List readGroups = header.getReadGroups();
if (readGroups.size() == 1) {
this.plotSubtitle = readGroups.get(0).getLibrary();
if (null == this.plotSubtitle) this.plotSubtitle = "";
}
}
@Override
protected void acceptRead(final SAMRecord rec, final ReferenceSequence refSeq) {
collector.acceptRecord(rec, refSeq);
}
@Override
protected void finish() {
collector.finish();
final MetricsFile file = getMetricsFile();
collector.addAllLevelsToFile(file);
file.write(OUTPUT);
boolean atLeastOneHistogram = false;
for (final Histogram histo : file.getAllHistograms()) {
atLeastOneHistogram = atLeastOneHistogram || !histo.isEmpty();
}
// Generate the coverage by position plot
if (CHART_OUTPUT != null && atLeastOneHistogram) {
final int rResult = RExecutor.executeFromClasspath("picard/analysis/rnaSeqCoverage.R",
OUTPUT.getAbsolutePath(),
CHART_OUTPUT.getAbsolutePath(),
INPUT.getName(),
this.plotSubtitle);
if (rResult != 0) {
throw new PicardException("Problem invoking R to generate plot.");
}
}
}
}