All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.analysis.CollectRnaSeqMetrics Maven / Gradle / Ivy

There is a newer version: 3.2.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2011 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package picard.analysis;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.OverlapDetector;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.analysis.directed.RnaSeqMetricsCollector;
import picard.annotation.Gene;
import picard.annotation.GeneAnnotationReader;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
import picard.util.RExecutor;

import java.io.File;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

@CommandLineProgramProperties(
        summary = CollectRnaSeqMetrics.USAGE_SUMMARY + CollectRnaSeqMetrics.USAGE_DETAILS,
        oneLineSummary = CollectRnaSeqMetrics.USAGE_SUMMARY,
        programGroup = DiagnosticsAndQCProgramGroup.class
)
@DocumentedFeature
public class CollectRnaSeqMetrics extends SinglePassSamProgram {
static final String USAGE_SUMMARY = "Produces RNA alignment metrics for a SAM or BAM file.  ";
static final String USAGE_DETAILS = "

This tool takes a SAM/BAM file containing the aligned reads from an RNAseq experiment "+ "and produces metrics describing the distribution of the bases within the transcripts. It calculates the total numbers and the "+ "fractions of nucleotides within specific genomic regions including untranslated regions (UTRs), introns, intergenic sequences "+ "(between discrete genes), and peptide-coding sequences (exons). This tool also determines the numbers of bases that pass quality filters "+ "that are specific to Illumina data (PF_BASES). For more information please see the corresponding GATK "+ "Dictionary entry.

" + "

Other metrics include the median coverage (depth), the ratios of 5 prime /3 prime-biases, and the numbers of reads with the "+ "correct/incorrect strand designation. The 5 prime /3 prime-bias results from errors introduced by reverse transcriptase enzymes "+ "during library construction, ultimately leading to the over-representation of either the 5 prime or 3 prime ends of transcripts. "+ "Please see the CollectRnaSeqMetrics "+ "definitions "+ "for details on how these biases are calculated.

" + "

The sequence input must be a valid SAM/BAM file containing RNAseq data aligned by an RNAseq-aware genome aligner such a "+ "STAR or TopHat. "+ "The tool also requires a REF_FLAT file, a tab-delimited file containing information about the location of RNA transcripts, "+ "exon start and stop sites, etc. For an example refFlat file for GRCh38, see refFlat.txt.gz at "+ "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database. "+ "The first five lines of the tab-limited text file appear as follows.

"+ "
" +
"DDX11L1	NR_046018	chr1	+	11873	14409	14409	14409	3	11873,12612,13220,	12227,12721,14409," +
"WASH7P	NR_024540	chr1	-	14361	29370	29370	29370	11	14361,14969,15795,16606,16857,17232,17605,17914,18267,24737,29320,	14829,15038,15947,16765,17055,17368,17742,18061,18366,24891,29370," +
"DLGAP2-AS1	NR_103863	chr8_KI270926v1_alt	-	33083	35050	35050	35050	3	33083,33761,35028,	33281,33899,35050," +
"MIR570	NR_030296	chr3	+	195699400	195699497	195699497	195699497	1	195699400,	195699497," +
"MIR548A3	NR_030330	chr8	-	104484368	104484465	104484465	104484465	1	104484368,	104484465," +
"
" + "

Note: Metrics labeled as percentages are actually expressed as fractions!

"+ "

Usage example:

"+ "
" +
"java -jar picard.jar CollectRnaSeqMetrics \\
" + " I=input.bam \\
" + " O=output.RNA_Metrics \\
" + " REF_FLAT=ref_flat.txt \\
" + " STRAND=SECOND_READ_TRANSCRIPTION_STRAND \\
" + " RIBOSOMAL_INTERVALS=ribosomal.interval_list" + "
" + "Please see the CollectRnaSeqMetrics " + "definitions " + "for a complete description of the metrics produced by this tool." + "
" ; private static final Log LOG = Log.getInstance(CollectRnaSeqMetrics.class); @Argument(doc="Gene annotations in refFlat form. Format described here: http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat") public File REF_FLAT; @Argument(doc="Location of rRNA sequences in genome, in interval_list format. " + "If not specified no bases will be identified as being ribosomal. " + "Format described here:", optional = true) public File RIBOSOMAL_INTERVALS; @Argument(shortName = "STRAND", doc="For strand-specific library prep. " + "For unpaired reads, use FIRST_READ_TRANSCRIPTION_STRAND if the reads are expected to be on the transcription strand.") public RnaSeqMetricsCollector.StrandSpecificity STRAND_SPECIFICITY; @Argument(doc="When calculating coverage based values (e.g. CV of coverage) only use transcripts of this length or greater.") public int MINIMUM_LENGTH = 500; @Argument(doc="The PDF file to write out a plot of normalized position vs. coverage.", shortName="CHART", optional = true) public File CHART_OUTPUT; @Argument(doc="If a read maps to a sequence specified with this option, all the bases in the read are counted as ignored bases. " + "These reads are not counted towards any metrics, except for the PF_BASES field.", optional = true) public Set IGNORE_SEQUENCE = new HashSet(); @Argument(doc="This percentage of the length of a fragment must overlap one of the ribosomal intervals for a read or read pair to be considered rRNA.") public double RRNA_FRAGMENT_PERCENTAGE = 0.8; @Argument(shortName="LEVEL", doc="The level(s) at which to accumulate metrics. ") public Set METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS); private RnaSeqMetricsCollector collector; /** * A subtitle for the plot, usually corresponding to a library. */ private String plotSubtitle = ""; @Override protected String[] customCommandLineValidation() { // No ribosomal intervals file and rRNA fragment percentage = 0 if ( RIBOSOMAL_INTERVALS == null && RRNA_FRAGMENT_PERCENTAGE == 0 ) { throw new PicardException("Must use a RIBOSOMAL_INTERVALS file if RRNA_FRAGMENT_PERCENTAGE = 0.0"); } return super.customCommandLineValidation(); } @Override protected void setup(final SAMFileHeader header, final File samFile) { if (CHART_OUTPUT != null) IOUtil.assertFileIsWritable(CHART_OUTPUT); final OverlapDetector geneOverlapDetector = GeneAnnotationReader.loadRefFlat(REF_FLAT, header.getSequenceDictionary()); LOG.info("Loaded " + geneOverlapDetector.getAll().size() + " genes."); final Long ribosomalBasesInitialValue = RIBOSOMAL_INTERVALS != null ? 0L : null; final OverlapDetector ribosomalSequenceOverlapDetector = RnaSeqMetricsCollector.makeOverlapDetector(samFile, header, RIBOSOMAL_INTERVALS, LOG); final HashSet ignoredSequenceIndices = RnaSeqMetricsCollector.makeIgnoredSequenceIndicesSet(header, IGNORE_SEQUENCE); collector = new RnaSeqMetricsCollector(METRIC_ACCUMULATION_LEVEL, header.getReadGroups(), ribosomalBasesInitialValue, geneOverlapDetector, ribosomalSequenceOverlapDetector, ignoredSequenceIndices, MINIMUM_LENGTH, STRAND_SPECIFICITY, RRNA_FRAGMENT_PERCENTAGE, true); // If we're working with a single library, assign that library's name as a suffix to the plot title final List readGroups = header.getReadGroups(); if (readGroups.size() == 1) { this.plotSubtitle = readGroups.get(0).getLibrary(); if (null == this.plotSubtitle) this.plotSubtitle = ""; } } @Override protected void acceptRead(final SAMRecord rec, final ReferenceSequence refSeq) { collector.acceptRecord(rec, refSeq); } @Override protected void finish() { collector.finish(); final MetricsFile file = getMetricsFile(); collector.addAllLevelsToFile(file); file.write(OUTPUT); boolean atLeastOneHistogram = false; for (final Histogram histo : file.getAllHistograms()) { atLeastOneHistogram = atLeastOneHistogram || !histo.isEmpty(); } // Generate the coverage by position plot if (CHART_OUTPUT != null && atLeastOneHistogram) { final int rResult = RExecutor.executeFromClasspath("picard/analysis/rnaSeqCoverage.R", OUTPUT.getAbsolutePath(), CHART_OUTPUT.getAbsolutePath(), INPUT.getName(), this.plotSubtitle); if (rResult != 0) { throw new PicardException("Problem invoking R to generate plot."); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy