picard.analysis.rrbsQc.R Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of picard Show documentation
A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.
There is a newer version: 3.2.0
Show newest version
##
## The MIT License
##
## Copyright (c) 2013 The Broad Institute
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to deal
## in the Software without restriction, including without limitation the rights
## to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
## copies of the Software, and to permit persons to whom the Software is
## furnished to do so, subject to the following conditions:
##
## The above copyright notice and this permission notice shall be included in
## all copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
## THE SOFTWARE.
##

args = commandArgs(trailingOnly=TRUE)
opt = list(details.fn=args[1], summary.fn=args[2], output.fn=args[3])

read_metrics_file = function(metrics.fn) {
  contents = read.delim(metrics.fn, comment.char="#", stringsAsFactors=FALSE)
  return(contents)
}

equals_or_is_na = function(x1, x2) {
  if (is.na(x1)) {
    return(is.na(x2))
  } else {
    return(x1 == x2)
  }
}

details = read_metrics_file(opt$details.fn)
summary = read_metrics_file(opt$summary.fn)

pdf(opt$output.fn)
par(mfrow=c(2,2), oma=c(0,0,2,0))

for (i in seq_len(nrow(summary))) {
  cur_summary = summary[i, ]
  cur_sample = cur_summary[1, "SAMPLE"]
  cur_library = cur_summary[1, "LIBRARY"]
  cur_read_group = cur_summary[1, "READ_GROUP"]
  cur_details = details[which((equals_or_is_na(cur_library, details[, "LIBRARY"]) &
    (equals_or_is_na(cur_sample, details[, "SAMPLE"])) &
    (equals_or_is_na(cur_read_group, details[, "READ_GROUP"])))), ]
  
  
  ## Plot conversion rates
  cpg.converted = sum(cur_details$CONVERTED_SITES)
  cpg.seen = sum(cur_details$TOTAL_SITES)
  cpg.conversion = cpg.converted / cpg.seen
  total.conversion = (cpg.converted + cur_summary$NON_CPG_CONVERTED_BASES) / (cpg.seen + cur_summary$NON_CPG_BASES)

  barplot(c("non-CpG"=cur_summary$PCT_NON_CPG_BASES_CONVERTED, "Combined"=total.conversion, "CpG"=cpg.conversion),
          ylim=c(0.95, 1), ylab="% Conversion", xlab="Distribution", main="Bisulfite Conversion Rate",
          col="blue", xpd=FALSE)
  abline(h=0.995, col="grey")
  
  ## Plot histogram of CpG counts by conversion rate
  hist(cur_details$PCT_CONVERTED, 10, xlab="Conversion Rate Of CpGs", ylab="# CpGs",
       main="CpG Conversion Rate Distribution", col="blue")
  
  ## Plot pie chart showing distribution of CpG coverage
  coverage_breaks = c(0, 1, 5, 10, 25, 50, 100, Inf)
  coverage_cut = cut(cur_details$TOTAL_SITES, coverage_breaks)
  cpg_coverage = split(cur_details$TOTAL_SITES, coverage_cut)
  coverages = sapply(cpg_coverage, length)[2:7]
  names(coverages) = paste(">=", c(1, 5, 10, 25, 50, 100), sep="")
  ## If we have 0s all across the pie chart will be effectively meaningless but put in a 100% >= 0 field instead
  ## to avoid an error on pie(). Normally it'd just be a pain to see these, but ...
  if (all(coverages == 0)) {
    coverages = c("No Coverage"=1)
  }
  color_ramp = colorRampPalette(c("white", "#538ED5", "blue"), bias=1, space="Lab")
  colors = color_ramp(length(coverages))[2:length(coverages)]
  pie(coverages, main="Distribution Of CpGs By Coverage", col=colors, clockwise=TRUE)
  
  discards = log10(c("Mismatches"=cur_summary$READS_IGNORED_MISMATCHES, "Size"=cur_summary$READS_IGNORED_SHORT))
  ## Protect against -Inf in the case where we had 0 discards
  discards = ifelse(is.finite(discards), discards, 0)
  barplot(discards, ylab="Number Discarded (log10)", xlab="Reason",
          main="Reads Discarded", col="blue", ylim=c(0, ceiling(max(discards))))

  header_txt = character()
  if (!is.na(cur_sample) && cur_sample != "") {
    header_txt = paste(header_txt, " SAMPLE=", cur_sample, sep="")
  }
  if (!is.na(cur_library) && cur_library != "") {
    header_txt = paste(header_txt, " LIBRARY=", cur_library, sep="")
  }
  if (!is.na(cur_read_group) && cur_read_group != "") {
    header_txt = paste(header_txt, " READ GROUP=", cur_read_group, sep="")
  }
  if (length(header_txt) > 0) {
    mtext(header_txt, outer=TRUE, line=1)
  }
}

dev.off()