All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.ErrorRateByReadPosition.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2016 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package com.fulcrumgenomics.bam

import java.util

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.SamSource
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.io.{Io, PathUtil}
import com.fulcrumgenomics.commons.util.{LazyLogging, SimpleCounter}
import com.fulcrumgenomics.fasta.SequenceDictionary
import com.fulcrumgenomics.sopt._
import com.fulcrumgenomics.util.Metric.Count
import com.fulcrumgenomics.util.{Metric, ProgressLogger, Rscript, Sequences}
import com.fulcrumgenomics.vcf.{ByIntervalListVariantContextIterator, VariantMask}
import htsjdk.samtools.SAMRecord
import htsjdk.samtools.filter.{DuplicateReadFilter, FailsVendorReadQualityFilter, SamRecordFilter, SecondaryOrSupplementaryFilter}
import htsjdk.samtools.reference.ReferenceSequenceFileWalker
import htsjdk.samtools.util.{IntervalList, SamLocusIterator, SequenceUtil}
import htsjdk.variant.vcf.VCFFileReader

import scala.annotation.switch
import scala.collection.mutable
import scala.util.Failure

@clp(group=ClpGroups.SamOrBam, description =
  """
    |Calculates the error rate by read position on coordinate sorted mapped BAMs. The output file contains
    |a row per read (first of pair, second of pair and unpaired), per position in read, with the total number
    |of bases observed, the number of errors observed, the overall error rate, and the rate of each kind of
    |substitution error.
    |
    |Substitution types are collapsed based on the reference or expected base, with only six substitution
    |types being reported: `A>C`, `A>G`, `A>T`, `C>A`, `C>G` and `C>T`.  For example, `T>G` is grouped in
    |with `A>C`.
    |
    |Analysis can be restricted to a set of intervals via the `--intervals` option. Genomic positions can be
    |excluded from analysis by supplying a set of variants (either known variants in the sample or a catalog
    |of known variants such as dbSNP).  For data believed to have low error rates it is recommended to use
    |both the `--intervals` and `--variants` options to restrict analysis to only regions expected to be
    |homozygous reference in the data.
    |
    |The following are reads / bases are excluded from the analysis:
    |
    |- Unmapped reads
    |- Reads marked as failing vendor quality
    |- Reads marked as duplicates (unless `--include-duplicates` is specified)
    |- Secondary and supplemental records
    |- Soft-clipped bases in records
    |- Reads with MAPQ < `--min-mapping-quality` (default: 20)
    |- Bases with base quality < `--min-base-quality` (default: 0)
    |- Bases where either the read base or the reference base is non-ACGT
    |
    |An output text file is generated with the extension `.error_rate_by_read_position.txt`
    |
    |If R's `Rscript` utility is on the path and `ggplot2` is installed in the R distribution then a PDF
    |of error rate plots will also be generated with extension `.error_rate_by_read_position.pdf`.
  """
  )
class ErrorRateByReadPosition
( @arg(flag='i', doc="Input BAM file.") val input: PathToBam,
  @arg(flag='o', doc="Output metrics prefix. If not given, will use the input BAM basename.") val output: Option[PathPrefix] = None,
  @arg(flag='r', doc="Reference sequence fasta file.") val ref: PathToFasta,
  @arg(flag='v', doc="Optional file of variant sites to ignore.") val variants: Option[PathToVcf] = None,
  @arg(flag='l', doc="Optional list of intervals to restrict analysis to.") val intervals: Option[PathToIntervals] = None,
  @arg(flag='d', doc="Include duplicate reads, otherwise ignore.") val includeDuplicates: Boolean = false,
  @arg(flag='m', doc="The minimum mapping quality for a read to be included.") val minMappingQuality: Int = 20,
  @arg(flag='q', doc="The minimum base quality for a base to be included.") val minBaseQuality: Int = 0,
  @arg(doc="""
            |Collapse substitution types based on the reference or expected base, with only six substitution
            |types being reported: `A>C`, `A>G`, `A>T`, `C>A`, `C>G` and `C>T`.For example, `T>G` is grouped in with `A>C`.
            |Otherwise, all possible substitution types will be reported.
           """
  ) val collapse: Boolean = true
) extends FgBioTool with LazyLogging {

  private val ScriptPath = "com/fulcrumgenomics/bam/ErrorRateByReadPosition.R"

  Io.assertReadable(input)
  Io.assertReadable(ref)
  variants.foreach(Io.assertReadable)
  intervals.foreach(Io.assertReadable)
  output.foreach(out => Io.assertCanWriteFile(out))

  def execute(): Unit = {
    val in          = SamSource(input, ref=Some(ref))
    val metrics     = computeMetrics(in)
    val prefix      = output.getOrElse(PathUtil.removeExtension(input))
    val metricsPath = PathUtil.pathTo(s"${prefix}${ErrorRateByReadPositionMetric.FileExtension}")
    val plotPath    = PathUtil.pathTo(s"${prefix}${ErrorRateByReadPositionMetric.PlotExtension}")
    Metric.write(metricsPath, metrics=metrics)

    // And try plotting
    if (metrics.isEmpty) {
      logger.warning("No metrics generated (is your BAM empty?). Plots will not be generated.")
    }
    else {
      val description = plotDescription(in, input)
      Rscript.execIfAvailable(ScriptPath, metricsPath.toString, plotPath.toString, description) match {
        case Failure(e) => logger.warning(s"Generation of PDF plots failed: ${e.getMessage}")
        case _ => ()
      }
    }
  }

  /** Computes the metrics from the BAM file. */
  private[bam] def computeMetrics(in: SamSource = SamSource(input, ref=Some(ref))): Seq[ErrorRateByReadPositionMetric] = {
    import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary
    val progress = ProgressLogger(logger, verb="Processed", noun="loci", unit=50000)
    val ilist    = this.intervals.map(p => IntervalList.fromFile(p.toFile).uniqued(false))

    val refWalker     = new ReferenceSequenceFileWalker(this.ref.toFile)
    val locusIterator = buildSamLocusIterator(in, ilist).iterator()
    val variantMask   = buildVariantMask(variants, ilist, refWalker.getSequenceDictionary.fromSam)

    val counters = List.tabulate(3)(_ => mutable.Map[Int, ObsCounter]())

    locusIterator.filterNot(l => variantMask.isVariant(l.getSequenceIndex, l.getPosition)).foreach { locus =>
      val refBase     = SequenceUtil.upperCase(refWalker.get(locus.getSequenceIndex).getBases()(locus.getPosition - 1))
      val refBaseComp = Sequences.complement(refBase)

      if (SequenceUtil.isValidBase(refBase)) {
        locus.getRecordAndOffsets.iterator().filter(r => SequenceUtil.isValidBase(r.getReadBase)).foreach { rec =>

          val readBase = SequenceUtil.upperCase(rec.getReadBase)
          val readNum  = readNumber(rec.getRecord)
          val cycle    = if (rec.getRecord.getReadNegativeStrandFlag) rec.getRecord.getReadLength - rec.getOffset else rec.getOffset + 1
          val counter  = counters(readNum).getOrElseUpdate(cycle, new ObsCounter(readNum, cycle, collapse))

          val readOrderRefBase  = if (rec.getRecord.getReadNegativeStrandFlag) refBaseComp else refBase
          val readOrderReadBase = if (rec.getRecord.getReadNegativeStrandFlag) Sequences.complement(readBase) else readBase

          if (!collapse || readOrderRefBase == 'A' || readOrderRefBase == 'C') counter.count(readOrderRefBase, readOrderReadBase)
          else counter.count(Sequences.complement(readOrderRefBase), SequenceUtil.complement(readOrderReadBase))
        }
      }

      progress.record(locus.getSequenceName, locus.getPosition)
    }
    in.safelyClose()

    // Ensure that all the maps have values up to the max
    for ((counter, readNum) <- counters.filter(_.nonEmpty).zipWithIndex; cycle <- 1 to counter.keys.max) {
      counter.getOrElseUpdate(cycle, new ObsCounter(readNum, cycle, collapse))
    }

    counters.flatMap(c => c.values.map(_.toMetric)).sortBy(m => (m.read_number, m.position))
  }

  /** Generates a SamLocusIterator that will traverse over the relevant parts of the BAM. */
  private def buildSamLocusIterator(in: SamSource, intervals: Option[IntervalList]): SamLocusIterator = {
    val iterator = intervals match {
      case None => new SamLocusIterator(in.toSamReader)
      case Some(i) => new SamLocusIterator(in.toSamReader, i)
    }

    val filters = new util.ArrayList[SamRecordFilter]()
    filters.add(new SecondaryOrSupplementaryFilter)
    filters.add(new FailsVendorReadQualityFilter)
    if (!this.includeDuplicates) filters.add(new DuplicateReadFilter)
    iterator.setSamFilters(filters)

    iterator.setMappingQualityScoreCutoff(this.minMappingQuality)
    iterator.setQualityScoreCutoff(this.minBaseQuality)
    iterator.setEmitUncoveredLoci(false)
    iterator.setIncludeIndels(false)
    iterator
  }

  /** Generates a variant mask object. */
  private def buildVariantMask(variants: Option[PathToVcf], intervals: Option[IntervalList], dict: SequenceDictionary): VariantMask = {
    this.variants match {
      case None =>
        new VariantMask(Iterator.empty, dict)
      case Some(path) =>
        val reader = new VCFFileReader(path.toFile, this.intervals.isDefined)
        intervals match {
          case None => new VariantMask(reader.iterator(), dict)
          case Some(i) => new VariantMask(ByIntervalListVariantContextIterator(reader, i), dict)
        }
    }
  }

  /** Compute the read number for the record. */
  def readNumber(rec: SAMRecord): Int = if (!rec.getReadPairedFlag) 0 else if (rec.getFirstOfPairFlag) 1 else 2
}

/** Counter for substitutions for a given read number and position in that read.
  *
  * @param readNumber the read number
  * @param position the position in the read
  * @param collapse true to collapse substitution types, false otherwise
  */
private class ObsCounter(readNumber: Int, position: Int, collapse: Boolean) {
  private val Dna: Seq[Char]             = IndexedSeq('A', 'C', 'G', 'T')
  private val counts: Array[Array[Long]] = new Array[Long](4).map(_ => new Array[Long](4))

  /** Maps A/C/G/T to an index 0-3 and throws an exception for any other base. */
  @inline private def index(base: Byte): Int = (base: @switch) match {
    case 'A' => 0
    case 'C' => 1
    case 'G' => 2
    case 'T' => 3
    case _   => throw new IllegalArgumentException(s"Invalid base: $base")
  }

  @inline def count(ref: Byte, read: Byte): Unit =
    try { this.counts(index(ref))(index(read)) += 1 } catch { case _: IllegalArgumentException => () }

  @inline def countOf(ref: Char, read: Char): Long =
    try { this.counts(index(ref.toByte))(index(read.toByte)) } catch { case _: IllegalArgumentException => 0 }

  @inline private def total(ref: Char): Long    = this.counts(index(ref.toByte)).sum
  @inline private def totalRef(ref: Char): Long = countOf(ref, ref)

  @inline private def errorRate(ref: Char, read: Char): Double = {
    val totalRef = this.total(ref)
    if (totalRef == 0) 0 else countOf(ref=ref, read=read) / totalRef.toDouble
  }

  @inline private def getErrorRate(ref: Char, read: Char): Option[Double] = {
    if (!collapse || ref == 'A' || ref == 'C') Some(errorRate(ref=ref, read=read))
    else None
  }

  def toMetric: ErrorRateByReadPositionMetric = {
    // Double check that we have zero counts for G>X and T>X when we collapse substitution types
    if (collapse) {
      Seq('G', 'T').foreach { ref =>
        Dna.foreach { read =>
          require(this.countOf(ref=ref, read=read) == 0,
            s"Bug: collapse is true but found a non-zero count for $ref>$read")
        }
      }
    }

    val total  = Dna.map(ref => this.total(ref=ref)).sum
    val errors = total - Dna.map(ref => this.totalRef(ref=ref)).sum
    new ErrorRateByReadPositionMetric(
      read_number = readNumber,
      position    = position,
      bases_total = total,
      errors      = errors,
      error_rate  = if (total == 0) 0 else errors / total.toDouble,
      a_to_c_error_rate = errorRate('A', 'C'),
      a_to_g_error_rate = errorRate('A', 'G'),
      a_to_t_error_rate = errorRate('A', 'T'),
      c_to_a_error_rate = errorRate('C', 'A'),
      c_to_g_error_rate = errorRate('C', 'G'),
      c_to_t_error_rate = errorRate('C', 'T'),
      g_to_a_error_rate = getErrorRate('G', 'A'),
      g_to_c_error_rate = getErrorRate('G', 'C'),
      g_to_t_error_rate = getErrorRate('G', 'T'),
      t_to_a_error_rate = getErrorRate('T', 'A'),
      t_to_c_error_rate = getErrorRate('T', 'C'),
      t_to_g_error_rate = getErrorRate('T', 'G'),
      collapsed         = collapse
    )
  }
}

object ErrorRateByReadPositionMetric {
  val FileExtension = ".error_rate_by_read_position.txt"
  val PlotExtension = ".error_rate_by_read_position.pdf"
}

/**
  * Metrics produced by `ErrorRateByReadPosition` describing the number of base observations and
  * substitution errors at each position within each sequencing read.  Error rates are given for
  * the overall substitution error rate and also for each kind of substitution separately.
  *
  * If `collapsed` is `true`, then complementary substitutions are grouped together into the first 6 error rates.
  * e.g. `T>G` substitutions are reported as `A>C`.  Otherwise, all 12 substitution rates are reported.
  *
  * @param read_number The read number (0 for fragments, 1 for first of pair, 2 for second of pair).
  * @param position The position or cycle within the read (1-based).
  * @param bases_total The total number of bases observed at this position.
  * @param errors The total number of errors or non-reference basecalls observed at this position.
  * @param error_rate The overall error rate at position.
  * @param a_to_c_error_rate The rate of `A>C` (and `T>G` when collapsed) errors at the position.
  * @param a_to_g_error_rate The rate of `A>G` (and `T>C` when collapsed) errors at the position.
  * @param a_to_t_error_rate The rate of `A>T` (and `T>A` when collapsed) errors at the position.
  * @param c_to_a_error_rate The rate of `C>A` (and `G>T` when collapsed) errors at the position.
  * @param c_to_g_error_rate The rate of `C>G` (and `G>C` when collapsed) errors at the position.
  * @param c_to_t_error_rate The rate of `C>T` (and `G>A` when collapsed) errors at the position.
  * @param g_to_a_error_rate The rate of `G>A` errors at the position.
  * @param g_to_c_error_rate The rate of `G>C` errors at the position.
  * @param g_to_t_error_rate The rate of `G>T` errors at the position.
  * @param t_to_a_error_rate The rate of `T>A` errors at the position.
  * @param t_to_c_error_rate The rate of `T>C` errors at the position.
  * @param t_to_g_error_rate The rate of `T>T` errors at the position.
  */
case class ErrorRateByReadPositionMetric
( read_number: Int,
  position: Int,
  bases_total: Count,
  errors: Count,
  error_rate: Double,
  a_to_c_error_rate: Double,
  a_to_g_error_rate: Double,
  a_to_t_error_rate: Double,
  c_to_a_error_rate: Double,
  c_to_g_error_rate: Double,
  c_to_t_error_rate: Double,
  g_to_a_error_rate: Option[Double] = None,
  g_to_c_error_rate: Option[Double] = None,
  g_to_t_error_rate: Option[Double] = None,
  t_to_a_error_rate: Option[Double] = None,
  t_to_c_error_rate: Option[Double] = None,
  t_to_g_error_rate: Option[Double] = None,
  collapsed: Boolean = true
) extends Metric




© 2015 - 2025 Weber Informatics LLC | Privacy Policy