All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.CallOverlappingConsensusBases.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2022 Fulcrum Genomics
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package com.fulcrumgenomics.bam

import com.fulcrumgenomics.FgBioDef.{FgBioEnum, FilePath, PathToBam, PathToFasta, SafelyClosable}
import com.fulcrumgenomics.bam.api.{SamOrder, SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.collection.ParIterator
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.commons.util.Threads.IterableThreadLocal
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, Metric, ProgressLogger}
import enumeratum.EnumEntry

import scala.collection.immutable


@clp(group = ClpGroups.SamOrBam, description=
  """
    |Consensus calls overlapping bases in read pairs.
    |
    |## Inputs and Outputs
    |
    |In order to correctly correct reads by template, the input BAM must be either `queryname` sorted or `query` grouped.
    |The sort can be done in streaming fashion with:
    |
    |```
    |samtools sort -n -u in.bam | fgbio CallOverlappingConsensusBases -i /dev/stdin ...
    |```
    |
    |The output sort order may be specified with `--sort-order`.  If not given, then the output will be in the same
    |order as input.
    |
    |The reference FASTA must be given so that any existing `NM`, `UQ` and `MD` tags can be repaired.
    |
    |## Correction
    |
    |Only mapped read pairs with overlapping bases will be eligible for correction.
    |
    |Each read base from the read and its mate that map to same position in the reference will be used to create
    |a consensus base as follows:
    |
    |1. If the base agree, then the chosen agreement strategy (`--agreement-strategy`) will be used.
    |2. If the base disagree, then the chosen disagreement strategy (`--disagreement-strategy`) will be used.
    |
    |The agreement strategies are as follows:
    |
    |* Consensus:   Call the consensus base and return a new base quality that is the sum of the two base qualities.
    |* MaxQual:     Call the consensus base and return a new base quality that is the maximum of the two base qualities.
    |* PassThrough: Leave the bases and base qualities unchanged.
    |
    |In the context of disagreement strategies, masking a base will make the base an "N" with base quality phred-value "2".
    |The disagreement strategies are as follows:
    |
    |* MaskBoth:      Mask both bases.
    |* MaskLowerQual: Mask the base with the lowest base quality, with the other base unchanged.  If the base qualities
    |                 are the same, mask both bases.
    |* Consensus:     Consensus call the base.  If the base qualities are the same, mask both bases.  Otherwise, call the
    |                 base with the highest base quality and return a new base quality that is the difference between the
    |                 highest and lowest base quality.
 |  """)
class CallOverlappingConsensusBases
(@arg(flag='i', doc="Input SAM or BAM file of aligned reads.") val input: PathToBam,
 @arg(flag='o', doc="Output SAM or BAM file.") val output: PathToBam,
 @arg(flag='m', doc="Output metrics file.") val metrics: FilePath,
 @arg(flag='r', doc="Reference sequence fasta file.") val ref: PathToFasta,
 @arg(doc="The number of threads to use while consensus calling.") val threads: Int = 1,
 @arg(flag='S', doc="The sort order of the output. If not given, output will be in the same order as input if the input.")
 val sortOrder: Option[SamOrder] = None,
 @arg(doc="The strategy to consensus call when both bases agree.  See the usage for more details")
 val agreementStrategy: AgreementStrategy = AgreementStrategy.Consensus,
 @arg(doc="The strategy to consensus call when both bases disagree.  See the usage for more details")
 val disagreementStrategy: DisagreementStrategy = DisagreementStrategy.Consensus
) extends FgBioTool with LazyLogging {
  Io.assertReadable(input)
  Io.assertReadable(ref)
  Io.assertCanWriteFile(output)

  private case class ThreadData
  (caller: OverlappingBasesConsensusCaller             = new OverlappingBasesConsensusCaller(agreementStrategy=agreementStrategy, disagreementStrategy=disagreementStrategy),
   templateMetric: CallOverlappingConsensusBasesMetric = CallOverlappingConsensusBasesMetric(kind=CountKind.Templates),
   basesMetric: CallOverlappingConsensusBasesMetric    = CallOverlappingConsensusBasesMetric(kind=CountKind.Bases)
  )

  override def execute(): Unit = {
    val source           = SamSource(input)
    val outSort          = sortOrder.flatMap { order => if (SamOrder(source.header).contains(order)) None else Some(order) }
    val writer           = Bams.nmUqMdTagRegeneratingWriter(writer=SamWriter(output, source.header.clone(), sort=outSort), ref=ref)
    val progress         = new ProgressLogger(logger)
    val templateIterator = Bams.templateIterator(source)
    val threadData       = new IterableThreadLocal(() => ThreadData())

    // Require queryname sorted or query grouped
    Bams.requireQueryGrouped(header=source.header, toolName="CallOverlappingConsensusBases")

    ParIterator(templateIterator, threads=threads)
      .map { template =>
        val threadDatum = threadData.get()
        threadDatum.synchronized {
          // update metrics
          threadDatum.templateMetric.total += 1
          threadDatum.basesMetric.total += template.primaryReads.map(_.length).sum
          // corrects
          val stats          = threadDatum.caller.call(template)
          val correctedBases = stats.r1CorrectedBases + stats.r2CorrectedBases
          if (stats.overlappingBases > 0) {
            threadDatum.templateMetric.overlapping += 1
            threadDatum.basesMetric.overlapping += stats.overlappingBases
            if (correctedBases > 0) {
              threadDatum.templateMetric.corrected += 1
              threadDatum.basesMetric.corrected += correctedBases
            }
          }
        }
        template.allReads.foreach(progress.record)
        template
      }.toAsync(ParIterator.DefaultChunkSize * 8).foreach { template =>
      writer ++= template.allReads
    }
    progress.logLast()
    source.safelyClose()
    writer.close()

    val templatesMetric = CallOverlappingConsensusBasesMetric(kind=CountKind.Templates)
    val basesMetric     = CallOverlappingConsensusBasesMetric(kind=CountKind.Bases)
    threadData.foreach { datum =>
      templatesMetric += datum.templateMetric
      basesMetric     += datum.basesMetric
    }

    Metric.write(metrics, templatesMetric, basesMetric)
  }
}

/** Collects the the number of reads or bases that were examined, had overlap, and were corrected as part of
  * the [[CallOverlappingConsensusBases]] tool.
  *
  * @param kind template if the counts are per template, bases if counts are in units of bases.
  * @param total the total number of templates (bases) examined
  * @param overlapping the total number of templates (bases) that were overlapping
  * @param corrected the total number of templates (bases) that were corrected.
  */
case class CallOverlappingConsensusBasesMetric
(
  kind: CountKind,
  var total: Long = 0,
  var overlapping: Long = 0,
  var corrected: Long = 0,
) extends Metric {
  def +=(other: CallOverlappingConsensusBasesMetric): CallOverlappingConsensusBasesMetric = {
    require(this.kind == other.kind)
    this.total += other.total
    this.overlapping += other.overlapping
    this.corrected += other.corrected
    this
  }
}

sealed trait CountKind extends EnumEntry

/** Enumeration for the type of counts in [[CallOverlappingConsensusBasesMetric]]. */
object CountKind extends FgBioEnum[CountKind] {
  case object Templates extends CountKind
  case object Bases extends CountKind

  override def values: immutable.IndexedSeq[CountKind] = findValues
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy