All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.OverlappingBasesConsensusCaller.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2022 Fulcrum Genomics
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package com.fulcrumgenomics.bam

import com.fulcrumgenomics.FgBioDef.FgBioEnum
import com.fulcrumgenomics.bam.api.{SamRecord, SamSource}
import com.fulcrumgenomics.commons.collection.SelfClosingIterator
import com.fulcrumgenomics.commons.util.Logger
import com.fulcrumgenomics.util.NumericTypes.PhredScore
import enumeratum.EnumEntry
import htsjdk.samtools.util.SequenceUtil


/** Consensus calls the overlapping mapped bases in read pairs.
  *
  * This will iterate through the mapped bases that overlap between the read and mate in a read pair.  If the read and
  * mate agree at a given reference position, then read and mate base will not change and the base quality returned
  * is controlled by `agreementStrategy` (see [[AgreementStrategy]]). If they disagree at a given reference position,
  * then the base and quality returned is controlled by `disagreementStrategy` (see [[DisagreementStrategy]]).
  *
  * If either read base is a no-call (e.g. N), then the read bases for each read respectively will not be changed.
  *
  * @param agreementStrategy the strategy to use when the two bases agree.  This only affects the base qualities.
  * @param disagreementStrategy the disagreement strategy to use when the two bases disagree.  This will affect both the
  *                             bases and the qualities.
  */
class OverlappingBasesConsensusCaller(agreementStrategy: AgreementStrategy = AgreementStrategy.Consensus,
                                      disagreementStrategy: DisagreementStrategy = DisagreementStrategy.Consensus) {
  import OverlappingBasesConsensusCaller.NoCall

  /** Consensus calls the overlapping bases if and only if the template is a paired end where both ends map with at
    * least one base overlapping.
    *
    * @param template the template to potentially correct.
    * @return summary statistics about how many bases were examined and modified
    */
  def call(template: Template): CorrectionStats = (template.r1, template.r2) match {
    case (Some(r1), Some(r2)) if r1.mapped && r2.mapped && r1.matesOverlap.contains(true) => call(r1, r2)
    case _ => CorrectionStats()
  }

  /** Consensus calls the overlapping bases if and only if both ends map with at least one base overlapping.
    *
    * @param r1 the first read in the pair
    * @param r2 the second read in the pair
    * @return summary statistics about how many bases were examined and modified
    */
  def call(r1: SamRecord, r2: SamRecord): CorrectionStats = {
    require(r1.mapped && r2.mapped && r1.paired && r2.paired && r1.name == r2.name && r1.matesOverlap.contains(true))
    require(r1.firstOfPair && r2.secondOfPair)

    val iter             = new ReadMateAndRefPosIterator(rec=r1, mate=r2).buffered
    var overlappingBases = 0
    var r1Corrected      = 0
    var r2Corrected      = 0

    // Walk through the iterators by reference position
    iter.foreach { item =>
      val base1 = r1.bases(item.readPos - 1)
      val base2 = r2.bases(item.matePos - 1)
      if (!SequenceUtil.isNoCall(base1) && !SequenceUtil.isNoCall(base2)) {
        // consensus call
        val (newBase1, newQual1, newBase2, newQual2) = consensusCall(
          base1 = base1,
          qual1 = r1.quals(item.readPos - 1),
          base2 = base2,
          qual2 = r2.quals(item.matePos - 1)
        )

        if (newBase1 != NoCall && newBase1 != base1) r1Corrected += 1
        if (newBase2 != NoCall && newBase2 != base2) r2Corrected += 1

        r1.bases(item.readPos - 1) = newBase1
        r1.quals(item.readPos - 1) = newQual1
        r2.bases(item.matePos - 1) = newBase2
        r2.quals(item.matePos - 1) = newQual2

        overlappingBases += 1 // only one for the template
      }
    }

    CorrectionStats(overlappingBases, r1Corrected, r2Corrected)
  }

  private def consensusCall(base1: Byte, qual1: PhredScore,
                            base2: Byte, qual2: PhredScore): (Byte, PhredScore, Byte, PhredScore) = {
    if (base1 == base2) {
      val (q1, q2) = agreementStrategy.qual(qual1, qual2)
      (base1, q1, base2, q2)
    }
    else {
      disagreementStrategy.baseAndQual(base1, qual1, base2, qual2)
    }
  }
}

object OverlappingBasesConsensusCaller {
  val NoCall: Byte = 'N'.toByte
  val NoCallQual: PhredScore = PhredScore.MinValue

  /** Returns an iterator over records in the SAM source that are in query group order, where overlapping read pairs
    * are consensus called.  The input will be re-sorted if not already query sorted or grouped.  Statistics for how
    * may bases and templates had overlaps and were modified are logged to the given logger.
    *
    * @param in the input [[SamSource]] from which to read
    * @param logger the logger, to which output statistics are written
    * @param agreementStrategy the strategy to use when the two bases agree.  This only affects the base qualities.
    * @param disagreementStrategy the disagreement strategy to use when the two bases disagree.  This will affect both the
    *                             bases and the qualities.
    */
  def iterator(in: SamSource,
               logger: Logger,
               agreementStrategy: AgreementStrategy = AgreementStrategy.Consensus,
               disagreementStrategy: DisagreementStrategy = DisagreementStrategy.Consensus): Iterator[SamRecord] = {
    val templateMetric = CallOverlappingConsensusBasesMetric(kind=CountKind.Templates)
    val basesMetric    = CallOverlappingConsensusBasesMetric(kind=CountKind.Bases)
    val caller         = new OverlappingBasesConsensusCaller(
      agreementStrategy    = agreementStrategy,
      disagreementStrategy = disagreementStrategy
    )
    val templateIterator = Bams.templateIterator(in=in).flatMap { template =>
      val stats: CorrectionStats = caller.call(template)

      // update metrics
      val basesCorrected = stats.r1CorrectedBases + stats.r2CorrectedBases
      if (basesMetric.overlapping > 0) templateMetric.overlapping += 1
      if (basesCorrected > 0) templateMetric.corrected += 1
      templateMetric.total += 1
      basesMetric.total += template.primaryReads.map(_.length).sum
      basesMetric.overlapping = stats.overlappingBases
      basesMetric.corrected = basesCorrected
      template.allReads
    }
    new SelfClosingIterator(templateIterator, closer = () => {
      val pctTemplates = if (templateMetric.overlapping == 0) 0 else 100 * templateMetric.corrected / templateMetric.overlapping.toDouble
      val pctBases     = if (basesMetric.overlapping == 0) 0 else 100 * basesMetric.corrected / basesMetric.overlapping.toDouble
      logger.info("Consensus calling overlapping read pairs statistics:")
      logger.info(f"    ${templateMetric.overlapping}%,d overlapping templates")
      logger.info(f"    ${templateMetric.corrected}%,d corrected templates (${pctTemplates}%.2f%%)")
      logger.info(f"    ${basesMetric.overlapping}%,d overlapping bases")
      logger.info(f"    ${basesMetric.corrected}%,d corrected bases (${pctBases}%.2f%%)")
    })
  }
}

/** Statistics for consensus calling overlapping bases in a read pair
  *
  * @param overlappingBases the number of bases that overlap between R1 and R2
  * @param r1CorrectedBases the number of bases modified in R1
  * @param r2CorrectedBases the number of bases modified in R2
  */
case class CorrectionStats(overlappingBases: Int = 0, r1CorrectedBases: Int = 0, r2CorrectedBases: Int = 0)


/** Trait that entries in [[AgreementStrategy]] will extend. */
sealed trait AgreementStrategy extends EnumEntry {
  def qual(qual1: PhredScore, qual2: PhredScore): (PhredScore, PhredScore)
}


/** Enum to represent the strategy to consensus call when both bases agree. */
object AgreementStrategy extends FgBioEnum[AgreementStrategy] {
  /** Call the consensus base and return a new base quality that is the sum of the two base qualities. */
  case object Consensus extends AgreementStrategy {
    def qual(qual1: PhredScore, qual2: PhredScore): (PhredScore, PhredScore) = {
      val qual = PhredScore.cap(qual1 + qual2)
      (qual, qual)
    }
  }

  /** Call the consensus base and return a new base quality that is the maximum of the two base qualities. */
  case object MaxQual extends AgreementStrategy {
    def qual(qual1: PhredScore, qual2: PhredScore): (PhredScore, PhredScore) = {
      val qual = PhredScore.cap(Math.max(qual1, qual2))
      (qual, qual)
    }
  }

  /** Leave the bases and base qualities unchanged. */
  case object PassThrough extends AgreementStrategy {
    def qual(qual1: PhredScore, qual2: PhredScore): (PhredScore, PhredScore) = (qual1, qual2)
  }

  override def values: IndexedSeq[AgreementStrategy] = super.findValues
}

/** Trait that entries in [[DisagreementStrategy]] will extend. */
sealed trait DisagreementStrategy extends EnumEntry {
  def baseAndQual(base1: Byte, qual1: PhredScore, base2: Byte, qual2: PhredScore): (Byte, PhredScore, Byte, PhredScore)
}

/** Enum to represent the strategy to consensus call when both bases disagree.  Masking a base will make the base
  * an "N" with base quality "2". */
object DisagreementStrategy extends FgBioEnum[DisagreementStrategy] {
  import OverlappingBasesConsensusCaller.{NoCall, NoCallQual}

  /** Mask both bases. */
  case object MaskBoth extends DisagreementStrategy {
    def baseAndQual(base1: Byte, qual1: PhredScore, base2: Byte, qual2: PhredScore): (Byte, PhredScore, Byte, PhredScore) = {
      (NoCall, NoCallQual, NoCall, NoCallQual)
    }
  }

  /** Mask the base with the lowest base quality, with the other base unchanged.  If the base qualities are the same,
    * mask both bases. */
  case object MaskLowerQual extends DisagreementStrategy {
    def baseAndQual(base1: Byte, qual1: PhredScore, base2: Byte, qual2: PhredScore): (Byte, PhredScore, Byte, PhredScore) = {
      //                       r1-base  r1-qual     2-base  r2-qual
      if (qual1 > qual2)      (base1,   qual1,      NoCall, NoCallQual)
      else if (qual2 > qual1) (NoCall,  NoCallQual, base2,  qual2)
      else                    (NoCall,  NoCallQual, NoCall, NoCallQual)
    }
  }

  /** Consensus call the base.  If the base qualities are the same, mask both bases.  Otherwise, call the base
    * with the highest base quality and return a new base quality that is the difference between the highest and lowest
    * base quality. */
  case object Consensus extends DisagreementStrategy {
    def baseAndQual(base1: Byte, qual1: PhredScore, base2: Byte, qual2: PhredScore): (Byte, PhredScore, Byte, PhredScore) = {
      val (base, qual) = {
        if (qual1 > qual2)      (base1, PhredScore.cap(qual1 - qual2))
        else if (qual2 > qual1) (base2, PhredScore.cap(qual2 - qual1))
        else                    (NoCall, NoCallQual)
      }
      (base, qual, base, qual)
    }
  }

  override def values: IndexedSeq[DisagreementStrategy] = super.findValues
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy