
com.fulcrumgenomics.umi.DuplexConsensusCaller.scala Maven / Gradle / Ivy
The newest version!
/*
* The MIT License
*
* Copyright (c) 2017 Fulcrum Genomics LLC
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package com.fulcrumgenomics.umi
import java.lang.Math.min
import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.umi.DuplexConsensusCaller._
import com.fulcrumgenomics.umi.UmiConsensusCaller.ReadType.{ReadType, _}
import com.fulcrumgenomics.umi.UmiConsensusCaller.{SimpleRead, SourceRead}
import com.fulcrumgenomics.util.NumericTypes.PhredScore
/**
* Container for constant values and types used by the [[DuplexConsensusCaller]]
*/
object DuplexConsensusCaller {
/** Various default values for the consensus caller. */
val ErrorRatePreUmi: PhredScore = 45.toByte
val ErrorRatePostUmi: PhredScore = 40.toByte
val MinInputBaseQuality: PhredScore = 15.toByte
val NoCall: Byte = 'N'.toByte
val NoCallQual: PhredScore = PhredScore.MinValue
/** Additional filter strings used when rejecting reads. */
val FilterMinReads = "Not Enough Reads (Either Total, AB, or BA)"
val FilterFragments = "Being Fragment/Non-Paired Reads"
val FilterSsConsensus = "Only Generating One Strand Consensus"
val FilterCollision = "Potential collision between independent duplex molecules"
/**
* Stores information about a consensus read. Bases, arrays, and the two single
* strand consensus reads (when present) must all be the same length.
*
* @param bases the base calls of the consensus read
* @param quals the calculated phred-scaled quality scores of the bases
* @param errors the total number of raw read errors vs. the final consensus sequence at each base
* @param abConsensus the AB consensus read from which the duplex was constructed
* @param baConsensus the BA consensus read from which the duplex was constructed, if present. It may not be present
* if created from the AB strand reads only.
*/
case class DuplexConsensusRead(id: String,
bases: Array[Byte],
quals: Array[Byte],
errors: Array[Short],
abConsensus: VanillaConsensusRead,
baConsensus: Option[VanillaConsensusRead]) extends SimpleRead {
require(bases.length == quals.length, "Bases and qualities are not the same length.")
require(bases.length == errors.length, "Bases and errors are not the same length.")
require(abConsensus.length == bases.length, "Bases and AB consensus are not the same length.")
require(baConsensus.forall(_.length == bases.length), "Bases and BA consensus are not the same length.")
}
}
/**
* Creates duplex consensus reads from SamRecords that have been grouped by their source molecule
* but not yet by source strand.
*
* Filters incoming bases by quality before building the duplex.
*
* Output reads and bases are constructed only if there is at least one read from each source
* molecule strand. Otherwise no filtering is performed.
*
* Note that a consequence of the above is that the output reads can be shorter than _some_ of
* the input reads if the input reads are of varying length; they will be the length at which
* there is coverage from both source strands.
*
* @param readNamePrefix the prefix to apply to all consensus read names
* @param readGroupId the read group ID to apply to all created consensus reads
* @param minInputBaseQuality the minimum input base quality score to use a raw read's base
* @param trim if true, quality trim reads in addition to masking. If false just mask.
* @param errorRatePreUmi the estimated rate of errors in the DNA prior to attaching UMIs
* @param errorRatePostUmi the estimated rate of errors in the DNA post attaching UMIs
* @param minReads the minimum number of input reads to a consensus read (see [[CallDuplexConsensusReads]]).
* */
class DuplexConsensusCaller(override val readNamePrefix: String,
override val readGroupId: String = "A",
val minInputBaseQuality: PhredScore = DuplexConsensusCaller.MinInputBaseQuality,
val trim: Boolean = false,
val errorRatePreUmi: PhredScore = DuplexConsensusCaller.ErrorRatePreUmi,
val errorRatePostUmi: PhredScore = DuplexConsensusCaller.ErrorRatePostUmi,
val minReads: Seq[Int] = Seq(1),
val maxReadsPerStrand: Int = VanillaUmiConsensusCallerOptions.DefaultMaxReads
) extends UmiConsensusCaller[DuplexConsensusRead] with LazyLogging {
private val Seq(minTotalReads, minXyReads, minYxReads) = this.minReads.padTo(3, this.minReads.last)
// For depth thresholds it's required that ba <= ab <= cc
require(minXyReads <= minTotalReads, "min-reads values must be specified high to low.")
require(minYxReads <= minXyReads, "min-reads values must be specified high to low.")
private val ssCaller = new VanillaUmiConsensusCaller(readNamePrefix="x", options=new VanillaUmiConsensusCallerOptions(
errorRatePreUmi = this.errorRatePreUmi,
errorRatePostUmi = this.errorRatePostUmi,
minReads = 1,
maxReads = maxReadsPerStrand,
minInputBaseQuality = this.minInputBaseQuality,
minConsensusBaseQuality = PhredScore.MinValue,
producePerBaseTags = true
))
/**
* Returns the MI tag **with** the trailing suffix that identifies /A vs /B
*/
private def sourceMoleculeAndStrandId(rec: SamRecord): String = {
// Optimization: speed up retrieving this tag by storing it in the transient attributes
rec.transientAttrs.get[String](ConsensusTags.MolecularId) match {
case Some(mi) => mi
case None =>
rec.get[String](ConsensusTags.MolecularId) match {
case Some(mi) => mi
case None => throw new IllegalStateException(s"Read ${rec.name} is missing it's ${ConsensusTags.MolecularId} tag.")
}
}
}
/** Returns a clone of this consensus caller in a state where no previous reads were processed. I.e. all counters
* are set to zero.*/
def emptyClone(): DuplexConsensusCaller = {
new DuplexConsensusCaller(
readNamePrefix = readNamePrefix,
readGroupId = readGroupId,
minInputBaseQuality = minInputBaseQuality,
trim = trim,
errorRatePreUmi = errorRatePreUmi,
errorRatePostUmi = errorRatePostUmi,
minReads = minReads,
maxReadsPerStrand = maxReadsPerStrand
)
}
// The key in a [[SamRecord]]'s transient attributes that caches the molecular identifier. The molecular identifier is
// cached by the `sourceMoleculeId` method.
private val MolecularIdNoTrailingSuffix: String = "__" + ConsensusTags.MolecularId + "__"
/**
* Returns the MI tag **minus** the trailing suffix that identifies /A vs /B
*/
override protected[umi] def sourceMoleculeId(rec: SamRecord): String = {
// Optimization: speed up retrieving this tag by storing it in the transient attributes
rec.transientAttrs.getOrElse[String](MolecularIdNoTrailingSuffix, {
val mi = sourceMoleculeAndStrandId(rec)
val index = mi.lastIndexOf('/')
val miRoot = mi.substring(0, index)
rec.transientAttrs(MolecularIdNoTrailingSuffix) = miRoot
miRoot
})
}
/**
* Takes in all the reads for a source molecule and, if possible, generates one or more
* output consensus reads as SAM records.
*
* @param recs the full set of source SamRecords for a source molecule
* @return a seq of consensus SAM records, may be empty
*/
override protected def consensusSamRecordsFromSamRecords(recs: Seq[SamRecord]): Seq[SamRecord] = {
val (pairs, frags) = recs.partition(_.paired)
rejectRecords(frags, FilterFragments)
if (pairs.isEmpty) {
Nil
}
else {
// Group the reads by /A vs. /B and ensure that /A is the first group and /B the second
val groups = pairs.groupBy(r => sourceMoleculeAndStrandId(r)).toSeq.sortBy { case (mi, _) => mi }.map(_._2)
require(groups.length <= 2, "SamRecords supplied with more than two distinct MI values.")
val x = groups.head
val y = groups.lift(1).getOrElse(Seq.empty)
if (hasMinimumNumberOfReads(x, y)) {
val consensus = callDuplexConsensusRead(x, y)
if (consensus.forall(duplexHasMinimumNumberOfReads)) {
consensus
} else {
// Even though we had enough reads going into consensus calling, we can
// lose some in the process (e.g. to CIGAR filtering), and end up with
// consensus reads that will than fail the min-reads check
rejectRecords(groups.flatten, FilterMinReads)
Nil
}
}
else {
rejectRecords(groups.flatten, FilterMinReads)
Nil
}
}
}
/** Returns true if there are enough reads according to the minReads option. */
private def hasMinimumNumberOfReads(x: Seq[SamRecord], y: Seq[SamRecord]): Boolean = {
// Get the number of reads per strand, in decreasing (more stringent) order.
val (numXy: Int, numYx: Int) = {
val numAb = x.count(r => r.paired && r.firstOfPair)
val numBa = y.count(r => r.paired && r.firstOfPair)
if (numAb >= numBa) (numAb, numBa) else (numBa, numAb)
}
this.minTotalReads <= numXy + numYx && this.minXyReads <= numXy && this.minYxReads <= numYx
}
/** Returns true if there are enough reads composing the consensus according to the minReads option. */
private def duplexHasMinimumNumberOfReads(consensus: SamRecord): Boolean = {
val (numXy: Int, numYx: Int) = {
val numAb = consensus[Int](ConsensusTags.PerRead.AbRawReadCount)
val numBa = consensus[Int](ConsensusTags.PerRead.BaRawReadCount)
if (numAb >= numBa) (numAb, numBa) else (numBa, numAb)
}
this.minTotalReads <= numXy + numYx && this.minXyReads <= numXy && this.minYxReads <= numYx
}
private def areAllSameStrand(reads: Seq[SamRecord]): Boolean = {
if (reads.nonEmpty) {
val ss1Flag = reads.head.negativeStrand
reads.forall(_.negativeStrand == ss1Flag)
}
else {
true
}
}
// An empty sequence of [[SamRecord]]s, used in subGroupRecords to improve performance
private val NoSamRecords: Seq[SamRecord] = Seq.empty[SamRecord]
/** Split records into those that should make a single-end consensus read, first of pair consensus read,
* and second of pair consensus read, respectively. This method is overridden in [[DuplexConsensusCaller]] to
* improve performance since no fragment reads should be given to this method.
*/
override protected def subGroupRecords(records: Seq[SamRecord]): (Seq[SamRecord], Seq[SamRecord], Seq[SamRecord]) = {
// NB: the input records should not have fragments
val (firstOfPair, secondOfPair) = records.partition { r =>
require(r.paired, "Fragment reads should not be given to subGroupRecords in DuplexConsensusCaller.")
r.firstOfPair
}
(NoSamRecords, firstOfPair, secondOfPair)
}
/** Attempts to call a duplex consensus reads from the two sets of reads, one for each strand. */
private def callDuplexConsensusRead(ab: Seq[SamRecord], ba: Seq[SamRecord]): Seq[SamRecord] = {
// Fragments have no place in duplex land (and are filtered out previously anyway)!
val (_, abR1s, abR2s) = subGroupRecords(ab)
val (_, baR1s, baR2s) = subGroupRecords(ba)
// Get all the alignments to one end of the source molecule
val singleStrand1 = abR1s ++ baR2s
val singleStrand2 = abR2s ++ baR1s
// The orientation of AB and BA reads should be:
// AB R1: + AB R2: -
// BA R1: - BA R2: +
// or vice versa (AB-R1:-, AB-R2:+, AB-R1:-, AB-R2: +
// Therefore, AB-R1s and BA-R2s should be on the same strand, and the same for AB-R2s and BA-R1s
// Check for this explicitly here.
(areAllSameStrand(singleStrand1), areAllSameStrand(singleStrand2)) match {
case (false, _) =>
val ss1Mi = sourceMoleculeId(singleStrand1.head)
rejectRecords(ab ++ ba, FilterCollision)
logger.debug(s"Not all AB-R1s and BA-R2s were on the same strand for molecule with id: $ss1Mi")
Nil
case (_, false) =>
val ss2Mi = sourceMoleculeId(singleStrand2.head)
rejectRecords(ab ++ ba, FilterCollision)
logger.debug(s"Not all AB-R2s and BA-R1s were on the same strand for molecule with id: $ss2Mi")
Nil
case (true, true) =>
// Filter by common indel pattern with AB and BA together
val filteredXs = filterToMostCommonAlignment((abR1s ++ baR2s).flatMap(toSourceRead(_, this.minInputBaseQuality, this.trim)))
val filteredYs = filterToMostCommonAlignment((abR2s ++ baR1s).flatMap(toSourceRead(_, this.minInputBaseQuality, this.trim)))
// Then split then back apart for SS calling
val filteredAbR1s = filteredXs.filter(_.sam.exists(_.firstOfPair))
val filteredAbR2s = filteredYs.filter(_.sam.exists(_.secondOfPair))
val filteredBaR1s = filteredYs.filter(_.sam.exists(_.firstOfPair))
val filteredBaR2s = filteredXs.filter(_.sam.exists(_.secondOfPair))
// Call the single-stranded consensus reads
val abR1Consensus = ssCaller.consensusCall(filteredAbR1s)
val abR2Consensus = ssCaller.consensusCall(filteredAbR2s)
val baR1Consensus = ssCaller.consensusCall(filteredBaR1s)
val baR2Consensus = ssCaller.consensusCall(filteredBaR2s)
// Call the duplex reads
val duplexR1Sources = filteredAbR1s ++ filteredBaR2s
val duplexR2Sources = filteredAbR2s ++ filteredBaR1s
val duplexR1 = duplexConsensus(abR1Consensus, baR2Consensus, duplexR1Sources)
val duplexR2 = duplexConsensus(abR2Consensus, baR1Consensus, duplexR2Sources)
// Convert to SamRecords and return
(duplexR1, duplexR2) match {
case (Some(r1), Some(r2)) =>
Seq(
createSamRecord(r1, FirstOfPair, toUmiBasesForConsensusUmiCalling(duplexR1Sources, firstOfPair=true)),
createSamRecord(r2, SecondOfPair, toUmiBasesForConsensusUmiCalling(duplexR2Sources, firstOfPair=false))
)
case _ =>
// NB: some reads may have been rejected already in filterToMostCommonAlignment, so just
// reject those records that survived the initial filtering.
val remainingRecs = filteredXs ++ filteredYs
rejectRecords(remainingRecs.flatMap(_.sam), FilterSsConsensus)
Nil
}
}
}
/** Extracts the UMI bases for each source read, ignoring reads without UMI bases, and normalizes the order of UMI
* bases across the AB and BA reads.
*
* Important: the UMI bases for the source reads are not the same across R1s and R2s. For example, considering
* combining the AB R1s and BA R2s to call the R1 duplex consensus R1. If the UMI bases for the AB R1s are GAT-TAG,
* then the UMI-bases for the BA R2s are TAG-GAT, where A=TAG and B=GAT. To make them all agree (assuming no
* sequencing error), we output them in the order found in R1 (i.e. TAG-GAT). The different order of UMI-bases
* between the AB and BA reads is due to how the duplex tags are constructed and sequenced.
* */
private def toUmiBasesForConsensusUmiCalling(reads: Seq[SourceRead], firstOfPair: Boolean): Seq[String] = {
// The source read may not have the RX tag, and if so, we ignore them through
// the use of .flatMap and .get, with the latter returning an `Some(bases)` if the
// UMI bases are present, `None` otherwise.
reads.flatMap(_.sam).flatMap { rec =>
rec.get[String](ConsensusTags.UmiBases).map { umi =>
if (rec.firstOfPair == firstOfPair) umi else umi.split("-", -1).reverse.mkString("-")
}
}
}
/**
* Constructs a duplex consensus read from a pair of single strand consensus reads.
* If either of the incoming reads are undefined, the duplex read will be undefined.
*
* @param ab the single-strand consensus from one strand
* @param ba the single-strand consensus from the other strand
* @return a duplex consensus if one can be built
*/
private[umi] def duplexConsensus(ab: Option[VanillaConsensusRead],
ba: Option[VanillaConsensusRead],
sourceReads: Seq[SourceRead]): Option[DuplexConsensusRead] = {
// Calculate the length that we'll create a duplex read as "the shorter of the available reads",
// and then filter each read to make sure it has some coverage in that section of the read
val len = min(ab.map(_.length).getOrElse(Int.MaxValue), ba.map(_.length).getOrElse(Int.MaxValue))
val abX = ab.filter(_.depths.iterator.take(len).exists(_ > 0))
val baX = ba.filter(_.depths.iterator.take(len).exists(_ > 0))
(abX, baX) match {
case (Some(a), None) => Some(DuplexConsensusRead(id=a.id, a.bases, a.quals, a.errors, a, None))
case (None, Some(b)) => Some(DuplexConsensusRead(id=b.id, b.bases, b.quals, b.errors, b, None))
case (Some(a), Some(b)) =>
val id = a.id
val bases = new Array[Byte](len)
val quals = new Array[Byte](len)
val errors = new Array[Short](len)
forloop(from=0, until=len) { i =>
val aBase = a.bases(i)
val bBase = b.bases(i)
val aQual = a.quals(i)
val bQual = b.quals(i)
// Capture the raw consensus base prior to masking it to N, so that we can compute
// errors vs. the actually called base.
val (rawBase, rawQual) = {
if (aBase == bBase) (aBase, PhredScore.cap(aQual + bQual))
else if (aQual > bQual) (aBase, PhredScore.cap(aQual - bQual))
else if (bQual > aQual) (bBase, PhredScore.cap(bQual - aQual))
else (aBase, PhredScore.MinValue)
}
// Then mask it if appropriate
val (base, qual) = if (aBase == NoCall || bBase == NoCall || rawQual == PhredScore.MinValue) (NoCall, NoCallQual) else (rawBase, rawQual)
bases(i) = base
quals(i) = qual
// NB: optimized based on profiling; was previously:
// sourceReads.count(s => s.length > i && isError(s.bases(i), rawBase))
var numErrors = 0
val sourceReadsArray = sourceReads.toArray
forloop(from=0, until=sourceReadsArray.length) { j =>
val sourceRead = sourceReadsArray(j)
if (sourceRead.length > i && isError(sourceRead.bases(i), rawBase)) {
numErrors += 1
}
}
errors(i) = min(numErrors, Short.MaxValue).toShort
}
Some(DuplexConsensusRead(id=id, bases, quals, errors, a.truncate(bases.length), Some(b.truncate(bases.length))))
case _ =>
None
}
}
/** Function that returns true if the pair of bases are both valid/called bases and do not match each other. */
@inline private def isError(lhs: Byte, rhs: Byte): Boolean = lhs != NoCall && rhs != NoCall && lhs != rhs
/**
* Creates a SamRecord with a ton of additional tags annotating the duplex read.
*/
override protected def createSamRecord(read: DuplexConsensusRead, readType: ReadType, umis: Seq[String] = Seq.empty): SamRecord = {
val rec = super.createSamRecord(read, readType, umis)
// Calculate the total depths across both SS consensus reads
val totalDepths: Array[Int] = read.baConsensus match {
case Some(ba) => read.abConsensus.depths.zip(ba.depths).map(x => x._1 + x._2)
case None => read.abConsensus.depths.map(_.toInt)
}
{ import ConsensusTags.PerRead._
rec(RawReadCount) = totalDepths.max
rec(MinRawReadCount) = totalDepths.min
rec(RawReadErrorRate) = sum(read.errors) / totalDepths.sum.toFloat
rec(AbRawReadCount) = read.abConsensus.depths.max.toInt
rec(AbMinRawReadCount) = read.abConsensus.depths.min.toInt
rec(AbRawReadErrorRate) = sum(read.abConsensus.errors) / sum(read.abConsensus.depths).toFloat
read.baConsensus match {
case Some(ba) =>
rec(BaRawReadCount) = ba.depths.max.toInt
rec(BaMinRawReadCount) = ba.depths.min.toInt
rec(BaRawReadErrorRate) = sum(ba.errors) / sum(ba.depths).toFloat
case None =>
rec(BaRawReadCount) = 0
rec(BaMinRawReadCount) = 0
rec(BaRawReadErrorRate) = 0.0f
}
}
{ import ConsensusTags.PerBase._
rec(AbRawReadCount) = read.abConsensus.depths
rec(AbRawReadErrors) = read.abConsensus.errors
rec(AbConsensusBases) = read.abConsensus.baseString
rec(AbConsensusQuals) = read.abConsensus.qualString
read.baConsensus.foreach { ab =>
rec(BaRawReadCount) = ab.depths
rec(BaRawReadErrors) = ab.errors
rec(BaConsensusBases) = ab.baseString
rec(BaConsensusQuals) = ab.qualString
}
}
rec
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy