
com.fulcrumgenomics.umi.CorrectUmis.scala Maven / Gradle / Ivy
The newest version!
/*
* The MIT License
*
* Copyright (c) 2017 Fulcrum Genomics LLC
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package com.fulcrumgenomics.umi
import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.collection.LeastRecentlyUsedCache
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.umi.CorrectUmis._
import com.fulcrumgenomics.util.Metric.{Count, Proportion}
import com.fulcrumgenomics.util.{Metric, _}
import scala.collection.mutable
object CorrectUmis {
/**
* Holds information about the match of a UMI to an expected UMI.
* @param matched true if the match is acceptable, false otherwise
* @param umi the fixed UMI sequence that was the closest match
* @param mismatches the number of mismatches between the UMI and the reported best matching fixed UMI
* */
private[umi] case class UmiMatch(matched: Boolean, umi: String, mismatches: Int)
/**
* Metrics produced by `CorrectUmis` regarding the correction of UMI sequences to a fixed set of known UMIs.
*
* @param umi The corrected UMI sequence (or all `N`s for unmatched).
* @param total_matches The number of UMI sequences that matched/were corrected to this UMI.
* @param perfect_matches The number of UMI sequences that were perfect matches to this UMI.
* @param one_mismatch_matches The number of UMI sequences that matched with a single mismatch.
* @param two_mismatch_matches The number of UMI sequences that matched with two mismatches.
* @param other_matches The number of UMI sequences that matched with three or more mismatches.
* @param fraction_of_matches The fraction of all UMIs that matched or were corrected to this UMI.
* @param representation The `total_matches` for this UMI divided by the _mean_ `total_matches` for all UMIs.
*/
case class UmiCorrectionMetrics(umi: String,
var total_matches: Count = 0,
var perfect_matches: Count = 0,
var one_mismatch_matches: Count = 0,
var two_mismatch_matches: Count = 0,
var other_matches: Count = 0,
var fraction_of_matches: Proportion = 0,
var representation: Double = 0
) extends Metric
/**
* Finds pairs of UMIs within the given set that are within some edit distance of each other.
*
* @param umis the set of umis to check
* @param distance the maximum edit distance at which to report pairs of UMIs
* @return a Seq of three-tuples containing (umi1, umi2, edit_distance)
*/
def findUmiPairsWithinDistance(umis: Seq[String], distance: Int): Seq[(String,String,Int)] = {
umis.tails.flatMap {
case x +: ys => ys.map(y => (x, y, Sequences.countMismatches(x, y))).filter(_._3 <= distance)
case _ => Seq.empty
}.toList
}
}
@clp(group=ClpGroups.Umi, description=
"""
|Corrects UMIs stored in BAM files when a set of fixed UMIs is in use. If the set of UMIs used in
|an experiment is known and is a _subset_ of the possible randomers of the same length, it is possible
|to error-correct UMIs prior to grouping reads by UMI. This tool takes an input BAM with UMIs in a
|tag (`RX` by default) and set of known UMIs (either on the command line or in a file) and produces:
|
| 1. A new BAM with corrected UMIs in the same tag the UMIs were found in
| 2. Optionally a set of metrics about the representation of each UMI in the set
| 3. Optionally a second BAM file of reads whose UMIs could not be corrected within the specific parameters
|
|All of the fixed UMIs must be of he same length, and all UMIs in the BAM file must also have the same
|length. Multiple UMIs that are concatenated with hyphens (e.g. `AACCAGT-AGGTAGA`) are split apart,
|corrected individually and then re-assembled. A read is accepted only if all the UMIs can be corrected.
|
|Correction is controlled by two parameters that are applied per-UMI:
|
| 1. _--max-mismatches_ controls how many mismatches (no-calls are counted as mismatches) are tolerated
| between a UMI as read and a fixed UMI.
| 2. _--min-distance_ controls how many more mismatches the next best hit must have
|
|For example, with two fixed UMIs `AAAAA` and `CCCCC` and `--max-mismatches=3` and `--min-distance=2` the
|following would happen:
|
| - AAAAA would match to AAAAA
| - AAGTG would match to AAAAA with three mismatches because CCCCCC has six mismatches and 6 >= 3 + 2
| - AACCA would be rejected because it is 2 mismatches to AAAAA and 3 to CCCCCC and 3 <= 2 + 2
|
|The set of fixed UMIs may be specified on the command line using `--umis umi1 umi2 ...` or via one or
|more files of UMIs with a single sequence per line using `--umi-files umis.txt more_umis.txt`. If there
|are multiple UMIs per template, leading to hyphenated UMI tags, the values for the fixed UMIs should
|be single, non-hyphenated UMIs (e.g. if a record has `RX:Z:ACGT-GGCA`, you would use `--umis ACGT GGCA`).
|
|Records which have their UMIs corrected (i.e. the UMI is not identical to one of the expected UMIs but is
|close enough to be corrected) will by default have their original UMI stored in the `OX` tag. This can be
|disabled with the `--dont-store-original-umis` option.
|
|For a large number of input UMIs, the `--cache-size` option may used to speed up the tool. To disable
|using a cache, set the value to `0`.
""")
class CorrectUmis
( @arg(flag='i', doc="Input SAM or BAM file.") val input: PathToBam,
@arg(flag='o', doc="Output SAM or BAM file.") val output: PathToBam,
@arg(flag='r', doc="Reject BAM file to save unassigned reads.") val rejects: Option[PathToBam] = None,
@arg(flag='M', doc="Metrics file to write.") val metrics: Option[FilePath] = None,
@arg(flag='m', doc="Maximum number of mismatches between a UMI and an expected UMI.") val maxMismatches: Int,
@arg(flag='d', doc="Minimum distance (in mismatches) to next best UMI.") val minDistance: Int,
@arg(flag='u', doc="Expected UMI sequences.", minElements=0) val umis: Seq[String] = Seq.empty,
@arg(flag='U', doc="File of UMI sequences, one per line.", minElements=0) val umiFiles: Seq[FilePath] = Seq.empty,
@arg(flag='t', doc="Tag in which UMIs are stored.") val umiTag: String = ConsensusTags.UmiBases,
@arg(flag='x', doc="Don't store original UMIs upon correction.") val dontStoreOriginalUmis: Boolean = false,
@arg(doc="The number of uncorrected UMIs to cache; zero will disable the cache.") val cacheSize: Int = 100000,
@arg(doc="The minimum ratio of kept UMIs to accept. A ratio below this will cause a failure (but all files will still be written).") val minCorrected: Option[Double] = None
) extends FgBioTool with LazyLogging {
validate(umis.nonEmpty || umiFiles.nonEmpty, "At least one UMI or UMI file must be provided.")
Io.assertReadable(input)
Io.assertReadable(umiFiles)
Io.assertCanWriteFile(output)
rejects.foreach(Io.assertCanWriteFile(_))
minCorrected.foreach(m => validate(m >= 0 && m <= 1, "--min-corrected must be between 0 and 1."))
// Construct the cache
private lazy val cache = new LeastRecentlyUsedCache[String,UmiMatch](maxEntries = cacheSize)
override def execute(): Unit = {
// Construct the full set of UMI sequences to match again
val (umiSequences, umiLength) = {
val set = mutable.HashSet[String](umis:_*)
umiFiles.foreach(Io.readLines(_).map(_.trim).filter(_.nonEmpty).foreach(set.add))
validate(set.nonEmpty, s"At least one UMI sequence must be provided; none found in files ${umiFiles.mkString(", ")}")
val lengths = set.map(_.length)
validate(lengths.size == 1, s"UMIs of multiple lengths found. Lengths: ${lengths.mkString(", ")}")
(set.toArray, lengths.head)
}
// Warn if any of the UMIs are too close together
CorrectUmis.findUmiPairsWithinDistance(umiSequences.toSeq, minDistance-1).foreach { case (umi1, umi2, distance) =>
logger.warning(s"Umis $umi1 and $umi2 are $distance edits apart which is less than the min distance: $minDistance")
}
// Construct the UMI metrics objects
val unmatchedUmi = "N" * umiLength
val umiMetrics = (umiSequences ++ Seq(unmatchedUmi)).map(umi => umi -> UmiCorrectionMetrics(umi)).toMap
// Now go through and correct the UMIs in the BAM
var (totalRecords, missingUmisRecords, wrongLengthRecords, mismatchedRecords) = (0L, 0L, 0L, 0L)
val in = SamSource(input)
val out = SamWriter(output, in.header)
val rejectOut = rejects.map(r => SamWriter(r, in.header))
val progress = ProgressLogger(logger)
in.foreach { rec =>
totalRecords += 1
rec.get[String](umiTag) match {
case None | Some("") =>
if (missingUmisRecords == 0) logger.warning(s"Read (${rec.name}) detected without UMI in tag $umiTag")
missingUmisRecords += 1
rejectOut.foreach(w => w += rec)
case Some(umi: String) =>
val sequences = umi.split("-", -1)
if (sequences.exists(_.length != umiLength)) {
if (wrongLengthRecords == 0) {
logger.warning(s"Read (${rec.name}) detected with unexpected length UMI(s): ${sequences.mkString(" ")}.")
logger.warning(s"Expected UMI length: ${umiLength}")
}
wrongLengthRecords += 1
rejectOut.foreach(w => w += rec)
}
else {
// Find matches for all the UMIs
val matches = sequences.map(findBestMatch(_, umiSequences))
// Update the metrics
matches.foreach { m =>
if (m.matched) {
val metric = umiMetrics(m.umi)
metric.total_matches += 1
m.mismatches match {
case 0 => metric.perfect_matches += 1
case 1 => metric.one_mismatch_matches += 1
case 2 => metric.two_mismatch_matches += 1
case _ => metric.other_matches += 1
}
}
else {
umiMetrics(unmatchedUmi).total_matches += 1
}
}
// Output the corrected read
if (matches.forall(_.matched)) {
// Store the original UMI if enabled and there are mismatches
if (!dontStoreOriginalUmis && !matches.forall(_.mismatches == 0)) {
rec(ConsensusTags.OriginalUmiBases) = rec(this.umiTag)
}
val correctedUmi = matches.map(_.umi).mkString("-")
rec(this.umiTag) = correctedUmi
out += rec
}
else {
mismatchedRecords += 1
rejectOut.foreach(r => r += rec)
}
}
}
progress.record(rec)
}
in.safelyClose()
out.close()
rejectOut.foreach(_.close())
// Finalize the metrics
val sortedMetrics = umiMetrics.values.toSeq.sortBy(_.umi)
val totalWithUnmatched = sortedMetrics.map(_.total_matches).sum.toDouble
val meanWithoutUnmatched = sortedMetrics.filter(_.umi != unmatchedUmi).map(_.total_matches).sum / (sortedMetrics.size - 1d)
sortedMetrics.foreach { m =>
m.fraction_of_matches = m.total_matches / totalWithUnmatched
m.representation = m.total_matches / meanWithoutUnmatched
}
this.metrics.foreach(path => Metric.write(path, sortedMetrics))
// Log some summary info & warnings
val discarded = missingUmisRecords + wrongLengthRecords + mismatchedRecords
val kept = totalRecords - discarded
logger.info(f"Read ${totalRecords}%,d; kept ${kept}%,d and rejected ${discarded}%,d")
if (missingUmisRecords > 0 || wrongLengthRecords > 0) {
logger.error("###################################################################")
if (missingUmisRecords > 0) logger.error(s"# ${mismatchedRecords} were missing UMI attributes in the BAM file!")
if (wrongLengthRecords > 0) logger.error(s"# ${wrongLengthRecords} had unexpected UMIs of differing lengths in the BAM file!")
logger.error("###################################################################")
}
minCorrected.foreach { min =>
val ratioKept = 1.0 * kept / totalRecords
assert(ratioKept >= min,
f"# Final ratio of reads kept / total was ${ratioKept}%2.2f (user specified minimum was ${min}%2.2f) " +
"This could indicate a mismatch between library preparation and the provided UMI file."
)
}
}
/** Given a UMI sequence and a set of fixed UMIs, report the best match. */
private[umi] def findBestMatch(bases: String, umis: Array[String]): UmiMatch = {
val cachedResult = if (this.cacheSize > 0) cache.get(bases) else None
cachedResult match {
case Some(result) => result
case None =>
val mismatches = umis.map(umi => Sequences.countMismatches(bases, umi))
val min = mismatches.min
val matched = (min <= maxMismatches) && (mismatches.count(m => m < min + this.minDistance) == 1)
val umiMatch = UmiMatch(matched, umis(mismatches.indexOf(min)), min)
if (cacheSize > 0) cache.put(bases, umiMatch)
umiMatch
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy