
com.fulcrumgenomics.umi.ExtractUmisFromBam.scala Maven / Gradle / Ivy
The newest version!
/*
* The MIT License
*
* Copyright (c) 2016 Fulcrum Genomics
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
package com.fulcrumgenomics.umi
import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamRecord, SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.io.Io
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt._
import com.fulcrumgenomics.util.{ProgressLogger, _}
import htsjdk.samtools.util._
@clp(description =
"""
|Extracts unique molecular indexes from reads in a BAM file into tags.
|
|Currently only unmapped reads are supported.
|
|Only template bases will be retained as read bases (stored in the `SEQ` field) as specified by the read structure.
|
|A read structure should be provided for each read of a template. For example, paired end reads should have two
|read structures specified. The tags to store the molecular indices will be associated with the molecular index
|segment(s) in the read structure based on the order specified. If only one molecular index tag is given, then the
|molecular indices will be concatenated and stored in that tag. Otherwise the number of molecular indices in the
|read structure should match the number of tags given. In the resulting BAM file each end of a pair will contain
|the same molecular index tags and values. Additionally, when multiple molecular indices are present the
|`--single-tag` option may be used to write all indices, concatenated, to a single tag in addition to the tags
|specified in `--molecular-index-tags`.
|
|Optionally, the read names can be annotated with the molecular indices directly. In this case, the read name
|will be formatted `+` where `` is the concatenation of read one's molecular indices.
|Similarly for ``.
|
|Mapping information will not be adjusted, as such, this tool should not be used on reads that have been mapped since
|it will lead to an BAM with inconsistent records.
|
|The read structure describes the structure of a given read as one or more read segments. A read segment describes
|a contiguous stretch of bases of the same type (ex. template bases) of some length and some offset from the start
|of the read. Read structures are made up of `` pairs much like the CIGAR string in BAM files.
|Four kinds of operators are recognized:
|
|1. `T` identifies a template read
|2. `B` identifies a sample barcode read
|3. `M` identifies a unique molecular index read
|4. `S` identifies a set of bases that should be skipped or ignored
|
|The last `` pair may be specified using a '+' sign instead of number to denote "all remaining
|bases". This is useful if, e.g., fastqs have been trimmed and contain reads of varying length.
|
|An example would be `10B3M7S100T` which describes 120 bases, with the first ten bases being a sample barcode,
|bases 11-13 being a molecular index, bases 14-20 ignored, and bases 21-120 being template bases. See
|[Read Structures](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures) for more information.
""",
group = ClpGroups.SamOrBam)
class ExtractUmisFromBam
( @arg(flag='i', doc = "Input BAM file.") val input: PathToBam,
@arg(flag='o', doc = "Output BAM file.") val output: PathToBam,
@arg(flag='r', doc = "The read structure, one per read in a template.") val readStructure: Seq[ReadStructure],
@arg(flag='t', doc = "SAM tag(s) in which to store the molecular indices.", minElements=0) val molecularIndexTags: Seq[String] = Seq.empty,
@arg(flag='s', doc = "Single tag into which to concatenate all molecular indices.") val singleTag: Option[String] = None,
@arg(flag='a', doc = "Annotate the read names with the molecular indices. See usage for more details.") val annotateReadNames: Boolean = false,
@arg(flag='c', doc = "The SAM tag with the position in read to clip adapters (e.g. `XT` as produced by Picard's `MarkIlluminaAdapters`).") val clippingAttribute: Option[String] = None
) extends FgBioTool with LazyLogging {
val progress = ProgressLogger(logger, verb="written", unit=5e6.toInt)
Io.assertReadable(input)
Io.assertCanWriteFile(output)
val (rs1, rs2) = readStructure match {
case Seq(r1, r2) => (r1.withVariableLastSegment, Some(r2.withVariableLastSegment))
case Seq(r1) => (r1.withVariableLastSegment, None)
case Seq() => invalid("No read structures given")
case _ => invalid("More than two read structures given")
}
// This can be removed once the @deprecated molecularBarcodeTags is removed
if (molecularIndexTags.isEmpty) invalid("At least one molecular-index-tag must be specified.")
// validate the read structure versus the molecular index tags
{
// create a read structure for the entire template
val rsMolecularBarcodeSegmentCount = readStructure.map(_.molecularBarcodeSegments.size).sum
// make sure each tag is of length 2
molecularIndexTags.foreach(tag => if (tag.length != 2) invalid("SAM tags must be of length two: " + tag))
// ensure we either have one tag, or we have the same # of tags as molecular indices in the read structure.
if (molecularIndexTags.size > 1 && rsMolecularBarcodeSegmentCount != molecularIndexTags.size) {
invalid("Either a single SAM tag, or the same # of SAM tags as molecular indices in the read structure, must be given.")
}
}
// Verify that if a single tag was specified that it is valid and not also contained in the per-index tags
singleTag.foreach { tag =>
if (tag.length != 2) invalid(s"All tags must be two characters: ${tag}")
if (molecularIndexTags.contains(tag)) invalid(s"$tag specified as single-tag and also per-index tag")
}
// Validate the molecular index tags against the read structures and then split them out
private val (molecularIndexTags1, molecularIndexTags2) = molecularIndexTags match {
case Seq(one) =>
val t1 = Seq.fill(rs1.molecularBarcodeSegments.length)(one)
val t2 = Seq.fill(rs2.map(_.molecularBarcodeSegments.length).getOrElse(0))(one)
(t1, t2)
case tags =>
// Validate that the # of molecular barcodes in the read structure matches the tags
val total = rs1.molecularBarcodeSegments.length + rs2.map(_.molecularBarcodeSegments.length).getOrElse(0)
validate(tags.length == total, s"Read structures contains $total molecular barcode segments, but ${tags.length} tags provided.")
(tags.take(rs1.molecularBarcodeSegments.length), tags.drop(rs1.molecularBarcodeSegments.length))
}
override def execute(): Unit = {
val in = SamSource(input)
val iterator = in.iterator.buffered
val out = SamWriter(output, in.header)
while (iterator.hasNext) {
val r1 = iterator.next()
if (r1.paired) {
// get the second read and make sure there's a read structure.
if (!iterator.hasNext) fail(s"Could not find mate for read ${r1.name}")
if (rs2.isEmpty) fail(s"Missing read structure for read two (required for paired end data).")
val r2 = iterator.next()
// verify everything is in order for paired end reads
Seq(r1, r2).foreach(r => {
if (!r.paired) fail(s"Read ${r.name} was not paired")
if (r.mapped) fail(s"Read ${r.name} was not unmapped")
})
if (!r1.firstOfPair) fail(s"Read ${r1.name} was not the first end of a pair")
if (!r2.secondOfPair) fail(s"Read ${r2.name} was not the second end of a pair")
if (!r1.name.equals(r2.name)) fail(s"Read names did not match: '${r1.name}' and '${r2.name}'")
// now do some work
val bases1 = ExtractUmisFromBam.annotateRecord(record=r1, readStructure=rs1, molecularIndexTags=molecularIndexTags1, clippingAttribute=clippingAttribute)
val bases2 = ExtractUmisFromBam.annotateRecord(record=r2, readStructure=rs2.get, molecularIndexTags=molecularIndexTags2, clippingAttribute=clippingAttribute)
if (annotateReadNames) {
// Developer Note: the delimiter must be an ascii character less than what is usually in the read names. For
// example "|" doesn't work. I am not completely sure why.
r1.name = r1.name + "+" + bases1 + bases2
r2.name = r2.name + "+" + bases1 + bases2
}
require(r1.name.equals(r2.name), s"Mismatching read name. R1: '$r1' R2: '$r2'")
// If we have duplicate tags, then concatenate them in the same order across the read pair.
val tagAndValues = (molecularIndexTags1.map { tag => (tag, r1[String](tag)) } ++ molecularIndexTags2.map { tag => (tag, r2[String](tag)) }).toList
tagAndValues.groupBy(_._1).foreach { case (tag, tuples) =>
val attr = tuples.map(_._2).mkString(ExtractUmisFromBam.UmiDelimiter)
r1(tag) = attr
r2(tag) = attr
}
// If we have a single-tag, then also output values there
singleTag.foreach { tag =>
val value = tagAndValues.map { case (t,v) => v }.mkString(ExtractUmisFromBam.UmiDelimiter)
r1(tag) = value
r2(tag) = value
}
val recs = Seq(r1, r2)
out ++= recs
recs.foreach(progress.record)
}
else {
// verify everything is in order for single end reads
if (r1.mapped) fail(s"Read ${r1.name} was not unmapped")
// now do some work
val bases1 = ExtractUmisFromBam.annotateRecord(record=r1, readStructure=rs1, molecularIndexTags=molecularIndexTags1, clippingAttribute=clippingAttribute)
if (annotateReadNames) {
// Developer Note: the delimiter must be an ascii character less than what is usually in the read names. For
// example "|" doesn't work. I am not completely sure why.
r1.name = r1.name + "+" + bases1
}
// If we have duplicate tags, then concatenate them in the same order across the read
val tagAndValues = molecularIndexTags1.map { tag => (tag, r1[String](tag)) }
tagAndValues.groupBy(_._1).foreach { case (tag, tuples) =>
val attr = tuples.map(_._2).mkString(ExtractUmisFromBam.UmiDelimiter)
r1(tag) = attr
}
out += r1
progress.record(r1)
}
}
out.close()
CloserUtil.close(iterator)
}
}
object ExtractUmisFromBam {
val UmiDelimiter = "-"
/**
* Extracts bases associated with molecular indices and adds them as SAM tags. The read's bases will only contain
* template bases as defined in the given read structure.
*/
private[umi] def annotateRecord(record: SamRecord,
readStructure: ReadStructure,
molecularIndexTags: Seq[String],
clippingAttribute: Option[String] = None): String = {
val bases = record.basesString
val qualities = record.qualsString
// get the bases associated with each segment
val readStructureBases = readStructure.extract(bases, qualities)
// get the molecular index segments
val molecularIndexBases = readStructureBases.filter(_.kind == SegmentType.MolecularBarcode).map(_.bases)
// set the index tags
// TODO: when we remove the deprecated molecularBarcodeTags option, consider whether or not we still
// need to have support for specifying a single tag via molecularIndexTags.
molecularIndexTags match {
case Seq(tag) => record(tag) = molecularIndexBases.mkString(UmiDelimiter)
case _ =>
if (molecularIndexTags.length < molecularIndexBases.length) throw new IllegalStateException("Found fewer molecular index SAM tags than molecular indices in the read structure.")
else if (molecularIndexTags.length > molecularIndexBases.length) throw new IllegalStateException("Found fewer molecular indices in the read structure than molecular index SAM tags.")
molecularIndexTags.zip(molecularIndexBases).foreach { case (tag, b) => record(tag) = b }
}
// keep only template bases and qualities in the output read
val basesAndQualities = readStructureBases.filter(_.kind == SegmentType.Template)
// update any clipping information
updateClippingInformation(record=record, clippingAttribute=clippingAttribute, readStructure=readStructure)
record.bases = basesAndQualities.map(_.bases).mkString
record.quals = basesAndQualities.map(_.quals).mkString
// return the concatenation of the molecular indices in sequencing order
molecularIndexBases.mkString
}
/**
* Update the clipping information produced by Picard's MarkIlluminaAdapters to account for any non-template bases
* that will be removed from the read.
*/
private[umi] def updateClippingInformation(record: SamRecord,
clippingAttribute: Option[String],
readStructure: ReadStructure): Unit = {
clippingAttribute.map(tag => (tag, record.get[Int](tag))) match {
case None => ()
case Some((tag, None)) => ()
case Some((tag, Some(clippingPosition))) =>
val newClippingPosition = readStructure.takeWhile(_.offset < clippingPosition).filter(_.kind == SegmentType.Template).map { t =>
if (t.length.exists(l => t.offset + l < clippingPosition)) t.length.get
else clippingPosition - t.offset
}.sum
record(tag) = newClippingPosition
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy