All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.ZipperBams.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2022 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.bam

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.async.AsyncIterator
import com.fulcrumgenomics.commons.collection.BetterBufferedIterator
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.umi.ConsensusTags
import com.fulcrumgenomics.util.Io
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.{SAMFileHeader, SAMSequenceDictionary}
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor

/** Companion object that holds implementation methods for the ZipperBams command line tool. */
private[bam] object ZipperBams extends LazyLogging {
  /** Named sets of tags for reversing. */
  val TagSetsForReversing = Map(
    "Consensus" -> ConsensusTags.PerBase.TagsToReverse
  )

  /** Named sets of tags for reverse complementing. */
  val TagSetsForRevcomping = Map(
    "Consensus" -> ConsensusTags.PerBase.TagsToReverseComplement
  )

  object TagInfo {
    def apply(remove: IndexedSeq[String], reverse: IndexedSeq[String], revcomp: IndexedSeq[String]): TagInfo = {
      val r  = reverse.flatMap(t => TagSetsForReversing.getOrElse(t, Some(t)))
      val rc = revcomp.flatMap(t => TagSetsForRevcomping.getOrElse(t, Some(t)))
      new TagInfo(remove=remove, reverse=r, revcomp=rc)
    }
  }

  /** Class to hold info about all the extended tags/attrs we want to manipulate. */
  case class TagInfo private (remove: IndexedSeq[String], reverse: IndexedSeq[String], revcomp: IndexedSeq[String]) {
    val setToReverse: java.util.Set[String] = reverse.iterator.toJavaSet
    val setToRevcomp: java.util.Set[String] = revcomp.iterator.toJavaSet
    val hasRevsOrRevcomps: Boolean = reverse.nonEmpty || revcomp.nonEmpty
  }

  /** Checks the source's header declares it is queryname sorted or grouped and logs a warning if not. */
  def checkSort(source: SamSource, path: FilePath, name: String): Unit = {
    if (source.header.getSortOrder != SortOrder.queryname && source.header.getGroupOrder != GroupOrder.query) {
      logger.warning(s"${name} file ${path} does not appear to be queryname sorted or grouped per the SAM header.")
      logger.warning(s"Continuing, but your output may be incorrect.")
    }
  }

  /** Builds the header for the output BAM from the headers of the unmapped and mapped BAM.
    *
    * @param unmapped the header from the unmapped BAM
    * @param mapped the header from the mapped BAM
    * @param sort optionally, the sort order specified for the output BAM
    * @param dict the sequence dictionary of the reference genome to insert into the output header
    */
  def buildOutputHeader(unmapped: SAMFileHeader,
                        mapped: SAMFileHeader,
                        sort: Option[SamOrder],
                        dict: SAMSequenceDictionary): SAMFileHeader = {
    // Build a new header using the given sequence dictionary
    val header = new SAMFileHeader(dict)

    // Copy over comments, RGs and PGs, letting mapped override unmapped if there are conflicts
    val mappedComments       = mapped.getComments.toSet
    val mappedReadGroups     = mapped.getReadGroups.toSet
    val mappedProgramRecords = mapped.getProgramRecords.toSet
    unmapped.getComments.filterNot(mappedComments.contains).foreach(header.addComment)
    unmapped.getReadGroups.filterNot(mappedReadGroups.contains).foreach(header.addReadGroup)
    unmapped.getProgramRecords.filterNot(mappedProgramRecords.contains).foreach(header.addProgramRecord)
    mapped.getComments.foreach(header.addComment)
    mapped.getReadGroups.foreach(header.addReadGroup)
    mapped.getProgramRecords.foreach(header.addProgramRecord)

    // Set the sort and group order
    header.setSortOrder(unmapped.getSortOrder)
    header.setGroupOrder(unmapped.getGroupOrder)
    sort.foreach(s => s.applyTo(header))

    header
  }

  /** Constructs a template iterator that assumes the input is sorted or grouped, and turns it into an async iter. */
  def templateIterator(source: SamSource, bufferSize: Int): BetterBufferedIterator[Template] = {
    val recs      = source.iterator.bufferBetter
    val templates = new Iterator[Template] {
      override def hasNext: Boolean = recs.hasNext
      override def next(): Template = {
        try {
          Template(recs)
        }
        catch {
          case iae: IllegalArgumentException => throw new IllegalStateException(
            s"Error reading from ${source.toSamReader.getResourceDescription}: ${iae.getMessage}"
          )
        }
      }
    }

    if (bufferSize > 0) {
      AsyncIterator(templates, Some(bufferSize)).bufferBetter
    }
    else {
      templates.bufferBetter
    }
  }

  /** Merge information from the unmapped template into the mapped template. */
  def merge(unmapped: Template, mapped: Template, tagInfo: TagInfo): Template = {
    // Fix mate info first
    mapped.fixMateInfo()

    // Remove tags from the mapped records
    for (rec <- mapped.allReads; tag <- tagInfo.remove) rec.remove(tag)

    // Copy tags over from the unmapped record
    for (u <- unmapped.primaryReads) {
      // Get all the mapped records for the unmapped record
      val ms = if (u.unpaired || u.firstOfPair) mapped.allR1s.toSeq else mapped.allR2s.toSeq
      val hasPosReads = ms.exists(_.positiveStrand)
      val hasNegReads = ms.exists(_.negativeStrand)

      // Copy over pass/fail qc flag
      ms.foreach(_.pf = u.pf)

      // Copy over tags on any positive strand reads first
      if (hasPosReads) ms.iterator.filter(_.positiveStrand).foreach(m => copyTags(u, m))

      // Then reverse complement the unmapped read if we need to and do the negative strand reads
      if (hasNegReads) {
        if (tagInfo.hasRevsOrRevcomps) u.asSam.reverseComplement(tagInfo.setToRevcomp, tagInfo.setToReverse, !hasPosReads)
        ms.iterator.filter(_.negativeStrand).foreach(m => copyTags(u, m))
      }
    }

    mapped
  }

  /** Copies all tags from read to another, with the exception that PG is only copied if not already present on dest. */
  def copyTags(src: SamRecord, dest: SamRecord): Unit = {
    src.attributes.foreach { case (tag, value) =>
      if (tag != "PG" || !dest.contains("PG")) {
        dest(tag) = value
      }
    }
  }
}

@clp(group=ClpGroups.SamOrBam, description=
  """
    |Zips together an unmapped and mapped BAM to transfer metadata into the output BAM.
    |
    |Both the unmapped and mapped BAMs _must_ be a) queryname sorted or grouped (i.e. all records with the same
    |name are grouped together in the file), and b) have the same ordering of querynames.  If either of these are
    |violated the output is undefined!
    |
    |All tags present on the unmapped reads are transferred to the mapped reads.  The options `--tags-to-reverse`
    |and `--tags-to-revcomp` will cause tags on the unmapped reads to be reversed or reverse complemented before
    |being copied to reads mapped to the negative strand.  These options can take a mixture of two-letter tag names
    |and the names of tag sets, which will be expanded into sets of tag names.  Currently the only named tag set
    |is "Consensus" which contains all the per-base consensus tags produced by fgbio consensus callers.
    |
    |By default the mapped BAM is read from standard input (stdin) and the output BAM is written to standard
    |output (stdout). This can be changed using the `--input/-i` and `--output/-o` options.
    |
    |By default the output BAM file is emitted in the same order as the input BAMs.  This can be overridden
    |using the `--sort` option, though in practice it may be faster to do the following:
    |
    |```
    |fgbio --compression 0 ZipperBams -i mapped.bam -u unmapped.bam -r ref.fa | samtools sort -@ $(nproc)
    |```
  """)
class ZipperBams
( @arg(flag='i', doc="Mapped SAM or BAM.") val input: PathToBam = Io.StdIn,
  @arg(flag='u', doc="Unmapped SAM or BAM.") val unmapped: PathToBam,
  @arg(flag='r', doc="Path to the reference used in alignment. Must have accompanying .dict file.") val ref: PathToFasta,
  @arg(flag='o', doc="Output SAM or BAM file.") val output: PathToBam = Io.StdOut,
  @arg(doc="Tags to remove from the mapped BAM records.", minElements=0)
  val tagsToRemove: IndexedSeq[String] = IndexedSeq.empty,
  @arg(doc="Set of optional tags to reverse on reads mapped to the negative strand.", minElements=0)
  val tagsToReverse: IndexedSeq[String] = IndexedSeq.empty,
  @arg(doc="Set of optional tags to reverse complement on reads mapped to the negative strand.", minElements=0)
  val tagsToRevcomp: IndexedSeq[String] = IndexedSeq.empty,
  @arg(flag='s', doc="Sort the output BAM into the given order.") val sort: Option[SamOrder] = None,
  @arg(flag='b', doc="Buffer this many read-pairs while reading the input BAMs.") val buffer: Int = 5000
) extends FgBioTool {
  import ZipperBams._

  Io.assertReadable(ref)
  Io.assertReadable(input)
  Io.assertReadable(unmapped)
  Io.assertCanWriteFile(output)

  override def execute(): Unit = {
    val unmappedSource = SamSource(unmapped)
    val mappedSource   = SamSource(input)
    checkSort(unmappedSource, unmapped, "unmapped")
    checkSort(mappedSource,  input, "input")

    val dict    = SAMSequenceDictionaryExtractor.extractDictionary(this.ref)
    val header  = buildOutputHeader(unmappedSource.header, mappedSource.header, this.sort, dict)
    val out     = SamWriter(output, header, sort=this.sort)
    val tagInfo = TagInfo(remove=tagsToRemove, reverse=tagsToReverse, revcomp=tagsToRevcomp)

    if (tagInfo.remove.nonEmpty)  logger.info(s"Tags for removal from mapped BAM: ${tagInfo.remove.mkString(", ")}")
    if (tagInfo.reverse.nonEmpty) logger.info(s"Tags being reversed: ${tagInfo.reverse.mkString(", ")}")
    if (tagInfo.revcomp.nonEmpty) logger.info(s"Tags being reverse complemented: ${tagInfo.revcomp.mkString(", ")}")

    val unmappedIter = templateIterator(unmappedSource, bufferSize=buffer)
    val mappedIter   = templateIterator(mappedSource, bufferSize=buffer)

    unmappedIter.foreach { template =>
      if (mappedIter.hasNext && mappedIter.head.name == template.name) {
        out ++= merge(unmapped=template, mapped=mappedIter.next(), tagInfo=tagInfo).allReads
      }
      else {
        logger.debug(s"Found an unmapped read with no corresponding mapped read: ${template.name}")
        out ++= template.allReads
      }
    }

    out.close()

    // There really should be no more mapped reads!
    if (mappedIter.hasNext) {
      throw new IllegalStateException(
        """Error: processed all unmapped reads but there are mapped reads remaining to be read.
        |Please ensure the unmapped and mapped reads have the same set of read names in the same
        |order, and reads with the same name are consecutive (grouped) in each input""".stripMargin
      )
    }
    
    unmappedSource.safelyClose()
    mappedSource.safelyClose()
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy