All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.pileup.StreamingPileupBuilder.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2022 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.bam.pileup

import com.fulcrumgenomics.bam.api.SamOrder.Coordinate
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource}
import com.fulcrumgenomics.bam.pileup.PileupBuilder._
import com.fulcrumgenomics.bam.pileup.StreamingPileupBuilder.DefaultInitialCacheSize
import com.fulcrumgenomics.commons.CommonsDef._
import com.fulcrumgenomics.coord.LocatableOrdering
import com.fulcrumgenomics.fasta.SequenceDictionary
import htsjdk.samtools.util.{Interval, Locatable}

import java.io.Closeable
import scala.collection.mutable
import scala.language.reflectiveCalls

/** Companion object for [[StreamingPileupBuilder]]. */
object StreamingPileupBuilder {

  /** The default initial cache size for pre-allocating an array for a pileup of reads. Set to 300x coverage by default. */
  val DefaultInitialCacheSize: Int = 300

  /** Build a streaming pileup builder from a coordinate sorted SAM source. */
  def apply(
    source: SamSource,
    minMapQ: Int                                = PileupDefaults.minMapQ,
    minBaseQ: Int                               = PileupDefaults.minBaseQ,
    mappedPairsOnly: Boolean                    = PileupDefaults.mappedPairsOnly,
    includeDuplicates: Boolean                  = PileupDefaults.includeDuplicates,
    includeSecondaryAlignments: Boolean         = PileupDefaults.includeSecondaryAlignments,
    includeSupplementalAlignments: Boolean      = PileupDefaults.includeSupplementalAlignments,
    includeMapPositionsOutsideFrInsert: Boolean = PileupDefaults.includeMapPositionsOutsideFrInsert,
    initialCacheSize: Int                       = DefaultInitialCacheSize,
  ): StreamingPileupBuilder = {
    require(SamOrder(source.header).contains(Coordinate), "SAM source must be coordinate sorted!")
    lazy val iterator = source.iterator
    new StreamingPileupBuilder(
      records                            = iterator,
      dict                               = source.dict,
      minMapQ                            = minMapQ,
      minBaseQ                           = minBaseQ,
      mappedPairsOnly                    = mappedPairsOnly,
      includeDuplicates                  = includeDuplicates,
      includeSecondaryAlignments         = includeSecondaryAlignments,
      includeSupplementalAlignments      = includeSupplementalAlignments,
      includeMapPositionsOutsideFrInsert = includeMapPositionsOutsideFrInsert,
      initialCacheSize                   = initialCacheSize,
      source                             = Some(iterator)
    )
  }

  /** Helper class to ensure pileups are locatable and can be used in coordinate comparison and ordering. */
  private implicit class LocatablePileup(pileup: Pileup[PileupEntry]) extends Locatable {
    override def getContig: String = pileup.refName
    override def getStart: Int     = pileup.pos
    override def getEnd: Int       = pileup.pos
  }
}

/** A pileup builder that builds coordinate-maintaining or advancing pileups using lazy end-to-end BAM streaming.
  *
  * For every call to , this class will advance  until SAM records are found that overlap the requested
  * locus. These records will be loaded into an internal cache and then a pileup will be built from them. The pileup
  * will be cached in the event a coordinate-maintaining call to  is made in the future. If another
  * coordinate-advancing call is made to  then any previous SAM records are evicted from the internal cache that
  * do not overlap the requested locus and  will be advanced to find all additional SAM records that overlap
  * the requested locus.
  *
  * @param records a by-name iterator of SAM records that is assumed to be coordinate-sorted.
  * @param dict the sequence dictionary associated with the records we will pileup.
  * @param minBaseQ the minimum base quality that a base must have to contribute to a pileup.
  * @param minMapQ the minimum mapping quality a record must have to contribute to a pileup.
  * @param mappedPairsOnly if true, only allow paired records with both records mapped to contribute to a pileup.
  * @param includeDuplicates if true, allow records flagged as duplicates to contribute to a pileup.
  * @param includeSecondaryAlignments if true, allow records flagged as secondary alignments to contribute to a pileup.
  * @param includeSupplementalAlignments if true, allow records flagged as supplementary alignments to contribute to a pileup.
  * @param includeMapPositionsOutsideFrInsert if true, include any record of an FR pair where the site requested is outside the insert.
  * @param initialCacheSize the initial size for the internal SAM record cache, set this to your expected pileup depth.
  * @param source an optional source to additionally close upon closing this pileup builder.
  */
class StreamingPileupBuilder private(
  records: => Iterator[SamRecord],
  override val dict: SequenceDictionary,
  override val minMapQ: Int                                = PileupDefaults.minMapQ,
  override val minBaseQ: Int                               = PileupDefaults.minBaseQ,
  override val mappedPairsOnly: Boolean                    = PileupDefaults.mappedPairsOnly,
  override val includeDuplicates: Boolean                  = PileupDefaults.includeDuplicates,
  override val includeSecondaryAlignments: Boolean         = PileupDefaults.includeSecondaryAlignments,
  override val includeSupplementalAlignments: Boolean      = PileupDefaults.includeSupplementalAlignments,
  override val includeMapPositionsOutsideFrInsert: Boolean = PileupDefaults.includeMapPositionsOutsideFrInsert,
  initialCacheSize: Int                                    = DefaultInitialCacheSize,
  source: => Option[{ def close(): Unit }]                 = None,
) extends PileupBuilder with Closeable {
  import com.fulcrumgenomics.bam.pileup.StreamingPileupBuilder.LocatablePileup

  /** Records that we've accumulated that could overlap another coordinate-advancing call to . */
  private val cache: mutable.ArrayBuffer[SamRecord] = new mutable.ArrayBuffer[SamRecord](initialCacheSize)

  /** Whether this builder is able to pileup more records from the input iterator of SAM records or not. */
  private var closed: Boolean = false

  /** A genomic ordering for any locatable that utilizes the sequence dictionary corresponding to the input records. */
  private val byCoordinate: Ordering[Locatable] = LocatableOrdering(dict)

  /** The last pileup we built over the input SAM records and is cached to save time if the locus is re-requested. */
  private var lastPileup: Option[Pileup[PileupEntry]] = None

  /** The underlying buffered stream of input SAM records which is lazily summoned. */
  private lazy val underlying: Iterator[SamRecord] = records.filter(_.mapped).bufferBetter

  /** Advance this builder to the next requested locus and add all possibly overlapping records to the cache. */
  @inline private def seekAndFillCache(refIndex: Int, pos: Int): Unit = {
    // Drop records up until the next record stands a chance of overlapping the requested locus. Then, take records up
    // until the next record stands a chance of having a start greater than the requested locus plus one. All records in
    // this query have a chance of overlapping the locus so we must then filter down to only those that have an end
    // location greater than or equal to the requested locus. Finally, these records will be added to the cache so that
    // they can be evaluated for pileup at the requested locus. We add one to  for the case when a record starts
    // with an insertion.
    val maybeOverlapping = underlying
      .dropWhile(rec => rec.refIndex < refIndex || (rec.refIndex == refIndex && rec.end < pos))
      .takeWhile(rec => rec.refIndex == refIndex && rec.start <= pos + 1)
      .filter(rec => rec.end >= pos)
    cache.addAll(maybeOverlapping)
  }

  /** Efficiently advance to the next coordinate-maintaining or coordinate-advancing locus and build a pileup there. */
  def pileup(refName: String, pos: Int): Pileup[PileupEntry] = {
    require(!closed, "Cannot advance to a new locus if the pileup builder was closed!")
    val currentLocus = new Interval(refName, pos, pos)
    val refIndex     = dict(refName).index.ensuring(_ >= 0, s"Reference name not in sequence dictionary: $refName")

    // If there is a last pileup defined and it was a locus prior to the requested locus then purge the cache of any
    // records that will not overlap the requested locus, advance the iterator to accumulate maybe-overlapping records
    // back into the cache, and finally pileup records from the requested locus using the cache. If there is a last
    // pileup defined and the requested locus is at the same locus as the last pileup, then return the cached pileup.
    // If there is a last pileup defined and the requested locus is prior to the last pileup then an exception will be
    // raised since it means we are not coordinate-maintaining or coordinate-advancing. When there is no last pileup
    // defined, advance the iterator to accumulate maybe-overlapping records into a fresh cache and then pileup records
    // from the requested locus using the cache.
    val pileup = lastPileup match {
      case Some(last) if byCoordinate.lt(last, currentLocus) =>
        lastPileup = None // Set to `None` now so we can drop the object reference ASAP for garbage collection.
        if (last.refIndex != refIndex) cache.clear() // If we've moved to a new reference sequence index, simply clear.
        else cache.filterInPlace(rec => (rec.refIndex == refIndex && rec.end >= pos) || rec.refIndex > refIndex)
        this.seekAndFillCache(refIndex = refIndex, pos = pos)
        this.build(cache, refName = refName, pos = pos)
      case Some(last) if byCoordinate.equiv(last, currentLocus) => last
      case Some(last) =>
        throw new IllegalArgumentException(
          s"Queried locus $refName:$pos is behind the previous pileup and should be " +
          s"greater than or equal to: ${last.refName}:${last.pos}"
        )
      case None =>
        this.seekAndFillCache(refIndex = refIndex, pos = pos)
        this.build(cache, refName = refName, pos = pos)
    }

    lastPileup = Some(pileup)
    pileup
  }

  /** Close this pileup builder and clear internal state so that all object references can be garbage collected. */
  override def close(): Unit = { closed = true; source.foreach(_.close()); cache.clear(); lastPileup = None }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy