All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.DownsampleAndNormalizeBam.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2023 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.bam

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.DownsampleAndNormalizeBam.CoverageTracker
import com.fulcrumgenomics.bam.api._
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, ProgressLogger, Sorter}
import htsjdk.samtools.util.IntervalList.IntervalMergerIterator
import htsjdk.samtools.util.{CoordMath, Interval, IntervalList, Murmur3, OverlapDetector}

/**
  * Companion object with helper classes.
  */
private object DownsampleAndNormalizeBam extends LazyLogging {
  /**
    * Tracks coverage over a set of regions.
    */
  class CoverageTracker(regions: Seq[Interval],
                        val targetCoverage: Int,
                        val minMapQ: Int
                       ) {

    // Overlap detector for region => coverage
    private val detector = new OverlapDetector[RegionCoverage](0, 0)
    regions.foreach { region =>
      detector.addLhs(new RegionCoverage(region.getContig, region.getStart, region.getEnd), region)
    }

    /**
      * Checks to see if a template has reads that contribute coverage to any bases that are currently
      * below the coverage threshold.  If so, coverage from all acceptable reads is accumulated and the
      * method returns true.  Otherwise no coverage is accumulated and the method returns false.
      *
      * @param t a Template to accumulate coverage from if it covers any bases that are under the coverage threshold
      * @return true if the template added coverage, false otherwise
      */
    def add(t: Template): Boolean = {
      val recs = t.allReads
        .filterNot(_.secondary)
        .filterNot(_.unmapped)
        .filterNot(_.duplicate)
        .filter(_.mapq >= minMapQ)
        .toSeq

      val addsCoverage = recs.exists { rec =>
        detector.getOverlaps(rec.asSam).iterator().exists { cov =>
          Range.inclusive(math.max(rec.start, cov.start), math.min(rec.end, cov.end)).exists { pos =>
            cov(pos) < targetCoverage
          }
        }
      }

      if (addsCoverage) {
        val intervals = {
          val iter = recs
            .sortBy(r => (r.refName, r.start, r.end))
            .iterator
            .map(r => new Interval(r.refName, r.start, r.end))
          new IntervalMergerIterator(iter, true, false, false)
        }

        for (interval <- intervals; cov <- detector.getOverlaps(interval).iterator()) {
          cov.add(interval.getStart, interval.getEnd)
        }
      }

      addsCoverage
    }
  }

  /**
    * Coverage counter for a region - all coordinates in the public API are 1-based closed ended.
    */
  class RegionCoverage(val chrom: String, val start: Int, val end: Int) {
    private val offset = start
    private val counts = new Array[Int](CoordMath.getLength(start, end))

    /** Returns the coverage at the given genomic position, or -1 if the position is not between start:end. */
    def apply(i: Int): Int = {
      if (i < start || i > end) {
        throw new IndexOutOfBoundsException(s"Position $i is outside of range for region $chrom:$start-$end.")
      }

      this.counts(i - offset)
    }

    /** Adds 1 to the coverage counts for all bases between from:to inclusive. */
    def add(from: Int, to: Int): Unit = {
      forloop (from=math.max(0, from-offset), until=math.min(counts.length, to-offset+1)) { i =>
        this.counts(i) += 1
      }
    }
  }
}

@clp(
  group=ClpGroups.SamOrBam,
  description =
    """
      |Downsamples a BAM in a biased way to a uniform coverage across regions.
      |
      |Attempts to downsample a BAM such that every base in the genome (or in the target `regions` if provided)
      |is covered by at least `coverage` reads.  When computing coverage:
      |  - Reads marked as secondary, duplicate or unmapped are not used
      |  - A base can receive coverage from only one read with the same queryname (i.e. mate overlaps are not counted)
      |  - Coverage is counted if a read _spans_ a base, even if that base is deleted in the read
      |
      |Reads are first sorted into a random order (by hashing read names).  Reads are then consumed one template
      |at a time, and if any read adds coverage to base that is under the target coverage, _all_ reads (including
      |secondary, unmapped, etc.) for that template are emitted into the output.
      |
      |Given the procedure used for downsampling, it is likely the output BAM will have coverage up to 2X the requested
      |coverage at regions in the input BAM that are i) well covered and ii) are close to regions that are poorly
      |covered.
    """
)
class DownsampleAndNormalizeBam
( @arg(flag='i', doc="Input SAM or BAM file.") val input: PathToBam,
  @arg(flag='o', doc="Output SAM or BAM file.") val output: PathToBam,
  @arg(flag='c', doc="Desired minimum coverage.") val coverage: Int,
  @arg(flag='m', doc="Minimum mapping quality to count a read as covering.") val minMapQ: Int = 0,
  @arg(flag='s', doc="Random seed to use when randomizing order of reads/templates.") val seed: Int = 42,
  @arg(flag='l', doc="Optional set of regions for coverage targeting.") val regions: Option[PathToIntervals] = None,
  @arg(flag='M', doc="Maximum records to be held in memory while sorting.") val maxInMemory: Int = Bams.MaxInMemory
) extends FgBioTool {

  Io.assertReadable(input)
  Io.assertCanWriteFile(output)
  regions.foreach(Io.assertReadable)

  override def execute(): Unit = {
    val in  = SamSource(input)
    val out = SamWriter(output, in.header, sort=SamOrder(in.header))

    val targets: IndexedSeq[Interval] = regions match {
      case Some(rs) => IntervalList.fromPath(rs).uniqued().getIntervals.toIndexedSeq
      case None     => in.dict.map(s => new Interval(s.name, 1, s.length)).toIndexedSeq
    }

    val tracker = new CoverageTracker(targets, this.coverage, this.minMapQ)
    var (read, written) = (0L, 0L)

    Bams.templateRandomIterator(in, this.seed, maxInMemory).foreach { template =>
      read += 1

      if (tracker.add(template)) {
        out ++= template.allReads
        written += 1
      }

      if (read % 1000000 == 0) {
        logger.info(f"Read ${read}%,d templates and wrote out ${written}%,d")
      }
    }

    in.safelyClose()
    out.close()
    logger.info(f"Read ${read}%,d templates and wrote out ${written}%,d")
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy