All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.ClipBam.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2017 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.bam

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, Metric, ProgressLogger}
import enumeratum.EnumEntry
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.SamPairUtil

import scala.collection.immutable.IndexedSeq

@clp(group = ClpGroups.SamOrBam, description=
  """
    |Clips reads from the same template. Ensures that at least N bases are clipped from any end of the read (i.e.
    |R1 5' end, R1 3' end, R2 5' end, and R2 3' end).  Optionally clips reads from the same template to eliminate overlap
    |between the reads.  This ensures that downstream processes, particularly variant calling, cannot double-count
    |evidence from the same template when both reads span a variant site in the same template.
    |
    |Clipping overlapping reads is only performed on `FR` read pairs, and is implemented by clipping approximately half
    |the overlapping bases from each read.  By default hard clipping is performed; soft-clipping may be substituted
    |using the `--soft-clip` parameter.
    |
    |Secondary alignments and supplemental alignments are not clipped, but are passed through into the
    |output.
    |
    |In order to correctly clip reads by template and update mate information, the input BAM must be either
    |`queryname` sorted or `query` grouped.  If your input BAM is not in an appropriate order the sort can be
    |done in streaming fashion with, for example:
    |
    |```
    |samtools sort -n -u in.bam | fgbio ClipBam -i /dev/stdin ...
    |```
    |
    |The output sort order may be specified with `--sort-order`.  If not given, then the output will be in the same
    |order as input.
    |
    |Any existing `NM`, `UQ` and `MD` tags are repaired, and mate-pair information updated.
    |
    |Three clipping modes are supported:
    |1. `Soft` - soft-clip the bases and qualities.
    |2. `SoftWithMask` - soft-clip and mask the bases and qualities (make bases Ns and qualities the minimum).
    |3. `Hard` - hard-clip the bases and qualities.
    |
    |The `--upgrade-clipping` parameter will convert all existing clipping in the input to the given more stringent mode:
    |from `Soft` to either `SoftWithMask` or `Hard`, and `SoftWithMask` to `Hard`. In all other cases, clipping remains
    |the same prior to applying any other clipping criteria.
  """)
class ClipBam
( @arg(flag='i', doc="Input SAM or BAM file of aligned reads in coordinate order.") val input: PathToBam,
  @arg(flag='o', doc="Output SAM or BAM file.") val output: PathToBam,
  @arg(flag='m', doc="Optional output of clipping metrics.") val metrics: Option[FilePath] = None,
  @arg(flag='r', doc="Reference sequence fasta file.") val ref: PathToFasta,
  @arg(flag='c', doc="The type of clipping to perform.") val clippingMode: ClippingMode = ClippingMode.Hard,
  @arg(flag='a', doc="Automatically clip extended attributes that are the same length as bases.") val autoClipAttributes: Boolean = false,
  @arg(flag='H', doc="Upgrade all existing clipping in the input to the given clipping mode prior to applying any other clipping criteria.") val upgradeClipping: Boolean = false,
  @arg(          doc="Require at least this number of bases to be clipped on the 5' end of R1") val readOneFivePrime: Int  = 0,
  @arg(          doc="Require at least this number of bases to be clipped on the 3' end of R1") val readOneThreePrime: Int = 0,
  @arg(          doc="Require at least this number of bases to be clipped on the 5' end of R2") val readTwoFivePrime: Int  = 0,
  @arg(          doc="Require at least this number of bases to be clipped on the 3' end of R2") val readTwoThreePrime: Int = 0,
  @arg(          doc="Clip overlapping reads.") val clipOverlappingReads: Boolean = false,
  @arg(          doc="Clip reads in FR pairs that sequence past the far end of their mate.") val clipBasesPastMate: Boolean = false,
  @arg(flag='S', doc="The sort order of the output. If not given, output will be in the same order as input if the input.")
  val sortOrder: Option[SamOrder] = None
) extends FgBioTool with LazyLogging {
  Io.assertReadable(input)
  Io.assertReadable(ref)
  Io.assertCanWriteFile(output)

  validate(upgradeClipping || clipOverlappingReads || clipBasesPastMate || Seq(readOneFivePrime, readOneThreePrime, readTwoFivePrime, readTwoThreePrime).exists(_ != 0),
    "At least one clipping option is required")

  if (clipBasesPastMate && clipOverlappingReads) {
    logger.info("Clipping overlapping reads supersedes clipping past the far end of their mate.")
  }

  private val clipper = new SamRecordClipper(mode=clippingMode, autoClipAttributes=autoClipAttributes)

  override def execute(): Unit = {
    val in     = SamSource(input)
    val header = in.header
    val out    = Bams.nmUqMdTagRegeneratingWriter(writer=SamWriter(output, header.clone(), sort=sortOrder), ref=ref)

    // Require queryname sorted or query grouped
    Bams.requireQueryGrouped(header=in.header, toolName="ClipBam")

    val metricsMap: Map[ReadType, ClippingMetrics] = this.metrics.map { _ =>
      ReadType.values.map { readType => readType -> ClippingMetrics(read_type=readType) }.toMap
    }.getOrElse(Map.empty)

    // Go through and clip reads and fix their mate information
    Bams.templateIterator(in).foreach { template =>
      if (this.upgradeClipping) template.allReads.foreach { r => this.clipper.upgradeAllClipping(r) }

      (template.r1, template.r2) match {
        case (Some(r1), Some(r2)) =>
          clipPair(r1=r1, r2=r2, r1Metric=metricsMap.get(ReadType.ReadOne), r2Metric=metricsMap.get(ReadType.ReadTwo))
          SamPairUtil.setMateInfo(r1.asSam, r2.asSam, true)
          template.r1Supplementals.foreach(s => SamPairUtil.setMateInformationOnSupplementalAlignment(s.asSam, r2.asSam, true))
          template.r2Supplementals.foreach(s => SamPairUtil.setMateInformationOnSupplementalAlignment(s.asSam, r1.asSam, true))
        case (Some(frag), None) =>
          clipFragment(frag=frag, metric=metricsMap.get(ReadType.Fragment))
        case _ => ()
      }

      out ++= template.allReads
    }
    out.close()

    this.metrics.foreach { path =>
      // Update the metrics for "All" and "Pair" read types
      import ReadType._
      metricsMap.foreach {
        case (Fragment, metric)          => metricsMap(All).add(metric)
        case (ReadOne | ReadTwo, metric) => Seq(Pair, All).foreach { r => metricsMap(r).add(metric) }
        case _                           => ()
      }
      // Write it!
      Metric.write(path, ReadType.values.map { readType => metricsMap(readType)})
    }

  }

  /** Clips a fixed amount from the reads and then clips overlapping reads.
    */
  private[bam] def clipFragment(frag: SamRecord, metric: Option[ClippingMetrics] = None): Unit = {
    val priorBasesClipped = frag.cigar.clippedBases

    // Clip the read!
    val numFivePrime  = this.clipper.clip5PrimeEndOfRead(frag, readOneFivePrime)
    val numThreePrime = this.clipper.clip3PrimeEndOfRead(frag, readOneThreePrime)

    // Update metrics
    metric.foreach { m =>
      m.update(
        rec                 = frag,
        priorBasesClipped   = priorBasesClipped,
        numFivePrime        = numFivePrime,
        numThreePrime       = numThreePrime,
        numOverlappingBases = 0,
        numExtendingBases   = 0
      )
    }
  }

  /** Clips a fixed amount from the reads and then clips overlapping reads.
    */
  private[bam] def clipPair(r1: SamRecord, r2: SamRecord, r1Metric: Option[ClippingMetrics] = None, r2Metric: Option[ClippingMetrics] = None): Unit = {
    val priorBasesClippedReadOne = r1.cigar.clippedBases
    val priorBasesClippedReadTwo = r2.cigar.clippedBases

    // Clip the read!
    val numReadOneFivePrime  = this.clipper.clip5PrimeEndOfRead(r1, readOneFivePrime)
    val numReadOneThreePrime = this.clipper.clip3PrimeEndOfRead(r1, readOneThreePrime)
    val numReadTwoFivePrime  = this.clipper.clip5PrimeEndOfRead(r2, readTwoFivePrime)
    val numReadTwoThreePrime = this.clipper.clip3PrimeEndOfRead(r2, readTwoThreePrime)

    val (numOverlappingBasesReadOne, numOverlappingBasesReadTwo) = {
      if (clipOverlappingReads && r1.isFrPair) this.clipper.clipOverlappingReads(r1, r2)
      else (0, 0)
    }

    val (numExtendingPastMateStartReadOne, numExtendingPastMateStartReadTwo) = {
      if (clipBasesPastMate && r1.isFrPair) {
        val clip1 = this.clipper.clipExtendingPastMateEnd(rec=r1, mateEnd=r2.end)
        val clip2 = this.clipper.clipExtendingPastMateEnd(rec=r2, mateEnd=r1.end)
        (clip1, clip2)
      }
      else (0, 0)
    }

    r1Metric.foreach { m =>
      m.update(
        rec                 = r1,
        priorBasesClipped   = priorBasesClippedReadOne,
        numFivePrime        = numReadOneFivePrime,
        numThreePrime       = numReadOneThreePrime,
        numOverlappingBases = numOverlappingBasesReadOne,
        numExtendingBases   = numExtendingPastMateStartReadOne
      )
    }

    r2Metric.foreach { m =>
      m.update(
        rec                 = r2,
        priorBasesClipped   = priorBasesClippedReadTwo,
        numFivePrime        = numReadTwoFivePrime,
        numThreePrime       = numReadTwoThreePrime,
        numOverlappingBases = numOverlappingBasesReadTwo,
        numExtendingBases   = numExtendingPastMateStartReadTwo
      )
    }
  }
}

sealed trait ReadType extends EnumEntry
object ReadType extends FgBioEnum[ReadType] {
  def values: IndexedSeq[ReadType] = findValues
  case object Fragment extends ReadType
  case object ReadOne extends ReadType
  case object ReadTwo extends ReadType
  case object Pair extends ReadType
  case object All extends ReadType
}


/** Metrics produced by [[ClipBam]] that detail how many reads and bases are clipped respectively.
  *
  * @param read_type The type of read (i.e. Fragment, ReadOne, ReadTwo).
  * @param reads The number of reads examined.
  * @param reads_clipped_pre The number of reads with any type of clipping prior to clipping with [[ClipBam]].
  * @param reads_clipped_post The number of reads with any type of clipping after clipping with [[ClipBam]], including reads that became unmapped.
  * @param reads_clipped_five_prime The number of reads with the 5' end clipped.
  * @param reads_clipped_three_prime The number of reads with the 3' end clipped.
  * @param reads_clipped_overlapping The number of reads clipped due to overlapping reads.
  * @param reads_clipped_extending The number of reads clipped due to a read extending past its mate.
  * @param reads_unmapped The number of reads that became unmapped due to clipping.
  * @param bases The number of aligned bases after clipping.
  * @param bases_clipped_pre The number of bases clipped prior to clipping with [[ClipBam]].
  * @param bases_clipped_post The number of bases clipped after clipping with [[ClipBam]], including bases from reads that became unmapped.
  * @param bases_clipped_five_prime The number of bases clipped on the 5' end of the read.
  * @param bases_clipped_three_prime The number of bases clipped on the 3 end of the read.
  * @param bases_clipped_overlapping The number of bases clipped due to overlapping reads.
  * @param bases_clipped_extending The number of bases clipped due to a read extending past its mate.
  */
case class ClippingMetrics
(read_type: ReadType,
 var reads: Long = 0,
 var reads_unmapped: Long = 0,
 var reads_clipped_pre: Long = 0,
 var reads_clipped_post: Long = 0,
 var reads_clipped_five_prime: Long = 0,
 var reads_clipped_three_prime: Long = 0,
 var reads_clipped_overlapping: Long = 0,
 var reads_clipped_extending: Long = 0,
 var bases: Long = 0,
 var bases_clipped_pre: Long = 0,
 var bases_clipped_post: Long = 0,
 var bases_clipped_five_prime: Long = 0,
 var bases_clipped_three_prime: Long = 0,
 var bases_clipped_overlapping: Long = 0,
 var bases_clipped_extending: Long = 0,
) extends Metric {
  def update(rec: SamRecord, priorBasesClipped: Int, numFivePrime: Int, numThreePrime: Int, numOverlappingBases: Int, numExtendingBases: Int): Unit = {
    this.reads                      += 1
    this.bases                      += rec.cigar.alignedBases
    if (priorBasesClipped > 0) {
      this.reads_clipped_pre        += 1
      this.bases_clipped_pre        += priorBasesClipped
    }
    if (numFivePrime > 0) {
      this.reads_clipped_five_prime += 1
      this.bases_clipped_five_prime += numFivePrime
    }
    if (numThreePrime > 0) {
      this.reads_clipped_three_prime += 1
      this.bases_clipped_three_prime += numThreePrime
    }
    if (numOverlappingBases > 0) {
      this.reads_clipped_overlapping += 1
      this.bases_clipped_overlapping += numOverlappingBases
    }
    if (numExtendingBases > 0) {
      this.reads_clipped_extending += 1
      this.bases_clipped_extending += numExtendingBases
    }
    val additionalClippedBases = numFivePrime + numThreePrime + numOverlappingBases + numExtendingBases
    val totalClippedBases = additionalClippedBases + priorBasesClipped
    if (totalClippedBases > 0) {
      this.reads_clipped_post        += 1
      this.bases_clipped_post        += totalClippedBases
      if (rec.unmapped && additionalClippedBases > 0) this.reads_unmapped += 1
    }
  }
  
  def add(metric: ClippingMetrics*): Unit = {
    this.reads                     += metric.sumBy(_.reads)
    this.reads_unmapped            += metric.sumBy(_.reads_unmapped)
    this.reads_clipped_pre         += metric.sumBy(_.reads_clipped_pre)
    this.reads_clipped_post        += metric.sumBy(_.reads_clipped_post)
    this.reads_clipped_five_prime  += metric.sumBy(_.reads_clipped_five_prime)
    this.reads_clipped_three_prime += metric.sumBy(_.reads_clipped_three_prime)
    this.reads_clipped_overlapping += metric.sumBy(_.reads_clipped_overlapping)
    this.reads_clipped_extending   += metric.sumBy(_.reads_clipped_extending)
    this.bases                     += metric.sumBy(_.bases)
    this.bases_clipped_pre         += metric.sumBy(_.bases_clipped_pre)
    this.bases_clipped_post        += metric.sumBy(_.bases_clipped_post)
    this.bases_clipped_five_prime  += metric.sumBy(_.bases_clipped_five_prime)
    this.bases_clipped_three_prime += metric.sumBy(_.bases_clipped_three_prime)
    this.bases_clipped_overlapping += metric.sumBy(_.bases_clipped_overlapping)
    this.bases_clipped_extending   += metric.sumBy(_.bases_clipped_extending)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy