com.fulcrumgenomics.bam.FindSwitchbackReads.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fgbio_2.13 Show documentation
fgbio
The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2018 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.bam

import java.lang.Math.{abs, max, min}

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.io.PathUtil
import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter}
import com.fulcrumgenomics.fasta.ReferenceSequenceIterator
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.Metric.{Count, Proportion}
import com.fulcrumgenomics.util._
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.SamPairUtil.PairOrientation
import htsjdk.samtools.reference.ReferenceSequence
import htsjdk.samtools.util.{CoordMath, SequenceUtil}
import htsjdk.samtools.{CigarOperator, SAMFileHeader, SAMReadGroupRecord}


object FindSwitchbackReads {
  /** Resource path for R script for plotting. */
  private val PlottingScript = "com/fulcrumgenomics/bam/FindSwitchbackReads.R"

  /** The name of the BAM extended attribute tag used to store switchback information. */
  val SwitchTag: String = "sb"

  /** Parent trait for the different kinds of hits we can find. */
  sealed trait SwitchHit { def code: Char }

  /**
    * Information about a switchback found by looking at soft-clipped complementary sequence within
    * a single read.
    *
    * @param offset the offset between the reference position at the last base read on the original strand and
    *               the first base read on the opposite strand.
    * @param length the length of the post-switch sequence within the read
    * @param read optionally the read in which the hit was found (generally filled in later)
    */
  case class ReadBasedHit(var offset: Int, length: Int, var read: Option[SamRecord] = None) extends SwitchHit {
    override val code: Char = 'r'
  }

  /**
    * Information about a switchback found by examining a template with a tandem (i.e. FF or RR) pair orientation.
    * The offset in this case is the distance between the end of one read and the beginning of the next.  E.g.
    *
    * 01234567890123456789
    * ----->  ----->       gap = 2
    *  <----- <-----       gap = 1
    * ----->
    *     ------>          gap = -2
    * @param gap the unsequenced gap between the the 3' end of the "earlier" read and the 5' end of the "later" read.
    */
  case class TandemBasedHit(gap: Int) extends SwitchHit {
    override val code: Char = 't'
  }

  /**
    * Summary metrics regarding switchback reads found.
    *
    * @param sample the name of the sample sequenced.
    * @param library the name of the library sequenced.
    * @param templates the total number of templates (i.e. inserts, unique read names) seen in the input.
    * @param aligned_templates the number of templates that had at least one aligned read.
    * @param switchback_templates the number of templates identified as having a switchback event.
    * @param fraction_switchbacks the fraction of all templates that appear to have switchbacks in them.
    * @param read_based_switchbacks the count of switchback_templates that were identified by looking for soft-clipped
    *                               reverse complementary sequence at the ends of reads.
    * @param mean_length the mean length of the reverse complementary sequence in the `read_based_switchbacks`.
    * @param mean_offset the mean offset of the `read_based_switchbacks`.
    * @param tandem_based_switchbacks the count of switchback_templates that were identified because they had paired
    *                                 reads mapped in FF or RR orientations within the designated gap size range.
    * @param mean_gap the mean gap size for `tandem_based_switchbacks`
    */
  case class SwitchMetric(sample: String,
                          library: String,
                          templates: Count,
                          aligned_templates: Count,
                          switchback_templates: Count,
                          fraction_switchbacks: Proportion,
                          read_based_switchbacks: Count,
                          mean_length: Double,
                          mean_offset: Double,
                          tandem_based_switchbacks: Count,
                          mean_gap: Double
                         ) extends Metric

  /**
    * Attempts to find a switchback event based on soft-clipped sequence within the primary reads of the template.
    *
    * @param refs a map of reference sequence name to [[ReferenceSequence]] object containing the sequence.
    * @param template the template to be examined.
    * @param minLength the minimum length of switchback to be detected.
    * @param maxOffset the maximum offset on the reference between the end of the pre-switch sequence and the start
    *                  of the post-switch sequence. A value of `0` will cause this method to return `None`.
    * @param maxErrorRate the maximum rate of mismatches between the switched-back sequence and the reference
    * @return `Some(ReadBasedHit)` if a hit is found else `None`.
    */
  private[bam] def findReadBasedSwitchback(refs: Map[String, ReferenceSequence],
                                           template: Template,
                                           minLength: Int,
                                           maxOffset: Int,
                                           maxErrorRate: Double): Option[ReadBasedHit] = {
    ////////////////////////////////////////////////////////////////////////////////////////////////
    // Implementation notes:
    //
    // Read based switchbacks are detected by looking for soft-clipped sequence at the 5' end of the
    // read - i.e. the start of F reads and the end of R reads.  To trigger further checks we require
    // that soft-clipping exist and be at least as long as `minLength`.
    //
    // The second part of the check is that the soft-clipped sequence matches the reverse-complement
    // of the reference, and that the match _starts_ reading on the opposite strand within `maxOffset`
    // of the 5'-most mapped base of the non-clipped sequence.  How we do this differs by strand.
    //
    // A key value we wish to determine is the `offset` - the gap between where we leave off reading
    // one strand, and where we start reading on the other strand.  This value is positive if
    // post-switch the enzyme starts outside the sequence read on the original strand and moves back
    // towards the last base sequenced on the original strand, and negative if the enzyme starts
    // reading the second strand recessed within the already-read sequence of the first strand.
    // The `find` method returns the offset within the reference window given, which is then
    // translated into the offset from the 5' alignment position in a strand dependent manner.
    ////////////////////////////////////////////////////////////////////////////////////////////////

    if (maxOffset <= 0) None else template.primaryReads.filter(_.mapped).flatMap { rec =>
      if (rec.positiveStrand && rec.cigar.head.operator == CigarOperator.S && rec.cigar.head.length >= minLength) {
        val ref         = refs(rec.refName)
        val bases       = Sequences.revcomp(rec.basesString.take(rec.cigar.head.length))
        val windowStart = max(1, rec.start - maxOffset)
        val windowEnd   = min(ref.length(), CoordMath.getEnd(rec.start, bases.length + maxOffset))
        val refBases    = new String(ref.getBases, windowStart - 1, CoordMath.getLength(windowStart, windowEnd))
        find(bases, refBases, maxErrorRate, preferLaterMatches=false)
          .map { hit => hit.read = Some(rec); hit.offset = -(hit.offset + windowStart - rec.start); hit }
      }
      else if (rec.negativeStrand && rec.cigar.last.operator == CigarOperator.S && rec.cigar.last.length >= minLength) {
        val ref         = refs(rec.refName)
        val bases       = Sequences.revcomp(rec.basesString.takeRight(rec.cigar.last.length))
        val windowStart = max(1, rec.end - bases.length - maxOffset + 1)
        val windowEnd   = min(ref.length(), rec.end + maxOffset)
        val refBases    = new String(ref.getBases, windowStart - 1, windowEnd - windowStart + 1)
        find(bases, refBases, maxErrorRate, preferLaterMatches=true)
          .map { hit => hit.read = Some(rec); hit.offset = -(rec.end - (hit.offset + windowStart + bases.length - 1)); hit }
      }
      else {
        None
      }
    }.find(_ => true)
  }

  /**
    * Attempts to find a switchback based on the orientation of the primary reads (FF or RR) and the distance between
    * the end of one read (in sequencing order) and the start of the next read (in sequencing order), i.e. the gap size.
    *
    * @param template the template to be examined.
    * @param maxGap the maximum allowable gap size
    * @return either `Some(TandemBasedHit)` if a hit can be found, else `None`
    */
  private[bam] def findTandemSwitchback(template: Template, maxGap: Int) : Option[TandemBasedHit] =
    if (maxGap <=0 ) None else template.pairOrientation match {
      case Some(PairOrientation.TANDEM) =>
        ((template.r1, template.r2): @unchecked) match {
          case (Some(r1), Some(r2)) =>
            val (earlier, later) = if (r1.start <= r2.start) (r1, r2) else (r2, r1)
            val gap = later.start - earlier.end - 1
            if (abs(gap) <= maxGap) Some(TandemBasedHit(later.start - earlier.end - 1)) else None
        }
      case _ =>
        None
  }

  /**
    * Attempts to find the first ungapped match of the query sequence within the target sequence. "First" is either
    * from the start of the target sequence, or if `preferLaterMatches` is true, from the end of the target sequence.
    *
    * @param query the query sequence to be found
    * @param target the target sequence in which the query sequence is to be located
    * @param maxErrorRate the maximum rate of mismatches allowed when matching the sequences
    * @param preferLaterMatches if true prefer matches closer to the end of targetSequence, else prefer matches closer
    *                           to the start
    * @return a [[ReadBasedHit]] if a match is found, otherwise None.
    */
  def find(query: String, target: String, maxErrorRate: Double, preferLaterMatches: Boolean): Option[ReadBasedHit] = {
    val maxMismatches = Math.floor(query.length * maxErrorRate)
    val range = {
      if (preferLaterMatches) Range.inclusive(target.length-query.length, 0, step = -1)
      else Range.inclusive(0, target.length-query.length-1, step=1)
    }

    range.iterator.flatMap { targetOffset =>
      var mismatches = 0
      forloop (from=0, until=query.length) { i =>
        if (!SequenceUtil.basesEqual(query(i).toByte, target(targetOffset+i).toByte)) mismatches += 1
      }

      if (mismatches <= maxMismatches) Some(ReadBasedHit(offset=targetOffset, length=query.length))
      else None
    }.find(_ => true)
  }

  /** Returns the read number, either 1 or 2. */
  private def readNum(rec: SamRecord): Int = if (!rec.paired || rec.firstOfPair) 1 else 2

  /** Returns the concatenation of the unique values per read group, or the default if there are no values. */
  private def summarizeRg(header: SAMFileHeader, f: SAMReadGroupRecord => String, default: String = "n/a"): String = {
    val ss = header.getReadGroups.map(f).filter(_ != null).map(_.trim).filter(_.nonEmpty).toSeq.distinct.sorted
    if (ss.nonEmpty) ss.mkString(",") else default
  }
}

@clp(group=ClpGroups.SamOrBam, description=
  """
    |Finds reads where a template switch occurred during library construction.  Some library construction methods,
    |notably ultra-low-input shotgun methods, are prone to template switching events that create molecules
    |(templates, inserts) that instead of being a linear copy of a segment of genomic DNA, instead are chimeras formed
    |by starting on one strand of DNA and then, switching to the opposite strand.  Frequently when the switch occurs
    |there may be a small offset between where the first strand was departed and the opposite strand joined.
    |
    |## Algorithm
    |
    |Templates that contain strand switch events (switch-backs) are found by this tool in two different ways:
    |
    |1. By looking at reads that contain soft-clipped bases at their 5' end that, when reverse complemented, matches the
    |   genome proximal to the 5'-most mapped base of the read.  We call these matches
    |   "read based switchbacks".  Finding read based switchbacks is based on several parameters:
    |
    |   1. `max-offset` controls how far away to search for the reverse-complemented sequence.  The default value of
    |      `35` allows matches to be found when the soft-clipped sequence matches the genome _starting_ at most 35bp
    |      from the 5' mapped position of the read, and reading in the opposite direction.
    |   2. `min-length` controls the minimum number of soft-clipped bases that must exist to trigger the search.
    |      Given that the search looks at `2 * max-offset` locations (default=70) it is important that `min-length`
    |      is set such that `4^min-length >> 2 * `max-offset` in order to avoid false positives.
    |   3. `max-error-rate` allows for some mismatches to exist between the soft-clipped sequence and the genome when matching.
    |
    |2. By identifying templates with `FF` or `RR` (aka tandem) orientation where it is surmised that the template
    |   switch occurred in the un-sequenced region of the template between R1 and R2.  We call these `tandem based
    |   switchbacks`.  This is controlled by a single parameter, `max-gap`, which causes the tool to only identify a
    |   tandem read pair as a switch-back _if_ the gap between the end of the first read and the start of the second
    |   read is `+/- max-gap`.
    |
    |By default, when a switch-back template is identified, the primary reads are made unmapped (and the original
    |alignment stored in the OA tag) and all secondary and supplementary alignments are discarded.  This can be
    |disabled with the `--dont-unmap` or `-d` option.
    |
    |All reads from a switch-back template are also tagged with an `sb` tag that describes the nature of the
    |switchback.  If the template was identified base on soft-clipped sequence within a read the format is:
    |
    |```sb:Z:r,[read|mate],{offset},{length}```
    |
    |If the template is identified due to it's tandem pair orientation then the format is:
    |
    |```sb:Z:t,{gap}```
    |
    |## Inputs and Outputs
    |
    |The tool takes as input a SAM or BAM file, and by default consumes from `stdin`.  The primary output is also a
    |SAM or BAM file, and defaults to compressed BAM on `stdout`.  This allows the tool to be run immediately after
    |an aligner in a pipe, e.g. `bwa mem ref.fa r1.fq r2.fq | fgbio -Xmx8g --ref=ref.fa | ...`.
    |
    |If the input is neither `queryname` sorted nor `queryname` grouped (i.e. all reads with the same name grouped
    |together) it will be sorted into `queryname` order by the tool before processing.
    |
    |By default the output BAM is produced in the order the reads were processed (i.e. the input ordering _or_
    |queryname sorted if sorting was required).  This can be overridden with the `--sort-order` option.
    |
    |A number of text files are also produced if the `--metrics` option is specified.  E.g. when specifying
    |`--metrics=s1.switchback` the following files are produced:
    |
    |1. `s1.switchback.summary.txt`: A table of summary metrics describing the number of reads, switchbacks, etc.
    |2. `s1.switchback.lengths.txt`: A table of the distribution of observed switchback lengths in read-based switchbacks.
    |3. `s1.switchback.offsets.txt`: A table of the distribution of observed offsets in read-based switchbacks.
    |4. `s1.switchback.gaps.txt`: A table of the distribution of gap lengths in tampl
    |5. `s1.switchback.plots.pdf`: A PDF containing plots of the distributions from 2-4.
    |
    |Note: because this tool accesses the reference genome in a random manner it pre-loads the entire reference fasta
    |into memory.  As a result the tool is best run with `-Xmx8g` to give it sufficient memory.
  """)
class FindSwitchbackReads
( @arg(flag='i', doc="Input BAM file.") val input: PathToBam = Io.StdIn,
  @arg(flag='o', doc="Output BAM file.") val output: PathToBam = Io.StdOut,
  @arg(flag='m', doc="Metrics output file.") val metrics: Option[PathPrefix] = None,
  @arg(flag='s', doc="Output sort order.") val sortOrder: Option[SamOrder] = None,
  @arg(flag='r', doc="Reference genome fasta file.") val ref: PathToFasta,
  @arg(flag='O', doc="Maximum offset between end the two segments of the read on the reference. Set to 0 to disable read-based checks.") val maxOffset: Int = 35,
  @arg(flag='g', doc="Maximum gap between R1 and R2 of tandem reads to call a template a switchback. Set to 0 to disable tandem-based checks.") val maxGap: Int = 500,
  @arg(flag='l', doc="Minimum match length of the switched back segment.") val minLength: Int = 6,
  @arg(flag='e', doc="Maximum mismatch error rate of switchback match to genome.") val maxErrorRate: Double = 0.1,
  @arg(flag='d', doc="IF true, do NOT unmap reads from switchback templates.") val dontUnmap: Boolean = false
) extends FgBioTool with LazyLogging {
  import FindSwitchbackReads._

  Io.assertReadable(input)
  Io.assertReadable(ref)
  Io.assertCanWriteFile(output)
  metrics.foreach(m => Io.assertCanWriteFile(m))

  validate(maxOffset    >= 0,   "max-offset must be >= 0")
  validate(maxGap       >= 0,   "max-gap must be >= 0")
  validate(minLength    >= 0,   "min-length must be >= 0")
  validate(maxErrorRate >= 0.0, "max-error-rate must be >= 0")

  override def execute(): Unit = {
    logger.info("Loading reference sequences.")
    val refs = ReferenceSequenceIterator(ref, stripComments=true).map(r => r.getName -> r).toMap

    val progress = ProgressLogger(logger)
    var total   = 0L
    var aligned = 0L
    val switchbackGaps    = new NumericCounter[Int]()
    val switchbackLengths = new NumericCounter[Int]()
    val switchbackOffsets = new NumericCounter[Int]()

    val in = SamSource(input)
    val iterator = Bams.templateIterator(in)
    val out = {
      val outHeader = in.header.clone()
      sortOrder match {
        case Some(so) => so.applyTo(outHeader)
        case None     =>
          if (in.header.getSortOrder != SortOrder.queryname && in.header.getGroupOrder != GroupOrder.query) {
            SamOrder.Queryname.applyTo(outHeader)
          }
      }

      SamWriter(output, header=outHeader, sort=sortOrder)
    }

    logger.info("Examining reads for switchbacks.")
    iterator.map { template =>
      total += 1
      if (template.primaryReads.forall(_.unmapped)) template else {
        aligned += 1

        val result: Option[SwitchHit] = findReadBasedSwitchback(refs, template, minLength, maxOffset, maxErrorRate)
          .orElse(findTandemSwitchback(template, maxGap))

        result.foreach {
          case hit: TandemBasedHit =>
            switchbackGaps.count(hit.gap)
            val tagValue = s"${hit.code},${hit.gap}"
            template.allReads.foreach(r => r(SwitchTag) = tagValue)
          case hit: ReadBasedHit =>
            switchbackLengths.count(hit.length)
            switchbackOffsets.count(hit.offset)
            val foundInReadNum = readNum(hit.read.getOrElse(unreachable("read should be populated!")))
            template.allReads.foreach { r =>
              val num = readNum(r)
              r(SwitchTag) = s"${hit.code},${if (num == foundInReadNum) "read" else "mate"},${hit.offset},${hit.length}"
            }
        }

        if (result.isDefined && !this.dontUnmap) template.unmapped else template
      }
    }
    .foreach { template =>
      template.allReads.foreach { rec =>
        out += rec
        progress.record(rec)
      }
    }

    in.safelyClose()
    out.close()

    // Output the metrics
    this.metrics.foreach { prefix =>
      val m = SwitchMetric(
        sample                   = summarizeRg(header=in.header, f = _.getSample),
        library                  = summarizeRg(header=in.header, f = _.getLibrary),
        templates                = total,
        aligned_templates        = aligned,
        switchback_templates     = switchbackLengths.total + switchbackGaps.total,
        fraction_switchbacks     = (switchbackLengths.total + switchbackGaps.total) / aligned.toDouble,
        read_based_switchbacks   = switchbackLengths.total,
        mean_length              = switchbackLengths.mean(),
        mean_offset              = switchbackOffsets.mean(),
        tandem_based_switchbacks = switchbackGaps.total,
        mean_gap                 = switchbackGaps.mean()
      )

      Metric.write(PathUtil.pathTo(s"${prefix}.summary.txt"), m)

      Seq(
        ("lengths", switchbackLengths, "length"),
        ("offsets", switchbackOffsets, "offset"),
        ("gaps",    switchbackGaps,    "gap_length")
      ).foreach { case (suffix, counter, header) =>
        val writer = Io.toWriter(PathUtil.pathTo(s"$prefix.$suffix.txt"))
        writer.write(s"$header\tcount\n")
        counter.foreach { case (x, count) => writer.write(s"$x\t$count\n")}
        writer.close()
      }

      Rscript.execIfAvailable(PlottingScript, input.getFileName.toString, s"$prefix.lengths.txt", s"$prefix.offsets.txt", s"$prefix.gaps.txt", s"$prefix.plots.pdf")
    }
  }
}