All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.FindTechnicalReads.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2016 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.bam

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamRecord, SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt._
import com.fulcrumgenomics.util.{IlluminaAdapters, Io}
import htsjdk.samtools.util.SequenceUtil

import scala.annotation.tailrec
import scala.collection.mutable

@clp(group = ClpGroups.SamOrBam, description =
  """
     |Find reads that are from technical or synthetic sequences in a BAM file. Takes in
     |a BAM file, extracts the read pairs and fragment reads that are unmapped, and tests
     |them to see if they are likely generated from a technical sequence (e.g. adapter
     |dimer).
     |
     |The identification of reads is done by testing the first N bases (controlled by the
     |match-length parameter) of each read against all sub-sequences of length N from the
     |technical sequences.  Sub-sequences are generated from both the sequences and the
     |reverse complement of the sequences, ignoring any sub-sequences that include `N`s.
     |
     |By default the output BAM file will contain all reads that matched to a sub-sequence of the
     |technical sequences and, if the read is paired, the read's mate pair.  An option is
     |available to apply a tag to matched reads (--tag/-t), and if specified each matching
     |read will be tagged with the 0-based index of the sequence to which it matched. In
     |combination with tagging it is possible to output all reads (-a/--all-reads) which will
     |re-create the input BAM with the addition of tags on matching reads.
     |
     |The default set of sequences include a range of different Illumina adapter sequences
     |with the sample index/barcode region masked to Ns.
  """
  )
class FindTechnicalReads
( @arg(flag='i', doc="Input SAM or BAM file") val input: PathToBam,
  @arg(flag='o', doc="Output SAM or BAM file") val output: PathToBam,
  @arg(flag='m', doc="The number of bases at the start of the read to match against.") val matchLength: Int = 15,
  @arg(flag='e', doc="The maximum number of errors in the matched region.") val maxErrors: Int = 1,
  @arg(flag='s', doc="The set of technical sequences to look for.") val sequences: Seq[String] = IlluminaAdapters.all.flatMap(_.both),
  @arg(flag='a', doc="Output all reads.") val allReads: Boolean = false,
  @arg(flag='t', doc="Tag to set to indicate a read is a technical sequence.") val tag: Option[String] = None
) extends FgBioTool with LazyLogging {

  Io.assertReadable(input)
  Io.assertCanWriteFile(output)
  if (allReads && tag.isEmpty) fail("Tag option must be used when outputting all reads.")

  private val sequenceToIndex = sequences.zipWithIndex.toMap

  override def execute(): Unit = {
    // Built the appropriate input iterator
    val in = SamSource(input)
    val iter = (allReads, in.indexed) match {
      case (true, _)      => in.iterator.buffered
      case (false, true)  => in.unmapped.buffered
      case (false, false) => in.iterator.filter(isUnmappedTemplate).buffered
    }

    val out      = SamWriter(output, in.header)
    val matcher  = new Matcher(sequences=sequences, matchLength=matchLength, maxErrors=maxErrors)
    var n: Long  = 0

    while (iter.hasNext) {
      val r1 = iter.next()
      val r2 = if (iter.hasNext && iter.head.name == r1.name) Some(iter.next()) else None
      val recs = Seq(Some(r1), r2).flatten

      val isTechnical = if (isUnmappedTemplate(r1)) {
        recs.view.flatMap(r => matcher.findMatch(r.bases)).headOption match {
          case None =>
            false
          case Some(hit) =>
            val hitIndex = sequenceToIndex(hit)
            for (t <- tag; r <- recs) r(t) = hitIndex
            n += recs.size
            true
        }
      }
      else {
        false
      }

      if (isTechnical || allReads) out ++= recs
    }

    logger.info(s"Found ${n} reads that matched to technical sequences.")
    in.safelyClose()
    out.close()
  }

  /** Returns true if all reads from the template are unmapped, otherwise false. */
  private def isUnmappedTemplate(rec: SamRecord) : Boolean = rec.unmapped && (!rec.paired|| rec.mateUnmapped)
}

/** Little utility class to match reads to technical sequences. */
private[bam] class Matcher(val matchLength: Int, val maxErrors: Int, sequences: Seq[String]) {
  // Map of kmer to first sequence in which it is found
  private val kmerToSequence: mutable.LinkedHashMap[String,String] = mutable.LinkedHashMap[String,String]()
  sequences.foreach(s => {
    Seq(s.toUpperCase).flatMap(a => Seq(a, SequenceUtil.reverseComplement(a)))
      .flatMap(a => a.sliding(matchLength))
      .filterNot(_.contains('N'))
      .foreach(k => if (!kmerToSequence.contains(k)) kmerToSequence(k) = s)
  })

  private val kmers: Array[Array[Byte]] = kmerToSequence.keySet.map(_.getBytes).toArray

  /** Just tests to see if any sequence matches. */
  def matches(read: Array[Byte]): Boolean = {
    read.length >= matchLength && kmers.exists(kmer => matches(read=read, kmer=kmer, len=matchLength))
  }

  /** Identifies the first sequence with a matching kmer. */
  def findMatch(read: Array[Byte]): Option[String] = {
    if (read.length < matchLength) {
      None
    }
    else {
      kmers.find(kmer => matches(read=read, kmer=kmer, len=matchLength)) match {
        case None      => None
        case Some(arr) => kmerToSequence.get(new String(arr))
      }
    }
  }

  @tailrec
  private def matches(read: Array[Byte], kmer: Array[Byte], index: Int = 0, len: Int, errors: Int = 0): Boolean = {
    if (errors > maxErrors) false
    else if (index >= len)  true
    else matches(read, kmer, index+1, len, errors + (if(read(index) == kmer(index)) 0 else 1))
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy