com.fulcrumgenomics.umi.UmiConsensusCaller.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fgbio_2.13 Show documentation
fgbio
The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2017 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.umi

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.alignment.{Cigar, CigarElem}
import com.fulcrumgenomics.bam.{ClippingMode, SamRecordClipper}
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord}
import com.fulcrumgenomics.commons.util.{Logger, SimpleCounter}
import com.fulcrumgenomics.umi.UmiConsensusCaller._
import com.fulcrumgenomics.util.NumericTypes.PhredScore
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools._
import htsjdk.samtools.util.{Murmur3, SequenceUtil, TrimmingUtil}

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.math.min

/**
  * Contains shared types and functions used when writing UMI-driven consensus
  * callers that take in SamRecords and emit SamRecords.
  */
object UmiConsensusCaller {
  /** The type of consensus read to output. */
  object ReadType extends Enumeration {
    type ReadType = Value
    val Fragment, FirstOfPair, SecondOfPair = Value
    val ReadTypeKey: String = "__read_type__"
  }

  /** Filter reason for when there are too few reads to form a consensus. */
  val FilterInsufficientSupport = "Insufficient Support"

  /** Filter reason for when reads are rejected for having a minority CIGAR. */
  val FilterMinorityAlignment   = "Mismatching Cigars"

  /** Filter reason for when reads are rejected due to low quality. */
  val FilterLowQuality = "Low Base Quality"

  /** Filter reason for when reads are rejected due creation of orphaned consensus (i.e. R1 or R2 failed). */
  val FilterOrphan = "Orphan Consensus Created"

  /** A trait that consensus reads must implement. */
  trait SimpleRead {
    /** The ID of the molecule that generated the consensus read. */
    def id: String
    /** The bases of the consensus read. */
    def bases: Array[Byte]
    /** The quals of the consensus read. */
    def quals: Array[Byte]
    /** Gets the length of the consensus read. */
    def length: Int = bases.length
    /** Returns the consensus read a String - mostly useful for testing. */
    def baseString: String = new String(bases)
    /** Retrieves the quals as a phred+33/fastq ascii String. */
    def qualString: String = SAMUtils.phredToFastq(this.quals)
  }

  /** Stores information about a read to be fed into a consensus. */
  case class SourceRead(id: String, bases: Array[Byte], quals: Array[Byte], cigar: Cigar, sam: Option[SamRecord] = None) extends SimpleRead {
    require(bases.length == quals.length, "Bases and qualities are not the same length.")
  }

  /**
    * Attempts to construct a String that can be used as a prefix for consensus read names based
    * on the contents of the incoming SAMFileHeader.
    */
  def makePrefixFromSamHeader(header: SAMFileHeader): String = {
    val ids = header.getReadGroups.map(rg => Option(rg.getLibrary).getOrElse(rg.getReadGroupId)).toList.sorted.distinct
    // Read names have to fit into 255 bytes, and this is just the prefix
    if (ids.map(_.length+1).sum <= 200) ids.mkString("|")
    else Integer.toHexString(new Murmur3(1).hashUnencodedChars(ids.mkString("|")))
  }

  /**
    * Constructs an output header with a single read group for the a BAM.
    */
  def outputHeader(in: SAMFileHeader, readGroupId: String, sortOrder: Option[SamOrder] = None): SAMFileHeader = {
    val oldRgs = in.getReadGroups.toSeq
    def collapse(f: SAMReadGroupRecord => String): String = oldRgs.map(f).filter(_ != null).distinct match {
      case Nil => null
      case xs  => xs.mkString(",")
    }

    val rg = new SAMReadGroupRecord(readGroupId)
    rg.setDescription     (collapse(_.getDescription))
    rg.setLibrary         (collapse(_.getLibrary))
    rg.setSample          (collapse(_.getSample))
    rg.setPlatform        (collapse(x => Option(x.getPlatform).map(_.toUpperCase).orNull)) // NB: this to ensure that platforms are all upper-case; not all tools are modern or well-behaved
    rg.setPlatformUnit    (collapse(_.getPlatformUnit))
    rg.setSequencingCenter(collapse(_.getSequencingCenter))

    val outHeader = new SAMFileHeader
    outHeader.addReadGroup(rg)
    sortOrder match {
      case Some(so) => so.applyTo(outHeader)
      case None     => SamOrder.Unsorted.applyTo(outHeader); outHeader.setGroupOrder(GroupOrder.query)
    }

    outHeader.addComment(s"Read group ${rg.getId} contains consensus reads generated from ${oldRgs.size} input read groups.")
    outHeader
  }

  /**
    * Helper method to check that the input BAM is in the correct order for consensus calling. Will call
    * the warn function if the sort order looks like it's probably compatible but it's not 100% sure.
    * Will invoke the error function in cases where the sort order is definitely incompatible.
    *
    * @param header the header of the BAM file to be used for consensus calling
    * @param source a path or string representing the source of the header
    * @param warn a function to be called when any warnings are detected/emitted
    * @param error a function to be called when any errors are encountered; should probably throw an exception!
    */
  def checkSortOrder(header: SAMFileHeader, source: Any, warn: String => Unit, error: String => Unit): Unit = {
    // Check that the SAM file is sorted appropriately
    if (!SamOrder(header).contains(SamOrder.TemplateCoordinate)) {
      // Group reads used to output the header without the sub-sort, so allow this for now
      if (header.getSortOrder == SortOrder.unsorted && header.getGroupOrder == GroupOrder.query) {
        warn(s"File $source may not be sorted correctly for consensus read generation.")
      }
      else {
        error(s"File $source is not sorted correctly. Please sort with fgbio SortBam -s TemplateCoordinate.")
      }
    }
  }
}


/**
  * A trait that can be mixed in by any consensus caller that works at the read level,
  * mapping incoming SamRecords into consensus SamRecords.
  *
  * @tparam ConsensusRead Internally, the type of lightweight consensus read that is used prior to
  *           rebuilding [[com.fulcrumgenomics.bam.api.SamRecord]]s.
  */
trait UmiConsensusCaller[ConsensusRead <: SimpleRead] {
  import com.fulcrumgenomics.umi.UmiConsensusCaller.ReadType._

  // vars to track how many reads meet various fates
  private var _totalReads: Long = 0
  private val _filteredReads = new SimpleCounter[String]()
  private var _consensusReadsConstructed: Long = 0

  protected val NoCall: Byte = 'N'.toByte
  protected val NoCallQual: PhredScore = PhredScore.MinValue

  /** A consensus caller used to generate consensus UMI sequences */
  private val consensusBuilder = new SimpleConsensusCaller()

  /** Clipper utility used to _calculate_ clipping, but not do the actual clipping */
  private val clipper = new SamRecordClipper(mode=ClippingMode.Soft, autoClipAttributes=true)

  /** Returns a clone of this consensus caller in a state where no previous reads were processed.  I.e. all counters
    * are set to zero.*/
  def emptyClone(): UmiConsensusCaller[ConsensusRead]

  /** Returns the total number of input reads examined by the consensus caller so far. */
  def totalReads: Long = _totalReads

  /** Returns the total number of reads filtered for any reason. */
  def totalFiltered: Long = _filteredReads.total

  /**
    * Returns the number of raw reads filtered out due to there being insufficient reads present
    * to build the necessary set of consensus reads.
    */
  def readsFilteredInsufficientSupport: Long = this._filteredReads.countOf(FilterInsufficientSupport)

  /** Returns the number of raw reads filtered out because their alignment disagreed with the majority alignment of
    * all raw reads for the same source molecule.
    */
  def readsFilteredMinorityAlignment: Long = this._filteredReads.countOf(FilterMinorityAlignment)

  /** Returns the number of consensus reads constructed by this caller. */
  def consensusReadsConstructed: Long = _consensusReadsConstructed

  /** Records that the supplied records were rejected, and not used to build a consensus read. */
  protected def rejectRecords(recs: Iterable[SamRecord], reason: String) : Unit = this._filteredReads.count(reason, recs.size)

  /** Records that the supplied records were rejected, and not used to build a consensus read. */
  protected def rejectRecords(reason: String, rec: SamRecord*) : Unit = rejectRecords(rec, reason)

  /** A RG.ID to apply to all generated reads. */
  protected def readGroupId: String

  /** A prefix to use on all read names.  If None then a suitable prefix will be synthesized. */
  protected val readNamePrefix: String

  /**
    * Needs to be implemented to return a value from a SamRecord that represents the unit of
    * grouping, e.g. the MI tag for vanilla UMI data and the MI tag minus the /?? suffix for
    * duplex data.
    * @param rec a SamRecord
    * @return an identified for the source molecule
    */
  protected[umi] def sourceMoleculeId(rec: SamRecord): String

  /**
    * Converts from a SamRecord into a SourceRead.  During conversion the record is end-trimmed
    * to remove Ns and bases below the `minBaseQuality`.  Remaining bases that are below
    * `minBaseQuality` are then masked to Ns.  Also trims reads so that no mapped bases extend past their mate.
    *
    * @return Some(SourceRead) if there are any called bases with quality > minBaseQuality, else None
    */
  protected[umi] def toSourceRead(rec: SamRecord, minBaseQuality: PhredScore, trim: Boolean): Option[SourceRead] = {
    // Extract and possibly RC the source bases and quals from the SamRecord
    val bases = rec.bases.clone()
    val quals = rec.quals.clone()
    val cigar = if (rec.positiveStrand) rec.cigar else {
      SequenceUtil.reverseComplement(bases)
      SequenceUtil.reverse(quals, 0, quals.length)
      rec.cigar.reverse
    }

    // Quality trim the reads if requested.
    val trimToLength = if (trim) TrimmingUtil.findQualityTrimPoint(quals, minBaseQuality) else bases.length

    // Mask remaining low quality bases to Ns
    forloop (from=0, until=trimToLength) { i =>
      if (quals(i) < minBaseQuality) {
        bases(i) = NoCall
        quals(i) = NoCallQual
      }
    }

    // Get the length of the read based on trimming bases that are beyond the mate's end (FR only) and then any
    // remaining trailing Ns.  This includes both mapped bases and soft-clipped bases past the mate's end.
    val len = {
      var index = if (!rec.isFrPair) trimToLength - 1 else {
        // Get the number of mapped bases to clip that maps beyond the mate's end, including any soft-clipped bases. Use
        // that to compute where in the read to keep.
        val clipPosition = rec.length - this.clipper.numBasesExtendingPastMate(rec=rec)
        min(clipPosition, trimToLength) - 1
      }
      // Find the last non-N base of sufficient quality in the record, starting from either the
      // end of the read, or the end of the insert, whichever is shorter!
      while (index >= 0 && (bases(index) == NoCall)) index -= 1
      index + 1
    }

    len match {
      case 0 =>
        rejectRecords(Iterable(rec), FilterLowQuality)
        None
      case n if n == bases.length =>
        Some(SourceRead(sourceMoleculeId(rec), bases, quals, cigar, Some(rec)))
      case n                         =>
        val trimmedBases = new Array[Byte](n)
        val trimmedQuals = new Array[Byte](n)
        System.arraycopy(bases, 0, trimmedBases, 0, n)
        System.arraycopy(quals, 0, trimmedQuals, 0, n)
        Some(SourceRead(sourceMoleculeId(rec), trimmedBases, trimmedQuals, cigar.truncateToQueryLength(n), Some(rec)))
    }
  }

  /**
    * Takes in all the reads for a source molecule and, if possible, generates one or more
    * output consensus reads as SAM records.
    *
    * @param recs the full set of source SamRecords for a source molecule
    * @return a seq of consensus SAM records, may be empty
    */
  final def consensusReadsFromSamRecords(recs: Seq[SamRecord]): Seq[SamRecord] = {
    this._totalReads += recs.size
    val result = consensusSamRecordsFromSamRecords(recs)
    this._consensusReadsConstructed += result.size
    result
  }

  /**
    * Takes in all the reads for a source molecule and, if possible, generates one or more
    * output consensus reads as SAM records.
    *
    * @param recs the full set of source SamRecords for a source molecule
    * @return a seq of consensus SAM records, may be empty
    */
  protected def consensusSamRecordsFromSamRecords(recs: Seq[SamRecord]): Seq[SamRecord]

  /** Split records into those that should make a single-end consensus read, first of pair consensus read,
    * and second of pair consensus read, respectively.  The default method is to use the SAM flag to find
    * unpaired reads, first of pair reads, and second of pair reads.
    */
  protected def subGroupRecords(records: Seq[SamRecord]): (Seq[SamRecord], Seq[SamRecord], Seq[SamRecord]) = {
    val fragments    = records.filter { rec => !rec.paired }
    val (firstOfPair, secondOfPair) = records.filter(_.paired).partition(_.firstOfPair)
    (fragments, firstOfPair, secondOfPair)
  }

  /** Used it [[filterToMostCommonAlignment()]] to store a cigar string and a set of flags for which reads match. */
  private final case class AlignmentGroup(cigar: Cigar, flags: mutable.BitSet, var size: Int = 0) {
    /** Adds the read at `idx` to the set included. */
    @inline def add(idx: Int): Unit = {
      flags(idx) = true
      size      += 1
    }

    @inline def contains(idx: Int): Boolean = flags(idx)
  }

  /**
    * Takes in a non-empty seq of SamRecords and filters them such that the returned seq only contains
    * those reads that share the most common alignment of the read sequence to the reference.
    * If two or more different alignments share equal numbers of reads, the 'most common' will
    * be an arbitrary pick amongst those alignments, and the group of reads with that alignment will
    * be returned.
    *
    * For the purposes of this method all that is implied by "same alignment" is that any
    * insertions or deletions are at the same position and of the same length.  This is done
    * to allow for differential read length (either due to sequencing or untracked hard-clipping
    * of adapters) and for differential soft-clipping at the starts and ends of reads.
    *
    * NOTE: filtered out reads are sent to the [[rejectRecords]] method and do not need further handling
    */
  protected[umi] def filterToMostCommonAlignment(recs: Seq[SourceRead]): Seq[SourceRead] = if (recs.size < 2) recs else {
    val groups = new ArrayBuffer[AlignmentGroup]
    val sorted = recs.sortBy(r => -r.length).toIndexedSeq

    forloop (from=0, until=sorted.length) { i =>
      val simpleCigar = simplifyCigar(sorted(i).cigar)
      var found = false
      groups.foreach { g => if (simpleCigar.isPrefixOf(g.cigar)) { g.add(i); found = true } }

      if (!found) {
        val newGroup = AlignmentGroup(simpleCigar, new mutable.BitSet(sorted.size))
        newGroup.add(i)
        groups += newGroup
      }
    }

    if (groups.isEmpty) {
      Seq.empty
    }
    else {
      val bestGroup = groups.maxBy(_.size)
      val keepers = new ArrayBuffer[SourceRead](bestGroup.size)
      forloop (from=0, until=sorted.length) { i =>
        if (bestGroup.contains(i)) keepers += sorted(i)
        else sorted(i).sam.foreach(rejectRecords(FilterMinorityAlignment, _))
      }

      keepers.toIndexedSeq
    }
  }

  /** Simplifies the cigar by turning other operators into Ms if that's how we want to think of them. */
  private def simplifyCigar(cigar: Cigar) = {
    import CigarOperator._
    if (cigar.forall(e => e.operator == M || e.operator == I || e.operator == D)) {
      cigar.coalesce
    }
    else {
      val newElems = cigar.elems.map {
        case CigarElem(S,  len) => CigarElem(M, len)
        case CigarElem(EQ, len) => CigarElem(M, len)
        case CigarElem(X,  len) => CigarElem(M, len)
        case CigarElem(H,  len) => CigarElem(M, len)
        case cig => cig
      }

      Cigar(newElems).coalesce
    }
  }

  /** Creates a `SamRecord` from the called consensus base and qualities. */
  protected def createSamRecord(read: ConsensusRead, readType: ReadType, umis: Seq[String] = Seq.empty): SamRecord = {
    val rec = SamRecord(null)
    rec.name = this.readNamePrefix + ":" + read.id
    rec.unmapped = true
    readType match {
      case Fragment     => // do nothing
      case FirstOfPair  =>
        rec.paired       = true
        rec.firstOfPair  = true
        rec.mateUnmapped = true
      case SecondOfPair =>
        rec.paired       = true
        rec.secondOfPair = true
        rec.mateUnmapped = true
    }
    rec.bases = read.bases
    rec.quals = read.quals
    rec(SAMTag.RG.name()) = readGroupId
    rec(ConsensusTags.MolecularId) = read.id
    if (umis.nonEmpty) rec(ConsensusTags.UmiBases) = this.consensusBuilder.callConsensus(umis)

    rec
  }

  /** Sums a short array into an Int to avoid overflow. */
  protected def sum(ss: Array[Short]): Int = {
    var total: Int = 0
    forloop(from=0, until=ss.length) { i => total += ss(i) }
    total
  }

  /** Adds the given caller's statistics (counts) to this caller. */
  def addStatistics(caller: UmiConsensusCaller[ConsensusRead]): Unit = {
    this._totalReads += caller.totalReads
    this._consensusReadsConstructed += caller.consensusReadsConstructed
    this._filteredReads += caller._filteredReads
  }

  /**
    * Logs statistics about how many reads were seen, and how many were filtered/discarded due
    * to various filters.
    */
  def logStatistics(logger: Logger): Unit = {
    logger.info(f"Total Raw Reads Considered: $totalReads%,d.")
    this._filteredReads.foreach { case (filter, count) =>
      logger.info(f"Raw Reads Filtered Due to $filter: $count%,d (${count/totalReads.toDouble}%.4f).")
    }
    logger.info(f"Consensus reads emitted: $consensusReadsConstructed%,d.")
  }
}