All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.alignment.Alignment.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2017 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.alignment

import com.fulcrumgenomics.FgBioDef._
import htsjdk.samtools.{CigarElement, CigarOperator, SAMRecord, Cigar => HtsJdkCigar}

import scala.collection.mutable.ArrayBuffer

/**
  * Represents an element in a Cigar.
  *
  * @param operator the type of element (e.g. match, insertion, etc.)
  * @param length the length of the element in bases (must be greater than 0).
  */
case class CigarElem(operator: CigarOperator, length: Int) {
  require(length > 0, s"Cigar element must have length >= 0. Operator: $operator, Length: $length")

  /** Returns how this element should be represented in a cigar string. */
  override def toString: String = s"${length}${operator}"

  /** Returns the length of the element on the query sequence. */
  def lengthOnQuery: Int = if (operator.consumesReadBases()) length else 0

  /** Returns the length of the element on the query sequence. */
  def lengthOnTarget: Int = if (operator.consumesReferenceBases()) length else 0
}

/** Companion object for Cigar that offers alternative constructors. */
object Cigar {
  private val ZeroCharAsInt = '0'.toInt

  /** An empty Cigar object. */
  val empty = Cigar(IndexedSeq.empty)

  /** Constructs a Cigar object from a Cigar string. */
  def apply(cigar: String): Cigar = {
    require(cigar != null && cigar.nonEmpty, "Can't parse a null or empty cigar string.")
    val elements = new ArrayBuffer[CigarElem]
    val chars    = cigar.toCharArray

    var i = 0
    while (i < chars.length) {
      // Parse out the length of the operator
      val expectedNumberAtPosition = i
      var length = 0
      while (Character.isDigit(chars(i))) { length = (length * 10) + chars(i).toInt - ZeroCharAsInt; i += 1 }
      require(i > expectedNumberAtPosition, s"Malformed CIGAR string $cigar, found non-digit ${chars(i)} where number expected.")

      // Grab the operator (characterToEnum will throw an exception if the operator is invalid)
      val operator = CigarOperator.characterToEnum(chars(i))
      elements += CigarElem(operator, length)
      i += 1
    }

    Cigar(elements.toIndexedSeq)
  }

  /**
    * Constructs a Cigar from a Cigar string, allowing the string to be either `null` or
    * `*`, in which case Cigar.empty is returned.  Useful for parsing Cigars from SAM files.
    */
  def fromSam(cigar: String): Cigar = {
    if (cigar == null || cigar == SAMRecord.NO_ALIGNMENT_CIGAR) Cigar.empty else apply(cigar)
  }

  /** Constructs a Cigar objects from the HTSJDK class of the same name. */
  def apply(ciggy: HtsJdkCigar): Cigar = Cigar(ciggy.iterator().map(e => CigarElem(e.getOperator, e.getLength)).toIndexedSeq)
}

/** A data class for holding statistics about a [[Cigar]]. */
private[alignment] case class CigarStats(lengthOnQuery: Int,
                                         lengthOnTarget: Int,
                                         alignedBases: Int,
                                         clippedBases: Int,
                                         leadingSoftClippedBases: Int,
                                         trailingSoftClippedBases: Int)

/** Companion object for [[Cigar]]. */
private[alignment] object CigarStats {

  /** Build a [[CigarStats]] from a [[Cigar]]. */
  private[alignment] def apply(cigar: Cigar): CigarStats = {
    var lengthOnQuery            = 0
    var lengthOnTarget           = 0
    var alignedBases             = 0
    var clippedBases             = 0
    var leadingSoftClippedBases  = 0
    var trailingSoftClippedBases = 0
    var index                    = 0

    // leading clipping
    while (index < cigar.elems.length && cigar.elems(index).operator.isClipping) {
      val elem = cigar.elems(index)
      lengthOnQuery  += elem.lengthOnQuery
      lengthOnTarget += elem.lengthOnTarget
      clippedBases += elem.length
      if (elem.operator == CigarOperator.S) leadingSoftClippedBases += elem.length
      index += 1
    }

    // aligned bases
    while (index < cigar.elems.length && !cigar.elems(index).operator.isClipping) {
      val elem = cigar.elems(index)
      lengthOnQuery  += elem.lengthOnQuery
      lengthOnTarget += elem.lengthOnTarget
      if (elem.operator.isAlignment) alignedBases += elem.length
      index += 1
    }

    // trailing clipping
    while (index < cigar.elems.length && cigar.elems(index).operator.isClipping) {
      val elem = cigar.elems(index)
      lengthOnQuery  += elem.lengthOnQuery
      lengthOnTarget += elem.lengthOnTarget
      clippedBases += elem.length
      if (elem.operator == CigarOperator.S) trailingSoftClippedBases += elem.length
      index += 1
    }

    require(index == cigar.elems.length,
      s"Invalid cigar, did not consume all the elements: ${Cigar(cigar.elems.take(index))}[${Cigar(cigar.elems.drop(index))}]"
    )

    CigarStats(
      lengthOnQuery            = lengthOnQuery,
      lengthOnTarget           = lengthOnTarget,
      alignedBases             = alignedBases,
      clippedBases             = clippedBases,
      leadingSoftClippedBases  = leadingSoftClippedBases,
      trailingSoftClippedBases = trailingSoftClippedBases
    )
  }

  /** Build a [[CigarStats]] from a string representation of the cigar. */
  private[alignment] def apply(cigar: String): CigarStats = this.apply(Cigar(cigar))
}

/**
  * Object representation of a Cigar string representing an alignment between two sequences.
  * @param elems the ordered sequence of elements in the Cigar
  */
case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] {
  // Cache whether or not the Cigar is coalesced already (i.e. has no pair of adjacent elements with the same operator)
  private lazy val isCoalesced: Boolean = elems.length == 1 || {
    var itIs = true
    var index = 0
    while (index < elems.length-1 && itIs) {
      itIs = elems(index).operator != elems(index+1).operator
      index += 1
    }
    itIs
  }

  /** Statistics about this cigar which are computed once on first access and then cached. */
  private lazy val stats: CigarStats = CigarStats(this)

  /** Provides an iterator over the elements in the cigar. */
  override def iterator: Iterator[CigarElem] = elems.iterator

  /** Provides an iterator over the elements in the reverse order. */
  def reverseIterator: Iterator[CigarElem] = elems.reverseIterator

  /** Returns the length of the alignment on the query sequence. */
  def lengthOnQuery: Int = stats.lengthOnQuery

  /** Returns the length of the alignment on the target sequence. */
  def lengthOnTarget: Int = stats.lengthOnTarget

  /** Yields a new cigar that is truncated to the given length on the query. */
  def truncateToQueryLength(len: Int): Cigar = truncate(len, e => e.operator.consumesReadBases())

  /** Yields a new cigar that is truncated to the given length on the target. */
  def truncateToTargetLength(len: Int): Cigar = truncate(len, e => e.operator.consumesReferenceBases())

  /** Returns the number of bases that are directly aligned between the two sequences. */
  def alignedBases: Int = stats.alignedBases

  /** Returns the number of bases that are clipped between the two sequences. */
  def clippedBases: Int = stats.clippedBases

  /** Returns the number of bases that are soft-clipped at the start of the sequence  Ignores hard-clips. */
  def leadingSoftClippedBases: Int = stats.leadingSoftClippedBases

  /** Returns the number of bases that are soft-clipped at the end of this sequence.  Ignores hard-clips. */
  def trailingSoftClippedBases: Int = stats.trailingSoftClippedBases

  /** Returns the number of bases that are hard-clipped at the start of the sequence. */
  def leadingHardClippedBases = this.headOption.map { elem =>
    if (elem.operator == CigarOperator.H) elem.length else 0
  }.getOrElse(0)

  /** Returns the number of bases that are clipped at the start of the sequence. */
  def leadingClippedBases: Int = leadingHardClippedBases + leadingSoftClippedBases

  /** Returns the number of bases that are hard-clipped at the end of the sequence. */
  def trailingHardClippedBases = this.lastOption.map { elem =>
    if (elem.operator == CigarOperator.H) elem.length else 0
  }.getOrElse(0)

  /** Returns the number of bases that are clipped at the end of the sequence. */
  def trailingClippedBases: Int = trailingHardClippedBases + trailingSoftClippedBases

  /** Truncates the cigar based on either query or target length cutoff. */
  private def truncate(len: Int, shouldCount: CigarElem => Boolean): Cigar = {
    var pos = 1
    val iter = iterator
    val builder = IndexedSeq.newBuilder[CigarElem]
    while (pos <= len && iter.hasNext) {
      val elem = iter.next()
      if (shouldCount(elem)) {
        val maxElemLength = len - pos + 1
        builder += (if (elem.length <= maxElemLength) elem else elem.copy(length=maxElemLength))
        pos += elem.length
      }
      else {
        builder += elem
      }
    }

    Cigar(builder.result())
  }

  /**
    * Returns true if this Cigar is a prefix of the other cigar. Prefix is defined as having the same
    * operators for the same bases through the length of the cigar.  E.g.:
    *   10M is a prefix of 10M2S
    *   10M is a prefix of 10M
    *   10M is a prefix of 20M
    *   10M is not a prefix of 2S10M
    */
  def isPrefixOf(that: Cigar): Boolean = {
    if (that.elems.size < this.elems.size) {
      false
    }
    else {
      val lastIndex = this.elems.size - 1
      var isPrefix  = true
      var i = 0
      while (isPrefix && i <= lastIndex) {
        val lhs = this.elems(i)
        val rhs = that.elems(i)
        isPrefix = lhs.operator == rhs.operator && (if (i == lastIndex) lhs.length <= rhs.length else lhs.length == rhs.length)
        i += 1
      }

      isPrefix
    }
  }

  /** Returns a new Cigar that contains the same elements in the reverse order of this cigar. */
  def reverse: Cigar = Cigar(this.elems.reverse)

  /** Coalesces adjacent operators of the same type into single operators. */
  def coalesce: Cigar = {
    if (isCoalesced) {
      this
    }
    else {
      val builder = IndexedSeq.newBuilder[CigarElem]
      val iter   = iterator.bufferBetter

      while (iter.hasNext) {
        val elem = iter.next()
        val same = iter.takeWhile(_.operator == elem.operator).foldLeft(elem)((a,b) => CigarElem(a.operator, a.length + b.length))
        builder += same
      }

      Cigar(builder.result())
    }
  }

  /** Returns the canonical Cigar string. */
  override def toString(): String = elems.mkString

  /** Converts the Cigar into an HTSJDK Cigar object. */
  def toHtsjdkCigar: htsjdk.samtools.Cigar = {
    new htsjdk.samtools.Cigar(elems.iterator.map(e => new CigarElement(e.length, e.operator)).toJavaList)
  }
}


/** Companion object for Alignment. */
object Alignment {
  /** Construct an alignment using Strings for sequences instead of byte arrays. */
  def apply(query: String, target: String, queryStart: Int, targetStart: Int, cigar: Cigar, score: Int): Alignment = {
    new Alignment(query.getBytes, target.getBytes, queryStart, targetStart, cigar, score)
  }
}

/**
  * A general class to describe the alignment between two sequences or partial ranges thereof
  * @param query the query sequence
  * @param target the target sequence
  * @param queryStart the 1-based position in the query sequence where the alignment begins
  * @param targetStart the 1-based position in the target sequence where the alignment begins
  * @param cigar a [[Cigar]] object describing the alignment of the two sequences
  * @param score the alignment score
  */
case class Alignment(query: Array[Byte],
                     target: Array[Byte],
                     queryStart: Int,
                     targetStart: Int,
                     cigar: Cigar,
                     score: Int) {

  /** One based closed coordinate of the end of the alignment on the query sequence. */
  def queryEnd: Int = queryStart + cigar.lengthOnQuery - 1

  /** One based closed coordinate of the end of the alignment on the query sequence. */
  def targetEnd: Int = targetStart + cigar.lengthOnTarget - 1

  /**
    * Generates a padded text representation of the alignment for visualization. The returned
    * sequence will consist of three lines as follows (minus the labels on the left):
    *
    * query : ACGTGAACTGACT-ACTGTATGCG
    * align : |||||  |||||| ||||||||.|
    * target: ACGTG--CTGACTGACTGTATGGG
    *
    * @param matchChar the character to use in the alignment line when the bases match
    * @param mismatchChar the character to use in the alignment line when the bases mismatch
    * @param gapChar the character to use in the alignment line when one of the sequences is gapped
    * @param padChar the character to use in the query or target sequence lines when padding is required
    * @return Three lines representing the alignment
    */
  def paddedString(matchChar: Char = '|', mismatchChar: Char = '.', gapChar: Char = ' ', padChar: Char = '-'): Seq[String] = {
    val buffers = Seq(new StringBuilder, new StringBuilder, new StringBuilder)
    val Seq(queryBuffer, alignBuffer, targetBuffer) = buffers

    var qOffset = queryStart - 1
    var tOffset = targetStart - 1
    cigar.foreach { cig =>
      val len = cig.length

      cig.operator match {
        case CigarOperator.INSERTION =>
          forloop (from=0, until=len) { _ =>
            queryBuffer.append(this.query(qOffset).toChar)
            alignBuffer.append(gapChar)
            targetBuffer.append(padChar)
            qOffset += 1
          }
        case CigarOperator.DELETION =>
          forloop (from=0, until=len) { _ =>
            queryBuffer.append(padChar)
            alignBuffer.append(gapChar)
            targetBuffer.append(this.target(tOffset).toChar)
            tOffset += 1
          }
        case op =>
          forloop (from=0, until=len) { _ =>
            val q = this.query(qOffset).toChar
            val t = this.target(tOffset).toChar
            queryBuffer.append(q)
            targetBuffer.append(t)
            op match {
              case CigarOperator.EQ => alignBuffer.append(matchChar)
              case CigarOperator.X  => alignBuffer.append(mismatchChar)
              case CigarOperator.M  => alignBuffer.append(if (q == t) matchChar else mismatchChar)
              case other            => throw new IllegalStateException(s"Unsupported cigar operator: $other")
            }

            qOffset += 1
            tOffset += 1
          }
        }
      }

    buffers.map(_.toString())
  }

  /**
    * Returns a subset of the [[Alignment]] representing the region defined by `start` and `end` on the query
    * sequence. The returned alignment will contain the entire query and target sequences, but will have
    * adjusted `queryStart` and `targetStart` positions and an updated `cigar`.  The score is set to 0.
    *
    * @param start the 1-based inclusive position of the first base on the query sequence to include
    * @param end   the 1-based inclusive position of the last base on the query sequence to include
    * @return a new [[Alignment]] with updated coordinates and cigar
    */
  def subByQuery(start: Int, end: Int): Alignment = {
    require(start >= queryStart && start <= queryEnd, "start is outside of aligned region of query sequence")
    require(end   >= queryStart && end   <= queryEnd, "end is outside of aligned region of query sequence")
    sub(start, end, this.queryStart, _.operator.consumesReadBases())
  }

  /**
    * Returns a subset of the Alignment representing the region defined by `start` and `end` on the target
    * sequence. The returned alignment will contain the entire query and target sequences, but will have
    * adjusted `queryStart` and `targetStart` positions and an updated `cigar`.  The score is set to 0.
    *
    * @param start the 1-based inclusive position of the first base on the target sequence to include
    * @param end   the 1-based inclusive position of the last base on the target sequence to include
    * @return a new [[Alignment]] with updated coordinates and cigar
    */
  def subByTarget(start: Int, end: Int): Alignment = {
    require(start >= targetStart && start <= targetEnd, "start is outside of aligned region of target sequence")
    require(end   >= targetStart && end   <= targetEnd, "end is outside of aligned region of target sequence")
    sub(start, end, this.targetStart, _.operator.consumesReferenceBases())
  }

  /**
    * Private helper method that helps generate an Alignment that is a subset of the current alignment.
    *
    * @param start the start (on either the query or target sequence) of the desired window
    * @param end the end (on either the query or target sequence) of the desired window
    * @param initialStart the start position on the relevant sequence (either target OR query)
    * @param consumes a function that returns true if the cigar element passed as a parameter consumes
    *                 bases on the sequence on which `start` and `end` are expresssed
    * @return a new [[Alignment]] with adjusted start, end and cigar, and with score=0
    */
  private def sub(start: Int, end: Int, initialStart: Int, consumes: CigarElem => Boolean): Alignment = {
    val elems = IndexedSeq.newBuilder[CigarElem]
    var (qStart, tStart, currStart) = (queryStart, targetStart, initialStart) // currStart = start of current element

    this.cigar.foreach { elem =>
      // If the current element consumes the chosen sequence, calculate the end, else set to the same as start

      val elementConsumes = consumes(elem)
      val currEnd = if (elementConsumes) currStart + elem.length - 1 else currStart-1

      if (currEnd < start) {
        // Element before the desired window, need to bump start positions
        qStart += elem.lengthOnQuery
        tStart += elem.lengthOnTarget
        if (elementConsumes) currStart += elem.length
      }
      else if (currStart > end) {
        // Don't include, beyond the range we're interested
      }
      else if (currStart >= start && currEnd <= end) {
        // Contained within the target region
        // Only add the element if it a) consumes bases or b) isn't right at the boundary (to avoid trailing gaps)
        if (elementConsumes || currStart != start) {
          elems += elem
          if (elementConsumes) currStart += elem.length
        }
      }
      else {
        // Element overlaps the desired region, may enclose, be enclosed by or straddle
        var len = elem.length // <- the length of the element to be added, after any clipping

        if (currStart < start) {
          // Element is split over the start of the desired region
          val diff = start - currStart
          len -= diff
          if (elem.operator.consumesReadBases())      qStart += diff
          if (elem.operator.consumesReferenceBases()) tStart += diff
          currStart += diff
        }

        if (currEnd > end) {
          // Element is split over the end of the desired region
          len -= (currEnd - end)
        }

        elems += CigarElem(elem.operator, len)
        currStart += len
      }
    }

    copy(queryStart=qStart, targetStart=tStart, cigar=Cigar(elems.result()), score=0)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy