All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.util.Sequences.scala Maven / Gradle / Ivy

The newest version!
/**
 * Copyright (c) 2016, Fulcrum Genomics LLC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
package com.fulcrumgenomics.util

import com.fulcrumgenomics.FgBioDef._
import htsjdk.samtools.util.SequenceUtil

import scala.annotation.switch


/**
  * Utility methods for working with DNA or RNA sequences
  */
object Sequences {
  /** Common contig/chrom names for non-autosomal sequences in mammals. */
  val CommonNonAutosomalContigNames: Seq[String] = Seq("M", "chrM", "MT", "X", "chrX", "Y", "chrY")

  /** Array for determining what bases are compatible. Indices are all DNA/RNA bases. Contents
    * are Int masks with a bit set for each possible base.  U is treated as identical to T.
    */
  private val IupacMasks: Array[Int] = {
    val masks = new Array[Int](Byte.MaxValue)
    val (a, c, g, t) = (1, 2, 4, 8)

    def up(c: Char): Int = c.toUpper.toInt
    def down(c: Char): Int = c.toLower.toInt

    Seq(up(_), down(_)).foreach { f =>
      masks(f('A')) = a
      masks(f('C')) = c
      masks(f('G')) = g
      masks(f('T')) = t
      masks(f('U')) = t
      masks(f('M')) = a | c
      masks(f('R')) = a | g
      masks(f('W')) = a | t
      masks(f('S')) = c | g
      masks(f('Y')) = c | t
      masks(f('K')) = g | t
      masks(f('V')) = a | c | g
      masks(f('H')) = a | c | t
      masks(f('D')) = a | g | t
      masks(f('B')) = c | g | t
      masks(f('N')) = a | c | g | t
    }

    masks
  }

  /** Returns true if two bases are compatible.  Compatibility is defined as having any overlap in the
    * set of acceptable bases.  E.g.:
    *   - `compatible('T', 't') == true`
    *   - `compatible('T', 'U') == true`
    *   - `compatible('A', 'R') == true`
    *   - `compatible('M', 'S') == true`
    *   - `compatible('A', 'C') == false`
    *   - `compatible('M', 'K') == false`
    *
    * @param base1 the first base to be compared
    * @param base2 the second base to be compared
    * @return true if the bases share at least one concrete base in common, false otherwise
    */
  final def compatible(base1: Byte, base2: Byte): Boolean = base1 == base2 || (IupacMasks(base1) & IupacMasks(base2)) > 0

  /**
    * Calculates GC content for a DNA sequence as a fraction (between 0 and 1).
    *
    * @param s a DNA sequence
    * @return the fraction GC content for the given sequence, zero if the sequence is empty
    */
  def gcContent(s: String): Double = if (s.isEmpty) 0 else SequenceUtil.calculateGc(s.getBytes)

  /** Counts the number of mismatches between two sequences of the same length. */
  def countMismatches(s1: String, s2: String): Int = {
    require(s1.length == s2.length, s"Cannot count mismatches in strings of differing lengths: $s1 $s2")

    var count = 0
    forloop (from=0, until=s1.length) { i =>
      val a = Character.toUpperCase(s1.charAt(i))
      val b = Character.toUpperCase(s2.charAt(i))
      if (a != b) count += 1
    }

    count
  }

  /**
    * Returns the number of homopolymers of a given minimum length within the sequence.
    *
    * @param s a DNA or RNA sequence
    * @param minLength the minimum length of homopolymer to count (must be greater than 0)
    * @return the number of homopolymers of length >= minLength in the sequence
    */
  def homopolymers(s: String, minLength: Int): Int = {
    require(minLength > 0, "minLength must be greater than 0")
    var (homopolymers, currStreak) = (0, 1)
    forloop(1, s.length) { basePosition =>
      if (currStreak == minLength) homopolymers += 1
      if (basesEqual(s(basePosition), s(basePosition - 1))) currStreak += 1 else currStreak = 1
    }

    homopolymers
  }

  /** Class to store the zero-based offset and length of match for various properties of a sequence.  For example, see
    * [[longestHomopolymer]] and [[longestDinuc]]. */
  case class OffsetAndLength(offset: Int, length: Int)

  /**
    * Returns the offset (0-based) and length of the longest homopolymer in the sequence. In the case
    * that there are multiple homopolymers of the same length, the earliest one is returned.
    *
    * @param s a DNA or RNA sequence
    * @return the offset and length of the longest homopolymer
    */
  def longestHomopolymer(s: String) : OffsetAndLength = {
    var (bestStart, bestLength) = (0,0)
    forloop(0, s.length) { start =>
      val firstBase = s.charAt(start).toByte
      var i = start
      while (i < s.length && basesEqual(firstBase, s.charAt(i))) i += 1
      val length = i - start
      if (length > bestLength) {
        bestStart = start
        bestLength = length
      }
    }

    OffsetAndLength(bestStart, bestLength)
  }

  /**
    * Returns the offset (0-based) and length of the longest dinucleotide run in the sequence. In the case
    * that there are multiple dinucleotide runs of the same length, the earliest one is returned.
    *
    * @param s a DNA or RNA sequence
    * @return the offset and length of the longest dinucleotide sequence
    */
  def longestDinuc(s: String) : OffsetAndLength = {
    var (bestStart, bestLength) = (0,0)
    forloop(0, s.length-1) { start =>
      val (b1, b2) = (s.charAt(start).toByte, s.charAt(start+1).toByte)
      var i = start
      while (i < s.length-1 && basesEqual(b1, s.charAt(i)) && basesEqual(b2, s.charAt(i+1))) i += 2
      val length = i - start
      if (length > bestLength) {
        bestStart = start
        bestLength = length
      }
    }

    OffsetAndLength(bestStart, bestLength)
  }

  /** Reverse complements a string of bases. */
  def revcomp(s: String): String = {
    val bs = s.getBytes
    revcomp(bs)
    new String(bs)
  }

  /** Returns the sequence that is the complement of the provided sequence. */
  def complement(s: String): String = {
    val bs = s.getBytes
    complement(bs)
    new String(bs)
  }

  /** Reverse complements an array of bases in place. See [[complement()]] for how bases are complemented. */
  def revcomp(bs: Array[Byte]): Unit = {
    var (i, j) = (0, bs.length - 1)
    while (i < j) {
      val tmp = bs(i)
      bs(i) = complement(bs(j))
      bs(j) = complement(tmp)
      i += 1
      j -= 1
    }

    if (i == j) bs(i) = complement(bs(i))
  }

  /**
    * Complements the bases in the array in place. See [[complement()]] for how complementing is performed.
    *
    * @param bs an array of bases as bytes to be complemented in place
    */
  def complement(bs: Array[Byte]): Unit = forloop (from=0, until=bs.length) { i => bs(i) = complement(bs(i)) }

  /** Complements a single base. `U` and `u` are always complemented to `A` and `a` respectively.
    * IUPAC ambiguity codes are handled appropriately - they are complemented to the IUPAC code that represents
    * the set of bases that can pair with the input IUPAC code.
    *
    * Any byte that is not a valid upper- or lower-case DNA base, RNA base or IUPAC code will cause
    * an exception to be thrown.
    *
    * @param b the base to be complemented
    * @return the complement of that base
    */
  @inline final def complement(b: Byte): Byte = (b: @switch) match {
    // Discrete bases
    case 'A' => 'T'
    case 'C' => 'G'
    case 'G' => 'C'
    case 'T' => 'A'
    case 'U' => 'A'
    // IUPAC codes that represent two bases
    case 'M' => 'K'
    case 'K' => 'M'
    case 'R' => 'Y'
    case 'Y' => 'R'
    case 'W' => 'W'
    case 'S' => 'S'
    // IUPAC codes that represent three bases
    case 'B' => 'V'
    case 'V' => 'B'
    case 'H' => 'D'
    case 'D' => 'H'
    // IUPAC universal code
    case 'N' => 'N'

    // Discrete bases
    case 'a' => 't'
    case 'c' => 'g'
    case 'g' => 'c'
    case 't' => 'a'
    case 'u' => 'a'
    // IUPAC codes that represent two bases
    case 'm' => 'k'
    case 'k' => 'm'
    case 'r' => 'y'
    case 'y' => 'r'
    case 'w' => 'w'
    case 's' => 's'
    // IUPAC codes that represent three bases
    case 'b' => 'v'
    case 'v' => 'b'
    case 'h' => 'd'
    case 'd' => 'h'
    // IUPAC universal code
    case 'n' => 'n'

    case ch  => throw new IllegalArgumentException(s"Don't know how to complement '$ch'.")
  }

  /** Compares if two bases are equal ignoring case.  The bases may be IUPAC codes, but their relationships are not
    * considered. */
  @inline private def basesEqual(b1: Byte, b2: Char): Boolean = SequenceUtil.basesEqual(b1, b2.toByte)

  /** Compares if two bases are equal ignoring case.  The bases may be IUPAC codes, but their relationships are not
    * considered. */
  @inline private def basesEqual(b1: Char, b2: Char): Boolean = SequenceUtil.basesEqual(b1.toByte, b2.toByte)

  /** Returns the IUPAC code for the set of bases given. */
  def iupacCode(bases: Iterable[Byte]): Byte = {
    val mask: Int = bases.foldLeft(0) { case (code, next) => code | IupacMasks(next) }
    IupacMasks.indexOf(mask).toByte
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy