All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.umi.SimpleConsensusCaller.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2017 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.umi

import com.fulcrumgenomics.FgBioDef.forloop
import com.fulcrumgenomics.util.NumericTypes.{LogProbability, PhredScore}

import scala.collection.immutable.BitSet

/**
  * A class that can be mixed in to call sequences represented as strings of the same length.
  *
  * If a non-ACGTN bases is encountered at any position in the strings, that position is not consensus called, and the
  * character from the first sequenced is used.  All strings should have the same non-ACGTN character at this position.
  *
  * @param errorRatePreLabeling The error probability prior to adding the UMI.  By default, set to almost surely zero.
  * @param errorRatePostLabeling The error probability post to adding the UMI.  By default, set to almost surely zero
  * @param qError The adjusted error probability.  By default set to Q20.
  */
private[umi] class SimpleConsensusCaller(val errorRatePreLabeling: Byte = 90.toByte,
                                         val errorRatePostLabeling: Byte = 90.toByte,
                                         qError: PhredScore = 20.toByte
                                        ) {
  val pError: LogProbability = LogProbability.fromPhredScore(this.qError)
  val pTruth: LogProbability = LogProbability.not(this.pError)

  /** A consensus caller used to generate consensus UMI sequences */
  private val consensusBuilder = new ConsensusCaller(
    errorRatePreLabeling  = this.errorRatePreLabeling,
    errorRatePostLabeling = this.errorRatePostLabeling
  ).builder()

  private val DnaBases = Set('A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n')
  private val DnaBasesBitSet = BitSet(DnaBases.map(_.toInt).toSeq:_*)

  /** Calls a simple consensus sequences from a set of sequences all the same length. */
  def callConsensus(sequences: Seq[String]): String = {
    require(sequences.nonEmpty, "Can't call consensus on an empty set of sequences!")

    if (sequences.length == 1) sequences.head else {
      require(
        sequences.forall(_.length == sequences.head.length),
        s"Sequences must all have the same length. Found ${sequences.mkString(", ")}"
      )
      val buffer = new StringBuilder
      val firstRead  = sequences.head
      val readLength = firstRead.length
      val sequencesLength = sequences.length

      forloop (from=0, until=readLength) { i =>
        this.consensusBuilder.reset()
        var nonDna = 0
        sequences.foreach { sequence =>
          val char = sequence.charAt(i)
          if (!this.DnaBasesBitSet.contains(char.toInt)) {
            nonDna += 1
            // verify that all non-DNA bases are the same character
            require(firstRead.charAt(i) == char,
              s"Sequences must have character '${firstRead.charAt(i)}' at position $i, found '$char'")
          }
          else this.consensusBuilder.add(char.toByte, qError)
        }

        if (nonDna == 0) buffer.append(this.consensusBuilder.call()._1.toChar)
        else if (nonDna == sequencesLength) buffer.append(firstRead.charAt(i)) // NB: we have previously verified they are all the same character
        else throw new IllegalStateException(s"Sequences contained a mix of DNA and non-DNA characters at offset $i: $sequences")
      }

      buffer.toString()
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy