com.fulcrumgenomics.util.SampleBarcodeMetric.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fgbio_2.13 Show documentation
fgbio
The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2016 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package com.fulcrumgenomics.util


object SampleBarcodeMetric {
  def apply(barcodeName: String, libraryName: String, barcode: String): SampleBarcodeMetric = {
    new SampleBarcodeMetric(barcode_name=barcodeName, library_name=libraryName, barcode=barcode)
  }

  /** Computes values that are require the summary counts across multiple barcode metrics, such as certain fractions.
    *
    * @param barcodeToMetrics the map in which barcode metrics per sample are stored.
    * @param noMatchBarcode the barcode for the unmatched templates.  This should stored in `barcodeToMetrics`.
    */
  def finalizeMetrics(barcodeToMetrics: Map[String, SampleBarcodeMetric],
                      noMatchBarcode: String): Unit = {
    val noMatchMetric = barcodeToMetrics(noMatchBarcode)

    var totalReads: Long           = 0
    var totalPfReads: Long         = 0
    var totalPfReadsAssigned: Long = 0
    var totalBases: Long           = 0

    barcodeToMetrics.foreach { case (_, metric) =>
      totalReads           += metric.templates
      totalPfReads         += metric.pf_templates
      totalPfReadsAssigned += metric.pf_templates
      totalBases           += metric.total_number_of_bases
    }

    if (totalReads > 0) {
      noMatchMetric.fraction_matches = noMatchMetric.templates / totalReads.toDouble
      var bestPctOfAllBarcodeMatches: Double = 0
      barcodeToMetrics.foreach { case (_, metric) =>
        val fracMatches =  metric.templates / totalReads.toDouble
        if (fracMatches > bestPctOfAllBarcodeMatches) {
          bestPctOfAllBarcodeMatches = fracMatches
        }
        metric.fraction_matches = fracMatches
      }
      if (bestPctOfAllBarcodeMatches > 0) {
        noMatchMetric.ratio_this_barcode_to_best_barcode = noMatchMetric.fraction_matches / bestPctOfAllBarcodeMatches
        barcodeToMetrics.foreach { case (_, metric) =>
          metric.ratio_this_barcode_to_best_barcode = metric.fraction_matches / bestPctOfAllBarcodeMatches
        }
      }
    }
    if (totalPfReads > 0) {
      var bestPfPctOfAllBarcodeMatches: Double = 0
      barcodeToMetrics.foreach { case (_, metric) =>
        val fracPfMatches = metric.pf_templates / totalPfReads.toDouble
        if (fracPfMatches > bestPfPctOfAllBarcodeMatches) {
          bestPfPctOfAllBarcodeMatches = fracPfMatches
        }
        metric.pf_fraction_matches = fracPfMatches
      }
      if (bestPfPctOfAllBarcodeMatches > 0) {
        noMatchMetric.pf_ratio_this_barcode_to_best_barcode = noMatchMetric.pf_fraction_matches / bestPfPctOfAllBarcodeMatches
        barcodeToMetrics.foreach { case (_, metric) =>
          metric.pf_ratio_this_barcode_to_best_barcode = metric.pf_fraction_matches / bestPfPctOfAllBarcodeMatches
        }
      }

    }
    if (totalPfReadsAssigned > 0) {
      val mean: Double = totalPfReadsAssigned.toDouble / barcodeToMetrics.values.size.toDouble
      barcodeToMetrics.foreach { case (_, metric) =>
        metric.pf_normalized_matches = metric.pf_templates / mean
      }
    }
    if (totalBases > 0) {
      barcodeToMetrics.foreach { case (_, metric) =>
        if (metric.total_number_of_bases > 0) {
          metric.frac_q20_bases = metric.q20_bases / metric.total_number_of_bases.toDouble
          metric.frac_q30_bases = metric.q30_bases / metric.total_number_of_bases.toDouble
        }
      }
    }
  }
}

/**
  * Metrics for matching templates to sample barcodes primarily used in [[com.fulcrumgenomics.fastq.DemuxFastqs]].
  *
  * The number of templates will match the number of reads for an Illumina single-end sequencing run, while the number
  * of templates will be half the number of reads for an Illumina paired-end sequencing run (i.e. R1 & R2 observe the
  * same template).
  *
  * @param barcode_name the name for the sample barcode, typically the sample name from the SampleSheet.
  * @param library_name the name of the library, typically the library identifier from the SampleSheet.
  * @param barcode the sample barcode bases.  Dual index barcodes will have two sample barcode sequences delimited by a
  *                dash.
  * @param templates the total number of templates matching the given barcode.
  * @param pf_templates the total number of pass-filter templates matching the given barcode.
  * @param perfect_matches the number of templates that match perfectly the given barcode.
  * @param pf_perfect_matches the number of pass-filter templates that match perfectly the given barcode.
  * @param one_mismatch_matches the number of pass-filter templates that match the given barcode with exactly one
  *                             mismatch.
  * @param pf_one_mismatch_matches the number of pass-filter templates that match the given barcode with exactly
  *                                one mismatch.
  * @param q20_bases the number of bases in a template with a quality score 20 or above
  * @param q30_bases the number of bases in a template with a quality score 30 or above
  * @param total_number_of_bases the total number of bases in the templates combined
  * @param fraction_matches the fraction of all templates that match the given barcode.
  * @param ratio_this_barcode_to_best_barcode the rate of all templates matching this barcode to all template
  *                                               reads matching the most prevalent barcode. For the most prevalent
  *                                               barcode this will be 1, for all others it will be less than 1 (except
  *                                               for the possible exception of when there are more unmatched templates
  *                                               than for any other barcode, in which case the value may be arbitrarily
  *                                               large).  One over the lowest number in this column gives you the
  *                                               fold-difference in representation between barcodes.
  * @param pf_fraction_matches the fraction of all pass-filter templates that match the given barcode.
  * @param pf_ratio_this_barcode_to_best_barcode the rate of all pass-filter templates matching this barcode to
  *                                                  all templates matching the most prevalent barcode. For the
  *                                                  most prevalent barcode this will be 1, for all others it will be
  *                                                  less than 1 (except for the possible exception of when there are
  *                                                  more unmatched templates than for any other barcode, in which
  *                                                  case the value may be arbitrarily large).  One over the lowest
  *                                                  number in this column gives you the fold-difference in
  *                                                  representation between barcodes.
  * @param pf_normalized_matches The "normalized" matches to each barcode. This is calculated as the number of
  *                              pass-filter templates matching this barcode over the mean of all pass-filter
  *                              templates matching any barcode (excluding unmatched). If all barcodes are
  *                              represented equally this will be
  * @param frac_q20_bases the fraction of bases in a template with a quality score 20 or above
  * @param frac_q30_bases the fraction of bases in a template with a quality score 30 or above
  */
case class SampleBarcodeMetric
( var barcode_name: String                                     = "",
  var library_name: String                                     = "",
  var barcode: String                                          = "",
  var templates: Metric.Count                                  = 0,
  var pf_templates: Metric.Count                               = 0,
  var perfect_matches: Metric.Count                            = 0,
  var pf_perfect_matches: Metric.Count                         = 0,
  var one_mismatch_matches: Metric.Count                       = 0,
  var pf_one_mismatch_matches: Metric.Count                    = 0,
  var q20_bases: Metric.Count                                  = 0,
  var q30_bases: Metric.Count                                  = 0,
  var total_number_of_bases: Metric.Count                      = 0,
  var fraction_matches: Metric.Proportion                      = 0d,
  var ratio_this_barcode_to_best_barcode: Metric.Proportion    = 0d,
  var pf_fraction_matches: Metric.Proportion                   = 0d,
  var pf_ratio_this_barcode_to_best_barcode: Metric.Proportion = 0d,
  var pf_normalized_matches: Metric.Proportion                 = 0d,
  var frac_q20_bases: Metric.Proportion                        = 0d,
  var frac_q30_bases: Metric.Proportion                        = 0d
) extends Metric {


  /** Increments the counts for a metric
    *
    * @param numMismatches number of mismatches
    * @param isPf true if the template passes QC
    * @param basesToAdd number of total bases in the record
    * @param q20Bases number of bases that have a quality score of 20 or higher
    * @param q30Bases number of bases that have a quality score of 30 or higher
    * @param omitFailing if failing reads are to be omitted from the output
    */
  def increment(numMismatches: Int,
                 isPf: Boolean = true,
                 basesToAdd: Int,
                 q20Bases: Int,
                 q30Bases: Int,
                 omitFailing: Boolean): Unit = {
    this.templates += 1
    if (isPf) this.pf_templates += 1

    if (numMismatches == 0) {
      this.perfect_matches += 1
      if (isPf) this.pf_perfect_matches += 1
    }
    else if (numMismatches == 1) {
      this.one_mismatch_matches += 1
      if (isPf) this.pf_one_mismatch_matches += 1
    }

    if (isPf || !omitFailing) {
      this.total_number_of_bases += basesToAdd
      this.q20_bases += q20Bases
      this.q30_bases += q30Bases
    }
  }
}