
com.fulcrumgenomics.umi.CollectDuplexSeqMetrics.scala Maven / Gradle / Ivy
The newest version!
/*
* The MIT License
*
* Copyright (c) 2017 Fulcrum Genomics LLC
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package com.fulcrumgenomics.umi
import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.Bams
import com.fulcrumgenomics.bam.api.{SamRecord, SamSource}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.collection.BetterBufferedIterator
import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter, SimpleCounter}
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.Metric.{Count, Proportion}
import com.fulcrumgenomics.util._
import htsjdk.samtools.util.{Interval, IntervalList, Murmur3, OverlapDetector}
import org.apache.commons.math3.distribution.BinomialDistribution
import scala.collection.mutable
import scala.util.Failure
/**
* Companion object for CollectDuplexSeqMetrics that contains various constants and types,
* including all the various [[com.fulcrumgenomics.util.Metric]] sub-types produced by the program.
*/
object CollectDuplexSeqMetrics {
// File extensions for all the files produced
val FamilySizeMetricsExt: String = ".family_sizes.txt"
val DuplexFamilySizeMetricsExt: String = ".duplex_family_sizes.txt"
val UmiMetricsExt: String = ".umi_counts.txt"
val DuplexUmiMetricsExt: String = ".duplex_umi_counts.txt"
val YieldMetricsExt: String = ".duplex_yield_metrics.txt"
val PlotsExt: String = ".duplex_qc.pdf"
private val PlottingScript = "com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.R"
/** Contains an AB and BA count of reads. */
private case class Pair(ab: Int, ba: Int)
/**
* Metrics produced by `CollectDuplexSeqMetrics` to quantify the distribution of different kinds of read family
* sizes. Three kinds of families are described:
*
* 1. _CS_ or _Coordinate & Strand_: families of reads that are grouped together by their unclipped 5'
* genomic positions and strands just as they are in traditional PCR duplicate marking
* 2. _SS_ or _Single Strand_: single-strand families that are each subsets of a CS family create by
* also using the UMIs to partition the larger family, but not linking up families that are
* created from opposing strands of the same source molecule.
* 3. _DS_ or _Double Strand_: families that are created by combining single-strand families that are from
* opposite strands of the same source molecule. This does **not** imply that all DS families are composed
* of reads from both strands; where only one strand of a source molecule is observed a DS family is
* still counted.
*
* @param family_size The family size, i.e. the number of read pairs grouped together into a family.
* @param cs_count The count of families with `size == family_size` when grouping just by coordinates and strand information.
* @param cs_fraction The fraction of all _CS_ families where `size == family_size`.
* @param cs_fraction_gt_or_eq_size The fraction of all _CS_ families where `size >= family_size`.
* @param ss_count The count of families with `size == family_size` when also grouping by UMI to create single-strand families.
* @param ss_fraction The fraction of all _SS_ families where `size == family_size`.
* @param ss_fraction_gt_or_eq_size The fraction of all _SS_ families where `size >= family_size`.
* @param ds_count The count of families with `size == family_size`when also grouping by UMI and merging single-strand
* families from opposite strands of the same source molecule.
* @param ds_fraction The fraction of all _DS_ families where `size == family_size`.
* @param ds_fraction_gt_or_eq_size The fraction of all _DS_ families where `size >= family_size`.
*/
case class FamilySizeMetric(family_size: Int,
var cs_count: Count = 0,
var cs_fraction: Proportion = 0,
var cs_fraction_gt_or_eq_size: Proportion = 0,
var ss_count: Count = 0,
var ss_fraction: Proportion = 0,
var ss_fraction_gt_or_eq_size: Proportion = 0,
var ds_count: Count = 0,
var ds_fraction: Proportion = 0,
var ds_fraction_gt_or_eq_size: Proportion =0
) extends Metric
/**
* Metrics produced by `CollectDuplexSeqMetrics` to describe the distribution of double-stranded (duplex)
* tag families in terms of the number of reads observed on each strand.
*
* We refer to the two strands as `ab` and `ba` because we identify the two strands by observing the same pair of
* UMIs (A and B) in opposite order (A->B vs B->A). Which strand is `ab` and which is `ba` is largely arbitrary, so
* to make interpretation of the metrics simpler we use a definition here that for a given tag family
* `ab` is the sub-family with more reads and `ba` is the tag family with fewer reads.
*
* @param ab_size The number of reads in the `ab` sub-family (the larger sub-family) for this double-strand tag family.
* @param ba_size The number of reads in the `ba` sub-family (the smaller sub-family) for this double-strand tag family.
* @param count The number of families with the `ab` and `ba` single-strand families of size `ab_size` and `ba_size`.
* @param fraction The fraction of all double-stranded tag families that have `ab_size` and `ba_size`.
* @param fraction_gt_or_eq_size The fraction of all double-stranded tag families that have
* `ab reads >= ab_size` and `ba reads >= ba_size`.
*/
case class DuplexFamilySizeMetric(ab_size: Int,
ba_size: Int,
count: Count = 0,
var fraction: Proportion = 0,
var fraction_gt_or_eq_size: Proportion = 0
) extends Metric with Ordered[DuplexFamilySizeMetric] {
/** Orders by ab_size, then ba_size. */
override def compare(that: DuplexFamilySizeMetric): Int = {
var retval = this.ab_size - that.ab_size
if (retval == 0) retval = this.ba_size - that.ba_size
retval
}
}
/**
* Metrics produced by `CollectDuplexSeqMetrics` that are sampled at various levels of coverage, via random
* downsampling, during the construction of duplex metrics. The downsampling is done in such a way that the
* `fraction`s are approximate, and not exact, therefore the `fraction` field should only be interpreted as a guide
* and the `read_pairs` field used to quantify how much data was used.
*
* See `FamilySizeMetric` for detailed definitions of `CS`, `SS` and `DS` as used below.
*
* @param fraction The approximate fraction of the full dataset that was used to generate the remaining values.
* @param read_pairs The number of read pairs upon which the remaining metrics are based.
* @param cs_families The number of _CS_ (Coordinate & Strand) families present in the data.
* @param ss_families The number of _SS_ (Single-Strand by UMI) families present in the data.
* @param ds_families The number of _DS_ (Double-Strand by UMI) families present in the data.
* @param ds_duplexes The number of _DS_ families that had the minimum number of observations on both strands to be
* called duplexes (default = 1 read on each strand).
* @param ds_fraction_duplexes The fraction of _DS_ families that are duplexes (`ds_duplexes / ds_families`).
* @param ds_fraction_duplexes_ideal The fraction of _DS_ families that should be duplexes under an idealized model
* where each strand, `A` and `B`, have equal probability of being sampled, given
* the observed distribution of _DS_ family sizes.
*/
case class DuplexYieldMetric(fraction: Proportion,
read_pairs: Count,
cs_families: Count,
ss_families: Count,
ds_families: Count,
ds_duplexes: Count,
ds_fraction_duplexes: Proportion,
ds_fraction_duplexes_ideal: Proportion) extends Metric
/**
* Metrics produced by `CollectDuplexSeqMetrics` describing the set of observed UMI sequences and the
* frequency of their observations. The UMI sequences reported may have been corrected using information
* within a double-stranded tag family. For example if a tag family is comprised of three read pairs with
* UMIs `ACGT-TGGT`, `ACGT-TGGT`, and `ACGT-TGGG` then a consensus UMI of `ACGT-TGGT` will be generated,
* and three raw observations counted for each of `ACGT` and `TGGT`, and no observations counted for `TGGG`.
*
* @param umi The UMI sequence, possibly-corrected.
* @param raw_observations The number of read pairs in the input BAM that observe the UMI (after correction).
* @param raw_observations_with_errors The subset of raw-observations that underwent any correction.
* @param unique_observations The number of double-stranded tag families (i.e unique double-stranded molecules)
* that observed the UMI.
* @param fraction_raw_observations The fraction of all raw observations that the UMI accounts for.
* @param fraction_unique_observations The fraction of all unique observations that the UMI accounts for.
*/
case class UmiMetric(umi: String,
var raw_observations: Count = 0,
var raw_observations_with_errors: Count = 0,
var unique_observations: Count = 0,
var fraction_raw_observations: Proportion = 0,
var fraction_unique_observations: Proportion = 0
) extends Metric
/**
* Metrics produced by `CollectDuplexSeqMetrics` describing the set of observed duplex UMI sequences and the
* frequency of their observations. The UMI sequences reported may have been corrected using information
* within a double-stranded tag family. For example if a tag family is comprised of three read pairs with
* UMIs `ACGT-TGGT`, `ACGT-TGGT`, and `ACGT-TGGG` then a consensus UMI of `ACGT-TGGT` will be generated.
*
* UMI pairs are normalized within a tag family so that observations are always reported as if they came
* from a read pair with read 1 on the positive strand (F1R2). Another way to view this is that for FR or RF
* read pairs, the duplex UMI reported is the UMI from the positive strand read followed by the UMI from the
* negative strand read. E.g. a read pair with UMI `AAAA-GGGG` and with R1 on the negative strand and R2 on
* the positive strand, will be reported as `GGGG-AAAA`.
*
* @param umi The duplex UMI sequence, possibly-corrected.
* @param raw_observations The number of read pairs in the input BAM that observe the duplex UMI (after correction).
* @param raw_observations_with_errors The subset of raw observations that underwent any correction.
* @param unique_observations The number of double-stranded tag families (i.e unique double-stranded molecules)
* that observed the duplex UMI.
* @param fraction_raw_observations The fraction of all raw observations that the duplex UMI accounts for.
* @param fraction_unique_observations The fraction of all unique observations that the duplex UMI accounts for.
* @param fraction_unique_observations_expected The fraction of all unique observations that are expected to be
* attributed to the duplex UMI based on the `fraction_unique_observations`
* of the two individual UMIs.
*/
case class DuplexUmiMetric(umi: String,
var raw_observations: Count = 0,
var raw_observations_with_errors: Count = 0,
var unique_observations: Count = 0,
var fraction_raw_observations: Proportion = 0,
var fraction_unique_observations: Proportion = 0,
var fraction_unique_observations_expected: Proportion = 0
) extends Metric
}
@clp(group=ClpGroups.Umi, description=
"""
|Collects a suite of metrics to QC duplex sequencing data.
|
|## Inputs
|
|The input to this tool must be a BAM file that is either:
|
|1. The exact BAM output by the `GroupReadsByUmi` tool (in the sort-order it was produced in)
|2. A BAM file that has MI tags present on all reads (usually set by `GroupReadsByUmi` and has
| been sorted with `SortBam` into `TemplateCoordinate` order.
|
|Calculation of metrics may be restricted to a set of regions using the `--intervals` parameter. This
|can significantly affect results as off-target reads in duplex sequencing experiments often have very
|different properties than on-target reads due to the lack of enrichment.
|
|Several metrics are calculated related to the fraction of tag families that have duplex coverage. The
|definition of "duplex" is controlled by the `--min-ab-reads` and `--min-ba-reads` parameters. The default
|is to treat any tag family with at least one observation of each strand as a duplex, but this could be
|made more stringent, e.g. by setting `--min-ab-reads=3 --min-ba-reads=3`. If different thresholds are
|used then `--min-ab-reads` must be the higher value.
|
|## Outputs
|
|The following output files are produced:
|
|1. **
© 2015 - 2025 Weber Informatics LLC | Privacy Policy