All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.bam.EstimatePoolingFractions.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2016 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.bam

import java.lang.Math.{max, min}
import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.SamSource
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.Metric.{Count, Proportion}
import com.fulcrumgenomics.util.{Io, Metric, Sequences}
import com.fulcrumgenomics.vcf.api.{Variant, VcfSource}
import htsjdk.samtools.util.SamLocusIterator.LocusInfo
import htsjdk.samtools.util._
import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression

import scala.collection.immutable.ArraySeq

@clp(group=ClpGroups.SamOrBam, description=
  """
    |Examines sequence data generated from a pooled sample and estimates the fraction of sequence data
    |coming from each constituent sample. Uses a VCF of known genotypes for the samples within the
    |mixture along with a BAM of sequencing data derived from the pool.  Performs a multiple regression
    |for the alternative allele fractions at each SNP locus, using as inputs the individual sample's genotypes.
    |Only SNPs that are bi-allelic within the pooled samples are used.
    |
    |Each sample's contribution of REF vs. ALT alleles at each site is derived in one of two ways: (1) if
    |the sample's genotype in the VCF has the `AF` attribute then the value from that field will be used, (2) if the
    |genotype has no `AF` attribute then the contribution is estimated based on the genotype (e.g. 0/0 will be 100%
    |ref, 0/1 will be 50% ref and 50% alt, etc.).
    |
    |Various filtering parameters can be used to control which loci are used:
    |
    |- _--intervals_ will restrict analysis to variants within the described intervals
    |- _--min-genotype-quality_ will filter out any site with any genotype with GQ < n
    |- _--min-mean-sample-coverage_ requires that the coverage of a site in the BAM be >= `min-mean-sample-coverage * n_samples`
    |- _--min-mapping-quality_ filters out reads in the BAM with MQ < n
    |- _--min-base-quality_ filters out bases in the BAM with Q < n
  """)
class EstimatePoolingFractions
(@arg(flag='v', doc="VCF of individual sample genotypes.")  val vcf: PathToVcf,
 @arg(flag='b', doc="Path to BAM file of sequencing data.") val bam: PathToBam,
 @arg(flag='o', doc="Output file to write with pooling fractions.") val output: FilePath,
 @arg(flag='l', minElements=0, doc="Zero or more set of regions to restrict analysis to.") val intervals: Seq[PathToIntervals] = Seq.empty,
 @arg(flag='s', minElements=0, doc="Optional subset of samples from VCF to use.") val samples: Seq[String] = Seq.empty,
 @arg(flag='n', doc="Non-autosomal chromosomes to avoid.") val nonAutosomes: Seq[String] = Sequences.CommonNonAutosomalContigNames,
 @arg(flag='g', doc="Minimum genotype quality. Use -1 to disable.") val minGenotypeQuality: Int = 30,
 @arg(flag='c', doc="Minimum (sequencing coverage @ SNP site / n_samples).") val minMeanSampleCoverage: Int = 6,
 @arg(flag='m', doc="Minimum mapping quality.") val minMappingQuality: Int = 20,
 @arg(flag='q', doc="Minimum base quality.") val minBaseQuality:Int = 5
) extends FgBioTool with LazyLogging {
  Io.assertReadable(vcf :: bam :: intervals.toList)

  private val Ci99Width = 2.58 // Width of a 99% confidence interval in units of std err

  /* Class to hold information about a single locus. */
  case class Locus(chrom: String, pos: Int, ref: Char, alt: Char, expectedSampleFractions: Array[Double], var observedFraction: Option[Double] = None)

  override def execute(): Unit = {
    val vcfReader   = VcfSource(vcf)
    val sampleNames = pickSamplesToUse(vcfReader)
    val intervals   = loadIntervals

    // Get the expected fractions from the VCF
    val vcfIterator = constructVcfIterator(vcfReader, intervals, ArraySeq.unsafeWrapArray(sampleNames))
    val loci = vcfIterator.filterNot(v => this.nonAutosomes.contains(v.chrom)).map { v => Locus(
      chrom = v.chrom,
      pos   = v.pos,
      ref   = v.alleles.ref.bases.charAt(0),
      alt   = v.alleles.alts.head.value.charAt(0),
      expectedSampleFractions = sampleNames.map { s =>
        val gt = v.gt(s)
        gt.get[IndexedSeq[Float]]("AF") match {
          case None      => if (gt.isHomRef) 0 else if (gt.isHet) 0.5 else 1.0  // requires bi-allelic sites and ploidy=2
          case Some(afs) => afs(0)
        }
      }
    )}.toArray
    vcfReader.safelyClose()

    logger.info(s"Loaded ${loci.length} bi-allelic SNPs from VCF.")

    val coveredLoci = fillObservedFractionAndFilter(loci, this.minMeanSampleCoverage * sampleNames.length)

    logger.info(s"Regressing on ${coveredLoci.length} of ${loci.length} that met coverage requirements.")
    val regression = new OLSMultipleLinearRegression
    regression.setNoIntercept(true) // Intercept should be at 0!
    regression.newSampleData(
      coveredLoci.map(_.observedFraction.getOrElse(unreachable("observed fraction must be defined"))),
      coveredLoci.map(_.expectedSampleFractions)
    )

    val regressionParams = regression.estimateRegressionParameters()
    val total            = regressionParams.sum
    val fractions        = regressionParams.map(_ / total)
    val stderrs          = regression.estimateRegressionParametersStandardErrors().map(_ / total)
    logger.info(s"R^2 = ${regression.calculateRSquared()}")
    logger.info(s"Sum of regression parameters = ${total}")

    val metrics = sampleNames.toSeq.zipWithIndex.map { case (sample, index) =>
      val sites      = coveredLoci.count(l => l.expectedSampleFractions(index) > 0)
      val singletons = coveredLoci.count(l => l.expectedSampleFractions(index) > 0 && l.expectedSampleFractions.sum == l.expectedSampleFractions(index))
      PoolingFractionMetric(
        sample             = sample,
        variant_sites      = sites,
        singletons         = singletons,
        estimated_fraction = fractions(index),
        standard_error     = stderrs(index),
        ci99_low           = max(0, fractions(index) - stderrs(index)*Ci99Width),
        ci99_high          = min(1, fractions(index) + stderrs(index)*Ci99Width))
    }

    Metric.write(output, metrics)

    if (regression.estimateRegressionParameters().exists(_ < 0)) {
      logger.error("#################################################################################")
      logger.error("# One or more samples is estimated to have fraction < 0. This is likely due to  #")
      logger.error("# incorrect samples being used, insufficient coverage and/or too few SNPs.      #")
      logger.error("#################################################################################")
      fail(1)
    }
  }

  /** Verify a provided sample list, or if none provided retrieve the set of samples from the VCF. */
  private def pickSamplesToUse(vcfIn: VcfSource): Array[String] = {
    if (this.samples.isEmpty) vcfIn.header.samples.toArray else {
      val samplesInVcf   = vcfIn.header.samples
      val missingSamples = samples.toSet -- samplesInVcf.toSet
      if (missingSamples.nonEmpty) fail(s"Samples not present in VCF: ${missingSamples.mkString(", ")}")
      else samples.toArray.sorted
    }
  }

  /** Loads up and merges all the interval lists provided. Returns None if no intervals were specified. */
  private def loadIntervals: Option[IntervalList] = this.intervals match {
    case head +: tail =>
      val list = IntervalList.fromFile(head.toFile)
      val dict = list.getHeader.getSequenceDictionary
      tail.foreach { f =>
        val tmp = IntervalList.fromFile(f.toFile)
        if (!SequenceUtil.areSequenceDictionariesEqual(dict, tmp.getHeader.getSequenceDictionary))
          fail(s"Sequence dictionaries differ between ${this.intervals.head} and ${f}")
        else
          list.addall(tmp.getIntervals)
      }
      Some(list.uniqued(false))
    case _ => None
  }

  /** Generates an iterator over non-filtered bi-allelic SNPs where all the required samples are genotyped. */
  def constructVcfIterator(in: VcfSource, intervals: Option[IntervalList], samples: Seq[String]): Iterator[Variant] = {
    val iterator: Iterator[Variant] = intervals match {
      case None     => in.iterator
      case Some(is) => is.flatMap(i => in.query(i.getContig, i.getStart, i.getEnd))
    }

    iterator
      .filter(v => v.filters.isEmpty || v.filters == Variant.PassingFilters)
      .filter(v => v.alleles.size == 2 && v.alleles.forall(a => a.value.length == 1))  // Just biallelic SNPs
      .filter(v => samples.map(v.gt).forall(gt => gt.isFullyCalled && (this.minGenotypeQuality <= 0 || gt.get[Int]("GQ").exists(_ >= this.minGenotypeQuality))))
      .map   (v => v.copy(genotypes=v.genotypes.filter { case (s, _) => samples.contains(s)} ))
      .filter(v => v.gts.flatMap(_.calls).distinct.size > 1)  // Not monomorphic
  }

  /** Constructs a SamLocusIterator that will visit every locus in the input. */
  def constructBamIterator(loci: Iterable[Locus]): Iterator[LocusInfo] = {
    val in = SamSource(this.bam)
    val intervals = new IntervalList(in.header)
    loci.foreach(l => intervals.add(new Interval(l.chrom, l.pos, l.pos)))
    val iterator = new SamLocusIterator(in.toSamReader, intervals.uniqued())
    iterator.setEmitUncoveredLoci(true)
    iterator.setIncludeNonPfReads(false)
    iterator.setMappingQualityScoreCutoff(this.minMappingQuality)
    iterator.setQualityScoreCutoff(this.minBaseQuality)
    javaIteratorAsScalaIterator(iterator)
  }

  /**
    * Fills in the observedFraction field for each locus that meets coverage and then returns
    * the subset of loci that met coverage.
    */
  def fillObservedFractionAndFilter(loci: Array[Locus], minCoverage: Int): Array[Locus] = {
    val locusIterator = constructBamIterator(loci)
    locusIterator.zip(loci.iterator).foreach { case (locusInfo, locus) =>
      if (locusInfo.getSequenceName != locus.chrom || locusInfo.getPosition != locus.pos) fail("VCF and BAM iterators out of sync.")

      // A gross coverage check here to avoid a lot of work; better check below
      if (locusInfo.getRecordAndOffsets.size() >= minCoverage) {
        val counts = BaseCounts(locusInfo)
        val (ref, alt) = (counts(locus.ref), counts(locus.alt))

        // Somewhat redundant with check above, but this protects against a large fraction
        // of Ns or other alleles, and also against a large proportion of overlapping reads
        if (ref + alt >= minCoverage) {
          locus.observedFraction = Some(alt / (ref + alt).toDouble)
        }
      }
    }

    loci.filter(_.observedFraction.isDefined)
  }
}

/**
  * Metrics produced by `EstimatePoolingFractions` to quantify the estimated proportion of a sample
  * mixture that is attributable to a specific sample with a known set of genotypes.
  *
  * @param sample The name of the sample within the pool being reported on.
  * @param variant_sites How many sites were examined at which the reported sample is known to be variant.
  * @param singletons How many of the variant sites were sites at which only this sample was variant.
  * @param estimated_fraction The estimated fraction of the pool that comes from this sample.
  * @param standard_error The standard error of the estimated fraction.
  * @param ci99_low  The lower bound of the 99% confidence interval for the estimated fraction.
  * @param ci99_high The upper bound of the 99% confidence interval for the estimated fraction.
  */
case class PoolingFractionMetric(sample: String,
                                 variant_sites: Count,
                                 singletons: Count,
                                 estimated_fraction: Proportion,
                                 standard_error: Double,
                                 ci99_low: Proportion,
                                 ci99_high: Proportion
                                ) extends Metric




© 2015 - 2025 Weber Informatics LLC | Privacy Policy