All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.testing.VcfBuilder.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2019 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.testing

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.testing.VcfBuilder.Gt
import com.fulcrumgenomics.vcf.api.Allele.NoCallAllele
import com.fulcrumgenomics.vcf.api.Variant.PassingFilters
import com.fulcrumgenomics.vcf.api._

import java.nio.file.Files
import scala.collection.immutable.ListMap
import scala.collection.mutable
import scala.util.Try

object VcfBuilder {
  /** The default header that is used when making a new [[VcfBuilder]]. */
  val DefaultHeader: VcfHeader = {
    val contigs = new SamBuilder().dict
      .map(s => VcfContigHeader(s.index, s.name, length=Some(s.length), assembly=s.assembly))
      .toIndexedSeq

    VcfHeader(
      contigs = contigs,
      infos   = Seq(
        VcfInfoHeader(id="AC",  count=VcfCount.OnePerAltAllele, kind=VcfFieldType.Integer, description="Alternate allele counts in genotypes."),
        VcfInfoHeader(id="DP",  count=VcfCount.Fixed(1),        kind=VcfFieldType.Integer, description="Depth across all samples.")
      ),
      formats = Seq(
        VcfFormatHeader(id="GT", count=VcfCount.Fixed(1),       kind=VcfFieldType.String, description="Genotype string."),
        VcfFormatHeader(id="AD", count=VcfCount.OnePerAllele,   kind=VcfFieldType.Integer, description="Depth per allele."),
        VcfFormatHeader(id="GQ", count=VcfCount.Fixed(1),       kind=VcfFieldType.Integer, description="Genotype quality."),
        VcfFormatHeader(id="PL", count=VcfCount.OnePerGenotype, kind=VcfFieldType.Integer, description="Phred scaled genotype likelihoods.")
      ),
      filters = Seq(
        VcfFilterHeader(id="LowQD", description="Low Quality/Depth value"),
        VcfFilterHeader(id="LowAB", description="Low/poor allele balance.")
      ),
      others  = Seq(
        VcfGeneralHeader(headerType="ALT", id="NON_REF", data=Map("Description" -> "Represents any non-reference allele."))
      ),
      samples = IndexedSeq()
    )
  }

  /**
    * Convenience case class used to build up genotypes.
    *
    * @param sample the name of the sample being genotyped
    * @param gt a genotype either in VCF format (e.g. 0/1) or using actual alleles (e.g. A/T)
    * @param dp optional depth (DP)
    * @param ad optional set of allele depths (AD)
    * @param pl optional set of phred-scale genotype likelihoods (PL)
    * @param attrs extended attributes (must be in FORMAT headers)
    */
  case class Gt(sample: String,
                gt: String,
                dp: Int = -1,
                ad: Seq[Int] = Seq.empty,
                pl: Seq[Int] = Seq.empty,
                attrs: Map[String,Any] = Map.empty
               ) {

    def allAttrs: Map[String, Any] = {
      attrs ++
        (if (dp < 0) Map.empty else Map("DP" -> dp)) ++
        (if (ad.isEmpty) Map.empty else Map("AD" -> ad.toIndexedSeq)) ++
        (if (pl.isEmpty) Map.empty else Map("DP" -> pl.toIndexedSeq))
    }
  }

  /** Constructs a VcfBuilder using the supplied header. */
  def apply(header: VcfHeader): VcfBuilder = new VcfBuilder(header)

  /** Constructs a VcfBuilder using the [[DefaultHeader]] and the set of samples provided. */
  def apply(samples: Seq[String]): VcfBuilder = {
    require(samples.distinct.size == samples.size, s"${samples.mkString(",")} contains duplicate sample names.")
   this.apply(DefaultHeader.copy(samples=samples.toIndexedSeq))
  }
}

/** Class for building VCFs for testing purposes. */
class VcfBuilder private (initialHeader: VcfHeader) extends Iterable[Variant] {
  /** Genomic location as (sequence_index, position). */
  private type Location = (Int, Int)
  private var _header: VcfHeader = initialHeader
  private val variants: mutable.Map[Location, Variant] = mutable.TreeMap()

  /** Provides access to the header that will be written to the VCF. */
  def header: VcfHeader = _header

  /** Adds one or more INFO headers to the VCF Header. */
  def withInfoHeaders(headers: VcfInfoHeader*): this.type = {
    this._header = this._header.copy(infos = this._header.infos ++ headers)
    this
  }

  /** Adds one or more FORMAT headers to the VCF Header. */
  def withFormatHeaders(headers: VcfFormatHeader*): this.type = {
    this._header = this._header.copy(formats = this._header.formats ++ headers)
    this
  }

  /** Adds one or more FILTER headers to the VCF Header. */
  def withFilterHeaders(headers: VcfFilterHeader*): this.type = {
    this._header = this._header.copy(filters = this._header.filters ++ headers)
    this
  }

  /** Adds one or more non-INFO/FORMAT/FILTER headers to the VCF Header. */
  def withOtherHeaders(headers: VcfGeneralHeader*): this.type = {
    this._header = this._header.copy(others = this._header.others++ headers)
    this
  }

  /** Adds a variant to the set being built.  The variant should contain all information required as it is not
    * possible to update a variant once added. If a variant already exists at the given position an exception
    * will be thrown.
    *
    * The genotypes are specified using instance of the [[Gt]] helper class.  The genotype strings within the [[Gt]]
    * objects may be either numeric like in a VCF (e.g. `0/1`) or use allele strings (e.g. `A/T`.)
    *
    * The variant must also be valid, e.g. not reference INFO, FILTER or FORMAT fields that are no in the header,
    * and not have alleles in genotypes that are not in the set of alleles for the variant.
    *
    * If the header contains more samples than there are genotype give, simple diploid no-call genotypes will
    * be inserted for the remaining samples.
    */
  def add(chrom: String = this.header.contigs.head.name,
          pos: Int,
          id: String = ".",
          alleles: Seq[String],
          qual: Int = -1,
          info: Map[String, Any] = Map.empty,
          filters: Iterable[String] = Set.empty,
          gts: Seq[Gt] = Seq.empty
         ): this.type = {

    require(_header.dict(chrom).index >= 0, s"Unknown chrom: $chrom")
    val key = (_header.dict(chrom).index, pos)

    // Make sure things are relatively valid
    require(!variants.contains(key), s"Variant already exists at position $chrom:$pos")
    require(alleles.nonEmpty, s"Must specify at least one allele.")
    info.keys.foreach(k => require(this._header.info.contains(k), s"No INFO header for key $k"))
    filters.filterNot(PassingFilters.contains).foreach(f => require(this._header.filter.contains(f), s"No FILTER header for key $f"))
    require(gts.map(_.sample).toSet.size == gts.size, s"Non-unique sample names in genotypes.")

    val alleleSet = AlleleSet(alleles:_*)

    val calledGenotypes = gts.map { g =>
      val phased    = g.gt.contains("|")
      val separator = if (phased) '|' else '/'
      val calls     = g.gt.split(separator).map(s => toAllele(s, alleleSet)).toIndexedSeq

      Genotype(
        alleles = alleleSet,
        sample  = g.sample,
        calls   = calls,
        phased  = phased,
        attrs   = g.allAttrs
      )
    }

    val noCalls = _header.samples.diff(calledGenotypes.map(_.sample)).map { s =>
      Genotype(alleles=alleleSet, sample=s, calls=IndexedSeq(NoCallAllele, NoCallAllele))
    }

    variants.put(key, Variant(
      chrom     = chrom,
      pos       = pos,
      id        = if (id == ".") None else Some(id),
      alleles   = alleleSet,
      qual      = if (qual < 0) None else Some(qual),
      attrs     = ListMap(info.toSeq:_*),
      filters   = filters.toSet,
      genotypes = (calledGenotypes ++ noCalls).map(gt => gt.sample -> gt).toMap
    ))

    this
  }

  /** Converts either a numeric index or an allele string into an Allele. */
  private def toAllele(s: String, alleles: AlleleSet): Allele = {
    Try { alleles(s.toInt) }
      .recover { case _ => Allele(s) }
      .get
  }

  /** Returns an iterator over the variants in chromosomal order. */
  def iterator: Iterator[Variant] = this.variants.valuesIterator

  /** Writes the contents of the record set to the provided file path. */
  def write(path: PathToVcf) : Unit = {
    val writer = VcfWriter(path, this._header)
    writer ++= iterator
    writer.close()
  }

  /** Writes the contents to a temporary file that will be deleted when the JVM exits. */
  def toTempFile(deleteOnExit: Boolean = true): PathToVcf = {
    val path = Files.createTempFile("VcfRecordSet.", ".vcf.gz")
    if (deleteOnExit) path.toFile.deleteOnExit()
    write(path)
    path
  }

  /** Creates a VcfSource over the records stored in a temporary file. */
  def toSource: VcfSource = VcfSource(toTempFile())
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy