All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.testing.VariantContextSetBuilder.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2016 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package com.fulcrumgenomics.testing

import java.nio.file.Files
import java.util.Collections

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.fasta.SequenceDictionary
import htsjdk.variant.variantcontext._
import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriterBuilder}
import htsjdk.variant.vcf.{VCFFileReader, VCFHeader, VCFHeaderLine}

import scala.collection.JavaConverters._
import scala.collection.compat._
import scala.collection.mutable.ListBuffer
import scala.reflect.ClassTag

object VariantContextSetBuilder {
  @deprecated(since="0.9.0", message="Use com.fulcrumgenomics.testing.VcfBuilder instead.")
  def apply(sampleName: String): VariantContextSetBuilder = {
    new VariantContextSetBuilder(List(sampleName))
  }
}

/**
  * A class to make artificial variant contexts and VCFs just a little bit easier to generate.
  *
  * This builder uses the default sequence dictionary from [SamBuilder] by default.
  */
@deprecated(since="0.9.0", message="Use com.fulcrumgenomics.testing.VcfBuilder instead.")
class VariantContextSetBuilder(sampleNames: Seq[String] = List("Sample")) extends Iterable[VariantContext] {

  if (sampleNames.isEmpty) throw new IllegalArgumentException("At least one sample name must be given")

  private var _header = {
    val h = new VCFHeader(Collections.emptySet[VCFHeaderLine](), sampleNames.toList.asJava)
    h.setSequenceDictionary(new SamBuilder().header.getSequenceDictionary)
    h
  }

  private val variants = new ListBuffer[VariantContext]()

  def header: VCFHeader = this._header

  lazy val dict: SequenceDictionary = {
    import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary
    this.header.getSequenceDictionary.fromSam
  }

  /** Sets the VCF header for this builder.  This should be set before adding variants. */
  def setHeader(header: VCFHeader): this.type = yieldingThis(this._header = header)

  /** Sets the sequence dictionary for this builder.  This should be set before adding variants. */
  def setSequenceDictionary(dict: SequenceDictionary): this.type = {
    import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary
    yieldingThis(this._header.setSequenceDictionary(dict.asSam))
  }

  /** Adds the header line to the header for this builder.  This should be set before adding variants. */
  def addMetaDataLine(headerLine: VCFHeaderLine): this.type = yieldingThis(this._header.addMetaDataLine(headerLine))

  /** Adds a variant to the builder.
    *
    * If a variant context already exists with the same contig, start, stop, and variant alleles, then we add this
    * genotype to that variant, otherwise we add a new variant and genotype.
    *
    * The variant alleles must always be given, as we use them to determine if this variant already exists when adding
    * a genotype for a new sample.
    *
    * If no genotype alleles are given, the genotype is a no call.
    *
    * If the sample name is not given, the first sample name in the header (in order) is used.  There must not already
    * be a genotype for this variant with the given sample name, and the sample name must be present in the header.
    *
    * Genotype attributes may be given with the `genotypeAttributes` parameter.  The value for the `GQ` and `DP`
    * attributes must have type [[Int]].  The value for the `AD` and `PL` attributes must have either type
    * [[IterableOnce]] or [[Array]], with each item having type [[Int]].  For example:
    * {{{
    *   val builder = new VariantContextSetBuilder()
    *   builder.addVariant(
    *     start=1,
    *     variantAlleles=Seq("A", "C"),
    *     genotypeAlleles=Seq("A", "C"),
    *     genotypeAttributes=Map("DP" -> 10, "AD" -> Array(6, 4))
    *   )
    * }}}
    *
    * */
  def addVariant(refIdx: Int = 0,
                 start: Long,
                 variantAlleles: List[String],
                 variantAttributes: Map[String,Any] = Map.empty,
                 genotypeAlleles: List[String] = List.empty,
                 genotypeAttributes: Map[String,Any] = Map.empty,
                 sampleName: Option[String] = None,
                 phased: Boolean = false): this.type = {
    if (!sampleName.forall { sn => this.header.getGenotypeSamples.contains(sn)}) {
      throw new IllegalArgumentException(s"Sample with name $sampleName not found in the VCF header.")
    }
    if (variantAlleles.isEmpty) {
      throw new IllegalArgumentException("No alleles given")
    }
    if (!genotypeAlleles.forall(a => a == Allele.NO_CALL_STRING || variantAlleles.contains(a))) {
      throw new IllegalArgumentException("A genotype allele not found in variant alleles")
    }
    val contig  = this.dict(refIdx).name
    val alleles = toAlleles(variantAlleles)
    val stop    = VariantContextUtils.computeEndFromAlleles(alleles.asJava, start.toInt, -1)
    // check to see if there are already genotypes for this variant
    val (ctxBuilder, prevGenotypes) = this.variants.find { ctx =>
      ctx.getContig == contig &&
        ctx.getStart == start &&
        ctx.getEnd == stop &&
        ctx.getAlleles.asScala.map(_.getBaseString).sorted.mkString == variantAlleles.sorted.mkString // only an approximation
    } match {
      case Some(ctx) =>
        // remove this variant context, and create a builder based off of it, and get the current genotypes.
        this.variants -= ctx
        (new VariantContextBuilder(ctx), ctx.getGenotypesOrderedByName.toList)
      case None      =>
        // create a new variant context builder and an empty set of genotypes
        (new VariantContextBuilder("source", contig, start, stop, alleles.asJava), List.empty[Genotype])
    }
    // get the reference allele
    val referenceAllele = ctxBuilder.getAlleles.find(_.isReference)
    // create a genotype builder.
    val name = sampleName.getOrElse(header.getSampleNamesInOrder.iterator().next())
    val genotypeBuilder = genotypeAlleles match {
      case Nil      => new GenotypeBuilder(name, Collections.singletonList(Allele.create(Allele.NO_CALL_STRING)))
      case gAlleles => new GenotypeBuilder(name, toAlleles(gAlleles, referenceAllele=referenceAllele).asJava)
    }
    genotypeBuilder.phased(phased)
    // Set the genotype attributes, with GQ/DP/AD/PL being special-cased
    // NB: no check is performed that attributes that are per-allele match the given # of alleles.
    genotypeAttributes.foreach {
      case ("GQ", v) => genotypeBuilder.GQ(anyToType[Integer]("GQ", v, "Int"))
      case ("DP", v) => genotypeBuilder.DP(anyToType[Integer]("DP", v, "Int"))
      case ("AD", v) => genotypeBuilder.AD(anyToArray[Integer]("AD", v, "Int").map(_.toInt))
      case ("PL", v) => genotypeBuilder.PL(anyToArray[Integer]("PL", v, "Int").map(_.toInt)) // NB: we don't support Doubles
      case (k,v)     => genotypeBuilder.attribute(k, v)
    }
    val genotype = genotypeBuilder.make()
    // check the sample doesn't already exists.
    prevGenotypes.find { g => g.getSampleName == genotype.getSampleName } match {
      case Some(g) => throw new IllegalArgumentException(s"Genotype already exists for the sample ${g.getSampleName}: ${g.toBriefString}")
      case None    =>
    }
    val genotypes = genotype +: prevGenotypes
    ctxBuilder.genotypes(genotypes:_*)
    ctxBuilder.attributes(variantAttributes.asJava)
    yieldingThis(this.variants.append(ctxBuilder.make()))
  }

  /** Returns `v` as type `T`.  Throws an exception if `v` is not assignable to type `T`.*/
  private def anyToType[T : ClassTag](k: String, v: Any, typeName: String): T = {
    val clazz = implicitly[ClassTag[T]].runtimeClass
    v match {
      case value if clazz.isInstance(v) => value.asInstanceOf[T]
      case _ => throw new IllegalArgumentException(s"$k attribute value must be an $typeName, found ${v.getClass.getSimpleName}")
    }
  }

  /** Returns `v` as type `Array[T]`.  Throws an exception if `v` is not assignable to `TraversableOnce[_]` or `Array[_]`,
    * or if any element in `v` is not of type `T` .*/
  private def anyToArray[T : ClassTag](k: String, v: Any, typeName: String): Array[T] = {
    val values = v match {
      case _v: IterableOnce[_] => _v.iterator.toArray[Any]
      case _v: Array[_]           => _v
      case _                      =>
        throw new IllegalArgumentException(s"$k attribute value must be an IterableOnce[$typeName], found ${v.getClass.getSimpleName}")
    }
    values.map(anyToType[T](k, _, typeName))
  }

  /** Gets an iterator over the variants. */
  override def iterator: Iterator[VariantContext] = this.variants.iterator

  /** Returns the number of variants added to this builder. */
  override def size: Int = this.variants.size

  /** Writes the contents of the record set to the provided file path. */
  def write(path: PathToVcf) : Unit = {
    val writer = new VariantContextWriterBuilder()
      .setReferenceDictionary(this.header.getSequenceDictionary)
      .setOutputFile(path.toFile)
      .setOption(Options.INDEX_ON_THE_FLY)
      .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
      .build()
    writer.writeHeader(_header)
    this.iterator.foreach(writer.add)
    writer.close()
  }

  /** Writes the contents to a temporary file that will be deleted when the JVM exits. */
  def toTempFile(deleteOnExit: Boolean = true): PathToVcf = {
    val path = Files.createTempFile("VariantContextSet.", ".vcf.gz")
    if (deleteOnExit) path.toFile.deleteOnExit()
    write(path)
    path
  }

  /** Writes the contents to a temporary file that will be deleted when the JVM exits and
    * provides a [VCFFileReader]. */
  def toVcfFileReader(deleteOnExit: Boolean = true): VCFFileReader = {
    val path = toTempFile(deleteOnExit=deleteOnExit)
    new VCFFileReader(path.toFile, true)
  }

  /** Given a list of alleles as strings, typically for a genotype, gives a list of alleles of type [Allele].  If
    * a reference allele is given, then we will use that to set if an input allele is reference, otherwise we assume
    * it's the first allele in the input list.
    */
  private def toAlleles(alleles: List[String], referenceAllele: Option[Allele] = None): List[Allele] = {
    referenceAllele match {
      case Some(r) =>
        alleles.map { allele => Allele.create(allele, r.getBaseString == allele) }
      case None    =>
        alleles.zipWithIndex.map { case (allele: String, idx: Int) => Allele.create(allele, idx == 0) }
    }
  }

  private def yieldingThis(f: => Unit): this.type = { f; this}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy