All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.vcf.FixVcfPhaseSet.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2020 Fulcrum Genomics
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package com.fulcrumgenomics.vcf

import com.fulcrumgenomics.FgBioDef.{FgBioEnum, PathToVcf, SafelyClosable}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.{LazyLogging, SimpleCounter}
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, ProgressLogger}
import com.fulcrumgenomics.vcf.api._
import enumeratum.EnumEntry

import scala.collection.{immutable, mutable}


@clp(group=ClpGroups.VcfOrBcf, description=
  """
    |Adds/fixes the phase set (PS) genotype field.
    |
    |The VCF specification allows phased genotypes to be annotated with the `PS` (phase set) `FORMAT` field.  The value
    |should be a non-negative integer, corresponding to the position of the first variant in the phase set.  Some tools
    |will output a non-integer value, as well as describe this field as having non-Integer type in the VCF header.  This
    |tool will update the phase set (`PS`) `FORMAT` field to be VCF spec-compliant.
    |
    |The handling of unphased genotypes with phase-sets is controlled by the `-x` option:
    |- If `-x` is used the genotype will be converted to a phased genotype (e.g. `0/1` => `0|1`)
    |- Otherwise the phase-set (`PS`) will be removed from the genotype
    |
    |The `--keep-original` option may be used to store the original `PS` value in a new `OPS` field.  The type
    |described in the header will match the original.
    |
    |This tool cannot fix phased variants without a phase set, or phased variant sets who have different phase set
    |values.
    |
    |In some cases, VCFs (e.g. from GIAB/NIST or Platinum Genomes) have illegal header lines, for example, a `PEDIGREE`
    |header line without a `ID` key-value field.  The `-z` option can be used to remove those lines.  This option is
    |included in this tool for convenience as those example VCFs in some cases have these illegal header lines, and it
    |is convenient to fix the phase set in addition to removing those illegal header lines.
  """)
class FixVcfPhaseSet
( @arg(flag='i', doc="Input VCF.") val input: PathToVcf,
  @arg(flag='o', doc="Output VCF.") val output: PathToVcf,
  @arg(flag='k', doc="Store the original phase set in the `OPS` field.") val keepOriginal: Boolean = false,
  @arg(flag='x', doc="Set unphased genotypes with a PS FORMAT value to be phased.") val phaseGenotypesWithPhaseSet: Boolean = false,
  @arg(flag='z', doc="Remove header lines that do not contain an ID key-value, which is required in VCF.") val removeNoIdHeaderLines: Boolean = false
) extends FgBioTool with LazyLogging {

  import FixVcfPhaseSet._

  Io.assertReadable(input)
  Io.assertCanWriteFile(output)

  override def execute(): Unit = {
    // Get the original VCF header
    val inHeader = {
      val source = VcfSource(input)
      val header = source.header
      source.safelyClose()
      header
    }
    // Developer note: we modify the input header so we are able to read in the PS format field as a string.
    val reader   = VcfSource(input, headerTransformer=headerForReader)
    val writer   = VcfWriter(output, header=headerForWriter(header=inHeader))
    val progress = ProgressLogger(logger=logger, noun="variants", verb="read", unit=100000)
    val updater  = new VcfPhaseSetUpdater(header=reader.header, keepOriginal=keepOriginal, phaseGenotypesWithPhaseSet=phaseGenotypesWithPhaseSet)

    reader.foreach { variant =>
      writer.write(variant=updater.update(variant=variant))
      progress.record(variant)
    }
    progress.logLast()

    reader.safelyClose()
    writer.close()

    logger.info(f"Examined ${updater.descriptionCounter.total}%,d genotypes.")
    updater.descriptionCounter.foreach { case (description, count) =>
      logger.info(f"Wrote $count%,d genotypes that $description")
    }
  }

  /** Returns a modified [[VcfHeader]] to use when reading the input.  The input VCF header will be modified such
    * that the PS FORMAT field is read as a single fixed value of type [[String]].  This is important, for example, when
    * the header has type [[VcfFieldType.Integer]] but the records have type [[VcfFieldType.String]].
    * */
  def headerForReader(header: VcfHeader): VcfHeader = {
    val formats = header.formats.filterNot(_.id == "PS")
    val ps      = header.formats
      .find(value => value.id == "PS" && value.count == VcfCount.Fixed(1) && value.kind == VcfFieldType.String)
      .getOrElse {
        VcfFormatHeader(
          id          = "PS",
          count       = VcfCount.Fixed(1),
          kind        = VcfFieldType.String,
          description = "Phasing set (typically the position of the first variant in the set)"
        )
      }
    header.copy(formats=formats :+ ps)
  }

  /** Builds the header for the [[VcfWriter]] from the input reader [[VcfHeader]].  If `keepOriginal` is `true`, then
    * the `OPS` FORMAT header line is added to store the original phase set, which will be a single fixed count of type
    * `String`.  The output phase set will always be a single fixed count of type `Integer`. */
  def headerForWriter(header: VcfHeader):  VcfHeader = {
    // Add the original phase set header FORMAT line if we want to keep the original phase set value
    val original: Option[VcfFormatHeader] = if (!keepOriginal) None else {
      Some(VcfFormatHeader(
        id          = "OPS",
        count       = VcfCount.Fixed(1),
        kind        = VcfFieldType.String,
        description = "Original phasing set prior to being fixed with fgbio FixVcfPhaseSet"
      ))
    }

    // Build the new phase set header FORMAT line.  If the header already contains one that is valid, just keep it.
    val newPhaseSet =  header
      .formats
      .find(value => value.id == "PS" && value.count == VcfCount.Fixed(1) && value.kind == VcfFieldType.Integer)
      .getOrElse {
          VcfFormatHeader(
          id          = "PS",
          count       = VcfCount.Fixed(1),
          kind        = VcfFieldType.Integer,
          description = "Phasing set (typically the position of the first variant in the set)"
        )
      }

    // In some cases, input VCFs may have header lines (ex. PEDIGREE) with no ID, which is required
    val others = if (removeNoIdHeaderLines) header.others.filterNot(_.id == null) else header.others

    // Put it all together
    header.copy(
      formats = header.formats.filter(_.id != "PS") ++ original.toSeq :+ newPhaseSet,
      others  = others
    )
  }
}

object FixVcfPhaseSet {
  private[vcf] object VcfPhaseSetUpdater {
    /** Base traits for the result of  [[VcfPhaseSetUpdater.updateGenotype()]] */
    sealed trait Result extends EnumEntry {
      def genotype: Genotype
      def description: String
    }
    object Result extends FgBioEnum[Result] {
      case class Valid                      (genotype: Genotype) extends Result { val description: String = "were valid" }
      case class UpdatedPhaseSetValue       (genotype: Genotype) extends Result { val description: String = "had phase set value updated only" }
      case class UpdatedPhaseAndValue       (genotype: Genotype) extends Result { val description: String = "were set to phased and had phase set value updated" }
      case class UpdatedPhaseOnly           (genotype: Genotype) extends Result { val description: String = "were set to phased" }
      case class PhasedMissingPhaseSetValue (genotype: Genotype) extends Result { val description: String = "were phased but missing a phase set value" }
      case class NotPhasedWithPhaseSetValue (genotype: Genotype) extends Result { val description: String = "were not phased but had a phase set value" }
      case class NotPhasedNoPhaseSetValue   (genotype: Genotype) extends Result { val description: String = "were not phased and no phase set value" }
      override def values: immutable.IndexedSeq[Result] = findValues
    }
  }

  /** Updates the phase set for a given variant's genotype(s).
    *
    * The phase set values must have type [[String]].
    *
    * @param header the VCF header
    * @param keepOriginal true to store the original phase set value in the OPS format field, false otherwise
    * @param phaseGenotypesWithPhaseSet set unphased genotypes with a PS FORMAT value to be phased
    */
  private[vcf] class VcfPhaseSetUpdater(header: VcfHeader, keepOriginal: Boolean, phaseGenotypesWithPhaseSet: Boolean) extends LazyLogging {
    import VcfPhaseSetUpdater._
    import VcfPhaseSetUpdater.Result._

    private val phaseSetToPositionBySample: Map[String, mutable.HashMap[String, Int]] = {
      header.samples.map { sample => (sample, scala.collection.mutable.HashMap[String, Int]()) }.toMap
    }
    private var lastChrom: String = ""
    private var lastPos: Int      = 0
    private var variants: Long    = 0
    private var genotypes: Long   = 0
    val descriptionCounter: SimpleCounter[String] = new SimpleCounter()

    /** Updates the phase set of the variant's genotype(s) */
    def update(variant: Variant): Variant = {
      // Phase sets cannot span contigs!  So empty our mappings if we get to a new one
      if (variant.chrom != lastChrom) {
        this.phaseSetToPositionBySample.values.foreach(_.clear())
        this.lastChrom = variant.chrom
        this.lastPos   = variant.pos
      }
      assert(this.lastPos <= variant.pos, "Variants are out of order!")

      variants += 1
      val genotypes: Map[String, Genotype] = variant.genotypes
        .map { case (sampleName: String, genotype: Genotype) =>
          // update the genotype
          val result = updateGenotype(variant=variant, genotype=genotype)
          // log the results
          result match {
            case NotPhasedWithPhaseSetValue(_)   =>
              logger.debug(
                "Genotype had a phase set but was unphased:" +
                  f"${variant.chrom}:${variant.pos}:${variant.id.getOrElse(".")} sample=$sampleName PS=${genotype("PS")}" +
                  "; consider r-running using `-x/--phase-genotypes-with-phase-set`"
              )
            case PhasedMissingPhaseSetValue(_) =>
              logger.debug(
                f"Genotype had no phase set but was phased: ${variant.chrom}:${variant.pos}:${variant.id.getOrElse(".")}"
              )
            case _                  => () // do nothing
          }
          this.descriptionCounter.count(result.description)
          // return the (potentially) new/updated genotype
          (sampleName, result.genotype)
        }
      variant.copy(genotypes=genotypes)
    }

    /** Updates the phase set of a given genotype.  Returns the [[Result]] of updating the genotyping, describing
      * what updating was done, if any. */
    private[vcf] def updateGenotype(variant: Variant, genotype: Genotype): Result = {
      this.genotypes += 1

      (genotype.phased, genotype.get[String]("PS")) match {
        case (true, None)  => PhasedMissingPhaseSetValue(genotype) // phased but no phase set, what can we do?
        case (false, None) => NotPhasedNoPhaseSetValue(genotype)   // not phased and no phase set, everything is fine
        case (false, Some(_)) if !phaseGenotypesWithPhaseSet =>    // unphased and a phase set, but we do not want to convert it to phased
          NotPhasedWithPhaseSetValue(genotype.copy(attrs=genotype.attrs.filterNot(_._1 == "PS"))) // remove the phase set for good measure
        case (isPhased, Some(oldValue)) => // may or not be phased, but has a phase set
          // Get the new phase set
          val phaseSetToValue = this.phaseSetToPositionBySample(genotype.sample)
          val newValue        = phaseSetToValue.getOrElseUpdate(oldValue, variant.pos)
          // Build the new set of attributes
          val phaseSetAttrs = {
            if (keepOriginal) Map("PS" -> newValue, "OPS" -> oldValue)
            else Map("PS" -> newValue)
          }
          // update the genotype
          val newGenotype = genotype.copy(attrs = genotype.attrs.filterNot(_._1 == "PS") ++ phaseSetAttrs, phased=true)
          // return based on if we phased the variant and if we added/updated the phase set value
          (isPhased, oldValue == newValue.toString) match {
            case (true, true)   => Valid(newGenotype)
            case (true, false)  => UpdatedPhaseSetValue(newGenotype)
            case (false, true)  => UpdatedPhaseOnly(newGenotype)
            case (false, false) => UpdatedPhaseAndValue(newGenotype)
          }
      }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy