All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.vcf.HapCutToVcf.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2016 Fulcrum Genomics
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package com.fulcrumgenomics.vcf

import java.io.{Closeable, InputStream}
import java.util
import java.util.NoSuchElementException

import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.util.Io
import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.commons.io.PathUtil
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt._
import htsjdk.variant.variantcontext._
import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriter, VariantContextWriterBuilder}
import htsjdk.variant.vcf._

import scala.jdk.CollectionConverters._
import scala.io.Source

@clp(
  description =
    """
      |Converts the output of `HAPCUT` (`HapCut1`/`HapCut2`) to a VCF.
      |
      |The output of `HAPCUT` does not include all input variants, but simply those variants that are in phased blocks.
      |This tool takes the original VCF and the output of `HAPCUT`, and produces a VCF containing both the variants that
      |were phased and the variants that were not phased, such that all variants in the original file are in the output.
      |
      |The original VCF provided to `HAPCUT` is assumed to contain a single sample, as `HAPCUT` only supports a single
      |sample.
      |
      |By default, all phased genotypes are annotated with the `PS` (phase set) `FORMAT` tag, which by convention is the
      |position of the first variant in the phase set (see the VCF specification).  Furthermore, this tool formats the
      |alleles of a phased genotype using the `|` separator instead of the `/` separator, where the latter indicates the
      |genotype is unphased.  If the option to output phased variants in GATK's `ReadBackedPhasing` format is used, then
      |the first variant in a phase set will have `/` instead of `|` separating its alleles.  Also, all phased variants
      |will have `PASS` set in its `FILTER` column, while unphased variants will have `NotPhased` set in their `FILTER`
      |column.  Unlike GATK's `ReadBackedPhasing`, homozygous variants will always be unphased.
      |
      |More information about the purpose and operation of GATK's Read-backed phasing, including its output format, can
      |be found in the [GATK Forum](http://gatkforums.broadinstitute.org/gatk/discussion/45/purpose-and-operation-of-read-backed-phasing)
      |
      |Additional `FORMAT` fields for phased variants are provided corresponding to per-genotype information produced by
      |`HapCut1`:
      |
      |  1. The `RC` tag gives the counts of calls supporting `allele0` and `allele1` respectively.
      |  2. The `LC` tag gives the change in likelihood if this SNP is made homozygous or removed.
      |  3. The `MCL` tag gives the maximum change in likelihood if this SNP is made homozygous or removed.
      |  4. The `RMEC` tag gives the reduction in `MEC` score if we remove this variant altogether.
      |
      |Additional `FORMAT` fields for phased variants are provided corresponding to per-genotype information produced by
      |`HapCut2`:
      |
      |  1. The `PR` tag is `1` if `HapCut2` pruned this variant, `0` otherwise.
      |  2. The `SE` tag gives the confidence (`log10`) that there is not a switch error occurring immediately before
      |     the SNV (closer to zero means that a switch error is more likely).
      |  3. The `NE` tag gives the confidence (`log10`) that the SNV is not a mismatch (single SNV) error (closer to
      |     zero means a mismatch is more likely).
      |
      |HapCut2 should not be run with `--call-homozygous` as the genotypes may be different than the input and so is not
      |currently supported.
      |
      |For more information about `HapCut1`, see the source code or paper below.
      |  source code: https://github.com/vibansal/hapcut
      |  HapCut1 paper: An efficient and accurate algorithm for the haplotype assembly problem Bioinformatics. 2008 Aug
      |    15;24(16):i153-9.
      |
      |For more information about `HapCut2`, see the [source code](https://github.com/vibansal/HapCUT2) below.
    """,
  group=ClpGroups.VcfOrBcf
)
class HapCutToVcf
( @arg(flag='v', doc="The original VCF provided to `HapCut1`/`HapCut2`.") val vcf: PathToVcf,
  @arg(flag='i', doc="The file produced by `HapCut1`/`HapCut2`.") val input: FilePath,
  @arg(flag='o', doc="The output VCF with both phased and unphased variants.") val output: PathToVcf,
  @arg(flag='r', doc="Output phased variants in GATK's `ReadBackedPhasing` format.") val gatkPhasingFormat: Boolean = false,
  @arg(          doc="Fix IUPAC codes in the original VCF to be VCF 4.3 spec-compliant (ex 'R' -> 'A').  Does not support BCF inputs.")
  val fixAmbiguousReferenceAlleles: Boolean = false
) extends FgBioTool with LazyLogging {
  import HapCutType._
  import HapCutToVcf.fixIupacBases

  Io.assertReadable(vcf)
  Io.assertReadable(input)
  Io.assertCanWriteFile(output)

  override def execute(): Unit = {
    // If fixing ambiguous (IUPAC) bases is desired, creates a new path to a VCF, otherwise returns the original input VCF.
    val inputVcf: FilePath = if (!fixAmbiguousReferenceAlleles) this.vcf else {
      val inputLines = PathUtil.extensionOf(this.vcf) match {
        case Some(".vcf") | Some(".vcf.gz") => Io.readLines(this.vcf)
        case Some(".bcf") => throw new IllegalArgumentException(s"BCF not supported when fixing ambiguity codes in: ${this.vcf}")
        case _            => throw new IllegalArgumentException(s"Could not determine file type from ${this.vcf}")
      }
      val outputLines = inputLines.map { line =>
        if (line.startsWith("#")) line else {
          val items    = line.split('\t')
          val refBases = fixIupacBases(items(3)) // no symbolic alleles allowed; the spec says no IUPAC bases, but here we are
          val altBases = items(4).split(',').map { alt =>
            if (alt.startsWith("<") && alt.endsWith(">")) alt else fixIupacBases(alt)
          }.mkString(",")
          (items.take(3) ++ Seq(refBases, altBases) ++ items.drop(5)).mkString("\t")
        }
      }
      val newVcf = Io.makeTempFile(PathUtil.basename(this.vcf.getFileName), ".vcf", dir=Some(Io.tmpDir))
      Io.writeLines(newVcf, outputLines)
      newVcf
    }

    val vcfReader = new VCFFileReader(inputVcf.toFile, false)
    val iterator  = new HapCutAndVcfMergingIterator(input, vcfReader, gatkPhasingFormat)
    val vcfWriter = makeWriter(output, vcfReader, iterator.hapCutType)

    iterator.foreach(vcfWriter.add)

    vcfReader.safelyClose()
    iterator.safelyClose()
    vcfWriter.close()
  }

  /** Creates a VCF writer, adding extra header lines if the output is the phased VCF. */
  private def makeWriter(path: PathToVcf, vcfReader: VCFFileReader, hapCutType: HapCutType): VariantContextWriter = {
    val inputHeader = vcfReader.getFileHeader
    val createIndex = PathUtil.extensionOf(path) match {
      case Some(".vcf")                   => false
      case Some(".vcf.gz") | Some(".bcf") => true
      case _                              => throw new IllegalArgumentException(s"Could not determine file type from $path")
    }
    val builder = new VariantContextWriterBuilder()
      .setOutputFile(path.toFile)
      .setReferenceDictionary(inputHeader.getSequenceDictionary)
      .setOption(Options.WRITE_FULL_FORMAT_FIELD)
      .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
    val writer: VariantContextWriter = if (createIndex) {
      builder.setOption(Options.INDEX_ON_THE_FLY).build
    }
    else {
      builder.unsetOption(Options.INDEX_ON_THE_FLY).build
    }

    // get the header lines in the input header that we wish to skip/replace with our own definitions
    val headerLinesToSkip = HeaderLines.formatHeaderKeys(hapCutType).flatMap(key => Option(inputHeader.getFormatHeaderLine(key)))
    val headerLines: util.Set[VCFHeaderLine] = new util.HashSet[VCFHeaderLine](
      inputHeader.getMetaDataInSortedOrder.filterNot(headerLinesToSkip.contains).toJavaSet
    )

    // add standard header lines
    VCFStandardHeaderLines.addStandardFormatLines(headerLines, false, Genotype.PRIMARY_KEYS)

    // add the new format header lines
    headerLines.addAll(HeaderLines.formatHeaderLines(hapCutType).asJava)

    // add the new filter header line if we are to set a filter (not phased) on the unphased variants
    if (gatkPhasingFormat) headerLines.add(HapCut1VcfHeaderLines.NotPhasedFilterHeaderLine)

    // create it
    val outHeader = new VCFHeader(headerLines, inputHeader.getSampleNamesInOrder) // create the header

    // write it and return
    writer.writeHeader(outHeader)
    writer
  }
}

object HeaderLines {
  import HapCutType._
  def formatHeaderKeys(hapCutType: HapCutType): Seq[String] = {
    hapCutType match {
      case HapCut1 => HapCut1VcfHeaderLines.formatHeaderKeys
      case HapCut2 => HapCut2VcfHeaderLines.formatHeaderKeys
      case _       => Seq.empty // empty file
    }
  }

  def formatHeaderLines(hapCutType: HapCutType): Seq[VCFHeaderLine] = {
    hapCutType match {
      case HapCut1 => HapCut1VcfHeaderLines.formatHeaderLines
      case HapCut2 => HapCut2VcfHeaderLines.formatHeaderLines
      case _       => Seq.empty // empty file
    }
  }
}

object HapCutToVcf {
  private val IupacToBase: Map[Char, Char] = Map(
    'M' -> 'A',
    'R' -> 'A',
    'W' -> 'A',
    'S' -> 'C',
    'Y' -> 'C',
    'K' -> 'G',
    'V' -> 'A',
    'H' -> 'A',
    'D' -> 'A',
    'B' -> 'C',
    'N' -> 'A'
  )

  def fixIupacBases(bases: String): String = bases.map { base =>
    val baseUpper = base.toUpper
    IupacToBase.getOrElse(baseUpper, baseUpper)
  }.mkString("")
}

trait HeaderLines {
  val PhaseSetFormatTag = "PS"
  val PhaseSetFormatDescription = "Phase set for the genotype (position of the first variant)"
  val PhaseSetFormatHeaderLine = new VCFFormatHeaderLine(PhaseSetFormatTag, 1, VCFHeaderLineType.Integer, PhaseSetFormatDescription)

  val NotPhasedFilterName        = "NotPhased"
  val NotPhasedFilterDescription = "The variant was not phased by HapCut."
  val NotPhasedFilterHeaderLine  = new VCFFilterHeaderLine(NotPhasedFilterName, NotPhasedFilterDescription)
}

object HapCut1VcfHeaderLines extends HeaderLines {
  val ReadCountFormatTag         = "RC"
  val ReadCountFormatDescription = "Counts of calls supporting allele0 and allele1 respectively"
  val ReadCountFormatHeaderLine  = new VCFFormatHeaderLine(ReadCountFormatTag, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, ReadCountFormatDescription)

  val LikelihoodChangeFormatTag         = "LC"
  val LikelihoodChangeFormatDescription = "Change in likelihood if this SNP is made homozygous or removed"
  val LikelihoodChangeFormatHeaderLine  = new VCFFormatHeaderLine(LikelihoodChangeFormatTag, 3, VCFHeaderLineType.Float, LikelihoodChangeFormatDescription)

  val MaxLikelihoodChangeFormatTag         = "MLC"
  val MaxLikelihoodChangeFormatDescription = "Maximum change in likelihood if this SNP is made homozygous or removed"
  val MaxLikelihoodChangeFormatHeaderLine  = new VCFFormatHeaderLine(MaxLikelihoodChangeFormatTag, 1, VCFHeaderLineType.Float, MaxLikelihoodChangeFormatDescription)

  val MecReductionFormatTag = "RMEC"
  val MecReductionFormatDescription = "Reduction in MEC score if we remove this variant altogether"
  val MecReductionFormatHeaderLine  = new VCFFormatHeaderLine(MecReductionFormatTag, 1, VCFHeaderLineType.Float, MecReductionFormatDescription)

  /** The VCF header FORMAT keys that will be added for HapCut1 specific-genotype information. */
  val formatHeaderKeys: Seq[String] = {
    Seq(ReadCountFormatTag, LikelihoodChangeFormatTag, MaxLikelihoodChangeFormatTag, MecReductionFormatTag, PhaseSetFormatTag)
  }

  /** The VCF header FORMAT lines that will be added for HapCut1 specific-genotype information. */
  val formatHeaderLines: Seq[VCFHeaderLine] = {
    Seq(ReadCountFormatHeaderLine, LikelihoodChangeFormatHeaderLine, MaxLikelihoodChangeFormatHeaderLine, MecReductionFormatHeaderLine, PhaseSetFormatHeaderLine)
  }
}

object HapCut2VcfHeaderLines extends HeaderLines {
  val PrunedFormatTag         = "PR"
  val PrunedFormatDescription = "1 if HapCut2 pruned this variant when using --discrete_pruning, 0 otherwise"
  val PrunedFormatHeaderLine  = new VCFFormatHeaderLine(PrunedFormatTag, 1, VCFHeaderLineType.Integer, PrunedFormatDescription)

  val SwitchErrorFormatTag         = "SE"
  val SwitchErrorFormatDescription = "The confidence (phred-scaled) that there is not a switch error occurring immediately before the SNV"
  val SwitchErrorFormatHeaderLine  = new VCFFormatHeaderLine(SwitchErrorFormatTag, 1, VCFHeaderLineType.Float, SwitchErrorFormatDescription)

  val NoErrorFormatTag         = "NE"
  val NoErrorFormatDescription = "The confidence (phred-scaled) that the SNV is not a mismatch (single SNV) error."
  val NoErrorFormatHeaderLine  = new VCFFormatHeaderLine(NoErrorFormatTag, 1, VCFHeaderLineType.Float, NoErrorFormatDescription)

  /** The VCF header FORMAT keys that will be added for HapCut2 specific-genotype information. */
  val formatHeaderKeys: Seq[String] = {
    Seq(PrunedFormatTag, SwitchErrorFormatTag, NoErrorFormatTag, PhaseSetFormatTag)
  }

  /** The VCF header FORMAT lines that will be added for HapCut2 specific-genotype information. */
  val formatHeaderLines: Seq[VCFHeaderLine] = {
    Seq(PrunedFormatHeaderLine, SwitchErrorFormatHeaderLine, NoErrorFormatHeaderLine, PhaseSetFormatHeaderLine)
  }
}


/** An iterator over all variants in the provided VCFFileReader.
  *
  * @param hapCutPath the output file with phased variants produced by HapCut.
  * @param vcfReader the VCF file with the original variants provided to HapCut.
  * @param gatkPhasingFormat true to output in GATK's ReadBackedPhasing format, false if to use the recommendations in the VCF spec.
  */
private class HapCutAndVcfMergingIterator(hapCutPath: FilePath,
                                          vcfReader: VCFFileReader,
                                          gatkPhasingFormat: Boolean)
  extends Iterator[VariantContext] with Closeable {
  import HapCutType.HapCutType

  private val sourceIterator = vcfReader.iterator.zipWithIndex.buffered
  private val hapCutReader   = HapCutReader(path=hapCutPath)
  private val sampleName     = vcfReader.getFileHeader.getSampleNamesInOrder.iterator().next()

  def hasNext: Boolean = {
    if (sourceIterator.isEmpty && hapCutReader.hasNext) throw new IllegalStateException("HapCut has more phased variants but no more variants in the input")
    sourceIterator.hasNext
  }

  /** Returns either the phased variant with added HapCut-specific genotype information (Left), or the original variant (Right). */
  def next(): VariantContext = {
    if (!hasNext) throw new NoSuchElementException("Calling next() when hasNext() is false")
    val (sourceContext, sourceOffset) = sourceIterator.next()
    if (!hapCutReader.hasNext) formatSourceContext(sourceContext)
    else {
      val HapCutOffsetAndCall(offset, callOption) = hapCutReader.next()
      if (offset != sourceOffset+1) throw new IllegalStateException("BUG: calls are out of order")
      callOption match {
        case None =>
          formatSourceContext(sourceContext)
        case Some(hapCutCall) =>
          require(hapCutCall.pos == sourceContext.getStart, s"${hapCutCall.pos} ${sourceContext.getStart}")
          require(offset == hapCutCall.offset)
          val hapCutContext = hapCutCall.toVariantContext(sampleName)
          replaceGenotypes(source=sourceContext, genotype=hapCutContext.getGenotype(0), isPhased = !gatkPhasingFormat || !hapCutCall.firstInBlock)
      }
    }
  }

  /** Returns the the HapCut type (HapCut1 or HapCut2), or Unknown if the file was empty. */
  def hapCutType: HapCutType = hapCutReader.hapCutType

  def close(): Unit = { this.hapCutReader.close() }

  /** Returns a new variant context with the phase unset (if it was set) and appropriately formatted (filtered as
    * not phased if `gatkPhasingFormat` is true).
    */
  private def formatSourceContext(sourceContext: VariantContext): VariantContext = {
    val hasPhasedGenotype = sourceContext.getGenotypes.exists(_.isPhased)
    val hasPhasingSetId   = sourceContext.getGenotypes.exists(_.hasExtendedAttribute(HapCut1VcfHeaderLines.PhaseSetFormatTag))

    if (!hasPhasedGenotype && !hasPhasingSetId && !gatkPhasingFormat) sourceContext
    else {
      val builder = new VariantContextBuilder(sourceContext)
      if (hasPhasedGenotype || hasPhasingSetId) {
        // unset the phase and remove the phasing set ID if the input has phase set
        builder.genotypes(sourceContext.getGenotypes().map { g =>
          val builder = new GenotypeBuilder(g).phased(false)
          val attrs = g.getExtendedAttributes.asScala.filterNot { case (tag, value) =>  tag == HapCut1VcfHeaderLines.PhaseSetFormatTag }
          builder.noAttributes()
          builder.attributes(attrs.asJava)
          builder.make()
        }.toJavaList)
      }
      if (gatkPhasingFormat) {
        // set the variant as filtered due to not being phased
        builder.filter(HapCut1VcfHeaderLines.NotPhasedFilterName)
      }
      builder.make()
    }
  }

  /** Replaces the original genotype with HapCut's genotype and adds any HapCut-specific genotype information. */
  private def replaceGenotypes(source: VariantContext, genotype: Genotype, isPhased: Boolean): VariantContext = {
    val builder = new VariantContextBuilder(source)
    val sourceAlleles = source.getAlleles.toSeq
    val genotypeAlleles = genotype.getAlleles.toList.map {
      allele => sourceAlleles.find(a => a.toString == allele.toString) match {
        case None =>
          throw new IllegalStateException(s"Could not find allele '$allele' in source alleles: " + sourceAlleles.map{_.toString}.mkString(", "))
        case Some(a) => a
      }
    }
    val sourceGenotype = source.getGenotype(0)
    val genotypeBuilder = new GenotypeBuilder(sourceGenotype).alleles(genotypeAlleles.asJava).phased(isPhased)
    genotypeBuilder.attributes(genotype.getExtendedAttributes)
    builder.genotypes(genotypeBuilder.make()).make()
  }
}

sealed trait BlockInfo {
  /** The 1-based variant offset from the first variant given to HapCut. */
  def offset: Int
  /** The total number of variants that this block spans, including unphased variants. */
  def len: Int
  /** The total number of variants phased. */
  def phased: Int
  /** The number of bases this block spans. */
  def span: Int
  /** The number of fragments contributing to variants in this block. */
  def fragments: Int
  /** The 1-based variant offset (from the first variant given to HapCut) of the last genotype in this block. */
  def offsetOfLastVariant: Int = this.offset + this.len - 1
  /** The phase set identifier for this block (typically the position of the first variant). Throws an exception if
    * the `phaseSetOption` was None when this was constructed. */
  def phaseSet: Int = phaseSetOption.get
  /** Updates the phase set to which this block belongs. */
  def withPhaseSet(phaseSet: Option[Int]): BlockInfo
  /** Stores the phase set, or None if it has not been set. */
  protected def phaseSetOption: Option[Int]
}

/** Haplotype block information produced by HapCut.
  *
  * @param phaseSetOption the phase set identifer, or None if not yet available.
  * @param offset the 1-based variant offset from the first variant given to HapCut.
  * @param len the total number of variants that this block spans, including unphased variants.
  * @param phased the total number of variants phased.
  * @param span the number of bases this block spans.
  * @param mecScore the minimum error correction (MEC) score.  For HAPCUT2, this will be -1.
  * @param fragments the number of fragments contributing to variants in this block.
  */
private case class HapCut1BlockInfo (protected val phaseSetOption: Option[Int] = None,
                                     offset: Int,
                                     len: Int,
                                     phased: Int,
                                     span: Int,
                                     mecScore: Float,
                                     fragments: Int) extends BlockInfo {
  def withPhaseSet(phaseSet: Option[Int]): HapCut1BlockInfo = this.copy(phaseSetOption=phaseSet)
}

/** Haplotype block information produced by HapCut2.
  *
  * @param phaseSetOption the phase set identifer, or None if not yet available.
  * @param offset the 1-based variant offset from the first variant given to HapCut.
  * @param len the total number of variants that this block spans, including unphased variants.
  * @param phased the total number of variants phased.
  * @param span the number of bases this block spans.
  * @param fragments the number of fragments contributing to variants in this block.
  */
private case class HapCut2BlockInfo (protected val phaseSetOption: Option[Int] = None,
                                     offset: Int,
                                     len: Int,
                                     phased: Int,
                                     span: Int,
                                     fragments: Int) extends BlockInfo {
  def withPhaseSet(phaseSet: Option[Int]): HapCut2BlockInfo = this.copy(phaseSetOption=phaseSet)
}

private object GenotypeInfo {
  import HapCutType._
  def apply(info: String, hapCutType: HapCutType, missingAlleles: Boolean = false, thresholdPruning: Boolean = false): GenotypeInfo = {
    hapCutType match {
      case HapCut1 => HapCut1GenotypeInfo(info=info)
      case HapCut2 => HapCut2GenotypeInfo(info=info, missingAlleles=missingAlleles, thresholdPruning=thresholdPruning)
      case _       => unreachable("Unknown HapCut type when building a GenotypeInfo.")
    }
  }
}

/** Genotype-level information specific to HapCut1 or HapCut2. */
private sealed trait GenotypeInfo {
  def addTo(builder: GenotypeBuilder): GenotypeBuilder
}

/** Genotype-level information produced by HapCut1 */
private case class HapCut1GenotypeInfo private(readCounts: List[Int],
                                               likelihoods: List[Float],
                                               delta: Float,
                                               rMEC: Float) extends GenotypeInfo {
  import HapCut1VcfHeaderLines._
  /** Adds the HapCut-specific genotype information to the given genotype builder. */
  def addTo(builder: GenotypeBuilder): GenotypeBuilder = {
    builder
      .attribute(ReadCountFormatTag, this.readCounts.asJava)
      .attribute(LikelihoodChangeFormatTag, this.likelihoods.asJava)
      .attribute(MaxLikelihoodChangeFormatTag, this.delta)
      .attribute(MecReductionFormatTag, this.rMEC)
  }
}

/** Values and methods for HapCut-specific genotype information. */
private object HapCut1GenotypeInfo {
  // A developer's notes
  //  R0,R1:L00,L01,L11:delta:rMEC
  //  "%d,%d:%0.1f,%0.1f,%0.1f:%0.1f:%0.1f"
  //  R0,R1: counts of calls supporting allele0 and allele1 respectively
  //  L00,L01,L11: change in likelihood if this SNP is made homozygous or removed
  //  delta: maximum change in likelihood if this SNP is made homozygous or removed
  //  rMEC: reduction in MEC score if we remove this variant altogether
  val HapCut1CallInfoPattern = """(\d+),(\d+):(\d+\.\d+),(\d+\.\d+),(\d+\.\d+):(\d+\.\d+):(\d+\.\d+)""".r

  /** Parses the genotype information produced by HapCut. */
  def apply(info: String): GenotypeInfo = {
    val tokens = info.split(":")
    new HapCut1GenotypeInfo(
      readCounts  = tokens(0).split(",").map(_.toInt).toList,
      likelihoods = tokens(1).split(",").map(_.toFloat).toList,
      delta       = tokens(2).toFloat,
      rMEC        = tokens(3).toFloat
    )
  }
}

/** Genotype-level information produced by HapCut2 */
private[vcf] case class HapCut2GenotypeInfo private[vcf](pruned: Option[Boolean], log10SwitchError: Option[Double], log10NoError: Option[Double]) extends GenotypeInfo {
  import HapCut2VcfHeaderLines._
  def addTo(builder: GenotypeBuilder): GenotypeBuilder = {
    builder
      .attribute(PrunedFormatTag, this.pruned.map(p => if (p) 1 else 0).orNull)
      .attribute(SwitchErrorFormatTag, this.log10SwitchError.orNull)
      .attribute(NoErrorFormatTag, this.log10NoError.orNull)
  }
}

/** Values and methods for HapCut2-specific genotype information. */
private[vcf] object HapCut2GenotypeInfo {

  private def toDouble(str: String): Option[Double] = {
    str match {
      case "."    => None
      case "-inf" => Some(Double.MinValue)
      case s      => Some(s.toDouble)
    }
  }

  /** Parses the genotype information produced by HapCut2. */
  def apply(info: String, missingAlleles: Boolean, thresholdPruning: Boolean): GenotypeInfo = {
    val tokens = info.split("\t")
    new HapCut2GenotypeInfo(
      pruned           = if ("." == tokens(0)) None else Some("1" == tokens(0) || missingAlleles || thresholdPruning),
      log10SwitchError = toDouble(tokens(1)),
      log10NoError     = toDouble(tokens(2))
    )
  }
}

/** Information about a variant phased by HapCut.
  *
  * @param block the haplotype block to which this variant is associated.
  * @param offset the 1-based offset of the variant relative to the variants given to HapCut.
  * @param hap1Allele the first allele.
  * @param hap2Allele the second allele.
  * @param contig the contig.
  * @param pos the position.
  * @param ref the reference allele.
  * @param alts the alternate alleles.
  * @param genotype the genotype.
  * @param info the HapCut-specific genotype information.
  * @param phaseSet the phase set identifier for the phase block to which this call belongs
  */
private case class HapCutCall private(block: BlockInfo,
                                      offset: Int,
                                      hap1Allele: Int, hap2Allele: Int,
                                      contig: String, pos: Int,
                                      ref: String, alts: String,
                                      genotype: String,
                                      info: GenotypeInfo,
                                      phaseSet: Int
                                     ) {
  import HapCut1VcfHeaderLines._

  /** true if this variant is the first phased variant in a phased block, false otherwise. */
  def firstInBlock: Boolean = phaseSet == pos

  /** Converts the HapCut variant representation to a VariantContext. */
  def toVariantContext(sampleName: String): VariantContext = {
    // Parse the alleles
    val refAllele = Allele.create(this.ref, true)
    val altAlleles = this.alts.split(",").map { alt =>
      Allele.create(alt, false)
    }.toSeq
    val alleles = refAllele +: altAlleles
    val allelesCollection = alleles.asJavaCollection

    // Get the genotype alleles
    val genotypeAlleles: List[Allele] = if (hap1Allele < 0) {
      require(hap2Allele < 0)
      // Assumes the genotype is before the first ":" token
      this.genotype.split(":").head.split("[/|]").map(_.toInt).map(i => alleles(i)).toList
    }
    else {
      List(alleles(hap1Allele), alleles(hap2Allele))
    }

    val genotype = this.info.addTo(new GenotypeBuilder(sampleName, genotypeAlleles.asJava))
      .phased(true)
      .attribute(PhaseSetFormatTag, phaseSet)
      .make()

    // build the context, and make sure to recompute the end
    new VariantContextBuilder(
      s"${this.offset}",
      this.contig,
      this.pos.toLong,
      this.pos.toLong,
      allelesCollection
    )
    .computeEndFromAlleles(alleles.toList.asJava, this.pos)
    .genotypes(genotype).make()
  }
}

private object HapCutCall {
  import HapCutType._
  import HapCutToVcf.fixIupacBases

  /** Parse a variant line.
    *
    * @param callLine the line to parse.
    * @param firstVariantPosition the position of the first variant in the block to which this variant belongs, or None
    *                             if this variant is the first variant in the block.
    **/
  def toHapCutCall(callLine: String, block: BlockInfo, firstVariantPosition: Option[Int], hapCutType: HapCutType): HapCutCall = {
    val tokens = callLine.split('\t')
    val info = hapCutType match {
      case HapCut1 =>
        require(9 == tokens.length, s"Did not parse 9 fields (parsed ${tokens.length}) in the HapCut line: " + callLine.trim)
        tokens(8)
      case HapCut2 =>
        require(11 <= tokens.length, s"Did not parse at least 11 fields (parsed ${tokens.length}) in the HapCut2 line: " + callLine.trim)
        tokens.drop(8).mkString("\t")
      case _       => unreachable("Unknown HapCut type in toHapCutCall.")
    }
    val offset = tokens(0).toInt
    val pos    = tokens(4).toInt
    val hap1Allele = if ("-" == tokens(1)) -1 else tokens(1).toInt
    val hap2Allele = if ("-" == tokens(2)) -1 else tokens(2).toInt
    val thresholdPruning = hapCutType == HapCut2 && (hap1Allele < 0 || hap2Allele < 0)
    val ref: String = fixIupacBases(tokens(5))
    new HapCutCall(
      block        = block,
      offset       = offset,
      hap1Allele   = hap1Allele,
      hap2Allele   = hap2Allele,
      contig       = tokens(3),
      pos          = pos,
      ref          = ref,
      alts         = tokens(6),
      genotype     = tokens(7),
      info         = GenotypeInfo(info, hapCutType=hapCutType, missingAlleles=(hap1Allele < 0 || hap2Allele < 0), thresholdPruning=thresholdPruning),
      phaseSet     = firstVariantPosition.getOrElse(pos)
    )
  }
}

object HapCutType extends Enumeration {
  type HapCutType = Value
  val HapCut1, HapCut2, Unknown = Value
}

private[vcf] case class HapCutOffsetAndCall(offset: Int, call: Option[HapCutCall] = None)

object HapCutReader {
  val HapCut1BlockLinePattern = """^BLOCK: offset: (\d+) len: (\d+) phased: (\d+) SPAN: (\d+) MECscore (\d+.\d+) fragments (\d+)$""".r
  val HapCut2BlockLinePattern = """^BLOCK: offset: (\d+) len: (\d+) phased: (\d+) SPAN: (\d+) fragments (\d+)$""".r

  def apply(stream: InputStream): HapCutReader = new HapCutReader(Source.fromInputStream(stream).getLines(), Some(stream))

  def apply(path: FilePath): HapCutReader = apply(Io.toInputStream(path))
}

/** Reads phased blocks and variants from a HapCut output file.  For each variant call given to HapCut, returns either
  * some `HapCutCall` if that variant was phased, or None if it was not phased.  HapCut does not include unphased
  * variants in its output, but does provide the 1-based offset of every phased variant relative to the variant calls
  * given to HapCut.  Therefore, we are able to determine if None should be returned (unphased, not in the HapCut file)
  * or if a `HapCutCall` should be returned (phased, in the HapCut file).  Nonetheless, variants that were not phased
  * after the last HapCut phased variant will not be returned (hasNext will be false), as HapCut doesn't provide the
  * total # of variants that were given to HapCut, so these trailing unphased variant calls must be treated specially
  * for now.
  */
private[vcf] class HapCutReader(iterator: Iterator[String],
                                private[this] val source: Option[{ def close(): Unit }] = None)
  extends Iterator[HapCutOffsetAndCall] with Closeable {
  import HapCutType._
  import scala.collection.mutable.ListBuffer

  private val lineIterator = iterator.buffered
  val hapCutType: HapCutType = {
    if (lineIterator.isEmpty) Unknown // Default to true if the file is empty
    else {
      lineIterator.head match {
        case HapCutReader.HapCut1BlockLinePattern(offset, len, phased, span, mecScore, fragments) => HapCut1
        case HapCutReader.HapCut2BlockLinePattern(offset, len, phased, span, fragments)           => HapCut2
        case blockLine => throw new IllegalStateException("Could not parse block line: " + blockLine)
      }
    }
  }
  private var nextBlockInfo: Option[BlockInfo] = this.forNextBlockInfo()
  private val calls = new CallBuffer


  /** Simple class to store calls within a range of offsets. */
  private class CallBuffer {
    private val callBuffer = new ListBuffer[HapCutOffsetAndCall]()
    private var firstOffset = 1

    /** Add a call to the buffer.  This may add empty calls with less than offset of this call relative to the call
      * with the greatest offset ever present in this buffer. */
    def add(call: HapCutCall): Unit = {
      require(firstOffset <= call.offset)
      val offsetAndCall = new HapCutOffsetAndCall(offset=call.offset, call=Some(call))
      // add empty values *prior* to this call
      while (firstOffset + this.callBuffer.length < call.offset) {
        this.callBuffer.append(new HapCutOffsetAndCall(offset=firstOffset+this.callBuffer.length, call=None))
      }
      // add the call
      if (this.callBuffer.nonEmpty && call.offset <= firstOffset + this.callBuffer.length - 1) {
        this.callBuffer(call.offset-firstOffset) = offsetAndCall
      }
      else {
        this.callBuffer.append(offsetAndCall)
      }
    }

    /** Removes the next call. */
    def removeFirst(): HapCutOffsetAndCall = {
      val call = this.callBuffer.remove(0)
      this.firstOffset = call.offset + 1
      call
    }

    def isEmpty: Boolean = this.callBuffer.isEmpty

    def nonEmpty: Boolean = this.callBuffer.nonEmpty
  }

  def hasNext: Boolean = {
    if (calls.nonEmpty) true
    else this.advance()
  }

  def next(): HapCutOffsetAndCall = {
    if (!hasNext) throw new NoSuchElementException("Calling next() when hasNext() is false")
    calls.removeFirst()
  }

  /** Closes the underlying reader; only necessary if EOF hasn't been reached. */
  def close(): Unit = this.source.foreach(_.close())

  /** Reads in the next block info. */
  private def forNextBlockInfo(): Option[BlockInfo] = {
    // find the next block
    while (this.lineIterator.hasNext && !this.lineIterator.head.startsWith("BLOCK:")) {
      this.lineIterator.next()
    }
    if (this.lineIterator.hasNext) {
      // Read the block information
      val blockLine = this.lineIterator.next().trim
      val currentBlock: BlockInfo = blockLine match {
        case HapCutReader.HapCut1BlockLinePattern(offset, len, phased, span, mecScore, fragments) =>
          if (hapCutType != HapCut1) throw new IllegalStateException(s"Found a HapCut1 block line but the file type was $hapCutType")
          new HapCut1BlockInfo(None, offset.toInt, len.toInt, phased.toInt, span.toInt, mecScore.toFloat, fragments.toInt)
        case HapCutReader.HapCut2BlockLinePattern(offset, len, phased, span, fragments) =>
          if (hapCutType != HapCut2) throw new IllegalStateException(s"Found a HapCut2 block line but the file type was $hapCutType")
          new HapCut2BlockInfo(None, offset.toInt, len.toInt, phased.toInt, span.toInt, fragments.toInt)
        case _ => throw new IllegalStateException("Could not parse block line: " + blockLine)
      }
      Some(currentBlock)
    }
    else None
  }

  /** Reads in the next set of variants for the given block. */
  private def forNextBlockCalls(block: BlockInfo): Unit = {
    // Read in the phased variants and add them to hte set of calls
    var firstVariantPosition: Option[Int] = None
    var i = 0
    while (i < block.phased) {
      if (!lineIterator.hasNext) throw new IllegalStateException("Reached the end of file when searching for a call")
      val call = HapCutCall.toHapCutCall(callLine=lineIterator.next(), block=this.nextBlockInfo.get, firstVariantPosition=firstVariantPosition, hapCutType=hapCutType)
      if (firstVariantPosition.isEmpty) firstVariantPosition = Some(call.pos)
      this.calls.add(call=call)
      i += 1
    }
  }

  /** Gets the variants for the next block, and any subsequent block that overlaps this block or others subsequently
    * read in.  Returns true if any block was read, false otherwise. */
  private def advance(): Boolean = {
    // Read in the variants for the next block
    val firstBlock = this.nextBlockInfo match {
      case None => return false
      case Some(block) =>
        forNextBlockCalls(block=block)
        block
    }
    require(this.calls.nonEmpty)

    // read in blocks until the start offset of the next block is greater than the maximum offset seen so far
    var maxOffset = firstBlock.offsetOfLastVariant
    this.nextBlockInfo = forNextBlockInfo()
    while (this.nextBlockInfo.exists(_.offset <= maxOffset)) {
      forNextBlockCalls(block=this.nextBlockInfo.get)
      maxOffset = Math.max(maxOffset, this.nextBlockInfo.get.offsetOfLastVariant)
      this.nextBlockInfo = forNextBlockInfo()
    }

    true
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy