All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.projectglow.vcf.VCFStreamWriter.scala Maven / Gradle / Ivy

/*
 * Copyright 2019 The Glow Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.projectglow.vcf

import java.io.{Closeable, OutputStream}
import java.util.{ArrayList => JArrayList, HashSet => JHashSet}

import scala.collection.JavaConverters._

import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriter, VariantContextWriterBuilder}
import htsjdk.variant.variantcontext.{Genotype, GenotypeBuilder, VariantContext, VariantContextBuilder}
import htsjdk.variant.vcf.{VCFHeader, VCFHeaderLine}

import io.projectglow.common.GlowLogging

/**
 * This internal row -> variant context stream writer maintains a header that is set exactly once. The sample IDs are
 * set by [[sampleIdInfo]] if predetermined, or inferred from the first written row otherwise.
 *
 * If missing sample IDs were used to set the header, sample IDs in all rows to be written will be replaced with those
 * from the header.
 * If all sample IDs are present when setting the header, sample IDs in written rows will not be replaced.
 * Mixed missing/present sample IDs are not permitted.
 *
 * @param stream The stream to write to
 * @param headerLineSet Header lines used to set the VCF header
 * @param sampleIdInfo Sample IDs, which may be predetermined or must be inferred
 * @param writeHeader Whether to write the header in this stream
 */
class VCFStreamWriter(
    stream: OutputStream,
    headerLineSet: Set[VCFHeaderLine],
    sampleIdInfo: SampleIdInfo,
    writeHeader: Boolean)
    extends Closeable
    with GlowLogging
    with Serializable {

  var header: VCFHeader = _
  var headerSampleSet: JHashSet[String] = _
  var replaceSampleIds: Boolean = _

  private val writer: VariantContextWriter = new VariantContextWriterBuilder()
    .clearOptions()
    .setOutputStream(stream)
    .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
    .setOption(Options.WRITE_FULL_FORMAT_FIELD)
    .build

  def setHeader(vcBuilder: VariantContextBuilder): Unit = {
    val sampleIds = if (sampleIdInfo == InferSampleIds) {
      val vcSamples = vcBuilder.getGenotypes.asScala.map(_.getSampleName)
      val numTotalSamples = vcSamples.length
      val numPresentSamples = vcSamples.count(!_.isEmpty)

      if (numPresentSamples > 0) {
        if (numPresentSamples < numTotalSamples) {
          VCFWriterUtils.throwMixedSamplesFailure()
        }
        replaceSampleIds = false
        vcSamples.sorted
      } else {
        replaceSampleIds = true
        InferSampleIds.fromNumberMissing(numTotalSamples)
      }
    } else {
      replaceSampleIds = false
      sampleIdInfo.asInstanceOf[SampleIds].sortedSampleIds
    }
    val javaSampleIds = sampleIds.asJava
    headerSampleSet = new JHashSet(javaSampleIds)
    header = new VCFHeader(headerLineSet.asJava, javaSampleIds)
  }

  // Replace genotypes' missing sample IDs with those from the header
  def replaceMissingSampleIds(vcBuilder: VariantContextBuilder): VariantContextBuilder = {
    val oldGts = vcBuilder.getGenotypes
    val newGts = new JArrayList[Genotype](oldGts.size)
    var i = 0
    while (i < oldGts.size) {
      val oldGt = oldGts.get(i)
      val newGt = if (oldGt.getSampleName.isEmpty && i < header.getGenotypeSamples.size) {
        new GenotypeBuilder(oldGt).name(header.getGenotypeSamples.get(i)).make
      } else {
        oldGt
      }
      newGts.add(newGt)
      i += 1
    }
    vcBuilder.genotypes(newGts)
  }

  // Check that genotype sample IDs are the same as those in the header
  def checkInferredSampleIds(vcBuilder: VariantContextBuilder): VariantContextBuilder = {
    if (!vcBuilder.getGenotypes.getSampleNames.equals(headerSampleSet)) {
      VCFWriterUtils.throwSampleInferenceFailure()
    }
    vcBuilder
  }

  def write(vc: VariantContext): Unit = {
    val vcBuilder = VCFWriterUtils.convertVcAttributesToStrings(vc)

    if (header == null) {
      setHeader(vcBuilder)
      if (writeHeader) {
        writer.writeHeader(header)
      } else {
        writer.setHeader(header)
      }
    }

    val replacedVcBuilder = if (replaceSampleIds) {
      replaceMissingSampleIds(vcBuilder)
    } else {
      vcBuilder
    }

    val checkedVcBuilder = if (sampleIdInfo == InferSampleIds) {
      checkInferredSampleIds(replacedVcBuilder)
    } else {
      replacedVcBuilder
    }

    writer.add(checkedVcBuilder.make)
  }

  override def close(): Unit = {
    // Header must be written before closing writer, or else VCF readers will break.
    if (header == null && writeHeader) {
      if (sampleIdInfo == InferSampleIds) {
        throw new IllegalStateException(
          "Cannot infer header for empty partition; " +
          "we suggest calling coalesce or repartition to remove empty partitions.")
      }
      val sampleIds = sampleIdInfo.asInstanceOf[SampleIds].sortedSampleIds
      header = new VCFHeader(headerLineSet.asJava, sampleIds.asJava)
      writer.writeHeader(header)
    }

    try {
      writer.close()
    } catch {
      case e: Throwable =>
        logger.warn("Could not close writer: " + e.getMessage)
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy