All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.testing.ReferenceSetBuilder.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2017 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.testing

import java.nio.file.{Files, Path}

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.commons.io.PathUtil
import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata}
import com.fulcrumgenomics.util.Io
import htsjdk.samtools.reference.{FastaSequenceIndexCreator, ReferenceSequenceFileFactory}
import htsjdk.samtools.util.SequenceUtil
import htsjdk.samtools.{SAMFileHeader, SAMTextHeaderCodec}

import scala.collection.mutable.ListBuffer

/** Class to programatically build up a reference sequence. */
class ReferenceSetBuilder(val assembly: Option[String] = Some("testassembly"),
                          val species: Option[String] = Some("testspecies"),
                          val lineLength: Int = 80) {
  // Class to build up a single reference sequence
  class ReferenceBuilder private[ReferenceSetBuilder](val name: String, val assembly: Option[String], val species: Option[String]) {
    private[ReferenceSetBuilder] val bases = new StringBuilder

    /** Adds bases to the reference. */
    def add(s: String, times: Int=1): this.type = {
      forloop (from=0, until=times) { _ => bases.append(s) }
      this
    }
  }

  /** The sequences in order. */
  private val refs = new ListBuffer[ReferenceBuilder]()

  /** Generates a new ReferenceBuilder object in order and returns it. */
  def add(name: String, assembly: Option[String] = this.assembly, species: Option[String] = this.species): ReferenceBuilder = {
    this.refs += new ReferenceBuilder(name, assembly, species)
    this.refs.last
  }

  /** Writes the fasta out to a temp file and creates a sequence dictionary alongside it. */
  def toTempFile(deleteOnExit: Boolean = true, calculateMds5: Boolean = false): PathToFasta = {
    val path = Files.createTempFile("SamRecordSet.", ".fa")
    if (deleteOnExit) path.toFile.deleteOnExit()
    toFile(path, deleteOnExit=deleteOnExit, calculateMds5=calculateMds5)
    path
  }

  /** Writes the fasta out to a file, and creates a sequence dictionary and fasta index (FAI) file. */
  def toFile(path: Path, deleteOnExit: Boolean = true, calculateMds5: Boolean = false): Unit = {
    val out = Io.toWriter(path)

    val infos = this.refs.map { ref =>
      out.write('>')
      out.write(ref.name)
      out.newLine()
      val bases = ref.bases.toString()
      bases.grouped(lineLength).foreach { line =>
        out.write(line)
        out.newLine()
      }


      SequenceMetadata(
        name       = ref.name,
        length     = bases.length,
        assembly   = ref.assembly,
        species    = ref.species,
        md5        = if (calculateMds5) Some(SequenceUtil.calculateMD5String(bases.toUpperCase.getBytes)) else None
      )
    }
    out.close()

    val dict = SequenceDictionary(infos.toSeq:_*)
    val header = {
      import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary
      new SAMFileHeader(dict.asSam)
    }

    // Create the sequence dictionary
    val dictOut    = PathUtil.replaceExtension(path, ".dict")
    val dictWriter = Io.toWriter(PathUtil.replaceExtension(path, ".dict"))
    new SAMTextHeaderCodec().encode(dictWriter, header)
    dictWriter.close()
    if (deleteOnExit) dictOut.toFile.deleteOnExit()

    // Create the FAI file
    val faiOut = ReferenceSequenceFileFactory.getFastaIndexFileName(path)
    val fai    = FastaSequenceIndexCreator.buildFromFasta(path)
    fai.write(faiOut)
    if (deleteOnExit) faiOut.toFile.deleteOnExit()
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy