All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.fasta.UpdateFastaContigNames.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2019 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.fasta

import java.io.BufferedWriter

import com.fulcrumgenomics.FgBioDef.PathToSequenceDictionary
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.CommonsDef._
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, ProgressLogger}
import htsjdk.samtools.reference.{FastaSequenceIndex, ReferenceSequence, ReferenceSequenceFile, ReferenceSequenceFileFactory}

@clp(description =
  """
    |Updates the sequence names in a FASTA.
    |
    |The name of each sequence must match one of the names (including aliases) in the given sequence dictionary.  The
    |new name will be the primary (non-alias) name in the sequence dictionary.
    |
    |By default, the sort order of the contigs will be the same as the input FASTA.  Use the `--sort-by-dict` option to
    |sort by the input sequence dictionary.  Furthermore, the sequence dictionary may contain **more** contigs than the
    |input FASTA, and they wont be used.
    |
    |Use the `--skip-missing` option to skip contigs in the input FASTA that cannot be renamed (i.e. who are not present
    |in the input sequence dictionary); missing contigs will not be written to the output FASTA.  Finally, use the
    |`--default-contigs` option to specify an additional FASTA which will be queried to locate contigs not present in
    |the input FASTA but present in the sequence dictionary.
  """,
  group = ClpGroups.Fasta)
class UpdateFastaContigNames
(@arg(flag='i', doc="Input FASTA.") val input: PathToFasta,
 @arg(flag='d', doc="The path to the sequence dictionary with contig aliases.") val dict: PathToSequenceDictionary,
 @arg(flag='o', doc="Output FASTA.") val output: PathToFasta,
 @arg(flag='l', doc="Line length or sequence lines.") val lineLength: Int = 100,
 @arg(doc="Skip missing source contigs (will not be outputted).") val skipMissing: Boolean = false,
 @arg(doc="Sort the contigs based on the input sequence dictionary.") sortByDict: Boolean = false,
 @arg(doc="Add sequences from this FASTA when contigs in the sequence dictionary are missing from the input FASTA.")
 defaultContigs: Option[PathToFasta] = None
) extends FgBioTool with LazyLogging {
  import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary

  Io.assertReadable(Seq(input, dict))
  defaultContigs.foreach(Io.assertReadable)
  Io.assertCanWriteFile(output)

  /** Little class to store the sequence metadata and reference sequence to output.
    *
    * The reference sequence to output is stored as a method, so we can lazily retrieve it and not incur large memory
    * overhead.
    *
    * @param info the sequence metadata to output
    * @param sequence the reference sequence to output
    */
  private class OutputSequence(val info: SequenceMetadata, sequence: => ReferenceSequence) {
    /** Writes the output sequence */
    def write(writer: BufferedWriter, progress: ProgressLogger): Unit = {
      val ref = sequence
      writer.write(s">${info.name}\n")
      val bases = ref.getBases
      var baseCounter = 0
      forloop(from = 0, until = bases.length) { baseIdx =>
        writer.write(bases(baseIdx))
        progress.record(info.name, baseIdx + 1)
        baseCounter += 1
        if (baseCounter >= lineLength) {
          writer.newLine()
          baseCounter = 0
        }
      }
      if (baseCounter > 0) writer.newLine()
    }
  }

  override def execute(): Unit = {
    val progress = ProgressLogger(logger, noun="bases", verb="written", unit=10e7.toInt)
    val dict     = SequenceDictionary(this.dict)
    val out      = Io.toWriter(output)

    val (srcDict, srcRefFile) = getDictAndFile(fasta=this.input)
    val defaultDictAndRefFile = this.defaultContigs.map(fasta => getDictAndFile(fasta=fasta))

    // Make sure the default FASTA and the default input dict are the same, as the latter should be built from the
    // former
    defaultDictAndRefFile.foreach { case (defaultDict, _) =>
      require(dict.sameAs(defaultDict), "Input sequence dictionary mismatch and default FASTA mismatch")
    }

    // Get the contigs from the input FASTA to output.
    // Note: we do not pre-load the sequences, as to preserve memory
    val srcSequences: Seq[OutputSequence] = srcDict.iterator.flatMap { srcInfo =>
      // Check if the sequence dictionary can rename this contig, and if not, either log or fail depending on --skip-missing
      val dictInfo = dict.get(srcInfo.name) match {
        case None if skipMissing => logger.warning(s"Did not find contig ${srcInfo.name} in the sequence dictionary."); None
        case None                => throw new IllegalStateException(s"Did not find contig ${srcInfo.name} in the sequence dictionary.")
        case Some(info)          => Some(info)
      }
      dictInfo.map(info => new OutputSequence(info=info, sequence=srcRefFile.getSequence(srcInfo.name)))
    }.toIndexedSeq

    // Get any default contigs that we need to add/append to the output
    // Note: we do not pre-load the sequences, as to preserve memory
    val defaultSequences: Seq[OutputSequence] = {
      defaultDictAndRefFile match {
        case None                      => Seq.empty
        case Some((_, defaultRefFile)) =>
          // Build the list of contig names that will be renamed (i.e. outputted)
          val namesThatCanBeRenamed = srcSequences.map(_.info.name).toSet
          // Keep the contigs in the input sequence dictionary that are not going be used to rename
          val contigs = dict
            .filterNot(info => namesThatCanBeRenamed.contains(info.name))
            .map(info => new OutputSequence(info=info, sequence=defaultRefFile.getSequence(info.name)))
            .toIndexedSeq
          logger.info(s"Will add/append ${contigs.length} contigs to the output.")
          contigs
      }
    }

    // Order the contigs based on if we want to sort by the input sequence dictionary or not
    val sequences: Seq[OutputSequence] = {
      if (sortByDict) {
        logger.info("Sorting the contigs using the input sequence dictionary")
        (srcSequences ++ defaultSequences).sortBy(_.info.index)
      }
      else {
        logger.info("Sorting the contigs using the input FASTA")
        srcSequences ++ defaultSequences
      }
    }

    // Write them out
    sequences.foreach(_.write(writer=out, progress=progress))
    logger.info(s"Wrote ${sequences.length} contigs.")

    srcRefFile.safelyClose()
    defaultDictAndRefFile.foreach(_._2.safelyClose())
    out.close()
  }


  /** Gets the sequence dictionary and reference sequence file for a given FASTA */
  private def getDictAndFile(fasta: PathToFasta): (SequenceDictionary, ReferenceSequenceFile) = {
    val refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fasta, true, true)
    val dict    = Option(refFile.getSequenceDictionary) match {
      case Some(dict) => dict.fromSam
      case None       =>
        require(refFile.isIndexed,
          s"Reference sequence file must have a sequence dictionary or be indexed." +
          s"  Try 'picard CreateSequenceDictionary' or 'samtools faidx '. $fasta"
        )
        // Build a very basic sequence dictionary from the fasta index
        val infos = new FastaSequenceIndex(ReferenceSequenceFileFactory
          .getFastaIndexFileName(this.input))
          .map { entry =>
            SequenceMetadata(name=entry.getContig, length=entry.getSize.toInt, index=entry.getSequenceIndex)
          }.toIndexedSeq
        SequenceDictionary(infos:_*)
    }
    (dict, refFile)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy