com.fulcrumgenomics.fasta.CollectAlternateContigNames.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fgbio_2.13 Show documentation
fgbio
The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2019 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.fasta


import com.fulcrumgenomics.FgBioDef.{FgBioEnum, PathToSequenceDictionary}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.CommonsDef._
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.fasta.SequenceMetadata.Keys
import com.fulcrumgenomics.sopt._
import com.fulcrumgenomics.util.Io
import enumeratum.EnumEntry

import scala.collection.immutable.IndexedSeq
import scala.collection.mutable.ListBuffer

/** Trait that entries in AssemblyReportColumn will extend. */
sealed trait AssemblyReportColumn extends EnumEntry {
  /** The column name in the assembly report. */
  def key: String
  /** The tag to store in the SAM header */
  def tag: String
  /** True if this columns contains a valid molecule alias, false otherwise. */
  def alias: Boolean = true
}

/** Enum to represent columns in a NCBI assembly report. */
object AssemblyReportColumn extends FgBioEnum[AssemblyReportColumn] {
  // Developer note: we only return those that are "aliases" so that sequence role and sequence length are not
  // valid options on the command line.
  def values: IndexedSeq[AssemblyReportColumn] = findValues.filter(_.alias)

  /** Allows the column to be build from the Enum name or the actual column name. */
  override def apply(str: String): AssemblyReportColumn = {
    values.find(_.key == str).getOrElse(super.apply(str))
  }

  case object SequenceName     extends AssemblyReportColumn { val key: String = "Sequence-Name"; val tag: String = "sn" }
  case object AssignedMolecule extends AssemblyReportColumn { val key: String = "Assigned-Molecule"; val tag: String = "am" }
  case object GenBankAccession extends AssemblyReportColumn { val key: String = "GenBank-Accn"; val tag: String = "ga" }
  case object RefSeqAccession  extends AssemblyReportColumn { val key: String = "RefSeq-Accn"; val tag: String = "ra" }
  case object UcscName         extends AssemblyReportColumn { val key: String = "UCSC-style-name"; val tag: String = "un" }
  case object SequenceRole     extends AssemblyReportColumn { val key: String = "Sequence-Role"; val tag: String = "sr"; override val alias: Boolean = false }
  case object SequenceLength   extends AssemblyReportColumn { val key: String = "Sequence-Length"; val tag: String = "sl"; override val alias: Boolean = false }

  /** The missing value for a column. */
  val MissingColumnValue: String = "na"

  values.foreach { value =>
    require(value.tag.toLowerCase == value.tag, s"Tag must be lowercase for $value: ${value.tag}")
  }
}

/** Trait that entries in SequenceRole will extend. */
sealed trait SequenceRole extends EnumEntry {
  def key: String
}

/** Enum to represent the various types of sequence-roles in NCBI assembly report. */
object SequenceRole extends FgBioEnum[SequenceRole] {
  def values: IndexedSeq[SequenceRole] = findValues

  /** Allows the column to be build from the Enum name or the actual sequence-role column name. */
  override def apply(str: String): SequenceRole = {
    values.find(_.key == str).getOrElse(super.apply(str))
  }

  /** Allows the column to be build from the Enum name or the actual sequence-role column name. */
  def apply(metadata: SequenceMetadata): SequenceRole = {
    val roleValue = metadata.attributes.getOrElse(AssemblyReportColumn.SequenceRole.tag,
      throw new IllegalArgumentException(s"Metadata missing tag: ${AssemblyReportColumn.SequenceRole.tag}")
    )
    SequenceRole(roleValue)
  }

  case object AltScaffold         extends SequenceRole { val key: String = "alt-scaffold" }
  case object AssembledMolecule   extends SequenceRole { val key: String = "assembled-molecule" }
  case object FixPatch            extends SequenceRole { val key: String = "fix-patch" }
  case object NovelPatch          extends SequenceRole { val key: String = "novel-patch" }
  case object UnlocalizedScaffold extends SequenceRole { val key: String = "unlocalized-scaffold" }
  case object UnplacedScaffold    extends SequenceRole { val key: String = "unplaced-scaffold" }
}


@clp(description =
  """
    |Collates the alternate contig names from an NCBI assembly report.
    |
    |The input is to be the `*.assembly_report.txt` obtained from NCBI.
    |
    |The output will be a "sequence dictionary", which is a valid SAM file, containing the version header line and one
    |line per contig.  The primary contig name (i.e. `@SQ.SN`) is specified with `--primary` option, while alternate
    |names (i.e. aliases) are specified with the `--alternates` option.
    |
    |The `Assigned-Molecule` column, if specified as an `--alternate`, will only be used for sequences with
    |`Sequence-Role` `assembled-molecule`.
    |
    |When updating an existing sequence dictionary with `--existing` the primary contig names must match.  I.e. the
    |contig name from the assembly report column specified by `--primary` must match the contig name in the existing
    |sequence dictionary (`@SQ.SN`).  All contigs in the existing sequence dictionary must be present in the assembly
    |report.  Furthermore, contigs in the assembly report not found in the sequence dictionary will be ignored.
  """,
  group = ClpGroups.Fasta)
class CollectAlternateContigNames
(@arg(flag='i', doc="Input NCBI assembly report file.") val input: FilePath,
 @arg(flag='o', doc="Output sequence dictionary file.") val output: PathToSequenceDictionary,
 @arg(flag='p', doc="The assembly report column for the primary contig name.") val primary: AssemblyReportColumn = AssemblyReportColumn.RefSeqAccession,
 @arg(flag='a', doc="The assembly report column(s) for the alternate contig name(s)", minElements=1) val alternates: Seq[AssemblyReportColumn],
 @arg(flag='s', doc="Only output sequences with the given sequence roles.  If none given, all sequences will be output.", minElements=0)
  val sequenceRoles: Seq[SequenceRole] = Seq.empty,
 @arg(flag='d', doc="Update an existing sequence dictionary file.  The primary names must match.") val existing: Option[PathToSequenceDictionary] = None,
 @arg(flag='x', doc="Allow mismatching sequence lengths when using an existing sequence dictionary file.") val allowMismatchingLengths: Boolean = false,
 @arg(doc="Skip contigs that have no alternates") val skipMissingAlternates: Boolean = true
) extends FgBioTool with LazyLogging {

  import com.fulcrumgenomics.fasta.{AssemblyReportColumn => Column}

  Io.assertReadable(input)
  existing.foreach(Io.assertReadable)
  Io.assertCanWriteFile(output)
  validate(!alternates.contains(primary), s"Primary column is in alternate column: $primary in " + alternates.mkString(", "))

  validate(Column.values.contains(primary), s"Primary column '$primary' must be one of the following: " + Column.values.mkString(", "))
  this.alternates.foreach { alternate =>
    validate(Column.values.contains(primary), s"Alternate column '$alternate' must be one of the following: " + Column.values.mkString(", "))
  }

  private case class SkipReason(name: String, role: SequenceRole, msg: String)

  override def execute(): Unit = {
    val iter = Io.readLines(input).bufferBetter

    // skip over comments until we reach the header
    iter.dropWhile { line => line.startsWith("#") && !line.startsWith(s"# ${Column.SequenceName.key}") }

    // read the header
    require(iter.hasNext, s"Missing header from $input.")
    val header = iter.next().substring(2).split('\t')

    // Collect the sequence metadatas
    val metadatas   = ListBuffer[SequenceMetadata]()
    val skipReasons = ListBuffer[SkipReason]()
    iter.foreach { line =>
      val dict   = header.zip(line.split('\t')).toMap
      val name   = dict(this.primary.key)
      val role   = SequenceRole(dict(Column.SequenceRole.key))
      val alts   = this.alternates.flatMap {
        // Developer note: the values in the Assigned-Molecule column (e.g. "1" or "chr1") make sense for molecules
        // with Sequence-Role "assembled-molecule".  But for others, e.g. "unlocalized-scaffold" and "alt-scaffold",
        // the Assigned-Molecule points to the primary/assembled molecule (e.g. chr1_gl000191_random -> 1).
        // Only use Assigned-Molecule to generate alias(es) for "assembled-molecules" in order not to generate
        // multiple records with the same alias.  Perhaps this is better represented as an alternate locus, but that is
        // not implemented here.
        case Column.AssignedMolecule if role != SequenceRole.AssembledMolecule => None
        case alt: Column =>
          dict(alt.key) match {
            case Column.MissingColumnValue =>
              logger.warning(s"Contig '$name' had a missing value for alternate in column '${alt.key}'")
              None
            case alternate => Some(alternate)
          }
      }.distinct
      if (sequenceRoles.nonEmpty && !this.sequenceRoles.contains(role)) {
        skipReasons += SkipReason(name=name, role=role, msg=s"Skipping contig name '$name' with mismatching sequencing role: $role.")
      }
      else if (name == Column.MissingColumnValue) {
        skipReasons += SkipReason(name=name, role=role, msg=s"Skipping contig as it had a missing value for column '${this.primary.key}': $line")
      }
      else if (alts.isEmpty && skipMissingAlternates) {
        skipReasons += SkipReason(name=name, role=role, msg=s"Skipping contig name '$name' with no alternates.")
      }
      else {
       val attributes = ListBuffer[(String, String)]()
        attributes += ((Column.SequenceRole.tag, dict(Column.SequenceRole.key)))
        Column.values.foreach { column =>
          attributes += ((column.tag, dict(column.key)))
        }
        val metadata = SequenceMetadata(
          name             = name,
          length           = dict(Column.SequenceLength.key).toInt,
          aliases          = alts,
          customAttributes = attributes.toMap
        )
        metadatas += metadata
      }
    }

    // Log all the reasons we skipped contigs
    skipReasons.foreach(reason => logger.warning(reason.msg))

    // Apply to an existing sequence dictionary if necessary
    val dict: SequenceDictionary = existing match {
      case None => SequenceDictionary(metadatas.toSeq:_*)
      case Some(path) =>
        val assemblyReportMetadataMap = metadatas.map { m => (m.name, m) }.toMap
        val updatedMetadatas          = SequenceDictionary(path).map { existingMetadata =>
          // Get the metadata from the assembly report
          val assemblyReportMetadata: SequenceMetadata = assemblyReportMetadataMap.getOrElse(existingMetadata.name,
            skipReasons.find(_.name == existingMetadata.name) match {
              case Some(skipReason) =>
                throw new IllegalArgumentException(s"Contig '${existingMetadata.name}' was in the assembly report but skipped: ${skipReason.msg}.")
              case None             =>
                throw new IllegalArgumentException(s"Could not find contig '${existingMetadata.name}' in the assembly report.")
            }
          )
          // append new aliases, and add new tags
          val attributes = existingMetadata.attributes ++ assemblyReportMetadata.attributes.map {
              case (Keys.Aliases, _) =>
                (Keys.Aliases, (existingMetadata.aliases ++ assemblyReportMetadata.aliases).distinct.mkString(","))
              case (key, value)                          => (key, value)
          }
          existingMetadata.copy(attributes=attributes)
        }
        SequenceDictionary(updatedMetadatas.toSeq:_*)
    }

    // Write it out!
    dict.write(output)
  }
}