All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.fasta.SequenceDictionary.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2020 Fulcrum Genomics
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package com.fulcrumgenomics.fasta

import java.io.StringWriter

import com.fulcrumgenomics.FgBioDef
import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.fasta.SequenceMetadata.Keys
import com.fulcrumgenomics.util.Io
import enumeratum.EnumEntry
import htsjdk.samtools.util.BufferedLineReader
import htsjdk.samtools.{SAMSequenceDictionary, SAMSequenceDictionaryCodec, SAMSequenceRecord, SAMTextHeaderCodec}
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor

import scala.collection.mutable
import scala.collection.mutable.ListBuffer

object SequenceMetadata {
  /** Keys for standard attributes in [[SequenceMetadata]] */
  object Keys {
    val Aliases               : String = "AN"
    val AlternateLocus        : String = "AH"
    val Assembly              : String = SAMSequenceRecord.ASSEMBLY_TAG
    val Description           : String = SAMSequenceRecord.DESCRIPTION_TAG
    val Length                : String = SAMSequenceRecord.SEQUENCE_LENGTH_TAG
    val Md5                   : String = SAMSequenceRecord.MD5_TAG
    val Name                  : String = SAMSequenceRecord.SEQUENCE_NAME_TAG
    val Species               : String = SAMSequenceRecord.SPECIES_TAG
    val Topology              : String = "TP"
    val Uri                   : String = SAMSequenceRecord.URI_TAG

    val values: Set[String] = Set(
      Aliases, AlternateLocus, Assembly, Description, Length, Md5, Name, Species, Topology, Uri
    )
  }

  /** Builds a new [[SequenceMetadata]]
    *
    * @param name name of the reference sequence (contig)
    * @param length length of the contig
    * @param index index of the contig in a set of contigs (ex. genome or assembly)
    * @param aliases list of reference sequence name aliases
    * @param alternateLocus genomic range in the genome or assembly for which this sequence is an alternate
    * @param assembly genome assembly identifier
    * @param description description of the reference sequence
    * @param md5 MD5 checksum
    * @param species species name
    * @param topology molecule topology
    * @param uri URI of the sequence
    * @param customAttributes any custom attributes (non-standard SAM attributes)
    */
  def apply(name: String,
            length: Int                            = SequenceMetadata.UnknownSequenceLength,
            index: Int                             = SequenceMetadata.UnknownSequenceIndex,
            aliases: Seq[String]                   = Seq.empty,
            alternateLocus: Option[AlternateLocus] = None,
            assembly: Option[String]               = None,
            description: Option[String]            = None,
            md5: Option[String]                    = None,
            species: Option[String]                = None,
            topology: Option[Topology]             = None,
            uri: Option[String]                    = None,
            customAttributes: Map[String, String]  = Map.empty
           ): SequenceMetadata = {
    Keys.values.find(customAttributes.contains).foreach { key =>
      throw new IllegalArgumentException(s"Attributes contains a standard key: $key")
    }
    // Build the attributes
    val buffer: ListBuffer[(String, String)] = ListBuffer()
    if (aliases.nonEmpty)       buffer += ((Keys.Aliases,        aliases.mkString(",")))
    alternateLocus.foreach(l => buffer += ((Keys.AlternateLocus, l.toString)) )
    assembly.foreach      (a => buffer += ((Keys.Assembly,       a)) )
    description.foreach   (d => buffer += ((Keys.Description,    d)) )
    md5.foreach           (m => buffer += ((Keys.Md5,            m)) )
    species.foreach       (s => buffer += ((Keys.Species,        s)) )
    topology.foreach      (t => buffer += ((Keys.Topology,       t.name)) )
    uri.foreach           (u => buffer += ((Keys.Uri,            u)) )

    SequenceMetadata(
      name       = name,
      length     = length,
      index      = index,
      attributes = buffer.toMap ++ customAttributes
    )
  }

  /** Generates a new SequenceMetadata from a SAMSequenceRecord. */
  def apply(rec: SAMSequenceRecord): SequenceMetadata = {
    val attributes: Map[String, String] = rec.getAttributes.map { entry => entry.getKey -> entry.getValue }.toMap

    SequenceMetadata(
      name       = rec.getSequenceName,
      length     = rec.getSequenceLength,
      index      = rec.getSequenceIndex,
      attributes = attributes
    )
  }

  /** The value for sequences of unknown length */
  val UnknownSequenceLength: Int = SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH


  /** The value for sequences of unknown length */
  val UnknownSequenceIndex: Int = SAMSequenceRecord.UNAVAILABLE_SEQUENCE_INDEX

  /** Stores information about the coordinates of the alternate locus  */
  type AlternateLocus = GenomicRange

  object AlternateLocus {
    @inline def apply(refName: String, start: Int, end: Int): AlternateLocus = {
      GenomicRange(refName=refName, start=start, end=end)
    }
  }
}

/** Stores information about a single Sequence (ex. chromosome, contig)
  *
  * Important: when retrieving attributes using the `apply`, `get`, and `getOrElse` methods, all values will be
  * returned as `String`s.  Use the named accessors for attributes that have non-`String` types (i.e. `length`,
  * `aliases`, `alternate`, and `topology`).
  *
  * @param name the primary name of the sequence
  * @param length the length of the sequence, or zero if unknown
  * @param index the index in the sequence dictionary
  * @param attributes attributes of this sequence
  */
case class SequenceMetadata private[fasta]
(name: String,
 length: Int,
 index: Int,
 attributes: Map[String, String]) {
  import SequenceMetadata.AlternateLocus

  allNames.foreach { name => SAMSequenceRecord.validateSequenceName(name) }
  require(length >= 0, s"Length must be >= 0 for '$name'")
  require(!attributes.contains(Keys.Name), f"`${Keys.Name}` should not given in the list of attributes")
  require(!attributes.contains(Keys.Length), s"`${Keys.Length}` should not given in the list of attributes")

  @inline final def apply(key: String): String = {
    if (key == Keys.Name) this.name
    else if (key == Keys.Length) s"${this.length}"
    else this.attributes(key)
  }
  @inline final def get(key: String): Option[String] = {
    if (key == Keys.Name) Some(this.name)
    else if (key == Keys.Length) Some(s"${this.length}")
    else this.attributes.get(key)
  }
  @inline final def getOrElse(key: String, default: String): String = this.get(key).getOrElse(default)
  @inline final def contains(key: String): Boolean = {
    this.attributes.contains(key) || key == Keys.Name || key == Keys.Length
  }
  lazy val aliases: Seq[String] = this.get(Keys.Aliases).map(_.split(',').toSeq).getOrElse(Seq.empty[String])
  /** All names, including aliases */
  @inline final def allNames: Seq[String] = name +: aliases
  @inline final def isAlternate: Boolean = this.alternate.isDefined
  lazy val alternate: Option[AlternateLocus] = {
    this.get(Keys.AlternateLocus).flatMap {
      case "*"   => None
      case range =>
        val locus = FgBioDef.parseRange(range) match {
          case GenomicRange("=", start, end) => AlternateLocus(refName=this.name, start=start.toInt, end=end.toInt)
          case _locus                        => _locus
        }
        Some(locus)
    }
  }
  @inline final def md5: Option[String] = this.get(Keys.Md5)
  @inline final def md5Int: Option[BigInt] = this.md5.map(BigInt(_, 16))
  @inline final def assembly: Option[String] = this.get(Keys.Assembly)
  @inline final def uri: Option[String] = this.get(Keys.Uri)
  @inline final def species: Option[String] = this.get(Keys.Species)
  @inline final def description: Option[String] = this.get(Keys.Description)
  @inline final def topology: Option[Topology] = {
    this.get(Keys.Topology).flatMap(tp => Topology.values.find(_.name == tp))
  }

  /** Returns true if the sequences share a common reference name (including aliases), have the same length, and
    * the same MD5 if both have MD5s. */
  def sameAs(that: SequenceMetadata): Boolean = {
    if (this.length != that.length) false
    else if (this.name != that.name && !this.allNames.exists(that.allNames.contains)) false
    else {
      (this.md5Int, that.md5Int) match {
        case (Some(thisMd5Int), Some(thatMd5Int)) => thisMd5Int == thatMd5Int
        case _                                    => true
      }
    }
  }

  /** Generates a copy of this record as a SAMSequenceRecord. */
  def toSam: SAMSequenceRecord = {
    val rec = new SAMSequenceRecord(name, length)
    rec.setSequenceIndex(index)
    attributes.foreach { case (key, value) => rec.setAttribute(key, value) }
    rec
  }

  override def toString: FilenameSuffix = {
    import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceRecord
    this.asSam.getSAMString
  }
}

object SequenceDictionary {
  /** Builds a sequence dictionary from one or more sequence metadatas.  This will set the sequence index. */
  def apply(info: SequenceMetadata*): SequenceDictionary = {
    val infos = info.zipWithIndex.map { case (info, index) =>
      if (info.index != index) info.copy(index=index) else info
    }.toIndexedSeq
    SequenceDictionary(infos=infos)
  }

  /** Builds a sequence dictionary from the given path */
  def apply(path: PathToSequenceDictionary): SequenceDictionary = {
    import Converters.FromSAMSequenceDictionary
    val codec  = new SAMTextHeaderCodec()
    val reader = new BufferedLineReader(Io.toInputStream(path))
    val dict   = codec.decode(reader, path.toString).getSequenceDictionary
    reader.close()
    dict.fromSam
  }

  /** Generates a new SequenceDictionary from a SAMSequenceDictionary */
  def apply(dict: SAMSequenceDictionary): SequenceDictionary = {
    apply(dict.getSequences.toSeq.map(SequenceMetadata.apply):_*)
  }

  /** Extracts a [[SequenceDictionary]] from SAM/BAM/CRAM/FASTA/DICT files. */
  def extract(path: FilePath): SequenceDictionary = {
    import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary
    SAMSequenceDictionaryExtractor.extractDictionary(path).fromSam
  }
}

/** Contains an ordered collection of sequences. */
case class SequenceDictionary(infos: IndexedSeq[SequenceMetadata]) extends Iterable[SequenceMetadata] {

  private val mapping: Map[String, SequenceMetadata] = {
    // validates that the same name is not present twice across allNames (so includes aliases)
    val names: mutable.Set[String] = mutable.HashSet[String]()
    infos.iterator.zipWithIndex.flatMap { case (info, index) =>
      require(info.index == index, s"Infos must be given with index set correctly. See ${index}th with name: $names")
      info.allNames.map { name =>
        require(!names.contains(name), f"Found duplicate sequence name: $name")
        names.add(name)
        name -> info
      }
    }.toMap
  }

  def apply(name: String): SequenceMetadata = this.mapping(name)
  def get(name: String): Option[SequenceMetadata] = this.mapping.get(name)
  def contains(name: String): Boolean = this.mapping.contains(name)
  def apply(index: Int): SequenceMetadata = this.infos(index)
  def get(index: Int): Option[SequenceMetadata] = this.infos.lift(index)
  override def iterator: Iterator[SequenceMetadata] = this.infos.iterator
  def length: Int = this.infos.length

  /** Returns true if the sequences share a common reference name (including aliases), have the same length, and
    * the same MD5 if both have MD5s. */
  def sameAs(that: SequenceDictionary): Boolean = {
    this.length == that.length && this.zip(that).forall { case (thisInfo, thatInfo) => thisInfo.sameAs(thatInfo) }
  }

  /** Returns a copy of this sequence dictionary as a SAMSequenceDictionary */
  def toSam: SAMSequenceDictionary = {
    val recs = infos.iterator.map(_.toSam).toJavaList
    new SAMSequenceDictionary(recs)
  }

  /** Writes the sequence dictionary to the given path */
  def write(path: FilePath): Unit = {
    val writer = Io.toWriter(path)
    this.write(writer=writer)
    writer.close()
  }

  /** Writes the sequence dictionary to the given writer */
  def write(writer: java.io.Writer): Unit = {
    import Converters.ToSAMSequenceRecord
    val codec  = new SAMSequenceDictionaryCodec(writer)
    codec.encodeHeaderLine(false)
    this.foreach { metadata: SequenceMetadata =>
      codec.encodeSequenceRecord(metadata.asSam)
    }
  }

  override def toString(): String = {
    val writer = new StringWriter()
    this.write(writer=writer)
    writer.toString
  }
}


/** Contains useful converters to and from HTSJDK objects. */
object Converters {

  /**
    * Converter from a [[SequenceMetadata]] to a [[SAMSequenceRecord]]
    * @deprecated use [[SequenceMetadata.toSam]] instead.
    */
  @Deprecated
  implicit class ToSAMSequenceRecord(info: SequenceMetadata) {
    def asSam: SAMSequenceRecord = info.toSam
  }

  /** Converter from a [[SAMSequenceRecord]] to a [[SequenceMetadata]] */
  implicit class FromSAMSequenceRecord(rec: SAMSequenceRecord) {
    def fromSam: SequenceMetadata = SequenceMetadata(rec)
    def toScala: SequenceMetadata = SequenceMetadata(rec)
  }

  /** Converter from a [[SequenceDictionary]] to a [[SAMSequenceDictionary]]
    * @deprecated use [[SequenceDictionary.toSam]] instead.
    */
  @Deprecated
  implicit class ToSAMSequenceDictionary(dict: SequenceDictionary) {
    def asSam: SAMSequenceDictionary = dict.toSam
  }

  /** Converter from a [[SAMSequenceDictionary]] to a [[SequenceDictionary]] */
  implicit class FromSAMSequenceDictionary(dict: SAMSequenceDictionary) {
    require(dict != null, "The reference provided does not have a sequence dictionary (.dict)")
    def fromSam: SequenceDictionary = SequenceDictionary(dict)
    def toScala: SequenceDictionary = SequenceDictionary(dict)
  }
}

/** The base trait for all topologies. */
sealed trait Topology extends EnumEntry {
  def name: String = this.entryName.toLowerCase
}

/** An enumeration representing the various reference sequence topologies. */
object Topology extends FgBioEnum[Topology] {
  def values: scala.collection.immutable.IndexedSeq[Topology] = findValues
  /** The sequence is linear. */
  case object Linear extends Topology
  /** The sequence is circular. */
  case object Circular extends Topology
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy