All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.util.IntervalListSource.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2019 Fulcrum Genomics
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 */

package com.fulcrumgenomics.util


import java.io.{Closeable, File, InputStream}

import com.fulcrumgenomics.FgBioDef.{PathToIntervals, yieldAndThen}
import com.fulcrumgenomics.commons.CommonsDef.BetterBufferedIteratorScalaWrapper
import com.fulcrumgenomics.commons.util.StringUtil
import com.fulcrumgenomics.fasta.SequenceDictionary
import htsjdk.samtools.util.{BufferedLineReader, Interval, IntervalList}
import htsjdk.samtools.{SAMFileHeader, SAMTextHeaderCodec}

import scala.io.Source

object IntervalListSource {

  /** Creates a new interval list source from a sequence of lines. */
  def apply(lines: Iterable[String]): IntervalListSource = new IntervalListSource(lines.iterator)

  /** Creates a new interval list source from an iterator of lines. */
  def apply(lines: Iterator[String]): IntervalListSource = new IntervalListSource(lines)

  /** Creates a new interval list source from an input stream. */
  def apply(stream: InputStream): IntervalListSource = new IntervalListSource(Source.fromInputStream(stream).getLines())

  /** Creates a new interval list source from a source. */
  def apply(source: Source): IntervalListSource = new IntervalListSource(source.getLines(), Some(source))

  /** Creates a new interval list source from a File. */
  def apply(file: File): IntervalListSource = apply(path=file.toPath)

  /** Creates a new interval list source from a Path. */
  def apply(path: PathToIntervals): IntervalListSource = apply(Io.readLines(path))
}

/**
  * Reads intervals from any text based source via a reader.  The underlying reader is closed automatically when EOF is
  * reached.
  */
class IntervalListSource private(lines: Iterator[String],
                                 private[this] val source: Option[{ def close(): Unit }] = None)
  extends Iterator[Interval] with Closeable {

  private val iter = lines.bufferBetter

  private var lineNumber = 1L

  /** True if calling `next()` will yield another interval, false otherwise. */
  override def hasNext: Boolean = iter.nonEmpty

  /** Returns the next interval if available, or throws an exception if none is available. */
  override def next(): Interval = yieldAndThen(parse(iter.next())) { lineNumber += 1 }

  // Read the header
  val header: SAMFileHeader = {
    val codec = new SAMTextHeaderCodec
    val headerLines = iter.takeWhile(_.startsWith("@")).toIndexedSeq
    require(headerLines.nonEmpty, "No header found")
    lineNumber += headerLines.length
    val lineReader = BufferedLineReader.fromString(headerLines.mkString("\n"))
    codec.decode(lineReader, "IntervalListSource")
  }

  require(this.dict.nonEmpty, "No reference sequences found in the header.")

  /** The [[SequenceDictionary]] associated with the source. */
  lazy val dict: SequenceDictionary = {
    import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary
    this.header.getSequenceDictionary.fromSam
  }

  private val parseArray = Array[String]("", "", "", "", "")

  /** Closes the underlying reader; only necessary if EOF hasn't been reached. */
  override def close(): Unit = this.source.foreach(_.close())

  private def parse(line: String): Interval = {
    val fieldCount  = StringUtil.split(line, '\t', parseArray)
    require(fieldCount == 5, s"Expected 5 fields on line $lineNumber")
    val Array(refName: String, startString: String, endString: String, strand: String, name: String) = parseArray

    val start = startString.toInt
    val end   = endString.toInt

    Option(dict(refName)) match {
      case None =>
        throw new IllegalArgumentException(f"Reference contig '$refName' not found in the sequence dictionary on line number $lineNumber.")
      case Some(seq) =>
        require(1 <= start, s"Start is less than 1 on line number $lineNumber")
        require(end <= seq.length, s"End is beyond the reference contig length on line number $lineNumber")
        require(start <= end, f"Start is greater than end on line number $lineNumber")
    }

    val negative = strand match {
      case "-" => true
      case "+" => false
      case _   => throw new IllegalArgumentException(s"Unrecognized strand '$strand' on line number $lineNumber")
    }

    new Interval(refName, start, end, negative, name)
  }

  /** Reads in the intervals into an [[htsjdk.samtools.util.IntervalList]] */
  def toIntervalList: IntervalList = {
    import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary
    val list = new IntervalList(dict.asSam)
    this.foreach { list.add }
    list
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy