All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fulcrumgenomics.util.Sorter.scala Maven / Gradle / Ivy

The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2017 Fulcrum Genomics LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package com.fulcrumgenomics.util

import java.io._
import java.nio.file.{Files, Path}
import java.util

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.commons.io.Writer
import com.fulcrumgenomics.commons.collection.SelfClosingIterator
import com.fulcrumgenomics.util.Sorter.{Codec, SortEntry}
import htsjdk.samtools.util.TempStreamFactory

import scala.collection.mutable.ArrayBuffer

/**
  * Companion object for [[Sorter]] that contains various types used.
  */
object Sorter {
  /**
    * A specialized tuple that contains a key object that is [[java.lang.Comparable]] and a serialized version
    * of the object being sorted in an [[scala.Array]].  The entry itself is comparable but simply
    * delegates comparison to the key object.
    *
    * @param key a comparable key that contains necessary information from the object being sorted
    * @param bytes a serialized form of the object read to be written to disk
    * @tparam B the key type
    */
  case class SortEntry[B <: Comparable[B]](key: B, bytes: Array[Byte]) extends Comparable[SortEntry[B]] {
    override def compareTo(that: SortEntry[B]): Int = this.key.compareTo(that.key)
  }

  /**
    * Trait for classes/objects that are able to encode and object into bytes and decode it
    * from bytes.
    *
    * @tparam A the type of object that is encoded/decoded
    */
  trait Codec[A] {
    /** Encode the object into an array of bytes. */
    def encode(a: A): Array[Byte]
    /** Decode an object from an array of bytes. */
    def decode(bs: Array[Byte], start: Int, length: Int): A
  }
}

/**
  * An implementation of a disk-backed sorting system.  The implementation requires two things:
  *
  * 1. An implementation of Codec that can serialize and deserialize objects
  * 2. A function that creates a [[scala.Ordered]] key object for each object being sorted
  *
  * Both must be thread-safe as they may be invoked across threads without external synchronization
  */
class Sorter[A,B <: Ordered[B]](val maxObjectsInRam: Int,
                                private val codec: Codec[A],
                                private val keyfunc: A => B,
                                private val tmpDir: DirPath = Io.tmpDir) extends Iterable[A] with Writer[A] {
  require(maxObjectsInRam > 1, "Max records in RAM must be at least 2, and probably much higher!")
  private val stash = new Array[SortEntry[B]](maxObjectsInRam)
  private val files = new ArrayBuffer[Path]()
  private var recordsInMemory: Int = 0
  private val _tmpStreamFactory = new TempStreamFactory

  /**
    * An iterator that consumes data from a single tmp file of sorted data and produces
    * an iterator of sorted records.
    */
  private class SortedIterator(stream: InputStream, codec: Codec[A]) extends Iterator[A] with Comparable[SortedIterator] with Closeable  {
    private val dataStream = new DataInputStream(stream)
    private var bytes = new Array[Byte](0)
    private var nextValue : A = _
    private var nextKey   : B = _
    private var closed: Boolean = false
    advance()

    /** Reads the next item from the stream, if one is available */
    private def advance(): Unit = {
      if (closed) {
        clear()
      }
      else {
        val length = try { this.dataStream.readInt() } catch { case ex: EOFException => -1 }
        if (length <= 0) {
          close()
          clear()
        }
        else {
          if (this.bytes.length < length) this.bytes = new Array[Byte](length)
          val read  = dataStream.read(bytes, 0, length)
          this.nextValue = this.codec.decode(bytes, 0, length)
          this.nextKey   = keyfunc(this.nextValue)
        }
      }
    }

    /** Clears the current key and value. */
    private def clear(): Unit = {
        this.nextValue = null.asInstanceOf[A]
        this.nextKey   = null.asInstanceOf[B]
    }

    /** True if a call to [[next()]] will yield a value.  */
    override def hasNext: Boolean = nextValue != null

    /** Retrieve the next value from the iterator. */
    override def next(): A = {
      if (!hasNext) throw new NoSuchElementException("next called on empty Iterator")
      val retval = this.nextValue
      advance()
      retval
    }

    /** The way the iterators are used it is guaranteed that we'll never compare an empty iterator. */
    override def compareTo(that: SortedIterator): Int = this.nextKey.compareTo(that.nextKey)

    /** Closes the underlying stream. */
    def close(): Unit = if (!closed) { this.stream.safelyClose(); closed = true }
  }


  /**
    * An iterator that merges records from [[SortedIterator]]s and maintains ordering.
    */
  private class MergingIterator(val streams: Seq[InputStream]) extends Iterator[A] with Closeable {
    private val iterators = new util.PriorityQueue[SortedIterator]()
    streams.foreach { stream =>
      val iter = new SortedIterator(stream, codec)
      if (iter.hasNext) iterators.add(iter)
    }

    /** True if a call to [[next()]] will yield a value.  */
    override def hasNext: Boolean = !this.iterators.isEmpty

    /** Retrieve the next value from the iterator. */
    override def next(): A = {
      if (!hasNext) throw new NoSuchElementException("next called on empty iterator.")
      val iter = this.iterators.poll()
      val entry = iter.next()
      if (iter.hasNext) this.iterators.add(iter)
      else iter.close()

      entry
    }

    /** Closes all underlying iterators/streams and frees the iterators. */
    override def close(): Unit = {
      this.iterators.foreach(i => i.safelyClose())
      this.iterators.clear()
    }
  }

  /**
    * Adds a record to the sorter.  This is an amortized constant time operation, but the individual times
    * will vary wildly as the accrued objects are written to disk once the max in memory is reached.
    */
  override def write(item: A): Unit = {
    val key   = keyfunc(item)
    val bytes = this.codec.encode(item)
    stash(recordsInMemory) = SortEntry(key, bytes)
    recordsInMemory += 1

    if (recordsInMemory == maxObjectsInRam) spill()
  }

  /**
    * Writes out a temporary file containing all the accrued objects and releases the objects so
    * that they can be garbage collected.
    */
  private def spill(): Unit = {
    if (recordsInMemory > 0) {
      util.Arrays.parallelSort(stash, 0, recordsInMemory)
      val path = Io.makeTempFile("sorter.", ".tmp", dir=Some(this.tmpDir))
      val out = new DataOutputStream(_tmpStreamFactory.wrapTempOutputStream(Io.toOutputStream(path), Io.bufferSize))
      forloop(from = 0, until = recordsInMemory) { i =>
        val bytes = stash(i).bytes
        out.writeInt(bytes.length)
        out.write(bytes)
        stash(i) = null
      }
      out.close()
      path.toFile.deleteOnExit()
      this.files += path
      this.recordsInMemory = 0
    }
  }

  /**
    * Generates an iterator over the data accumulated so far.  May be called multiple times to iterate
    * over the data multiple times, and may even be invoked between adding more objects. Note however
    * that each call to iterator will cause any accumulated objects to be written to disk, so it should
    * not be invoked too frequently!
     */
  def iterator: SelfClosingIterator[A] = {
    spill()
    val streams = files.iterator.map(f => _tmpStreamFactory.wrapTempInputStream(Io.toInputStream(f), Io.bufferSize)).toSeq
    val mergingIterator = new MergingIterator(streams)
    new SelfClosingIterator(mergingIterator, () => mergingIterator.close())
  }

  /** Deletes any temporary files that have been created. */
  def close(): Unit = {
    this.files.foreach(Files.deleteIfExists)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy