dlm.model.Streaming.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bayesian_dlms_2.12 Show documentation
bayesian_dlms
The newest version!
package dlm.core.model

import java.io.File
import java.nio.file.Paths
import kantan.csv._
import kantan.csv.ops._
import scala.concurrent.Future
import akka.stream._
import akka.stream.scaladsl._
import akka.NotUsed
import akka.util.ByteString
import breeze.linalg.{DenseMatrix, DenseVector, diag}

object Streaming {

  /**
    * Write a single chain
    * @param iters the iterations of the MCMC chain
    * @param formatParameters a function to properly format the parameters
    * to write them to a file
    * @param file the file to write the parameters to
    * @param config additional configuration as CsvConfiguration
    * from the kantan CSV package
    */
  def writeChain(
      formatParameters: DlmParameters => List[Double],
      filename: String,
      config: CsvConfiguration
  )(iters: Iterator[DlmParameters]) = {
    val file = new File(filename)
    val writer = file.asCsvWriter[List[Double]](config)

    // write iters to file
    while (iters.hasNext) {
      writer.write(formatParameters(iters.next))
    }

    writer.close()
  }

  /**
    * Parse parameters from a collection containing the diagonal
    * entries of the v, w matrices
    */
  def parseDiagonalParameters(vDim: Int, wDim: Int)(
      ps: Vector[Double]): DlmParameters = {

    val v = ps.take(vDim).map(_.toDouble).toArray
    val w = ps.drop(vDim).take(wDim).map(_.toDouble).toArray
    val m0 = ps.drop(vDim + wDim).take(wDim).map(_.toDouble).toArray
    val c0 = ps.drop(vDim + 2 * wDim).take(wDim * wDim).map(_.toDouble).toArray

    DlmParameters(diag(DenseVector(v)),
                  diag(DenseVector(w)),
                  DenseVector(m0),
                  new DenseMatrix(wDim, wDim, c0))
  }

  /**
    * Read an MCMC Chain into a list of doubles
    */
  def readMcmcChain(filename: String) = {
    val mcmcChain = Paths.get(filename)
    mcmcChain.asCsvReader[Vector[Double]](rfc.withHeader)
  }

  /**
    * Calculate the column means of List of List of doubles
    */
  def colMeans(params: List[List[Double]]): List[Double] = {
    params.transpose.map(a => breeze.stats.mean(a))
  }

  def quantile[A: Ordering](xs: Seq[A], prob: Double): A = {
    val index = math.floor(xs.length * prob).toInt
    val ordered = xs.sorted
    ordered(index)
  }

  /**
    * Streaming mean
    */
  def mean =
    Flow[Double]
      .fold((0.0, 1.0)) {
        case ((avg, n), b) =>
          ((avg * n + b) / (n + 1), n + 1)
      }
      .map(_._1)

  def meanDlmFsvParameters(vDim: Int, wDim: Int, p: Int, k: Int) =
    Flow[DlmFsvParameters]
      .fold((DlmFsvParameters.empty(vDim, wDim, p, k), 1.0)) {
        case ((avg, n), b) =>
          (avg.map(_ * n).add(b).map(_ / (n + 1)), n + 1)
      }
      .map(_._1)

  def meanDlmFsvSystemParameters(vDim: Int, wDim: Int, k: Int) =
    Flow[DlmFsvParameters]
      .fold((DlmFsvSystem.emptyParams(vDim, wDim, k), 1.0)) {
        case ((avg, n), b) =>
          (avg.map(_ * n).add(b).map(_ / (n + 1)), n + 1)
      }
      .map(_._1)

  /**
    * Calculate the streaming mean of DLM parameters
    */
  def meanParameters(vDim: Int, wDim: Int) = {
    Flow[DlmParameters]
      .fold((DlmParameters.empty(vDim, wDim), 1.0))((acc, b) => {
        val (avg: DlmParameters, n: Double) = acc
        (avg.map(_ * n).add(b).map(_ / (n + 1)), n + 1)
      })
      .map(_._1)
  }

  def meanSvParameters =
    Flow[SvParameters]
      .fold((SvParameters.empty, 1.0)) { (acc, b) =>
        val (avg: SvParameters, n: Double) = acc
        (avg.map(_ * n).add(b).map(_ / (n + 1)), n + 1)
      }
      .map(_._1)

  def meanFsvParameters(p: Int, k: Int) =
    Flow[FsvParameters]
      .fold((FsvParameters.empty(p, k), 1.0)) { (acc, b) =>
        val (avg: FsvParameters, n: Double) = acc
        (avg.map(_ * n).add(b).map(_ / (n + 1)), n + 1)
      }
      .map(_._1)

  /**
    * Create an Akka stream from a Markov Chain
    */
  def streamChain[A](chain: breeze.stats.distributions.Process[A],
                     nIters: Int): Source[A, NotUsed] = {

    Source.fromIterator(() => chain.steps.take(nIters))
  }

  def writeChainSink[A](
      filename: String,
      format: A => List[Double]
  ): Sink[A, Future[IOResult]] = {

    Flow[A]
      .map(a => ByteString(format(a).mkString(", ") + "\n"))
      .toMat(FileIO.toPath(Paths.get(filename)))(Keep.right)
  }

  /**
    * Given a single MCMC, write different realisations using
    * the same initial parameters
    * @param nChains the number of parallel chains
    * @param nIters the number of iterations
    * @param filename the prefix of the filename
    * @param format a function to format each row of the CSV output
    */
  def writeParallelChain[A](
      chain: breeze.stats.distributions.Process[A],
      nChains: Int,
      nIters: Int,
      filename: String,
      format: A => List[Double])(implicit m: Materializer) = {

    Source((0 until nChains)).mapAsync(nChains) { i =>
      streamChain(chain, nIters).runWith(
        writeChainSink(s"${filename}_$i.csv", format))
    }
  }

  /**
    * Filter a Stream to select only every nth iteration
    * @param xs an iterator
    * @param n the index of the iterator to select
    */
  def thinChain[A](n: Int) = {
    Flow[A].zipWithIndex.filter { case (a, i) => i % n == 0 }.map(_._1)
  }

  def readCsv[S](file: String) = {
    FileIO
      .fromPath(Paths.get(file))
      .via(
        Framing.delimiter(ByteString("\n"),
                          maximumFrameLength = 8192,
                          allowTruncation = true))
      .map(_.utf8String)
      .map(a => a.split(",").toVector)
  }
}