org.jliszka.probabilitymonad.Distribution.scala Maven / Gradle / Ivy
The newest version!
package org.jliszka.probabilitymonad
import java.math.MathContext
import scala.annotation.tailrec
import scala.math.BigDecimal
import scala.util.Random
trait Distribution[A] {
self =>
protected def get: A
override def toString = ""
def map[B](f: A => B): Distribution[B] = new Distribution[B] {
override def get = f(self.get)
}
def flatMap[B](f: A => Distribution[B]): Distribution[B] = new Distribution[B] {
override def get = f(self.get).get
}
def filter(pred: A => Boolean): Distribution[A] = new Distribution[A] {
@tailrec
override def get = {
val s = self.get
if (pred(s)) s else this.get
}
}
def given(pred: A => Boolean): Distribution[A] = filter(pred)
def until(pred: List[A] => Boolean): Distribution[List[A]] = new Distribution[List[A]] {
override def get = {
@tailrec
def helper(sofar: List[A]): List[A] = {
if (pred(sofar)) sofar
else helper(self.get :: sofar)
}
helper(Nil)
}
}
def repeat(n: Int): Distribution[List[A]] = new Distribution[List[A]] {
override def get = List.fill(n)(self.get)
}
/**
* Using this distribution as a prior, compute the posterior distribution after running an experiment
* and observing some outcomes and not others.
*/
def posterior[B](experiment: A => Distribution[B])(observed: B => Boolean): Distribution[A] = {
case class Trial(p: A, evidence: B)
val d = for {
p <- this
e <- experiment(p)
} yield Trial(p, e)
d.filter(t => observed(t.evidence)).map(_.p)
}
/**
* Markov chains
*/
@tailrec
final def markov(n: Int)(f: A => Distribution[A]): Distribution[A] = {
if (n == 0) this
else this.flatMap(f).markov(n-1)(f)
}
def markov(pred: A => Boolean)(f: A => Distribution[A]): Distribution[A] = new Distribution[A] {
override def get = {
@tailrec
def helper(a: A): A = {
if (pred(a)) a
else helper(f(a).get)
}
helper(self.get)
}
}
private val N = 10000
def pr(pred: A => Boolean, given: A => Boolean = (a: A) => true, samples: Int = N): Double = {
1.0 * this.filter(given).sample(samples).count(pred) / samples
}
// NB: Expected value only makes sense for real-valued distributions. If you want to find the expected
// value of a die roll, for example, you have to do die.map(_.toDouble).ev.
def ev(implicit toDouble: A <:< Double): Double = Stream.fill(N)(toDouble(self.get)).sum / N
def mean(implicit toDouble: A <:< Double): Double = ev
private def square(x: Double) = x * x
private def cube(x: Double) = x * x * x
def variance(implicit toDouble: A <:< Double): Double = {
val mean = this.mean
this.map(x => {
square(toDouble(x) - mean)
}).ev
}
def stdev(implicit toDouble: A <:< Double): Double = {
math.sqrt(this.variance)
}
def skewness(implicit toDouble: A <:< Double): Double = {
val mean = this.mean
val stdev = this.stdev
this.map(x => {
cube((toDouble(x) - mean) / stdev)
}).ev
}
def kurtosis(implicit toDouble: A <:< Double): Double = {
val mean = this.mean
val variance = this.variance
this.map(x => {
square(square(toDouble(x) - mean))
}).ev / square(variance)
}
def sample(n: Int = N): List[A] = List.fill(n)(self.get)
/**
* "Freeze" a distribution by taking a sample and serving values out of that sample at random.
* Useful for when a distribution is expensive to compute and is being sampled from repeatedly.
*/
def freeze: Distribution[A] = {
Distribution.discreteUniform(sample(N*10))
}
def zip[B](d: Distribution[B]): Distribution[(A, B)] = new Distribution[(A, B)] {
override def get = (self.get, d.get)
}
def zipWith[B, C](d: Distribution[B])(f: (A, B) => C): Distribution[C] = new Distribution[C] {
override def get = f(self.get, d.get)
}
def +(d: Distribution[A])(implicit n: Numeric[A]): Distribution[A] = new Distribution[A] {
override def get = n.plus(self.get, d.get)
}
def +(x: A)(implicit n: Numeric[A]): Distribution[A] = new Distribution[A] {
override def get = n.plus(self.get, x)
}
def -(d: Distribution[A])(implicit n: Numeric[A]): Distribution[A] = new Distribution[A] {
override def get = n.minus(self.get, d.get)
}
def -(x: A)(implicit n: Numeric[A]): Distribution[A] = new Distribution[A] {
override def get = n.minus(self.get, x)
}
def *(d: Distribution[A])(implicit n: Numeric[A]): Distribution[A] = new Distribution[A] {
override def get = n.times(self.get, d.get)
}
def *(x: A)(implicit n: Numeric[A]): Distribution[A] = new Distribution[A] {
override def get = n.times(self.get, x)
}
def /(d: Distribution[A])(implicit toDouble: A <:< Double): Distribution[Double] = new Distribution[Double] {
override def get = toDouble(self.get) / toDouble(d.get)
}
def /(x: A)(implicit toDouble: A <:< Double): Distribution[Double] = new Distribution[Double] {
override def get = toDouble(self.get) / toDouble(x)
}
def hist(implicit ord: Ordering[A] = null, d: A <:< Double = null) = {
if (d == null) {
plotHist(ord)
} else {
bucketedHist(20)(ord, d)
}
}
def histData: Map[A, Double] = {
this.sample(N).groupBy(x=>x).mapValues(_.length.toDouble / N)
}
private def plotHist(implicit ord: Ordering[A] = null) {
val histogram = this.histData.toList
val sorted = if (ord == null) histogram else histogram.sortBy(_._1)(ord)
doPlot(sorted)
}
private def findBucketWidth(min: Double, max: Double, buckets: Int): (BigDecimal, BigDecimal, BigDecimal, Int) = {
// Use BigDecimal to avoid annoying rounding errors.
val widths = List(0.1, 0.2, 0.25, 0.5, 1.0, 2.0, 2.5, 5.0, 10.0).map(BigDecimal.apply)
val span = max - min
val p = (math.log(span) / math.log(10)).toInt - 1
val scale = BigDecimal(10).pow(p)
val scaledWidths = widths.map(_ * scale)
val bestWidth = scaledWidths.minBy(w => (span / w - buckets).abs)
val outerMin = (min / bestWidth).toInt * bestWidth
val outerMax = ((max / bestWidth).toInt + 1) * bestWidth
val actualBuckets = ((outerMax - outerMin) / bestWidth).toInt
(outerMin, outerMax, bestWidth, actualBuckets)
}
def bucketedHist(buckets: Int)(implicit ord: Ordering[A], toDouble: A <:< Double) {
val data = this.sample(N).toList.sorted
val min = data.head
val max = data.last
val (outerMin, outerMax, width, nbuckets) = findBucketWidth(toDouble(min), toDouble(max), buckets)
bucketedHistHelper(outerMin, outerMax, nbuckets, data, roundDown = false)(ord, toDouble)
}
def bucketedHist(min: Double, max: Double, nbuckets: Int, roundDown: Boolean = false)
(implicit ord: Ordering[A], toDouble: A <:< Double) {
val data = this.sample(N).toList.sorted.filter(a => {
val x = toDouble(a)
min <= x && x <= max
})
bucketedHistHelper(BigDecimal(min), BigDecimal(max), nbuckets, data, roundDown)(ord, toDouble)
}
private def bucketedHistHelper(min: BigDecimal, max: BigDecimal, nbuckets: Int, data: List[A], roundDown: Boolean)
(implicit ord: Ordering[A], toDouble: A <:< Double) {
val rm = if (roundDown) BigDecimal.RoundingMode.DOWN else BigDecimal.RoundingMode.HALF_UP
val width = (max - min) / nbuckets
def toBucket(a: A): BigDecimal = ((toDouble(a) - min) / width).setScale(0, rm) * width + min
val n = data.size
val bucketToProb = data
.groupBy(toBucket)
.mapValues(_.size.toDouble / n)
val bucketed = (min to max by width).map(a => a -> bucketToProb.getOrElse(a, 0.0))
doPlot(bucketed)
}
private def doPlot[B](data: Iterable[(B, Double)]) = {
val scale = 100
val maxWidth = data.map(_._1.toString.length).max
val fmt = "%"+maxWidth+"s %5.2f%% %s"
data.foreach{ case (a, p) => {
val hashes = (p * scale).toInt
println(fmt.format(a.toString, p*100, "#" * hashes))
}}
}
}
object Distribution {
private val rand = new Random()
def always[A](value: A) = new Distribution[A] {
override def get = value
}
def definitely[A](value: A) = always(value)
/**
* Discrete distributions
*/
sealed abstract class Coin
case object H extends Coin
case object T extends Coin
def coin: Distribution[Coin] = discreteUniform(List(H, T))
def biasedCoin(p: Double): Distribution[Coin] = discrete(H -> p, T -> (1-p))
def d(n: Int) = discreteUniform(1 to n)
def die = d(6)
def dice(n: Int) = die.repeat(n)
def tf(p: Double = 0.5) = discrete(true -> p, false -> (1-p))
def bernoulli(p: Double = 0.5) = discrete(1 -> p, 0 -> (1-p))
def discreteUniform[A](values: Iterable[A]): Distribution[A] = new Distribution[A] {
private val vec = Vector() ++ values
override def get = vec(rand.nextInt(vec.length))
}
def discrete[A](weightedValues: (A, Double)*): Distribution[A] = new Distribution[A] {
val len = weightedValues.size
val scale = len / weightedValues.map(_._2).sum
val scaled = weightedValues.map{ case (a, p) => (a, p * scale) }.toList
val (smaller, bigger) = scaled.partition(_._2 < 1.0)
// The alias method: http://www.keithschwarz.com/darts-dice-coins/
private def alias(smaller: List[(A, Double)], bigger: List[(A, Double)]): List[(A, Double, Option[A])] = {
smaller match {
case Nil => bigger.map{ case (a, _) => (a, 1.0, None) }
case (s, sp)::ss => {
val (b, pb)::bb = bigger
val remainder = (b, pb - (1.0 - sp))
val rest = if (remainder._2 < 0.9999) alias(remainder :: ss, bb) else alias(ss, remainder :: bb)
(s, sp, Some(b)) :: rest
}
}
}
val table = Vector() ++ alias(smaller, bigger)
private def select(p1: Double, p2: Double, table: Vector[(A, Double, Option[A])]): A = {
table((p1 * len).toInt) match {
case (a, _, None) => a
case (a, p, Some(b)) => if (p2 <= p) a else b
}
}
override def get = {
select(uniform.get, uniform.get, table)
}
}
def shuffle[A](values: List[A]): Distribution[List[A]] = new Distribution[List[A]] {
override def get = rand.shuffle(values)
}
def choose(n: Int, k: Int): Distribution[List[Boolean]] = {
if (n == 0) always(Nil)
else {
val p = k.toDouble / n
for {
a <- tf(p)
rest <- choose(n-1, if (a) k-1 else k)
} yield a :: rest
}
}
def geometric(p: Double): Distribution[Int] = {
tf(p).until(_.headOption == Some(true)).map(_.size - 1)
}
def binomial(p: Double, n: Int): Distribution[Int] = {
bernoulli(p).repeat(n).map(_.sum)
}
def negativeBinomial(p: Double, r: Int): Distribution[Int] = {
tf(p).until(_.count(_ == false) == r).map(_.size - r)
}
def poisson(lambda: Double): Distribution[Int] = {
exponential(1).until(_.sum > lambda).map(_.size - 1)
}
def zipf(s: Double, n: Int): Distribution[Int] = {
discrete((1 to n).map(k => k -> 1.0 / math.pow(k, s)): _*)
}
/**
* Continuous distributions
*/
object uniform extends Distribution[Double] {
override def get = rand.nextDouble()
}
object normal extends Distribution[Double] {
override def get = rand.nextGaussian()
}
def chi2(n: Int): Distribution[Double] = {
normal.map(x => x*x).repeat(n).map(_.sum)
}
def students_t(df: Int): Distribution[Double] = {
for {
z <- normal
v <- chi2(df)
} yield z * math.sqrt(df / v)
}
def pareto(a: Double, xm: Double = 1.0): Distribution[Double] = {
for {
x <- uniform
} yield xm * math.pow(x, -1/a)
}
def exponential(l: Double): Distribution[Double] = {
for {
x <- uniform
} yield math.log(x) / (-l)
}
def laplace(b: Double): Distribution[Double] = {
val d = exponential(1/b)
d - d
}
def F(d1: Int, d2: Int): Distribution[Double] = {
chi2(d1) / chi2(d2)
}
def lognormal: Distribution[Double] = {
for {
z <- normal
} yield math.exp(z)
}
def cauchy: Distribution[Double] = {
normal / normal
}
def weibull(l: Double, k: Double): Distribution[Double] = {
for {
y <- exponential(1)
} yield l * math.pow(y, 1/k)
}
def gamma(k: Double, theta: Double): Distribution[Double] = {
val n = k.toInt
val gammaInt = uniform.repeat(n).map(_.map(x => -math.log(x)).sum)
val gammaFrac = {
val delta = k - n
// From https://en.wikipedia.org/wiki/Gamma_distribution#Generating_gamma-distributed_random_variables
def helper(): Distribution[Double] = {
for {
u1 <- uniform
u2 <- uniform
u3 <- uniform
(zeta, eta) = {
val v0 = math.E / (math.E + delta)
if (u1 <= v0) {
val zeta = math.pow(u2, 1/delta)
val eta = u3 * math.pow(zeta, delta - 1)
(zeta, eta)
} else {
val zeta = 1 - math.log(u2)
val eta = u3 * math.exp(-zeta)
(zeta, eta)
}
}
r <- if (eta > math.pow(zeta, delta - 1) * math.exp(-zeta)) helper() else always(zeta)
} yield r
}
helper()
}
(gammaInt + gammaFrac) * theta
}
def beta(a: Double, b: Double): Distribution[Double] = {
for {
x <- gamma(a, 1)
y <- gamma(b, 1)
} yield x / (x + y)
}
def sequence[T](ds: List[Distribution[T]]): Distribution[List[T]] = new Distribution[List[T]] {
override def get = ds.map(_.get)
}
def dirichlet(alphas: List[Double]): Distribution[List[Double]] = {
sequence(alphas.map(a => gamma(a, 1))).map(ys => {
val sum = ys.sum
ys.map(_ / sum)
})
}
/**
* Tests if two probability distributions are the same using the Kolmogorov-Smirnov test.
* The distributions are unlikely to be the same (p < 0.05) if the value is greater than 1.35
* and very unlikely (p < 0.001) if the value is greater than 1.95.
*/
def ksTest[A](d1: Distribution[A], d2: Distribution[A])(implicit ord: Ordering[A]): Double = {
val n = 100000
val d1s = d1.sample(n).sorted.zipWithIndex
val d2s = d2.sample(n).sorted.zipWithIndex
val all = (d1s ++ d2s).sorted.zipWithIndex
// 2i is the expected index in the combined list and j is the actual index.
val worstOffset = all.map{ case ((x, i), j) => math.abs(2 * i - j) }.max / 2
val ksStatistic = worstOffset.toDouble / n
ksStatistic / math.sqrt(2.0 * n / (n * n))
}
/**
* Determine if a joint probability distribution is composed of 2 independent events.
* Uses the G-test: http://en.wikipedia.org/wiki/G-test
*/
def chi2test[A, B](d: Distribution[(A, B)]): Double = {
val data = d.histData
val total = data.map(_._2).sum
val rowValues = data.map(_._1._1).toSet
val colValues = data.map(_._1._2).toSet
val rowTotals = (for {
row <- rowValues
} yield row -> colValues.map(col => data.getOrElse((row, col), 0.0)).sum).toMap
val colTotals = (for {
col <- colValues
} yield col -> rowValues.map(row => data.getOrElse((row, col), 0.0)).sum).toMap
val chi2stat = (for {
row <- rowValues
col <- colValues
} yield {
val observed = data.getOrElse((row, col), 0.0)
val expected = {
val rowTotal = rowTotals.getOrElse(row, 0.0)
val colTotal = colTotals.getOrElse(col, 0.0)
rowTotal.toDouble * colTotal / total
}
observed * math.log(observed / expected)
}).sum * 2
val df = (rowValues.size - 1) * (colValues.size - 1)
chi2(df).pr(_ > chi2stat)
}
}