breeze.stats.DescriptiveStats.scala Maven / Gradle / Ivy
package breeze.stats
import breeze.linalg.operators.{ OpLT, OpLTE }
import scala.reflect.ClassTag
import util.Sorting
import breeze.util.{ quickSelect, quickSelectImpl }
/*
Copyright 2009 David Hall, Daniel Ramage
Licensed under the Apache License, Version 2.0 (the "License")
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import breeze.generic.UFunc
import breeze.linalg.{DenseMatrix, DenseVector, convert, cov}
import breeze.linalg.support.CanTraverseValues
import breeze.linalg.support.CanTraverseValues.ValuesVisitor
import breeze.macros.expand
import breeze.math.Complex
import breeze.numerics.isOdd
import spire.implicits.{cfor, cforRange}
import scala.collection.mutable
case class MeanAndVariance(mean: Double, variance: Double, count: Long) {
def stdDev: Double = math.sqrt(variance)
def +(other: MeanAndVariance): MeanAndVariance = {
val d = other.mean - this.mean
val newMean = this.mean + d * other.count / (other.count + this.count)
val m2a = this.variance * (this.count - 1)
val m2b = other.variance * (other.count - 1)
val m2x = m2a + m2b + d * d * (other.count * this.count)/ (other.count + this.count)
val newVariance = m2x / (other.count + this.count - 1)
MeanAndVariance(newMean, newVariance, this.count + other.count)
}
}
object accumulateAndCount extends UFunc {
@expand
implicit def reduce[T, @expand.args(Double, Complex, Float) Scalar](implicit iter: CanTraverseValues[T, Scalar], @expand.sequence[Scalar](0.0, Complex.zero, 0.0f) zero: Scalar): Impl[T, (Scalar,Int)] = new Impl[T, (Scalar,Int)] {
def apply(v: T): (Scalar,Int) = {
val visit = new ValuesVisitor[Scalar] {
var sum = zero
var n = 0
def visit(a: Scalar): Unit = {
sum += a
n += 1
}
def zeros(numZero: Int, zeroValue: Scalar): Unit = {
sum += (numZero * zeroValue)
n += numZero
}
}
iter.traverse(v, visit)
(visit.sum, visit.n)
}
}
}
trait DescriptiveStats {
/**
* A [[breeze.generic.UFunc]] for computing the mean of objects
*/
object mean extends UFunc {
@expand
implicit def reduce[@expand.args(Float, Double, Complex) S, T](implicit iter: CanTraverseValues[T, S],
@expand.sequence[S](0f, 0d, Complex.zero) z: S): Impl[T, S] = new Impl[T, S] {
def apply(v: T): S = {
val visit = new ValuesVisitor[S] {
var mu: S = z
var n: Long = 0
def visit(y: S): Unit = {
n += 1
val d = y - mu
mu = mu + d / n
}
def zeros(numZero: Int, zeroValue: S): Unit = {
if (numZero != 0)
mu = mu * n / (n + numZero)
n += numZero
}
}
iter.traverse(v, visit)
import visit._
mu
}
}
}
/**
* A [[breeze.generic.UFunc]] for computing the mean and variance of objects.
* This uses an efficient, numerically stable, one pass algorithm for computing both
* the mean and the variance.
*/
object meanAndVariance extends UFunc {
@expand
implicit def reduce[@expand.args(Float, Double) S, T](implicit iter: CanTraverseValues[T, S]): Impl[T, MeanAndVariance] = new Impl[T, MeanAndVariance] {
def apply(v: T): MeanAndVariance = {
val visit = new ValuesVisitor[S] {
var mu: S = 0
var s: S = 0
var n: Long = 0
def visit(y: S): Unit = {
n += 1
val d = y - mu
mu = mu + d / n
s = s + (n - 1) * d / n * d
}
def zeros(numZero: Int, zeroValue: S): Unit = {
for (i <- 0 until numZero) visit(zeroValue)
}
}
iter.traverse(v, visit)
import visit._
if (n > 1) {
MeanAndVariance(mu, s / (n - 1), n)
} else {
MeanAndVariance(mu, 0, n)
}
}
}
}
/**
* A [[breeze.generic.UFunc]] for computing the variance of objects.
* The method just calls meanAndVariance and returns the second result.
*/
object variance extends UFunc {
implicit def reduceDouble[T](implicit mv: meanAndVariance.Impl[T, MeanAndVariance]): Impl[T, Double] = new Impl[T, Double] {
def apply(v: T): Double = mv(v).variance
}
}
/**
* Computes the standard deviation by calling variance and then sqrt'ing
*/
object stddev extends UFunc {
implicit def reduceDouble[T](implicit vari: variance.Impl[T, Double]): Impl[T, Double] = new Impl[T, Double] {
def apply(v: T): Double = scala.math.sqrt(vari(v))
}
}
/**
* A [[breeze.generic.UFunc]] for computing the median of objects
*/
object median extends UFunc {
@expand
implicit def reduce[@expand.args(Int, Long, Double, Float) T]: Impl[DenseVector[T], T] =
new Impl[DenseVector[T], T] {
def apply(v: DenseVector[T]): T = {
if (isOdd(v.length)) {
quickSelect(v.toArray, (v.length - 1) / 2)
} else {
val tempArray: Array[T] = v.toArray.clone()
val secondMedianPosition = v.length / 2
//quickSelectImpl does not clone the array, allowing us to access intermediate semi-sorted results for reuse in the second calculation
(quickSelectImpl(tempArray, secondMedianPosition) +
quickSelectImpl(tempArray, secondMedianPosition - 1)) / 2
}
}
}
@expand
implicit def reduceSeq[@expand.args(Int, Long, Double, Float) T]: Impl[Seq[T], T] =
new Impl[Seq[T], T] {
def apply(v: Seq[T]): T = { median(DenseVector(v.toArray)) }
}
@expand
implicit def reduceM[@expand.args(Int, Long, Double) T]: Impl[DenseMatrix[T], Double] =
new Impl[DenseMatrix[T], Double] {
def apply(m: DenseMatrix[T]) = median(m.toDenseVector)
}
}
object covmat extends UFunc {
implicit val matrixCovariance: Impl[DenseMatrix[Double], DenseMatrix[Double]] = new Impl[DenseMatrix[Double], DenseMatrix[Double]] {
def apply(data: DenseMatrix[Double]) = cov(data)
}
implicit val sequenceCovariance: Impl[Seq[DenseVector[Double]], DenseMatrix[Double]] = new Impl[Seq[DenseVector[Double]], DenseMatrix[Double]] {
/*
* We roughly follow the two_pass_covariance algorithm from here: http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Covariance
* However, we also use Bessel's correction, in order to agree with the rest of breeze.
*/
def apply(data: Seq[DenseVector[Double]]): DenseMatrix[Double] = {
data.headOption.map(firstRow => {
val result = new DenseMatrix[Double](firstRow.size, firstRow.size)
val dataSize = firstRow.size
//First compute the mean
var mean = firstRow.copy
var numRows: Long = 1
data.tail.foreach(x => {
numRows += 1
if (mean.size != x.size) {
throw new IllegalArgumentException("Attempting to compute covariance of dataset where elements have different sizes")
}
cforRange(0 until firstRow.size)(i => {
mean(i) = mean(i) + x(i)
})
})
val numRowsD = numRows.toDouble
mean = mean / numRowsD
//Second compute the covariance
data.foreach { x =>
cforRange(0 until dataSize)(i => {
val a = x(i) - mean(i)
cforRange(0 until dataSize)(j => {
val b = x(j) - mean(j)
result(i, j) = result(i, j) + (a * b / (numRowsD - 1)) //Use
})
})
}
result
}).getOrElse(new DenseMatrix[Double](0, 0))
}
}
}
object corrcoeff extends UFunc {
implicit def matrixCorrelation[T](implicit covarianceCalculator: covmat.Impl[T, DenseMatrix[Double]]): Impl[T, DenseMatrix[Double]] = new Impl[T, DenseMatrix[Double]] {
def apply(data: T) = {
val covariance = covarianceCalculator(data)
val d = new Array[Double](covariance.rows)
cforRange(0 until covariance.rows)(i => {
d(i) = math.sqrt(covariance(i, i))
})
cforRange(0 until covariance.rows)(i => {
cforRange(0 until covariance.rows)(j => {
if (i != j) {
covariance(i, j) /= (d(i) * d(j))
} else {
covariance(i, j) = 1.0
}
})
})
covariance
}
}
}
object mode extends UFunc {
@expand
implicit def reduce[T, @expand.args(Double, Complex, Float, Int) Scalar](
implicit iter: CanTraverseValues[T, Scalar],
@expand.sequence[Scalar](Double.NaN, Complex.nan, 0.0f, 0) initialValue: Scalar): Impl[T, ModeResult[Scalar]] =
new Impl[T, ModeResult[Scalar]] {
def apply(v: T): ModeResult[Scalar] = {
val visitor = new ModeVisitor[Scalar](initialValue)
iter.traverse(v, visitor)
ModeResult(visitor.runningMode, visitor.maxFrequency)
}
}
}
private class ModeVisitor[@expand.args(Double, Complex, Float, Int) Scalar](initialValue: Scalar)
extends ValuesVisitor[Scalar] {
val frequencyCounts = mutable.Map[Scalar,Int]()
var maxFrequency = 0
var runningMode = initialValue
def visit(value: Scalar): Unit = recordOccurrences(value, 1)
def zeros(numZeros: Int, zeroValue: Scalar): Unit = recordOccurrences(zeroValue, numZeros)
private def recordOccurrences(value: Scalar, count: Int): Unit = {
frequencyCounts(value) = frequencyCounts.getOrElse(value, 0) + count
if (frequencyCounts(value) > maxFrequency) {
maxFrequency = frequencyCounts(value)
runningMode = value
}
}
}
/**
* A [[breeze.generic.UFunc]] for digitizing arrays.
*
* Each element in the bins array is assumed to be the *right* endpoint of a given bin.
* For instance, bins=[1,3,5] represents a bin from (-infty,1], (1,3], (3,5] and (5,\infty).
* The result returned is the index of the bin of the inputs.
*
* E.g., digitize([-3, 0.5, 1, 1.5, 4], [0,1,2]) = [0, 1, 1, 2, 3]
*/
object digitize extends UFunc {
implicit def arrayVersion[T, U](implicit base: Impl2[DenseVector[T], DenseVector[U], DenseVector[Int]]): Impl2[Array[T], Array[U], Array[Int]] = {
new Impl2[Array[T], Array[U], Array[Int]] {
def apply(x: Array[T], bins: Array[U]): Array[Int] = {
val vecResult: DenseVector[Int] = digitize(new DenseVector(x), new DenseVector(bins))
vecResult.data
}
}
}
implicit def fromComparison[T, U](implicit lte: OpLTE.Impl2[T, U, Boolean], ltUU: OpLT.Impl2[U, U, Boolean]): Impl2[DenseVector[T], DenseVector[U], DenseVector[Int]] = {
new Impl2[DenseVector[T], DenseVector[U], DenseVector[Int]] {
def apply(x: DenseVector[T], bins: DenseVector[U]): DenseVector[Int] = {
errorCheckBins(bins)
val result = new DenseVector[Int](x.length)
cforRange(0 until x.length) { i =>
result(i) = bins.length
var j = bins.length - 1
while (j >= 0) {
if (lte(x(i), bins(j))) {
result(i) = j
} else {
j = -1
}
j -= 1
}
}
result
}
}
}
@expand
implicit def vecVersion[@expand.args(Int, Long, Double, Float) T]: Impl2[DenseVector[T], DenseVector[Double], DenseVector[Int]] =
new Impl2[DenseVector[T], DenseVector[Double], DenseVector[Int]] {
def apply(x: DenseVector[T], bins: DenseVector[Double]): DenseVector[Int] = {
errorCheckBins(bins)
val result = new DenseVector[Int](x.length)
cforRange(0 until x.length) { i =>
result(i) = bins.length
var j = bins.length - 1
while (j >= 0) {
if (x(i) <= bins(j)) {
result(i) = j
} else {
j = -1
}
j -= 1
}
}
result
}
}
private def errorCheckBins[U](bins: DenseVector[U])(implicit lt: OpLT.Impl2[U, U, Boolean]) = {
cforRange(0 until bins.length - 1) { i =>
require( lt(bins(i), bins(i + 1)) )
}
}
}
/**
* A [[breeze.generic.UFunc]] for counting bins.
*
* If passed a traversable object full of Int's, provided
* those ints are larger than 0, it will return an
* array of the bin counts. E.g.:
* bincount(DenseVector[Int](0,1,2,3,1,3,3,3)) == DenseVector[Int](1,2,1,4)
*
* One can also call this on two dense vectors - the second will be treated
* as an array of weights. E.g.:
* val x = DenseVector[Int](0,1,2,3,1)
* val weights = DenseVector[Double](1.0,2.0,1.0,7.0,1.0)
* result is bincount(x, weights) == DenseVector[Double](1.0,3.0,1,7.0)
*/
object bincount extends UFunc {
import breeze.linalg._
@expand
implicit def vecVersion[@expand.args(Double, Complex, Float) T]: Impl2[DenseVector[Int], DenseVector[T], DenseVector[T]] =
new Impl2[DenseVector[Int], DenseVector[T], DenseVector[T]] {
def apply(x: DenseVector[Int], weights: DenseVector[T]): DenseVector[T] = {
require(min(x) >= 0)
require(x.length == weights.length)
val result = new DenseVector[T](max(x)+1)
cfor(0)(i => i < x.length, i => i+1)(i => {
result(x(i)) = result(x(i)) + weights(i)
})
result
}
}
implicit def reduce[T](implicit iter: CanTraverseValues[T, Int]): Impl[T, DenseVector[Int]] =
new Impl[T, DenseVector[Int]] {
def apply(x: T): DenseVector[Int] = {
require(min(x) >= 0)
val result = new DenseVector[Int](max(x)+1)
class BincountVisitor extends ValuesVisitor[Int] {
def visit(a: Int): Unit = { result(a) = result(a) + 1 }
def zeros(numZero: Int, zeroValue: Int) = {
result(0) = result(0) + numZero
}
}
iter.traverse(x, new BincountVisitor)
result
}
}
/**
* A [[breeze.generic.UFunc]] for counting bins.
*
* This differs from bincount in that the result it returns is a SparseVector.
* The internal implementation of this could probably be significantly sped
* up by avoiding the use of counter. Then again, in sparse situations it's
* still a lot faster than using bincount.
*
*/
object sparse extends UFunc {
import breeze.linalg._
@expand
implicit def vecVersion[@expand.args(Double, Complex, Float) T]: Impl2[DenseVector[Int], DenseVector[T], SparseVector[T]] =
new Impl2[DenseVector[Int], DenseVector[T], SparseVector[T]] {
def apply(x: DenseVector[Int], weights: DenseVector[T]): SparseVector[T] = {
require(min(x) >= 0)
require(x.length == weights.length)
val counter = Counter[Int,T]()
cfor(0)(i => i < x.length, i => i+1)(i => {
counter.update(x(i), counter(x(i)) + weights(i))
})
val builder = new VectorBuilder[T](max(x)+1)
counter.iterator.foreach(x => builder.add(x._1, x._2))
builder.toSparseVector
}
}
implicit def reduce[T](implicit iter: CanTraverseValues[T, Int]): Impl[T, SparseVector[Int]] =
new Impl[T, SparseVector[Int]] {
def apply(x: T): SparseVector[Int] = {
require(min(x) >= 0)
val counter = Counter[Int,Int]()
class BincountVisitor extends ValuesVisitor[Int] {
def visit(a: Int): Unit = { counter.update(a,counter(a) + 1) }
def zeros(numZero: Int, zeroValue: Int) = {
counter.update(zeroValue, counter(zeroValue) + numZero)
}
}
iter.traverse(x, new BincountVisitor)
val builder = new VectorBuilder[Int](max(x)+1)
counter.iterator.foreach(x => builder.add(x._1, x._2))
builder.toSparseVector
}
}
}
}
}
/**
* Provides utilities for descriptive statistics, like the mean and variance.
*/
object DescriptiveStats {
/**
* returns the estimate of the data at p * it.size, where p in [0,1]
*/
def percentile(it: TraversableOnce[Double], p: Double) = {
if (p > 1 || p < 0) throw new IllegalArgumentException("p must be in [0,1]")
val arr = it.toArray
Sorting.quickSort(arr)
// +1 so that the .5 == mean for even number of elements.
val f = (arr.length + 1) * p
val i = f.toInt
if (i == 0) arr.head
else if (i >= arr.length) arr.last
else {
arr(i - 1) + (f - i) * (arr(i) - arr(i - 1))
}
}
/**
* Returns both means and covariance between two vectors. Single pass algorithm.
*
* Note:
* Will happily compute covariance between vectors of different lengths
* by truncating the longer vector.
*
*/
def meanAndCov[T](it1: TraversableOnce[T], it2: TraversableOnce[T])(implicit frac: Fractional[T]) = {
implicit def t(it: TraversableOnce[T]) = it.toIterable //convert to an iterable for zip operation
import frac.mkNumericOps
//mu1(n-1), mu2(n-1), Cov(n-1), n-1
val (mu1, mu2, c, n) = (it1, it2).zipped.foldLeft((frac.zero, frac.zero, frac.zero, frac.zero)) {
(acc, y) => val (oldMu1, oldMu2, oldC, oldN) = acc
val newN = oldN + frac.fromInt(1)
val newMu1 = oldMu1 + ((y._1 - oldMu1) / newN)
val newMu2 = oldMu2 + ((y._2 - oldMu2) / newN)
val newC = oldC + ((y._1 - oldMu1) * (y._2 - newMu2)) //compute covariance in single pass
(newMu1, newMu2, newC, newN)
}
if (n == 1) (mu1, mu2, 0) else (mu1, mu2, c / (n - frac.fromInt(1)))
}
/**
* Returns covariance between two vectors.
*
* Note:
* Will happily compute covariance between vectors of different lengths
* by truncating the longer vector.
*
*/
def cov[T](it1: Iterable[T], it2: Iterable[T])(implicit n: Fractional[T]) = {
meanAndCov(it1, it2)._3
}
}