
io.citrine.lolo.bags.BaggedPrediction.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lolo_2.13 Show documentation
Show all versions of lolo_2.13 Show documentation
A random forest-centered machine learning library in Scala.
package io.citrine.lolo.bags
import breeze.linalg.{norm, DenseMatrix, DenseVector}
import io.citrine.lolo.api.{PredictionResult, RegressionResult}
import org.slf4j.{Logger, LoggerFactory}
/**
* Interface defining the return value of a [[BaggedModel]].
*
* This allows the implementation to depend on the number of simultaneous predictions,
* which has performance implications.
*
* For background on the uncertainty calculation, see Wager, S.; Hastie, T and Efron, B. Confidence Intervals for
* Random Forests: The Jackknife and Infinitesimal Jackknife. Journal of Machine Learning Research 15 (2014).
*/
trait BaggedPrediction[+T] extends PredictionResult[T] {
/** The number of input rows that have been predicted on (NOT the number of bagged models). */
def numPredictions: Int
/** The predictions made by each of the bagged models in the ensemble. */
def ensemblePredictions: Seq[PredictionResult[T]]
/**
* Average the gradients from the models in the ensemble
*
* @return the gradient of each prediction as a vector of doubles
*/
override lazy val gradient: Option[Seq[Vector[Double]]] = if (ensemblePredictions.head.gradient.isEmpty) {
/* If the underlying model has no gradient, return None */
None
} else {
val gradientsByPrediction: Seq[Seq[Vector[Double]]] = ensemblePredictions.map(_.gradient.get)
val gradientsByInput: Seq[Seq[Vector[Double]]] = gradientsByPrediction.transpose
Some(
gradientsByInput.map { r => r.toVector.transpose.map(_.sum / ensemblePredictions.size) }
)
}
}
/** Interface for [[BaggedPrediction]]s for regression tasks with task-specific methods. */
sealed trait BaggedRegressionPrediction extends BaggedPrediction[Double] with RegressionResult
/**
* Container with model-wise predictions at a single input point.
*
* Assuming a single input allows for performance optimizations and more readable code.
* See [[MultiPointBaggedPrediction]] for a generic implementation.
*
* @param ensemblePredictions for each constituent model
* @param NibIn the sample matrix as (N_models x N_training)
* @param pointBias bias model prediction at the single prediction point
*/
case class SinglePointBaggedPrediction(
ensemblePredictions: Seq[PredictionResult[Double]],
NibIn: Vector[Vector[Int]],
pointBias: Option[Double] = None,
rescaleRatio: Double = 1.0,
disableBootstrap: Boolean = false
) extends BaggedRegressionPrediction {
override def numPredictions: Int = 1
override def expected: Seq[Double] = Seq(rawExpected + pointBias.getOrElse(0.0))
override def bias: Option[Seq[Double]] = pointBias.map(Seq(_))
override def uncertainty(observational: Boolean): Option[Seq[Double]] = {
if (observational) {
stdDevObs
} else {
stdDevMean
}
}
/**
* The importances are computed as an average of bias-corrected jackknife-after-bootstrap
* and infinitesimal jackknife methods
*
* @return training row scores of each prediction
*/
override def importanceScores: Option[Seq[Seq[Double]]] = Some(Seq(singleScores))
override lazy val stdDevMean: Option[Seq[Double]] = {
if (disableBootstrap) {
// If bootstrap is disabled, rescale is unity and treeVariance is our only option for UQ.
// Since it's not recalibrated, it's best considered to be a confidence interval of the underlying weak learner.
assert(rescaleRatio == 1.0)
Some(Seq(math.sqrt(treeVariance)))
} else {
Some(
Seq(math.sqrt(BaggedPrediction.rectifyEstimatedVariance(singleScores)))
)
}
}
override lazy val stdDevObs: Option[Seq[Double]] = Option.when(!disableBootstrap) {
val std = (rescaleRatio * math.sqrt(treeVariance)) ensuring (_ >= 0.0)
Seq(std)
}
private lazy val treePredictions = ensemblePredictions.map(_.expected.head).toArray
private lazy val rawExpected = treePredictions.sum / treePredictions.length
private lazy val treeVariance = {
assert(treePredictions.length > 1, "Bootstrap variance undefined for fewer than 2 bootstrap samples.")
treePredictions.map(x => math.pow(x - rawExpected, 2.0)).sum / (treePredictions.length - 1)
}
private lazy val singleScores: Vector[Double] = {
// This will be more convenient later
val nMat = NibIn.transpose
// Compute the Bessel-uncorrected variance of the ensemble of predicted values, then multiply by (n - 1) / (n * B)
// We later sum over the training data, introducing a factor of n and leaving us with the expected correction term
val correction = treeVariance * (treePredictions.length - 1.0) * (nMat.size - 1) /
(treePredictions.length * treePredictions.length * nMat.size)
// Loop over each of the training instances, computing its contribution to the uncertainty
val trainingContributions = nMat.indices.toVector.map { idx =>
// Pull the vector of the number of times this instance was used to train each tree
val vecN = nMat(idx).toArray
// Loop over the trees, computing the covariance for the IJ estimate and the predicted value of
// the out-of-bag trees for the J(ackknife) estimate
// The loops are merged for performance reasons
var cov: Double = 0.0
var tNot: Double = 0.0
var tNotCount: Int = 0
vecN.indices.foreach { jdx =>
cov = cov + (vecN(jdx) - 1) * (treePredictions(jdx) - rawExpected)
if (vecN(jdx) == 0) {
tNot = tNot + treePredictions(jdx)
tNotCount = tNotCount + 1
}
}
// Compute the infinitesimal jackknife estimate
val varIJ = math.pow(cov / vecN.length, 2.0)
if (tNotCount > 0) {
// Compute the Jackknife after bootstrap estimate
val varJ = math.pow(tNot / tNotCount - rawExpected, 2.0) * (nMat.size - 1) / nMat.size
// Averaged the corrected IJ and J estimates
0.5 * (varJ + varIJ - math.E * correction)
} else {
// We can't compute the Jackknife after bootstrap estimate, so just correct the IJ estimate
varIJ - correction
}
}
// The uncertainty must be positive, so anything smaller than zero is noise. Make sure that no estimated
// uncertainty is below that noise level
trainingContributions
}
}
/**
* Container with model-wise predictions and logic to compute variances and training row scores
*
* These calculations are implemented using matrix arithmetic to make them more performant when the number
* of predictions is large. This obfuscates the algorithm significantly, however. To see what is being computed,
* look at [[SinglePointBaggedPrediction]], which is more clear. These two implementations are tested for consistency.
*
* @param ensemblePredictions for each constituent model
* @param NibIn the sample matrix as (N_models x N_training)
* @param bias bias model predictions at all of the prediction points
*/
case class MultiPointBaggedPrediction(
ensemblePredictions: Seq[PredictionResult[Double]],
NibIn: Vector[Vector[Int]],
override val bias: Option[Seq[Double]] = None,
rescaleRatio: Double = 1.0,
disableBootstrap: Boolean = false
) extends BaggedRegressionPrediction {
override def numPredictions: Int = expectedMatrix.length
override def expected: Seq[Double] = rawExpected.zip(biasCorrection).map(x => x._1 + x._2)
override def uncertainty(observational: Boolean): Option[Seq[Any]] = {
if (observational) {
stdDevObs
} else {
stdDevMean
}
}
override def influenceScores(actuals: Seq[Any]): Option[Seq[Seq[Double]]] = {
Some(
influences(
expected.toVector,
actuals.toVector.asInstanceOf[Vector[Double]],
expectedMatrix,
NibJMat,
NibIJMat
)
)
}
override def importanceScores: Option[Seq[Seq[Double]]] = Some(scores)
override lazy val stdDevObs: Option[Seq[Double]] = Option.when(!disableBootstrap)(varObs.map(math.sqrt))
override lazy val stdDevMean: Option[Seq[Double]] = Option
.when(disableBootstrap) {
// If bootstrap is disabled, rescale is unity and treeVariance is our only option for UQ.
// Since it's not recalibrated, it's best considered to be a confidence interval of the underlying weak learner.
assert(rescaleRatio == 1.0)
varObs.map(math.sqrt)
}
.orElse {
val std = variance(rawExpected.toVector, expectedMatrix, NibJMat, NibIJMat).map(math.sqrt)
Some(std)
}
/* transpose to be (# training) x (# models) */
private lazy val Nib: Vector[Vector[Int]] = NibIn.transpose
/* Make a matrix of the tree-wise predictions */
private lazy val expectedMatrix: Seq[Seq[Double]] = ensemblePredictions.map(p => p.expected).transpose
/* Extract the prediction by averaging over trees and adding the bias correction. */
private lazy val biasCorrection: Seq[Double] = bias.getOrElse(Seq.fill(expectedMatrix.length)(0))
private lazy val rawExpected: Seq[Double] = expectedMatrix.map(ps => ps.sum / ps.size)
private lazy val NibJMat: DenseMatrix[Double] = BaggedPrediction.getJackknifeAfterBootstrapMatrix(Nib)
private lazy val NibIJMat: DenseMatrix[Double] = BaggedPrediction.getInfinitesimalJackknifeMatrix(Nib)
/* This estimates the variance of predictive distribution. */
private lazy val varObs: Seq[Double] = expectedMatrix.zip(rawExpected).map {
case (b, y) =>
assert(Nib.size > 1, "Bootstrap variance undefined for fewer than 2 bootstrap samples.")
b.map { x => rescaleRatio * rescaleRatio * math.pow(x - y, 2.0) }.sum / (b.size - 1)
}
/* Compute the scores one prediction at a time */
private lazy val scores: Seq[Vector[Double]] = scores(expected.toVector, expectedMatrix, NibJMat, NibIJMat)
// make sure the variance is non-negative after the stochastic correction
.map(BaggedPrediction.rectifyImportanceScores)
.map(_.map(math.sqrt))
/**
* Compute the variance of a prediction as the average of bias corrected IJ and J variance estimates
*
* @param meanPrediction over the models
* @param modelPredictions prediction of each model
* @param NibJ sampling matrix for the jackknife-after-bootstrap estimate
* @param NibIJ sampling matrix for the infinitesimal jackknife estimate
* @return the estimated variance
*/
def variance(
meanPrediction: Vector[Double],
modelPredictions: Seq[Seq[Double]],
NibJ: DenseMatrix[Double],
NibIJ: DenseMatrix[Double]
): Seq[Double] = {
scores(meanPrediction, modelPredictions, NibJ, NibIJ).map { BaggedPrediction.rectifyEstimatedVariance }
}
/**
* Compute the IJ training row scores for a prediction
*
* @param meanPrediction over the models
* @param modelPredictions prediction of each model
* @param NibJ sampling matrix for the jackknife-after-bootstrap estimate
* @param NibIJ sampling matrix for the infinitesimal jackknife estimate
* @return the score of each training row as a vector of doubles
*/
def scores(
meanPrediction: Vector[Double],
modelPredictions: Seq[Seq[Double]],
NibJ: DenseMatrix[Double],
NibIJ: DenseMatrix[Double]
): Seq[Vector[Double]] = {
/* Stick the predictions in a breeze matrix */
val predMat =
new DenseMatrix[Double](modelPredictions.head.size, modelPredictions.size, modelPredictions.flatten.toArray)
/* These operations are pulled out of the loop and extra-verbose for performance */
val JMat = NibJ.t * predMat
val JMat2 = JMat *:* JMat * ((Nib.size - 1.0) / Nib.size)
val IJMat = NibIJ.t * predMat
val IJMat2 = IJMat *:* IJMat
val arg = IJMat2 + JMat2
/* Avoid division in the loop, calculate (n - 1) / (n * B^2) */
val prefactor = 1.0 / math.pow(modelPredictions.head.size, 2.0) * (Nib.size - 1) / Nib.size
modelPredictions.indices
.map { i =>
/* Compute the first order bias correction for the variance estimators */
val correction = prefactor * math.pow(norm(predMat(::, i) - meanPrediction(i)), 2)
/* The correction is prediction dependent, so we need to operate on vectors */
0.5 * (arg(::, i) - math.E * correction)
}
.map(_.toScalaVector)
}
/**
* Compute the IJ training row scores for a prediction
*
* @param meanPrediction over the models
* @param modelPredictions prediction of each model
* @param NibJ sampling matrix for the jackknife-after-bootstrap estimate
* @param NibIJ sampling matrix for the infinitesimal jackknife estimate
* @return the score of each training row as a vector of doubles
*/
def influences(
meanPrediction: Vector[Double],
actualPrediction: Vector[Double],
modelPredictions: Seq[Seq[Double]],
NibJ: DenseMatrix[Double],
NibIJ: DenseMatrix[Double]
): Seq[Vector[Double]] = {
/* Stick the predictions in a breeze matrix */
val predMat =
new DenseMatrix[Double](modelPredictions.head.size, modelPredictions.size, modelPredictions.flatten.toArray)
/* These operations are pulled out of the loop and extra-verbose for performance */
val JMat = NibJ.t * predMat
val IJMat = NibIJ.t * predMat
val arg = IJMat + JMat
/* Avoid division in the loop */
val inverseSize = 1.0 / modelPredictions.head.size
modelPredictions.indices
.map { i =>
/* Compute the first order bias correction for the variance estimators */
val correction = inverseSize * norm(predMat(::, i) - meanPrediction(i))
/* The correction is prediction dependent, so we need to operate on vectors */
val influencePerRow: DenseVector[Double] =
math.signum(actualPrediction(i) - meanPrediction(i)) * 0.5 * (arg(::, i) - math.E * correction)
/* Impose a floor in case any of the variances are negative (hacked to work in breeze) */
// val floor: Double = math.min(0, -min(variancePerRow))
// val rezero: DenseVector[Double] = variancePerRow - floor
// 0.5 * (rezero + abs(rezero)) + floor
influencePerRow
}
.map(_.toScalaVector)
}
}
case class BaggedClassificationPrediction[T](ensemblePredictions: Seq[PredictionResult[T]])
extends BaggedPrediction[T] {
override def numPredictions: Int = expectedMatrix.length
override def uncertainty(includeNoise: Boolean = true): Option[Seq[Map[T, Double]]] = Some(predictionUncertainties)
override lazy val expected: Seq[T] = expectedMatrix.map(ps => ps.groupBy(identity).maxBy(_._2.size)._1)
private lazy val expectedMatrix: Seq[Seq[T]] = ensemblePredictions.map(p => p.expected).transpose
private lazy val predictionUncertainties: Seq[Map[T, Double]] =
expectedMatrix.map(ps => ps.groupBy(identity).view.mapValues(_.size.toDouble / ps.size).toMap)
}
object BaggedPrediction {
val logger: Logger = LoggerFactory.getLogger(getClass)
/**
* Generate a matrix that is useful for computing (co)variance via jackknife after bootstrap (JaB).
* The central term of the JaB calculation (Wager et. al. 2014, equation 6) is the difference between the
* out-of-bag prediction on a point and the mean prediction on that point. If this is written as a single sum over bags,
* then each point has a weight -1/B when it is in-bag and weight 1/|{N_{bi}=0}| - 1/B when it is out-of-bag (B is the number of bags).
* This matrix encodes those weights, so when it is multiplied by the (# bags) x (# predictions) prediction matrix
* we have a matrix of the \Delta terms from equation 6.
*
* @param Nib The (# training) x (# bags) matrix indicating how many times each training point is used in each bag
*/
def getJackknifeAfterBootstrapMatrix(Nib: Vector[Vector[Int]]): DenseMatrix[Double] = {
new DenseMatrix[Double](
Nib.head.size,
Nib.size,
Nib.flatMap { v =>
val itot = 1.0 / v.size // 1/B
val icount = 1.0 / v.count(_ == 0) // 1/|{N_{bi}=0}|
v.map(n => if (n == 0) icount - itot else -itot)
}.toArray
)
}
/**
* Generate a matrix that is useful for computing (co)variance via infinitesimal jackknife (IJ).
* The central term of the IJ calculation (Wager et. al. 2014, equation 5) is the covariance between the number of
* times a training point appears in a bag and the prediction made by that bag.
* This matrix encodes (N - \bar{N})/B (B is the number of bags), so that when it is multiplied by the
* (# bags) x (# predictions) prediction matrix, we have a matrix of the covariance terms from equation 5.
*
* @param Nib The (# training) x (# bags) matrix indicating how many times each training point is used in each bag
*/
def getInfinitesimalJackknifeMatrix(Nib: Vector[Vector[Int]]): DenseMatrix[Double] = {
new DenseMatrix[Double](
Nib.head.size,
Nib.size,
Nib.flatMap { v =>
val itot = 1.0 / v.size // 1/B
val vtot = v.sum.toDouble / (v.size * v.size) // \bar{N} / B
v.map(n => n * itot - vtot)
}.toArray
)
}
/**
* Make sure the variance is non-negative
*
* The monte carlo bias correction is itself stochastic, so let's make sure the result is positive
*
* If the sum is positive, then great! We're done.
*
* If the sum is <= 0.0, then the actual variance is likely quite small. We know the variance should be at
* least as large as the largest importance, since at least one training point will be important.
* Therefore, let's just take the maximum importance, which should be a reasonable lower-bound of the variance.
* Note that we could also sum the non-negative scores, but that could be biased upwards.
*
* If all of the scores are negative (which happens infrequently for very small ensembles), then we just need a scale.
* The largest scale is the largest magnitude score, which is the absolute value of the minimum score. When this
* happens, then a larger ensemble should really be used!
*
* If all of the treePredictions are zero, then this will return zero.
*
* @param scores the monte-carlo corrected importance scores
* @return A non-negative estimate of the variance
*/
def rectifyEstimatedVariance(scores: Seq[Double]): Double = {
val rawSum = scores.sum
lazy val maxEntry = scores.max
if (rawSum > 0) {
rawSum
} else if (maxEntry > 0) {
// If the sum is negative,
logger.warn(
s"Sum of scores was negative; using the largest score as an estimate for the variance. Please consider increasing the ensemble size."
)
maxEntry
} else {
logger.warn(
s"All scores were negative; using the magnitude of the smallest score as an estimate for the variance. It is highly recommended to increase the ensemble size."
)
-scores.min // equivalent to math.abs(scores.min)
}
} ensuring (_ >= 0.0)
/**
* Make sure the scores are each non-negative
*
* The monte carlo bias correction is itself stochastic, so let's make sure the result is positive.
* If the score was statistically consistent with zero, then we might subtract off the entire bias correction,
* which results in the negative value. Therefore, we can use the magnitude of the minimum as an estimate of the noise
* level, and can simply set that as a floor.
*
* If all of the treePredictions are zero, then this will return a vector of zero
*
* @param scores the monte-carlo corrected importance scores
* @return a vector of non-negative bias corrected scores
*/
def rectifyImportanceScores(scores: Vector[Double]): Vector[Double] = {
// this is a lower-bound on the noise level; note that it is strictly smaller than the correction
val floor = math.abs(scores.min)
if (floor < 0.0) {
logger.warn(s"Some importance scores were negative; rectifying. Please consider increasing the ensemble size.")
}
scores.map(math.max(floor, _))
} ensuring (vec => vec.forall(_ >= 0.0))
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy