io.citrine.lolo.bags.BaggerHelper.scala Maven / Gradle / Ivy

Go to download
package io.citrine.lolo.bags

import io.citrine.lolo.stats.{MathUtils, StatsUtils}
import io.citrine.lolo.api.{Model, TrainingRow}

/**
  * Helper class to subsume shared functionality of [[RegressionBagger]] and [[MultiTaskBagger]].
  *
  * @param models collection of trained models
  * @param trainingData on which models were trained
  * @param Nib vector (over models) of vectors (over training data) of the number of repeats in each model's bag
  * @param useJackknife whether to use jackknife for uncertainty quantification
  * @param uncertaintyCalibration whether to apply empirical uncertainty calibration
  */
protected[bags] case class BaggerHelper(
    models: Seq[Model[Double]],
    trainingData: Seq[TrainingRow[Double]],
    Nib: Vector[Vector[Int]],
    useJackknife: Boolean,
    uncertaintyCalibration: Boolean
) {

  /**
    * Seq of tuples containing training data paired with their out-of-bag residuals and uncertainty estimates.
    * Defined as lazy so it's only computed if needed for the ratio or bias learner calculations.
    */
  lazy val oobErrors: Seq[(Vector[Any], Double, Double)] = trainingData.indices.flatMap { idx =>
    val oobModels = models.zip(Nib.map(_(idx))).filter(_._2 == 0).map(_._1)
    val label = trainingData(idx).label
    if (oobModels.size < 2 || label.isNaN) {
      None
    } else {
      val model = BaggedRegressionModel(oobModels, Nib.filter { _(idx) == 0 })
      val predicted = model.transform(Seq(trainingData(idx).inputs))
      val error = predicted.expected.head - trainingData(idx).label
      val uncertainty = predicted.stdDevObs.get.head
      Some(trainingData(idx).inputs, error, uncertainty)
    }
  }

  /** Uncertainty calibration ratio based on the OOB errors. */
  val rescaleRatio: Double = if (uncertaintyCalibration && useJackknife) {
    val trainingLabels = trainingData.collect { case TrainingRow(_, y, _) if !(y.isInfinite || y.isNaN) => y }
    val zeroTolerance = StatsUtils.range(trainingLabels) / 1e12
    BaggerHelper.calculateRescaleRatio(oobErrors.map { case (_, e, u) => (e, u) }, zeroTolerance = zeroTolerance)
  } else {
    1.0
  }
  assert(!rescaleRatio.isNaN && !rescaleRatio.isInfinity, s"Uncertainty calibration ratio is not real: $rescaleRatio")

  /** Data on which to train a bias learner. */
  lazy val biasTraining: Seq[TrainingRow[Double]] = oobErrors.map {
    case (f, e, u) =>
      // Math.E is only statistically correct.  It should be actualBags / Nib.transpose(i).count(_ == 0)
      // Or, better yet, filter the bags that don't include the training example
      val bias = Math.E * Math.max(Math.abs(e) - u * rescaleRatio, 0)
      TrainingRow(f, bias)
  }
}

protected[bags] object BaggerHelper {

  /**
    * Calculate the uncertainty calibration ratio, which is the 68th percentile of error/uncertainty for the training points.
    *
    * If a point has 0 uncertainty, the ratio is 1 iff error is also 0, or infinity otherwise.
    * If the 68th percentile ratio is infinity, default to 1.0.
    * This is not unreasonable when the number of training data and bags are small,
    * meaning there may only be 2 out-of-bag models.
    *
    * @param oobErrors sequence of tuples of (error, uncertainty) from OOB predictions for each training row
    * @param zeroTolerance tolerance for determining if the error/uncertainty are zero
    * @return uncertainty rescale ratio
    */
  def calculateRescaleRatio(oobErrors: Seq[(Double, Double)], zeroTolerance: Double = 1e-12): Double = {
    val oneSigmaRatio = oobErrors
      .map {
        case (error, uncertainty) =>
          val errorIsZero = MathUtils.tolerantEquals(error, 0.0, zeroTolerance)
          val uncertaintyIsZero = MathUtils.tolerantEquals(uncertainty, 0.0, zeroTolerance)
          if (errorIsZero && uncertaintyIsZero) {
            1.0
          } else {
            math.abs(error / uncertainty)
          }
      }
      .sorted
      .drop((oobErrors.size * 0.68).toInt)
      .headOption
      .getOrElse(1.0)
    if (oneSigmaRatio.isPosInfinity) {
      1.0
    } else {
      oneSigmaRatio
    }
  }
}