All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.citrine.lolo.bags.MultiTaskBagger.scala Maven / Gradle / Ivy

package io.citrine.lolo.bags

import breeze.stats.distributions.Poisson
import io.citrine.lolo.api.{
import io.citrine.lolo.stats.StatsUtils
import io.citrine.random.Random
import io.citrine.lolo.stats.metrics.{ClassificationMetrics, RegressionMetrics}

import scala.collection.parallel.CollectionConverters._

  * Create an ensemble of multi-task models.
  * @param method                 learner to train each model in the ensemble
  * @param numBags                number of models in the ensemble
  * @param useJackknife           whether to enable jackknife uncertainty estimate
  * @param uncertaintyCalibration whether to empirically recalibrate the predicted uncertainties
  * @param biasLearner            learner to use for estimating bias
case class MultiTaskBagger(
    method: MultiTaskLearner,
    numBags: Int = -1,
    useJackknife: Boolean = true,
    uncertaintyCalibration: Boolean = true,
    biasLearner: Option[Learner[Double]] = None
) extends MultiTaskLearner {

  override def train(trainingData: Seq[TrainingRow[Vector[Any]]], rng: Random): MultiTaskBaggedTrainingResult = {
    val numInputs = trainingData.head.inputs.length
    val numOutputs = trainingData.head.label.length
    /* Make sure the training data are the same size */
      trainingData.forall { row =>
        row.inputs.length == numInputs && row.label.length == numOutputs
    if (trainingData.size < Bagger.minimumTrainingSize) {
      throw InsufficientTrainingDataException(numRows = trainingData.size, numRequired = Bagger.minimumTrainingSize)
    (0 until numOutputs).foreach { i =>
      val numOutputValues = trainingData.count(row => validOutput(row.label(i)))
      if (numOutputValues < Bagger.minimumOutputCount) {
        throw InsufficientOutputDataException(
          numRows = numOutputValues,
          numRequired = Bagger.minimumOutputCount,
          index = i

    // if numBags is non-positive, set # bags = # inputs
    val actualBags = if (numBags > 0) numBags else trainingData.size

    // Compute the number of instances of each training row in each training sample
    val randBasis = StatsUtils.breezeRandBasis(rng)
    val dist = new Poisson(1.0)(randBasis)
    val Nib: Vector[Vector[Int]] = Iterator
      .filter { suggestedCounts =>
        val allOutputsRepresented = (0 until numOutputs).forall { i =>
 { case (row, count) => validOutput(row.label(i)) && count > 0 }
        val minNonzeroWeights = suggestedCounts.count(_ > 0) >= Bagger.minimumNonzeroWeightSize
        allOutputsRepresented && minNonzeroWeights

    val indices = Nib.indices.toVector
    val (ensembleModels, importances) =
        .map {
          case (thisRng, i) =>
            val weightedTrainingData = Nib(i).zip(trainingData).map {
              case (count, row) => row.mapWeight(_ * count.toDouble)
            val meta = method.train(weightedTrainingData, thisRng)
            (meta.model, meta.featureImportance)

    val averageImportance: Option[Vector[Double]] = importances
      .map( / importances.size))

    // Get bias model and rescale ratio for each label
    val (biasModels, rescaleRatios) = Seq
      .tabulate(numOutputs) { i =>
        val isRegression = ensembleModels.head.realLabels(i)
        if (isRegression) {
          val thisLabelModels =[Model[Double]])
          val thisTrainingData = => vec(i).asInstanceOf[Double]))
          val helper = BaggerHelper(thisLabelModels, thisTrainingData, Nib, useJackknife, uncertaintyCalibration)
          val biasModel = biasLearner.collect {
            case learner if helper.oobErrors.nonEmpty =>
              learner.train(helper.biasTraining, rng = rng).model
          (biasModel, helper.rescaleRatio)
        } else {
          (None, 1.0) // Rescale not used for classification tasks, so just set a default of 1.0

      ensembleModels = ensembleModels,
      Nib = Nib,
      featureImportance = averageImportance,
      trainingData = trainingData,
      biasModels = biasModels,
      rescaleRatios = rescaleRatios

  /** Flag NaNs and nulls. */
  private def validOutput(x: Any): Boolean = {
    Option(x) match {
      case Some(x: Double) => !x.isNaN
      case Some(_: Any)    => true
      case None            => false

  * The result of training a bagger on a multi-label combined model.
  * @param ensembleModels    sequence of multi-models, one for each bag
  * @param featureImportance importance of input features
  * @param Nib               matrix representing number of times each training datum appears in each bag
  * @param trainingData      multi-label training data
  * @param biasModels        sequence of optional bias-correction models, one for each label
  * @param rescaleRatios     sequence of uncertainty calibration ratios for each label
case class MultiTaskBaggedTrainingResult(
    ensembleModels: Seq[MultiTaskModel],
    Nib: Vector[Vector[Int]],
    trainingData: Seq[TrainingRow[Vector[Any]]],
    override val featureImportance: Option[Vector[Double]],
    biasModels: Seq[Option[Model[Double]]],
    rescaleRatios: Seq[Double]
) extends MultiTaskTrainingResult {

  override lazy val model: MultiTaskBaggedModel = MultiTaskBaggedModel(ensembleModels, Nib, biasModels, rescaleRatios)

  // Each entry is a tuple, (feature vector, seq of predicted labels, seq of actual labels).
  // The labels are of type Option[Any] because a given training datum might not have a value for every single label.
  // If the actual value for a label is None, then the corresponding prediction is recorded as None. The model could generate
  // a prediction, but that's not useful in this context, since the point is to compare predictions with ground-truth values.
  override lazy val predictedVsActual: Option[Seq[(Vector[Any], Vector[Option[Any]], Vector[Option[Any]])]] = Some( {
      case (TrainingRow(features, labels, _), nb) =>
        // Bagged models that were not trained on this input
        val oob = == 0).map(_._1)
        if (oob.isEmpty) {
        } else {
          // "Average" the predictions on each label over the out-of-bag models
          val oobPredictions =
          val predicted = {
            case (predictions, labelIndex) if ensembleModels.head.realLabels(labelIndex) =>
              predictions.asInstanceOf[Seq[Double]].sum / predictions.size
            case (predictions, _) => predictions.groupBy(identity).maxBy(_._2.size)._1
          // Remove predictions for which the label was not specified
          val (optionLabels, optionPredicted) = labels
            .map {
              case (l, _) if l == null || (l.isInstanceOf[Double] && l.asInstanceOf[Double].isNaN) => (None, None)
              case (l, p)                                                                          => (Some(l), Some(p))
          Seq((features, optionPredicted, optionLabels))

  override lazy val loss: Option[Double] = predictedVsActual.collect {
    case pva if pva.nonEmpty =>
      val allInputs =
      val allPredicted: Seq[Seq[Option[Any]]] =
      val allActual: Seq[Seq[Option[Any]]] =
        .map {
          case (labelPredicted, labelActual, isReal) =>
            // Construct predicted-vs-actual for just this label, only keeping entries for which both predicted and actual are defined
            val labelPVA = allInputs.lazyZip(labelPredicted).lazyZip(labelActual).flatMap {
              case (input, Some(p), Some(a)) => Some((input, p, a))
              case _                         => None
            if (isReal) {
              RegressionMetrics.RMSE(labelPVA.asInstanceOf[Seq[(Vector[Any], Double, Double)]])
            } else {

  override def models: Seq[Model[Any]] = {
    val realLabels: Seq[Boolean] = ensembleModels.head.realLabels {
      case (isReal: Boolean, i: Int) =>
        val thisLabelModels =
        if (isReal) {
            Nib = Nib,
            rescaleRatio = rescaleRatios(i),
            biasModel = biasModels(i)
        } else {

  * Container holding a parallel sequence of models, each of which predicts on multiple labels.
  * @param ensembleModels sequence of multi-models, one for each bag
  * @param Nib            matrix representing number of times each training datum appears in each bag
  * @param biasModels     sequence of optional bias-correction models, one for each label
  * @param rescaleRatios  sequence of uncertainty calibration ratios for each label
case class MultiTaskBaggedModel(
    ensembleModels: Seq[MultiTaskModel],
    Nib: Vector[Vector[Int]],
    biasModels: Seq[Option[Model[Double]]],
    rescaleRatios: Seq[Double]
) extends MultiTaskModel {

  override val numLabels: Int = ensembleModels.head.numLabels

  override lazy val models: Vector[BaggedModel[Any]] = Vector.tabulate(numLabels) { i =>
    val thisLabelsModels =
    if (realLabels(i)) {
        Nib = Nib,
        rescaleRatio = rescaleRatios(i),
        biasModel = biasModels(i)
    } else {

  override def transform(inputs: Seq[Vector[Any]]): MultiTaskBaggedPrediction =
    MultiTaskBaggedPrediction(, realLabels)

  override def realLabels: Seq[Boolean] = ensembleModels.head.realLabels

  * Container with model-wise predictions for each label and the machinery to compute (co)variance.
  * @param labelPredictions bagged prediction results for each label
  * @param realLabels       a boolean sequence indicating which labels are real-valued
case class MultiTaskBaggedPrediction(labelPredictions: Vector[BaggedPrediction[Any]], realLabels: Seq[Boolean])
    extends BaggedPrediction[Vector[Any]]
    with MultiTaskModelPredictionResult {

  override def numPredictions: Int = labelPredictions.head.numPredictions

  override def expected: Seq[Vector[Any]] =

  override def ensemblePredictions: Seq[PredictionResult[Vector[Any]]] =
      .map(x => ParallelModelsPredictionResult(x.transpose))

  // For each prediction, the uncertainty is a sequence of entries for each label. Missing uncertainty values are reported as NaN
  override def uncertainty(observational: Boolean = true): Option[Seq[Seq[Any]]] = {
    Some( { predictionResult =>
      predictionResult.uncertainty(observational) match {
        case Some(value) => value
        case None        => Seq.fill(numPredictions)(Double.NaN)

  override def uncertaintyCorrelation(i: Int, j: Int, observational: Boolean = true): Option[Seq[Double]] = {
    (realLabels(i), realLabels(j)) match {
      case (true, true) if i == j => Some(Seq.fill(numPredictions)(1.0))
      case (true, true) =>
        if (observational) {
          Some(uncertaintyCorrelationObservational(i, j))
        } else {
      case _: Any => None

    * The uncertainty correlation of the observational distribution is the correlation coefficient
    * calculated over the bootstrap ensemble predictions.
  private def uncertaintyCorrelationObservational(i: Int, j: Int): Seq[Double] = {
    // make (# predictions) x (# bags) prediction matrices for each label
    val baggedPredictionsI =
    val baggedPredictionsJ =
      labelPredictions(j)[Seq[Seq[Double]]] {
      case (bagsI, bagsJ) =>
        StatsUtils.correlation(bagsI, bagsJ)

    * The uncertainty correlation of the mean distribution is 0.0. In theory it should be estimated using the jackknife,
    * but in practice the jackknife performs poorly when estimating covariance, so we default to the trivial implementation for now.
  private def uncertaintyCorrelationMean: Seq[Double] = Seq.fill(numPredictions)(0.0)

© 2015 - 2025 Weber Informatics LLC | Privacy Policy