io.citrine.lolo.util.LoloPyDataLoader.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lolo_2.13 Show documentation
A random forest-centered machine learning library in Scala.
There is a newer version: 6.6.2
package io.citrine.lolo.util

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.ByteOrder
import java.util.zip._
import io.citrine.lolo.api.{MultiTaskModelPredictionResult, PredictionResult, TrainingRow}

/**
  * Tool used to transfer data from LoloPy to the JVM
  *
  * Works by receiving an byte array and converting it to Scala Sequence
  */
object LoloPyDataLoader {

  /**
    * Receives the feature array. Assumes it to be in a C-ordering
    * @param input Input array as a byte array
    * @param numAttributes Number of features in the array
    * @param bigEndian Whether the numbers are is big-endian or not
    * @return The array as a Scala array
    */
  def getFeatureArray(input: Array[Byte], numAttributes: Integer, bigEndian: Boolean): Seq[Vector[Double]] = {
    // Get ordering
    val ordering = if (bigEndian) ByteOrder.BIG_ENDIAN else ByteOrder.LITTLE_ENDIAN

    // Wrap the input as a buffer
    val buffer = ByteBuffer.wrap(input).order(ordering)

    // Generate digits
    val nDigits = input.length / 8
    (0 until nDigits).map(x => buffer.getDouble(x * 8)).grouped(numAttributes).map(_.toVector).toVector
  }

  /**
    * Receive a 1D array of floats or integers
    * @param input Input byte array
    * @param getDouble Whether to read doubles (or integers)
    * @param bigEndian Whether the file
    */
  def get1DArray(input: Array[Byte], getDouble: Boolean, bigEndian: Boolean): Seq[Any] = {
    // Get ordering
    val ordering = if (bigEndian) ByteOrder.BIG_ENDIAN else ByteOrder.LITTLE_ENDIAN

    // Make the buffer
    val buffer = ByteBuffer.wrap(input).order(ordering)

    // Convert to integer or float
    if (getDouble) {
      val nDigits = input.length / 8
      (0 until nDigits).map(x => buffer.getDouble(x * 8)).toVector
    } else {
      val nDigits = input.length / 4
      (0 until nDigits).map(x => buffer.getInt(x * 4)).toVector
    }
  }

  /**
    * Build training rows from the features, labels, and weights.
    *
    * @param X Feature array
    * @param y Label array
    * @param w Weight array
    * @return Sequence of training row objects
    */
  def buildTrainingRows[T](X: Seq[Vector[Double]], y: Seq[T], w: Seq[Double]): Seq[TrainingRow[T]] = {
    X.lazyZip(y).lazyZip(w).map {
      case (xi, yi, wi) => TrainingRow(xi, yi, wi)
    }
  }

  /**
    * Generate the results of a regression model, which are assumed to be doubles
    * @param predictionResult Prediction result object
    * @return Byte array of doubles in native system order
    */
  def getRegressionExpected(predictionResult: PredictionResult[Any]): Array[Byte] = {
    val predResults: Seq[Double] = predictionResult.expected.asInstanceOf[Seq[Double]]
    send1DArray(predResults)
  }

  /**
    * Generate the results of a multitask regression model, which are assumed to be all doubles
    * @param predictionResult result of predicting on a multitask model
    * @return Byte array of doubles in native system order (the caller must then reshape the result into a 2d array)
    */
  def getMultiRegressionExpected(predictionResult: MultiTaskModelPredictionResult): Array[Byte] = {
    val predResults = predictionResult.expected.asInstanceOf[Seq[Seq[Double]]].flatten
    send1DArray(predResults)
  }

  /**
    * Send the training entry importance scores to the Python client
    * @param predictionResult Prediction result object
    * @return Byte of array of doubles in native system order
    */
  def getImportanceScores(predictionResult: PredictionResult[Any]): Array[Byte] = {
    send1DArray(predictionResult.importanceScores.get.flatten)
  }

  /**
    * Generate the uncertainties of a regression model, which are assumed to be doubles
    * @param predictionResult Prediction result object
    * @return Byte array of doubles in native system order
    */
  def getRegressionUncertainty(predictionResult: PredictionResult[Any]): Array[Byte] = {
    val predResults: Seq[Double] = predictionResult.uncertainty().get.asInstanceOf[Seq[Double]]
    send1DArray(predResults)
  }

  /**
    * Get the uncertainties of a multitask regression model, which are assumed to be all doubles
    * @param predictionResult result of predicting on a multitask model
    * @return Byte array of doubles in native system order (the caller must then reshape the result into a 2d array)
    */
  def getMultiRegressionUncertainty(predictionResult: MultiTaskModelPredictionResult): Array[Byte] = {
    val uncertaintyResults = predictionResult.uncertainty().get.asInstanceOf[Seq[Seq[Double]]].flatten
    send1DArray(uncertaintyResults)
  }

  /**
    * Get the correlation coefficients between the uncertainties of a multitask regression model.
    * By calling this method for all (i, j) pairs, one can construct a correlation matrix.
    * Combined with getMultiRegressionUncertainty, one can construct the covariance matrix.
    *
    * @param predictionResult result of predicting on a multitask model
    * @param i index of the first output
    * @param j index of the second output
    * @return Byte array of doubles in native system order
    */
  def getRegressionCorrelation(predictionResult: MultiTaskModelPredictionResult, i: Int, j: Int): Array[Byte] = {
    val correlationResults = predictionResult.uncertaintyCorrelation(i, j).get
    send1DArray(correlationResults)
  }

  /**
    * Prepare to send a 1D array of Doubles by converting it to a byte array
    * @param data Data to be sent
    * @return Byte array with all the doubles in Seq ordered in system byte order
    */
  def send1DArray(data: Seq[Double]): Array[Byte] = {
    val buffer: ByteBuffer = ByteBuffer.allocate(data.length * 8).order(ByteOrder.nativeOrder())
    data.foreach(buffer.putDouble)
    buffer.array()
  }

  /**
    * Get the predicted class from a classification model
    *
    * Note: The class labels must be integers
    *
    * @param predictionResult Prediction result from a classifier
    * @return Bytes of a integer array of the predicted class labels
    */
  def getClassifierExpected(predictionResult: PredictionResult[Any]): Array[Byte] = {
    val expect = predictionResult.expected.asInstanceOf[Seq[Int]]
    val buffer = ByteBuffer.allocate(expect.length * 4).order(ByteOrder.nativeOrder())
    expect.foreach(buffer.putInt)
    buffer.array
  }

  /**
    * Get the classification probabilities as an array
    *
    * Note: Model must be trained with sequential nonnegative integers starting at 0 as class names.
    *
    * @param predictionResult Prediction result from a classifier
    * @param nClasses Number of classes in the problem
    * @return Bytes of a double array of probabilities, stored row-wise (i.e., first entry, then second, ...)
    */
  def getClassifierProbabilities(predictionResult: PredictionResult[Any], nClasses: Int): Array[Byte] = {
    // Get an iterator over the number of classes
    val classes = 0 until nClasses
    val probs = predictionResult
      .uncertainty()
      .get
      .asInstanceOf[Seq[Map[Int, Double]]]
      .map(x => classes.map(i => x.getOrElse(i, 0.0)))
    val buffer = ByteBuffer.allocate(nClasses * probs.length * 8).order(ByteOrder.nativeOrder())
    probs.flatten.foreach(buffer.putDouble)
    buffer.array
  }

  /**
    * Convert a JVM object to a byte array
    *
    * Used for saving a model in lolopy inside of a pickle file.
    *
    * @param obj Object to be saved
    * @param compressLevel Compression level used to condense the serialized representation
    * @return Object as a serialized byte array
    */
  def serializeObject(obj: Any, compressLevel: Int): Array[Byte] = {
    // Thanks to: https://stackoverflow.com/questions/39369319/convert-any-type-in-scala-to-arraybyte-and-back
    val stream: ByteArrayOutputStream = new ByteArrayOutputStream()
    val compressedStream: DeflaterOutputStream = new DeflaterOutputStream(stream, new Deflater(compressLevel))
    val oos = new ObjectOutputStream(compressedStream)
    oos.writeObject(obj)
    oos.close()
    compressedStream.close()
    stream.toByteArray
  }

  /**
    * Deserialize a JVM object from a byte array
    *
    * @param bytes Bytes to be unserialized
    * @return The object
    */
  def deserializeObject(bytes: Array[Byte]): Any = {
    val stream = new ObjectInputStream(new InflaterInputStream(new ByteArrayInputStream(bytes)))
    val obj = stream.readObject()
    stream.close()
    obj
  }

  /**
    * Create a PredictionResult object from the mean and uncertainty
    * @param thisExpected    Mean of the predictions of a model
    * @param thisUncertainty Uncertainty of the predictions
    * @return Prediction result object
    */
  def makeRegressionPredictionResult(
      thisExpected: Seq[Double],
      thisUncertainty: Seq[Double]
  ): PredictionResult[Double] = {
    new PredictionResult[Double] {
      override def expected: Seq[Double] = thisExpected
      override def uncertainty(includeNoise: Boolean = true): Option[Seq[Any]] = Some(thisUncertainty)
    }
  }
}