spire.example.randomforest.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of examples_2.10 Show documentation
examples
The newest version!
package spire.example

import spire.algebra._
import spire.math._
import spire.implicits._

import scala.{ specialized => spec }
import scala.annotation.tailrec
import scala.util.Random.nextInt

import scala.reflect.ClassTag

import CrossValidation._


/**
 * An example of constructing Random Forests for both regression and
 * classification. This example shows off the utility of vector spaces (in this
 * case `CoordinateSpace`), fields, and orders to create random forests.
 */
object RandomForestExample extends App {

  // The Iris data set uses `Vector[Rational]`.
  testClassification(DataSet.Iris, RandomForestOptions())

  // The Yeast data set uses `Array[Double]`.
  testClassification(DataSet.Yeast, RandomForestOptions(
    numAxesSample = Some(2), numPointsSample = Some(200),
    numTrees = Some(200), minSplitSize = Some(3)))

  // The MPG data set uses `Array[Double]`.
  testRegression[Array[Double], Double](DataSet.MPG, RandomForestOptions(numPointsSample = Some(200), numTrees = Some(50)))


  def testClassification[V, @spec(Double) F, K](dataset: DataSet[V, F, K], opts: RandomForestOptions)(implicit
      order: Order[F], classTagV: ClassTag[V], classTagK: ClassTag[K], real: IsReal[F]) {

    println(s"\n${dataset.describe}\n")
    println(s"Cross-validating ${dataset.name} with random forest classification...")
    val accuracy = crossValidateClassification(dataset) { implicit space => data =>
      RandomForest.classification(data, opts)
    }
    println("... accuracy of %.2f%%\n" format (real.toDouble(accuracy) * 100))
  }

  def testRegression[V, @spec(Double) F](dataset: DataSet[V, F, F], opts: RandomForestOptions)(implicit
      order: Order[F], classTagV: ClassTag[V], classTagF: ClassTag[F], real: IsReal[F]) {

    println(s"\n${dataset.describe}\n")
    println(s"Cross-validating ${dataset.name} with random forest regression...")
    val rSquared = crossValidateRegression(dataset) { implicit space => data =>
      RandomForest.regression(data, opts)
    }
    println("... R^2 of %.3f" format real.toDouble(rSquared))
  }
}


/**
 * Random forests have a lot of knobs, so they are all stored in this class
 * for ease-of-use.
 */
case class RandomForestOptions(
  numAxesSample: Option[Int] = None,   // # of variables sampled each split.
  numPointsSample: Option[Int] = None, // # of points sampled per tree.
  numTrees: Option[Int] = None,        // # of trees created.
  minSplitSize: Option[Int] = None,    // Min. node size required for split.
  parallel: Boolean = true)            // Build trees in parallel.


/**
 * The common bits between regression and classification random forests. The
 * only real difference is how we determine the "disparity" or "error" in a
 * region of the tree. So, our outputs all belong to some type we don't really
 * care about. We then have a way of determining the error of some subset of
 * these outputs using the `Region`.
 */
trait RandomForest[V, @spec(Double) F, @spec(Double) K] {
  implicit def V: CoordinateSpace[V, F]
  implicit def F: Field[F] = V.scalar
  implicit def order: Order[F]
  implicit def vectorClassTag: ClassTag[V]

  // We need to be able to incrementally update the disparity. This is because,
  // for performance reasons, we want to do a linear sweep of some axis in a
  // region, maintaining the disparity of the region before the sweep line and
  // the region after the sweep line. We do this by updating the disparity as
  // the sweep line passes over a point, removing it from one region and adding
  // it to the other.

  protected trait RegionLike {
    def +(k: K): Region
    def -(k: K): Region
    def error: F
    def value: K
  }

  protected trait RegionCompanion {
    def empty: Region
  }

  protected type Region <: RegionLike
  protected def Region: RegionCompanion

  // A forest is just a bunch of trees.

  protected case class Forest(trees: List[DecisionTree[V, F, K]])

  // A version `RandomForestOptions` that doesn't have any unknown values.

  protected case class FixedOptions(
    numAxesSample: Int,
    numPointsSample: Int,
    numTrees: Int,
    minSplitSize: Int,
    parallel: Boolean)

  /**
   * Construct a random forest.
   */
  protected def randomForest(data: Array[V], outputs: Array[K], opts: FixedOptions): Forest = {
    require(opts.numAxesSample <= V.dimensions, "Cannot sample more dimension than exist in V.")
    require(data.length == outputs.length, "Number of dependent and independent variables must match.")

    // Selects a set of `m` predictors to use as coordiante indices. The
    // sampling is done using a variant of Knuth's shuffle.

    def predictors(): Array[Int] = {
      val indices = new Array[Int](opts.numAxesSample)
      cfor(0)(_ < indices.length, _ + 1) { i => indices(i) = i }
      cfor(V.dimensions - 1)(_ >= indices.length, _ - 1) { i =>
        val j = nextInt(i + 1)
        if (j < indices.length)
          indices(j) = i
      }
      indices
    }

    // Randomly samples `n` points with replacement from `data`. Note that our
    // sample is actually an array of indices. 

    def sample(): Array[Int] = {
      val sample = new Array[Int](opts.numPointsSample)
      cfor(0)(_ < sample.length, _ + 1) { i =>
        sample(i) = nextInt(data.length)
      }
      sample
    }

    // Convenience method to quickly create a full region from a set of
    // members.

    def region(members: Array[Int]): Region = {
      var d = Region.empty
      cfor(0)(_ < members.length, _ + 1) { i =>
        d += outputs(members(i))
      }
      d
    }

    // Grows a decision tree from a single region. The tree will keep growing
    // until we hit the minimum region size.

    def growTree(members: Array[Int]): DecisionTree[V, F, K] = {
      if (members.length < opts.minSplitSize) {
        Leaf(region(members).value)
      } else {
        val region0 = region(members)
        val vars = predictors()

        var minError = region0.error
        var minVar = -1
        var minIdx = -1

        cfor(0)(_ < vars.length, _ + 1) { i =>
          var axis = vars(i)
          var leftRegion = Region.empty
          var rightRegion = region0

          // To determine the optimal split point along an axis, we first sort
          // all the members along this axis. This let's us use a sweep-line to
          // update the left/right regions in O(1) time, so our total time to
          // check is dominated by sorting in O(n log n).

          members.qsortBy(data(_).coord(axis))

          cfor(0)(_ < (members.length - 1), _ + 1) { j =>

            // We move point j from the right region to the left and see if our
            // error is reduced.

            leftRegion += outputs(members(j))
            rightRegion -= outputs(members(j))
            val error = (leftRegion.error * (j + 1) +
                         rightRegion.error * (members.length - j - 1)) / members.length
            if (error < minError) {
              minError = error
              minVar = axis
              minIdx = j
            }
          }
        }

        // If we can never do better than our initial region, then split the
        // middle of some random axis -- we can probably do better here. It
        // would actually be nice try splitting again with a new set of
        // predictors, but we'd need a way to bound the number of retries.

        if (minIdx < 0) {
          minVar = vars(vars.length - 1)
          minIdx = members.length / 2
        }

        // We could do this in a single linear scan, but this is an example.

        if (minVar != vars(vars.length - 1)) { // Try to avoid a sort if we can.
          members.qsortBy(data(_).coord(minVar))
        }

        // We split the region directly between the left's furthest right point
        // and the right's furthest left point.

        val boundary = (data(members(minIdx)).coord(minVar) +
                        data(members(minIdx + 1)).coord(minVar)) / 2
        val left = members take (minIdx + 1)
        val right = members drop (minIdx + 1)
        Split(minVar, boundary, growTree(left), growTree(right))
      }
    }

    // Random forests are embarassingly parallel. Except for very small
    // datasets, there is no reason not to parallelize the algorithm.

    if (opts.parallel) {
      Forest((1 to opts.numTrees).toList.par.map({ _ =>
        growTree(sample())
      }).toList)
    } else {
      Forest(List.fill(opts.numTrees)(growTree(sample())))
    }
  }

  protected def fromForest(forest: Forest): V => K

  protected def defaultOptions(size: Int): FixedOptions

  private def fixOptions(size: Int, options: RandomForestOptions): FixedOptions = {
    val defaults = defaultOptions(size)
    FixedOptions(
      options.numAxesSample getOrElse defaults.numAxesSample,
      options.numPointsSample getOrElse defaults.numPointsSample,
      options.numTrees getOrElse defaults.numTrees,
      options.minSplitSize getOrElse defaults.minSplitSize,
      options.parallel)
  }

  def apply(data: Array[V], out: Array[K], options: RandomForestOptions) = {
    fromForest(randomForest(data, out, fixOptions(data.length, options)))
  }
}


/**
 * A `RandomForest` implementation for regression. In regression, the output
 * type is assumed to lie in the same field as the input vectors scalars. The
 * final predicted output is the average of the individual tress output (which
 * itself is just the mean of all outputs in the region the point lands in.
 */
class RandomForestRegression[V, @spec(Double) F](implicit val V: CoordinateSpace[V, F],
    val order: Order[F], val vectorClassTag: ClassTag[V]) extends RandomForest[V, F, F] {

  // Our "disparity" measure is just the squared error of the region.
  // We could be more careful here and use a "stable" incremental mean and
  // variance, like that described in [1], but this is simpler for now.
  // [1]: http://nfs-uxsup.csx.cam.ac.uk/~fanf2/hermes/doc/antiforgery/stats.pdf

  protected final class SquaredError(sum: F, sumSq: F, count: Int) extends RegionLike {
    def +(k: F) = new SquaredError(sum + k, sumSq + (k * k), count + 1)
    def -(k: F) = new SquaredError(sum - k, sumSq - (k * k), count - 1)
    def error: F = sumSq / count - (sum / count) ** 2  // Error = variance.
    def value: F = sum / count
  }

  protected type Region = SquaredError
  object Region extends RegionCompanion {
    def empty = new SquaredError(F.zero, F.zero, 0)
  }

  protected def defaultOptions(size: Int): FixedOptions = {
    val axes = math.max(V.dimensions / 3, math.min(V.dimensions, 2))
    val sampleSize = math.max(size * 2 / 3, 1)
    FixedOptions(axes, sampleSize, size, 5, true)
  }

  protected def fromForest(forest: Forest): V => F = { v =>
    forest.trees.map(_(v)).qmean
  }
}


/**
 * A `RandomForest` implementation for classification. In this case, the
 * outputs (dependent variable) belongs to some type `K`. This type needs to be
 * a well behaved Java object as its `equals` and `hashCode` will be used to
 * determine equality of classes. This implementation uses a majority vote
 * method to determine classification. Each region in a tree is associated with
 * the most popular class in that region. Ties are broken randomly (not really).
 * Within a forest, each tree casts its vote for classification of a point and
 * the majority wins. Again, ties are broken randomly (again, not really).
 */
class RandomForestClassification[V, @spec(Double) F, K](implicit val V: CoordinateSpace[V, F],
    val order: Order[F], val vectorClassTag: ClassTag[V]) extends RandomForest[V, F, K] {

  // Our "disparity" measure here is the Gini index. It basically measures how
  // homogeneous our region is, giving regions of high variability higher
  // scores.

  protected final class GiniIndex(m: Map[K, Int]) extends RegionLike {
    def +(k: K) = new GiniIndex(m + (k -> (m.getOrElse(k, 0) + 1)))
    def -(k: K) = new GiniIndex(m + (k -> (m.getOrElse(k, 0) - 1)))
    def error: F = {
      val n = F.fromInt(m.foldLeft(0)(_ + _._2))
      m.foldLeft(F.zero) { case (idx, (k, cnt)) =>
        idx + (F.fromInt(cnt) / n)
      }
    }
    def value: K = m.maxBy(_._2)._1
  }

  protected type Region = GiniIndex
  object Region extends RegionCompanion {
    def empty = new GiniIndex(Map.empty)
  }

  protected def defaultOptions(size: Int): FixedOptions = {
    val axes = math.max(math.sqrt(V.dimensions.toDouble).toInt, math.min(V.dimensions, 2))
    val sampleSize = math.max(size * 2 / 3, 1)
    FixedOptions(axes, sampleSize, size, 5, true)
  }

  protected def fromForest(forest: Forest): V => K = { v =>
    forest.trees.foldLeft(Map.empty[K, Int]) { (acc, classify) =>
      val k = classify(v)
      acc + (k -> (acc.getOrElse(k, 0) + 1))
    }.maxBy(_._2)._1
  }
}


object RandomForest {

  def regression[V, @spec(Double) F](data: Array[V], out: Array[F], options: RandomForestOptions)(implicit
      V: CoordinateSpace[V, F], order: Order[F], ev: ClassTag[V]): V => F = {
    val rfr = new RandomForestRegression[V, F]
    rfr(data, out, options)
  }

  def regression[V, @spec(Double) F](data: Iterable[V], out: Iterable[F],
      options: RandomForestOptions)(implicit V: CoordinateSpace[V, F], order: Order[F],
      classTagV: ClassTag[V], classTagF: ClassTag[F]): V => F = {
    regression(data.toArray, out.toArray, options)
  }

  def regression[V, @spec(Double) F](data: Iterable[(V, F)], options: RandomForestOptions)(implicit
      V: CoordinateSpace[V, F], order: Order[F],
      classTagV: ClassTag[V], classTagF: ClassTag[F]): V => F = {
    val (in, out) = data.unzip
    regression(in.toArray, out.toArray, options)
  }

  def classification[V, @spec(Double) F, K](data: Array[V], out: Array[K], options: RandomForestOptions)(implicit
      V: CoordinateSpace[V, F], order: Order[F], ev: ClassTag[V]): V => K = {
    val rfc = new RandomForestClassification[V, F, K]
    rfc(data, out, options)
  }

  def classification[V, @spec(Double) F, K](data: Iterable[V], out: Iterable[K],
      options: RandomForestOptions)(implicit V: CoordinateSpace[V, F],
      order: Order[F], classTagV: ClassTag[V], classTagK: ClassTag[K]): V => K = {
    classification(data.toArray, out.toArray, options)
  }

  def classification[V, @spec(Double) F, K](data: Iterable[(V, K)], options: RandomForestOptions)(implicit
      V: CoordinateSpace[V, F], order: Order[F],
      classTagV: ClassTag[V], classTagK: ClassTag[K]): V => K = {
    val (in, out) = data.unzip
    classification(in.toArray, out.toArray, options)
  }
}


/**
 * A simple decision tree. Each internal node is assigned an axis aligned
 * boundary which divides the space in 2 (left and right). To determine the
 * value of an input point, we simple determine which side of the boundary line
 * the input lies on, then recurse on that side. When we reach a leaf node, we
 * output its value.
 */
sealed trait DecisionTree[V, F, K] {
  def apply(v: V)(implicit V: CoordinateSpace[V, F], F: Order[F]): K = {
    @tailrec def loop(tree: DecisionTree[V, F, K]): K = tree match {
      case Split(i, boundary, left, right) =>
        if (v.coord(i) <= boundary) loop(left) else loop(right)
      case Leaf(k) =>
        k
    }

    loop(this)
  }
}

case class Split[V, F, K](variable: Int, boundary: F,
    left: DecisionTree[V, F, K], right: DecisionTree[V, F, K]) extends DecisionTree[V, F, K]

case class Leaf[V, F, K](value: K) extends DecisionTree[V, F, K]