All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twosigma.flint.rdd.function.summarize.summarizer.RegressionSummarizer.scala Maven / Gradle / Ivy

The newest version!
/*
 *  Copyright 2017 TWO SIGMA OPEN SOURCE, LLC
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.twosigma.flint.rdd.function.summarize.summarizer

import breeze.linalg.{ DenseVector, DenseMatrix }

case class RegressionRow(
  time: Long,
  x: Array[Double],
  y: Double,
  weight: Double
)

object RegressionSummarizer {
  /**
   *  Transform a [[RegressionRow]] into sqrt-weighted predictor, sqrt-weighted response, original response, and
   *  original weight.
   *
   * @param r               A [[RegressionRow]].
   * @param shouldIntercept If true, it will prepend 1.0 to the predictor.
   * @param isWeighted      If true, both of response and predictor will be multiplied by a factor of square root
   *                        of weight for the weighted regression purpose.
   * @return  sqrt-weighted x, sqrt-weighted y,  original y, original weight,
   */
  protected[summarizer] def transform(
    r: RegressionRow,
    shouldIntercept: Boolean,
    isWeighted: Boolean
  ): (Array[Double], Double, (Double, Double)) = {
    val w = if (isWeighted) r.weight else 1.0
    val sqrtW = Math.sqrt(w)
    if (shouldIntercept) {
      // Prepend the 1.0 at the beginning.
      val x = new Array[Double](r.x.length + 1)
      var i = 0
      while (i < r.x.length) {
        x(i + 1) = r.x(i) * sqrtW
        i += 1
      }
      x(0) = sqrtW
      (x, r.y * sqrtW, (r.y, w))
    } else {
      val x = new Array[Double](r.x.length)
      var i = 0
      while (i < r.x.length) {
        x(i) = r.x(i) * sqrtW
        i += 1
      }
      (x, r.y * sqrtW, (r.y, w))
    }
  }

  protected[summarizer] def transform(
    rows: Seq[RegressionRow],
    shouldIntercept: Boolean,
    isWeighted: Boolean
  ): (DenseMatrix[Double], DenseVector[Double], Array[(Double, Double)]) = {
    val (weightedX, weightedY, yw) =
      rows.map(transform(_, shouldIntercept, isWeighted)).unzip3
    (DenseMatrix(weightedX: _*), DenseVector(weightedY.toArray), yw.toArray)
  }

  /**
   * Find the value of the Gaussian log-likelihood function at the data
   * We use the statsmodels' implementation as a reference:
   * [[http://statsmodels.sourceforget.net/stable/_modules/statsmodels/regression/linear_model.html#WLS.loglike loglike]]
   */
  protected[summarizer] def computeLogLikelihood(
    count: Long,
    sumOfLogWeights: Double,
    residualSumOfSquares: Double
  ): Double = {
    val nOver2: Double = count.toDouble / 2.0
    var logLikelihood: Double = -nOver2 * math.log(residualSumOfSquares)
    logLikelihood += -nOver2 * (1.0 + math.log(math.Pi / nOver2))
    logLikelihood += 0.5 * sumOfLogWeights
    logLikelihood
  }

  /**
   * Find the Bayes information criterion of the data given the fitted model
   * [[https://en.wikipedia.org/wiki/Bayesian_information_criterion Bayesian information criterion]]
   */
  protected[summarizer] def computeBayesIC(
    beta: DenseVector[Double],
    logLikelihood: Double,
    count: Long,
    shouldIntercept: Boolean
  ): Double = {
    val k = beta.length
    -2.0 * logLikelihood + k * math.log(count.toDouble)
  }

  /**
   * Find the Akaike information criterion of the data given the fitted model
   * [[https://en.wikipedia.org/wiki/Akaike_information_criterion Akaike information criterion]]
   */
  protected[summarizer] def computeAkaikeIC(
    beta: DenseVector[Double],
    logLikelihood: Double,
    shouldIntercept: Boolean
  ): Double = {
    val k = beta.length
    -2.0 * logLikelihood + 2.0 * k
  }

  protected[summarizer] def computeResidualSumOfSquares(
    beta: DenseVector[Double],
    sumOfYSquared: Double,
    vectorOfXY: DenseVector[Double],
    matrixOfXX: DenseMatrix[Double]
  ): Double = {
    val k = beta.length
    require(matrixOfXX.rows == matrixOfXX.cols && vectorOfXY.length == k && matrixOfXX.cols == k)
    var residualSumOfSquares = sumOfYSquared
    var i = 0
    while (i < beta.length) {
      var rss = -2.0 * vectorOfXY(i)
      rss += beta(i) * matrixOfXX(i, i)
      var j = 0
      while (j < i) {
        rss += 2.0 * beta(j) * matrixOfXX(i, j)
        j = j + 1
      }
      residualSumOfSquares += rss * beta(i)
      i = i + 1
    }
    residualSumOfSquares
  }

  protected[summarizer] def computeRSquared(
    sumOfYSquared: Double,
    sumOfWeights: Double,
    sumOfY: Double,
    residualSumOfSquares: Double,
    shouldIntercept: Boolean
  ): Double = if (sumOfYSquared == 0.0 || sumOfWeights == 0.0) {
    Double.NaN
  } else {
    val meanOfY = sumOfY / sumOfWeights
    var varianceOfY = sumOfYSquared / sumOfWeights
    if (shouldIntercept) {
      varianceOfY -= meanOfY * meanOfY
    }
    (varianceOfY - residualSumOfSquares / sumOfWeights) / varianceOfY
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy