io.projectglow.sql.util.LeveneHaldane.scala Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2019 The Glow Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.projectglow.sql.util

import org.apache.commons.math3.distribution.AbstractIntegerDistribution
import org.apache.commons.math3.random.RandomGenerator

import io.projectglow.common.HailUtils._

/** Implementation pulled from Hail */
// scalastyle:off
class LeveneHaldane(
    val n: Int,
    val nA: Int,
    val mode: Int,
    pRU: Stream[Double],
    pLU: Stream[Double],
    pN: Double,
    rng: RandomGenerator)
    extends AbstractIntegerDistribution(rng) {

  // The probability mass function P(nAB), computing no more than necessary
  def probability(nAB: Int): Double =
    if (nAB < 0 || nAB > nA || nAB % 2 != nA % 2)
      0.0
    else if (nAB >= mode)
      pRU((nAB - mode) / 2) / pN
    else
      pLU((mode - nAB) / 2) / pN

  // P(n0 < nAB <= n1), implemented to minimize round-off error but take advantage of the sub-geometric tails
  override def cumulativeProbability(n0: Int, n1: Int): Double =
    if (n0 >= n1 || n0 >= nA || n1 < nA % 2)
      0.0
    else if (n0 >= mode) {
      val cutoff = pRU((n0 - mode) / 2 + 1) * 1.0e-16
      pRU.slice((n0 - mode) / 2 + 1, (n1 - mode) / 2 + 1).takeWhile(_ > cutoff).sum / pN
    } else if (n1 < mode) {
      val cutoff = pLU((mode - n1 + 1) / 2) * 1.0e-16
      pLU.slice((mode - n1 + 1) / 2, (mode - n0 + 1) / 2).takeWhile(_ > cutoff).sum / pN
    } else {
      val cutoff = 1.0e-16
      (pLU.slice(1, (mode - n0 + 1) / 2).takeWhile(_ > cutoff).sum
      + pRU.slice(0, (n1 - mode) / 2 + 1).takeWhile(_ > cutoff).sum) / pN
    }
  // P(nAB <= n1)
  def cumulativeProbability(n1: Int): Double =
    cumulativeProbability(-1, n1)
  // P(nAB > n0)
  def survivalFunction(n0: Int): Double =
    cumulativeProbability(n0, nA)

  // Exact tests with the mid-p-value correction:
  // half the probability of the observed outcome nAB plus the probabilities of those "more extreme", i.e.
  // greater, lesser, or of smaller probability, respectively
  // (in the latter case weighting outcomes of equal probability by 1/2)
  def rightMidP(nAB: Int) =
    survivalFunction(nAB) + 0.5 * probability(nAB)
  def leftMidP(nAB: Int) =
    cumulativeProbability(nAB) - 0.5 * probability(nAB)
  def exactMidP(nAB: Int) = {
    val p0U = probability(nAB) * pN
    if (D_==(p0U, 0.0))
      0.0
    else {
      val cutoff = p0U * 0.5e-16
      def mpU(s: Stream[Double]): Double = {
        val (sEq, sLess) =
          s.dropWhile(D_>(_, p0U, tolerance = 1.0e-12)).span(D_==(_, p0U, tolerance = 1.0e-12))
        0.5 * sEq.sum + sLess.takeWhile(_ > cutoff).sum
      }
      (mpU(pLU.tail) + mpU(pRU)) / pN
    }
  }

  def nB: Int = 2 * n - nA
  def getNumericalMean: Double = 1.0 * nA * nB / (2 * n - 1)
  def getNumericalVariance: Double =
    1.0 * nA * nB / (2 * n - 1) * (1 + (nA - 1.0) * (nB - 1) / (2 * n - 3) - 1.0 * nA * nB / (2 * n - 1))

  def isSupportConnected: Boolean = true // interpreted as restricted to the even or odd integers,
  def getSupportUpperBound: Int = nA
  def getSupportLowerBound: Int = nA % 2
}

object LeveneHaldane {

  def apply(n: Int, nA: Int, rng: RandomGenerator): LeveneHaldane = {
    require(nA >= 0 && nA <= n)

    val nB = 2 * n - nA
    val parity = nA % 2
    val mode = ((x: Double) => 2 * math.round((x - parity) / 2) + parity)(
      (nA + 1.0) * (nB + 1) / (2 * n + 3)
    ).toInt

    def pRUfrom(nAB: Int, p: Double): Stream[Double] =
      p #:: pRUfrom(nAB + 2, p * (nA - nAB) * (nB - nAB) / ((nAB + 2.0) * (nAB + 1)))
    def pLUfrom(nAB: Int, p: Double): Stream[Double] =
      p #:: pLUfrom(nAB - 2, p * nAB * (nAB - 1) / ((nA - nAB + 2.0) * (nB - nAB + 2)))

    // Unnormalized probability mass function going right, respectively left, from the mode by 2s
    val pRU = pRUfrom(mode, 1.0)
    val pLU = pLUfrom(mode, 1.0)

    // Normalization constant
    val pN = pRU.takeWhile(_ > 1.0e-16).sum + pLU.takeWhile(_ > 1.0e-16).sum - 1.0

    new LeveneHaldane(n, nA, mode, pRU, pLU, pN, rng)
  }

  // If we ever want to sample, it seems standard to replace `null` with a
  // default Random Generator `new Well19937c()`
  def apply(n: Int, nA: Int): LeveneHaldane = LeveneHaldane(n, nA, null)
}