
com.databricks.labs.automl.exploration.tools.ShapiroWilk.scala Maven / Gradle / Ivy
package com.databricks.labs.automl.exploration.tools
/**
* Shapiro-Wilk test for normality.
* @note the algorithm below is restricted to a maximum of 5000 elements.
*/
object ShapiroWilk extends ShapiroBase {
/**
* Calculates P-value for ShapiroWilk Test and return:
* W score
* Z score
* probability p-value
* Boolean decision on whether or not to reject the null hypothesis that the data is normally distributed
* String value describing whether the data is normally distributed or not
*
* This equation and the modeled source code is ported from the Javascript implementation
* which is originally based on the FORTRAN code used to execute this test.
* https://github.com/rniwa/js-shapiro-wilk/blob/master/shapiro-wilk.js
*
* @param x Array[Double] The data to be tested
* @return Test results from the ShapiroWilk algorithm
* @throws IllegalArgumentException
*/
@throws[IllegalArgumentException]
private def ShapiroWilkW(x: Array[Double]): ShapiroScoreData = {
java.util.Arrays.sort(x)
val n = x.length
if (n < 3)
throw new IllegalArgumentException(
s"Count of elements to measure W is too small ($n) must be more than 3"
)
if (n > 5000) {
throw new IllegalArgumentException(
s"Count of elements to measure W is too large ($n) must be less than 5001"
)
}
val nn2 = n / 2
val a = new Array[Double](nn2 + 1)
/* 1-based */
/*
ALGORITHM AS R94 APPL. STATIST. (1995) vol.44, no.4, 547-551.
Calculates the Shapiro-Wilk W test and its significance level
*/
val small = 1e-19
/* polynomial coefficients */
val g = Array(-2.273, 0.459)
val c1 = Array(0.0, 0.221157, -0.147981, -2.07119, 4.434685, -2.706056)
val c2 = Array(0.0, 0.042981, -0.293762, -1.752461, 5.682633, -3.582633)
val c3 = Array(0.544, -0.39978, 0.025054, -6.714e-4)
val c4 = Array(1.3822, -0.77857, 0.062767, -0.0020322)
val c5 = Array(-1.5861, -0.31082, -0.083751, 0.0038915)
val c6 = Array(-0.4803, -0.082676, 0.0030302)
/* Local variables */
var i = 0
var j = 0
var i1 = 0
var ssassx = 0.0
var summ2 = 0.0
var ssumm2 = 0.0
var gamma = 0.0
var range = 0.0
var a1 = 0.0
var a2 = 0.0
var an = 0.0
var m = 0.0
var s = 0.0
var sa = 0.0
var xi = 0.0
var sx = 0.0
var xx = 0.0
var y = 0.0
var w1 = 0.0
var fac = 0.0
var asa = 0.0
var an25 = 0.0
var ssa = 0.0
var sax = 0.0
var rsn = 0.0
var ssx = 0.0
var xsx = 0.0
var pw = 1.0
an = n.toDouble
if (n == 3) a(1) = 0.70710678 /* = sqrt(1/2) */
else {
an25 = an + 0.25
summ2 = 0.0
i = 1
while ({
i <= nn2
}) {
a(i) = normalQuantile((i - 0.375) / an25, 0, 1) // p(X <= x),
summ2 += a(i) * a(i)
i += 1
}
summ2 *= 2.0
ssumm2 = Math.sqrt(summ2)
rsn = 1.0 / Math.sqrt(an)
a1 = poly(c1, 6, rsn) - a(1) / ssumm2
/* Normalize a[] */
if (n > 5) {
i1 = 3
a2 = -a(2) / ssumm2 + poly(c2, 6, rsn)
fac = Math.sqrt(
(summ2 - 2.0 * (a(1) * a(1)) - 2.0 * (a(2) * a(2))) / (1.0 - 2.0 * (a1 * a1) - 2.0 * (a2 * a2))
)
a(2) = a2
} else {
i1 = 2
fac = Math.sqrt((summ2 - 2.0 * (a(1) * a(1))) / (1.0 - 2.0 * (a1 * a1)))
}
a(1) = a1
i = i1
while ({
i <= nn2
}) {
a(i) /= -fac
i += 1
}
}
range = x(n - 1) - x(0)
if (range < small) {
throw new IllegalArgumentException(
s"Total Range of data is too small to calculate ShapiroWilk test (${range}) which is less than minimum of ($small)"
)
}
/* Check for correct sort order on range - scaled X */
xx = x(0) / range
sx = xx
sa = -a(1)
i = 1
j = n - 1
while ({
i < n
}) {
xi = x(i) / range
if (xx - xi > small) {
throw new IllegalArgumentException(
s"Scaled range is too small to calculate ShapiroWilk properly (${xx - xi}) is less than minimum of ($small)"
)
}
sx += xi
i += 1
if (i != j) sa += sign(i - j) * a(Math.min(i, j))
xx = xi
j -= 1
}
// Calculate W statistic
sa /= n
sx /= n
ssa = 0.0
ssx = 0.0
sax = 0.0
i = 0
j = n - 1
while ({
i < n
}) {
if (i != j) asa = sign(i - j) * a(1 + Math.min(i, j)) - sa
else asa = -sa
xsx = x(i) / range - sx
ssa += asa * asa
ssx += xsx * xsx
sax += asa * xsx
i += 1
j -= 1
}
ssassx = Math.sqrt(ssa * ssx)
w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx)
val w = 1.0 - w1
/* Calculate significance level for W */
if (n == 3) {
/* exact P value : */
val pi6 = 1.90985931710274
/* = 6/pi */
val stqr = 1.04719755119660
pw = pi6 * (Math.asin(Math.sqrt(w)) - stqr)
if (pw < 0.0) pw = 0
//return w;
return ShapiroScoreData(w, 0.0, pw)
}
y = Math.log(w1)
xx = Math.log(an)
if (n <= 11) {
gamma = poly(g, 2, an)
if (y >= gamma) {
pw = 1e-99
return ShapiroScoreData(w, 0.0, pw)
}
y = -Math.log(gamma - y)
m = poly(c3, 4, an)
s = Math.exp(poly(c4, 4, an))
} else {
m = poly(c5, 4, xx)
s = Math.exp(poly(c6, 3, xx))
}
val z = (y - m) / s
pw = gaussCdf(z)
ShapiroScoreData(w, z, pw)
}
/**
* Tests the rejection of null Hypothesis for a particular confidence level
*
* @param data the Array of data to perform a test against
* @param aLevel the alpha for calculating the probability of normalcy resulting in a test pass or failure.
* Defaulted to 5%
* @return ShapiroInternalData payload consisting of the W value, Z score, probability result, normalcy boolean,
* and String value of "Normally Distributed y/n"
* @since 0.7.0
* @author Ben Wilson, Databricks
*/
def test(data: Array[Double], aLevel: Double = 0.05): ShapiroInternalData = {
var normalcyTest = false
val swTest = ShapiroWilkW(data)
val a = aLevel
if (swTest.probability <= a || swTest.probability >= (1.0 - a))
normalcyTest = true
val normalcy = if (normalcyTest) NORMALCY_FALSE else NORMALCY_TRUE
ShapiroInternalData(
swTest.w,
swTest.z,
swTest.probability,
normalcyTest,
normalcy
)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy