
com.databricks.labs.automl.exploration.tools.PairedTesting.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of automatedml_2.11 Show documentation
Show all versions of automatedml_2.11 Show documentation
Databricks Labs AutoML toolkit
The newest version!
package com.databricks.labs.automl.exploration.tools
import org.apache.commons.math3.stat.correlation.{
Covariance,
KendallsCorrelation,
PearsonsCorrelation,
SpearmansCorrelation
}
import org.apache.commons.math3.stat.descriptive.SummaryStatistics
import org.apache.commons.math3.stat.inference.{TTest, TestUtils}
class PairedTesting(left: Array[Double],
right: Array[Double],
alpha: Double = 0.05) {
/**
* Private helper method for creating the PairedStatistics construct for calculating t tests
* @return PairedSeq of SummaryStatistics Instances for both data series
* @since 0.7.0
* @author Ben Wilson, Databricks
*/
private def createPairedStatistics: PairedSeq = {
assert(
left.length == right.length,
s"Length of pairs is not equal. Left: ${left.length} Right: ${right.length}"
)
val leftStats = new SummaryStatistics()
val rightStats = new SummaryStatistics()
left.foreach(x => leftStats.addValue(x))
right.foreach(x => rightStats.addValue(x))
PairedSeq(leftStats, rightStats)
}
/**
* Method for determining the t-test values for comparing if the mean values are equivalent between two sequences
* of Doubles.
* @return TTestData payload, consisting of the alpha that was used, the t-stat value, significance determination,
* significance p-value, and a judgement of equivalency (Y or N)
* @since 0.7.0
* @author Ben Wilson, Databricks
*/
private def computePairedTTest: TTestData = {
val pairData = createPairedStatistics
val tStatisticValue = TestUtils.t(pairData.left, pairData.right)
val tStatisticSignificance =
TestUtils.pairedTTest(left, right, alpha)
val tStatisticPValue = new TTest().pairedTTest(left, right)
val equivalencyJudgement = tStatisticSignificance match {
case x if x => "N"
case _ => "Y"
}
TTestData(
alpha = alpha,
tStat = tStatisticValue,
tTestSignificance = tStatisticSignificance,
tTestPValue = tStatisticPValue,
equivalencyJudgement = equivalencyJudgement.head
)
}
/**
* Equivalency tests for the distribution of data between two series.
* @returns Payload of the equivalency p value, D statistic, and equivalency judgement between the two distributions
* @since 0.7.0
* @author Ben Wilson, Databricks
*/
private def computeKolmogorovSmirnovTest = {
val ksTestPValue =
TestUtils.kolmogorovSmirnovTest(left, right)
val ksTestDStatistic =
TestUtils.kolmogorovSmirnovStatistic(left, right)
val equivalency = ksTestPValue match {
case x if x <= alpha => "Y"
case _ => "N"
}
KSTestResult(ksTestPValue, ksTestDStatistic, equivalency.head)
}
/**
* Method for calculating unbiased covariance between two data series
* @return unbiased covariance score
* @note https://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math4/stat/correlation/Covariance.html
* @since 0.7.0
* @author Ben Wilson, Databricks
*/
private def computeCovariance: Double =
new Covariance().covariance(left, right, false)
/**
* Method for calculating Pearson's product-moment correlation coefficient for two data series
* @return Pearson's product-moment correlation coefficient
* @note https://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math4/stat/correlation/PearsonsCorrelation.html
* @since 0.7.0
* @author Ben Wilson, Databricks
*/
private def computePearsons: Double =
new PearsonsCorrelation().correlation(left, right)
/**
* Method for calculating Spearman's Rank correlation for two data series using Natural Ranking
* @return Spearman's rank correlation coefficient
* @note https://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math4/stat/correlation/SpearmansCorrelation.html
* @since 0.7.0
* @author Ben Wilson, Databricks
*/
private def computeSpearmans: Double =
new SpearmansCorrelation().correlation(left, right)
/**
* Method for calculating Kendall's Tau-b Rank correlation for two data series
* @return Kendall's Tau-b correlation coefficient
* @note https://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math4/stat/correlation/KendallsCorrelation.html
* @since 0.7.0
* @author Ben Wilson, Databricks
*/
private def computeKendallsTauRank: Double =
new KendallsCorrelation().correlation(left, right)
/**
* Main execution method for getting the pair testing data for two data series.
* Performs equivalency correlation testing for:
* - Unbiased correlation testing
* - Pearson's correlation testing
* - Spearman's correlation testing
* - Kendall's correlation testing
* Computes a t-test for mean equivalency
* Computes distribution equivalency testing between the two series
* @return Testing Payload of the statistical data
* @since 0.7.0
* @author Ben Wilson, Databricks
*/
def execute: PairedTestResult = {
val correlationChecks = CorrelationTestResult(
computeCovariance,
computePearsons,
computeSpearmans,
computeKendallsTauRank
)
PairedTestResult(
correlationChecks,
computePairedTTest,
computeKolmogorovSmirnovTest
)
}
}
/**
* Companion Object for Paired Testing
*/
object PairedTesting {
def evaluate(left: Seq[Double],
right: Seq[Double],
alpha: Double): PairedTestResult = {
new PairedTesting(left.toArray, right.toArray, alpha).execute
}
def evaluate(left: Array[Double],
right: Array[Double],
alpha: Double): PairedTestResult = {
new PairedTesting(left, right, alpha).execute
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy