All Downloads are FREE. Search and download functionalities are using the official Maven repository.

se.alipsa.groovy.stats.Normalize.groovy Maven / Gradle / Ivy

The newest version!
package se.alipsa.groovy.stats

import se.alipsa.groovy.matrix.Matrix
import se.alipsa.groovy.matrix.Stat

import java.math.RoundingMode

import static se.alipsa.groovy.matrix.ValueConverter.convert

/**
 * Implements various ways of normalizing (scaling) data e.g.
 * scale the values so that the they have a mean of 0 and a standard deviation of 1
 * It implements 4 different approaches
 * 
    *
  1. logarithmic normalization
  2. *
  3. Min-Max scaling, Zi = ( Xi - min(X) ) / ( max(X) - min(X) )
  4. *
  5. Mean normalization, X´ = ( X - μ ) / ( max(X) - min(X) )
  6. *
  7. Standard deviation normalization (Z score), Z = ( Xi - μ ) / σ
  8. *
* */ class Normalize { /** * Logarithmic transformations are used to normalize skewed distributions of continuous variables. * Taking the natural log of each observation in the distribution, forces the observations closer to the mean and * will thus generate a more normal distribution. * * @param x * @return the natural logarithm (base e) of the x value */ static BigDecimal logNorm(BigDecimal x, int... decimals) { if (x == null) return null def result = Math.log(x) as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN) } return result } /** * Logarithmic transformations are used to normalize skewed distributions of continuous variables. * Taking the natural log of each observation in the distribution, forces the observations closer to the mean and * will thus generate a more normal distribution. * * @param x * @return the natural logarithm (base e) of the x value */ static T logNorm(T x, int... decimals) { if (x == null) return null def result = Math.log(x as BigDecimal) as T if (decimals.length > 0) { return (result as BigDecimal).setScale(decimals[0], RoundingMode.HALF_EVEN) as T } return result } /** * Logarithmic transformations are used to normalize skewed distributions of continuous variables. * Taking the natural log of each observation in the distribution, forces the observations closer to the mean and * will thus generate a more normal distribution. * * @param x * @return the natural logarithm (base e) of the x value */ static Double logNorm(Double x, int... decimals) { if (x == null) return Double.NaN def result = Math.log(x) as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN).doubleValue() } return result.doubleValue() } /** * Logarithmic transformations are used to normalize skewed distributions of continuous variables. * Taking the natural log of each observation in the distribution, forces the observations closer to the mean and * will thus generate a more normal distribution. * * @param x * @return the natural logarithm (base e) of the x value */ static Float logNorm(Float x, int... decimals) { if (x == null) return Float.NaN def result = Math.log(x) as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN).floatValue() } return result.floatValue() } /** * Logarithmic transformations are used to normalize skewed distributions of continuous variables. * Taking the natural log of each observation in the distribution, forces the observations closer to the mean and * will thus generate a more normal distribution. * * @param column the double column to normalize * @return a double column where all values are transformed with the natural logarithm (base e) of the observations */ static List logNorm(Double[] column, int... decimals) { def vals = [] for (Double x : column) { vals.add(logNorm(x, decimals)) } return vals } /** * Logarithmic transformations are used to normalize skewed distributions of continuous variables. * Taking the natural log of each observation in the distribution, forces the observations closer to the mean and * will thus generate a more normal distribution. * * @param column the Float column to normalize * @return a Float column where all values are transformed with the natural logarithm (base e) of the observations */ static List logNorm(Float[] column, int... decimals) { def vals = [] for (Float x : column) { vals.add(logNorm(x, decimals)) } return vals } /** * Logarithmic transformations are used to normalize skewed distributions of continuous variables. * Taking the natural log of each observation in the distribution, forces the observations closer to the mean and * will thus generate a more normal distribution. * * @param column the BigDecimal column to normalize * @return a BigDecimal column where all values are transformed with the natural logarithm (base e) of the observations */ static List logNorm(BigDecimal[] column, int... decimals) { def vals = [] for (BigDecimal x : column) { vals.add(logNorm(x, decimals)) } return vals } /** * Logarithmic transformations are used to normalize skewed distributions of continuous variables. * Taking the natural log of each observation in the distribution, forces the observations closer to the mean and * will thus generate a more normal distribution. * * @param column the BigDecimal column to normalize * @return a BigDecimal column where all values are transformed with the natural logarithm (base e) of the observations */ static List logNorm(List column, int... decimals) { def vals = [] for (def x : column) { vals.add(logNorm(x, decimals)) } return vals } static List logNorm(Matrix table, String columnName, int... decimals) { if (Number.isAssignableFrom(table.type(columnName))) { return logNorm(table[columnName] as List, decimals) } else { throw new IllegalArgumentException("$columnName is not a numeric column") } } /** * Ranges the data values to be between 0 and 1, the formula is: * Zi = ( Xi - min(X) ) / ( max(X) - min(X) ) * * @param x the observed value * @param minX the lowest value in the distribution * @param maxX the highest value in the distribution * @return a scaled value between 0 and 1 */ static BigDecimal minMaxNorm(BigDecimal x, BigDecimal minX, BigDecimal maxX, int... decimals) { if (x == null || minX == null || maxX == null) { return null } def result = (x - minX) / (maxX - minX) as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN) } return result } /** * Ranges the data values to be between 0 and 1, the formula is: * Zi = ( Xi - min(X) ) / ( max(X) - min(X) ) * * @param x the observed value * @param minX the lowest value in the distribution * @param maxX the highest value in the distribution * @return a scaled value between 0 and 1 */ static T minMaxNorm(T x, T minX, T maxX, int... decimals) { if (x == null || minX == null || maxX == null) { return null } def result = ((x - minX) / (maxX - minX)) as T if (decimals.length > 0) { return (result as BigDecimal).setScale(decimals[0], RoundingMode.HALF_EVEN) as T } return result } /** * Ranges the data values to be between 0 and 1, the formula is: * Zi = ( Xi - min(X) ) / ( max(X) - min(X) ) * * @param x the observed value * @param minX the lowest value in the distribution * @param maxX the highest value in the distribution * @return a scaled value between 0 and 1 */ static Double minMaxNorm(Double x, Double minX, Double maxX, int... decimals) { if (x == null || Double.isNaN(x) || minX == null || Double.isNaN(minX) || maxX == null || Double.isNaN(maxX)) { return Double.NaN } def result = (x - minX) / (maxX - minX) as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN).doubleValue() } return result.doubleValue() } /** * Ranges the data values to be between 0 and 1, the formula is: * Zi = ( Xi - min(X) ) / ( max(X) - min(X) ) * * @param x the observed value * @param minX the lowest value in the distribution * @param maxX the highest value in the distribution * @return a scaled value between 0 and 1 */ static Float minMaxNorm(Float x, Float minX, Float maxX, int... decimals) { if (x == null || Float.isNaN(x) || minX == null || Float.isNaN(minX) || maxX == null || Float.isNaN(maxX)) { return Float.NaN } def result = (x - minX) / (maxX - minX) as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN).floatValue() } return result.floatValue() } /** * Ranges the data values to be between 0 and 1, the formula is: * Zi = ( Xi - min(X) ) / ( max(X) - min(X) ) * * @param column the double column to scale * @return a new double column of scaled values between 0 and 1 */ static List minMaxNorm(Double[] column, int... decimals) { def min = column.min() def max = column.max() List vals = [] for (def x : column) { vals.add(minMaxNorm(x, min, max, decimals)) } return vals } /** * Ranges the data values to be between 0 and 1, the formula is: * Zi = ( Xi - min(X) ) / ( max(X) - min(X) ) * * @param column the float column to scale * @return a new float column of scaled values between 0 and 1 */ static List minMaxNorm(Float[] column, int... decimals) { List col = column as List def min = col.min() as float def max = col.max() as float List vals = [] for (def x : col) { vals.add(minMaxNorm(x, min, max, decimals)) } return vals } /** * Ranges the data values to be between 0 and 1, the formula is: * Zi = ( Xi - min(X) ) / ( max(X) - min(X) ) * * @param column the float column to scale * @return a new float column of scaled values between 0 and 1 */ static List minMaxNorm(BigDecimal[] column, int... decimals) { List col = column as List def min = col.min() as BigDecimal def max = col.max() as BigDecimal List vals = [] for (def x : col) { vals.add(minMaxNorm(x, min, max, decimals)) } return vals } /** * Ranges the data values to be between 0 and 1, the formula is: * Zi = ( Xi - min(X) ) / ( max(X) - min(X) ) * * @param column the float column to scale * @return a new float column of scaled values between 0 and 1 */ static List minMaxNorm(List column, int... decimals) { def min = column.min() def max = column.max() List vals = [] for (def x : column) { def type = x.getClass() vals.add(minMaxNorm(x, convert(min, type), convert(max, type), decimals)) } return vals } static List minMaxNorm(Matrix table, String columnName, int... decimals) { if (Number.isAssignableFrom(table.type(columnName))) { return minMaxNorm(table[columnName] as List, decimals) } else { throw new IllegalArgumentException("$columnName is not a numeric column") } } /** * Scales the data values to be between (–1, 1), the formula is * X´ = ( X - μ ) / ( max(X) - min(X) ) * * @param x the observed value * @param sampleMean the sample mean (μ) * @param minX the lowest value in the distribution * @param maxX the highest value in the distribution * @return a scaled value between -1 and 1 */ static BigDecimal meanNorm(BigDecimal x, BigDecimal sampleMean, BigDecimal minX, BigDecimal maxX, int... decimals) { if (x == null || sampleMean == null || minX == null || maxX == null) { return null } def result = (x - sampleMean) / (maxX - minX) as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN) } return result } /** * Scales the data values to be between (–1, 1), the formula is * X´ = ( X - μ ) / ( max(X) - min(X) ) * * @param x the observed value * @param sampleMean the sample mean (μ) * @param minX the lowest value in the distribution * @param maxX the highest value in the distribution * @return a scaled value between -1 and 1 */ static T meanNorm(T x, T sampleMean, T minX, T maxX, int... decimals) { if (x == null || sampleMean == null || minX == null || maxX == null) { return null } def result = ((x - sampleMean) / (maxX - minX)) as T if (decimals.length > 0) { return (result as BigDecimal).setScale(decimals[0], RoundingMode.HALF_EVEN) as T } return result } /** * Scales the data values to be between (–1, 1), the formula is * X´ = ( X - μ ) / ( max(X) - min(X) ) * * @param x the observed value * @param sampleMean the sample mean (μ) * @param minX the lowest value in the distribution * @param maxX the highest value in the distribution * @return a scaled value between -1 and 1 */ static Double meanNorm(Double x, Double sampleMean, Double minX, Double maxX, int... decimals) { if (x == null || Double.isNaN(x) || sampleMean == null || Double.isNaN(sampleMean) || minX == null || Double.isNaN(minX) || maxX == null || Double.isNaN(maxX)) { return Double.NaN } def result = (x - sampleMean) / (maxX - minX) as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN).doubleValue() } return result.doubleValue() } /** * Scales the data values to be between (–1, 1), the formula is * X´ = ( X - μ ) / ( max(X) - min(X) ) * * @param x the observed value * @param sampleMean the sample mean (μ) * @param minX the lowest value in the distribution * @param maxX the highest value in the distribution * @return a scaled value between -1 and 1 */ static Float meanNorm(Float x, Float sampleMean, Float minX, Float maxX, int... decimals) { if (x == null || Float.isNaN(x) || sampleMean == null || Float.isNaN(sampleMean) || minX == null || Float.isNaN(minX) || maxX == null || Float.isNaN(maxX)) { return Float.NaN } def result = (x - sampleMean) / (maxX - minX) as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN).floatValue() } return result.floatValue() } /** * Scales the data values to be between (–1, 1), the formula is * X´ = ( X - μ ) / ( max(X) - min(X) ) * * @param column the double column to scale * @return a new, normalized double column */ static List meanNorm(Double[] column, int... decimals) { List col = column as List def min = col.min() def max = col.max() def mean = Stat.mean(col) List vals = [] for (def x : col) { vals.add(meanNorm(x, mean, min, max, decimals)) } return vals } /** * Scales the data values to be between (–1, 1), the formula is * X´ = ( X - μ ) / ( max(X) - min(X) ) * * @param column the float column to scale * @return a new, normalized float column */ static List meanNorm(Float[] column, int... decimals) { List col = column as List def min = col.min() as float def max = col.max() as float def mean = Stat.mean(col) as float List vals = [] for (def x : col) { vals.add(meanNorm(x, mean, min, max, decimals)) } return vals } /** * Scales the data values to be between (–1, 1), the formula is * X´ = ( X - μ ) / ( max(X) - min(X) ) * * @param column the float column to scale * @return a new, normalized float column */ static List meanNorm(BigDecimal[] column, int... decimals) { List col = column as List def min = col.min() as BigDecimal def max = col.max() as BigDecimal def mean = Stat.mean(col) as BigDecimal List vals = [] for (def x : col) { vals.add(meanNorm(x, mean, min, max, decimals)) } return vals } /** * Scales the data values to be between (–1, 1), the formula is * X´ = ( X - μ ) / ( max(X) - min(X) ) * * @param column the float column to scale * @return a new, normalized float column */ static List meanNorm(List column, int... decimals) { def min = column.min() def max = column.max() def mean = Stat.mean(column) List vals = [] for (def x : column) { def type = x.getClass() vals.add(meanNorm(x, convert(mean, type), convert(min, type), convert(max, type), decimals)) } return vals } static List meanNorm(Matrix table, String columnName, int... decimals) { if (Number.isAssignableFrom(table.type(columnName))) { return meanNorm(table[columnName] as List, decimals) } else { throw new IllegalArgumentException("$columnName is not a numeric column") } } /** * scales the distribution of data values so that the mean of the observed values * will be 0 and standard deviation will be 1 (a.k.a Z-score). * Z = ( Xi - μ ) / σ * * @param x the observed value * @param sampleMean the sample mean (μ) * @param stdDeviation the standard deviation (σ) * @return a scaled value so that the mean of the observed values will be 0 and standard deviation will be 1 */ static BigDecimal stdScaleNorm(BigDecimal x, BigDecimal sampleMean, BigDecimal stdDeviation, int... decimals) { if (x == null || sampleMean == null || stdDeviation == null) { return null } def result = (x - sampleMean) / stdDeviation as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN) } return result } /** * scales the distribution of data values so that the mean of the observed values * will be 0 and standard deviation will be 1 (a.k.a Z-score). * Z = ( Xi - μ ) / σ * * @param x the observed value * @param sampleMean the sample mean (μ) * @param stdDeviation the standard deviation (σ) * @return a scaled value so that the mean of the observed values will be 0 and standard deviation will be 1 */ static T stdScaleNorm(T x, T sampleMean, T stdDeviation, int... decimals) { if (x == null || sampleMean == null || stdDeviation == null) { return null } def result = ((x - sampleMean) / stdDeviation) as T if (decimals.length > 0) { return (result as BigDecimal).setScale(decimals[0], RoundingMode.HALF_EVEN) as T } return result } /** * scales the distribution of data values so that the mean of the observed values * will be 0 and standard deviation will be 1 (a.k.a Z-score). * Z = ( Xi - μ ) / σ * * @param x the observed value * @param sampleMean the sample mean (μ) * @param stdDeviation the standard deviation (σ) * @return a scaled value so that the mean of the observed values will be 0 and standard deviation will be 1 */ static Double stdScaleNorm(Double x, Double sampleMean, Double stdDeviation, int... decimals) { if (x == null || Double.isNaN(x) || sampleMean == null || Double.isNaN(sampleMean) || stdDeviation == null || Double.isNaN(stdDeviation)) { return Double.NaN } def result = (x - sampleMean) / stdDeviation as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN) } return result.doubleValue() } /** * scales the distribution of data values so that the mean of the observed values * will be 0 and standard deviation will be 1 (a.k.a Z-score). * Z = ( Xi - μ ) / σ * * @param x the observed value * @param sampleMean the sample mean (μ) * @param stdDeviation the standard deviation (σ) * @return a scaled value so that the mean of the observed values will be 0 and standard deviation will be 1 */ static Float stdScaleNorm(Float x, Float sampleMean, Float stdDeviation, int... decimals) { if (x == null || Float.isNaN(x) || sampleMean == null || Float.isNaN(sampleMean) || stdDeviation == null || Float.isNaN(stdDeviation)) { return Float.NaN } def result = (x - sampleMean) / stdDeviation as BigDecimal if (decimals.length > 0) { return result.setScale(decimals[0], RoundingMode.HALF_EVEN).floatValue() } return result.floatValue() } /** * scales the distribution of data values so that the mean of the observed values * will be 0 and standard deviation will be 1 (a.k.a Z-score). * Z = ( Xi - μ ) / σ * * @param column the double column to scale * @return a scaled double column where the values are scaled so that the mean of the observed values * will be 0 and standard deviation will be 1 */ static List stdScaleNorm(Double[] column, int... decimals) { List col = column as List def mean = Stat.mean(col) as Double def stdDev = Stat.sd(col) as Double List vals = [] for (def x : column) { vals.add(stdScaleNorm(x, mean, stdDev, decimals)) } return vals } /** * scales the distribution of data values so that the mean of the observed values * will be 0 and standard deviation will be 1 (a.k.a Z-score). * Z = ( Xi - μ ) / σ * * @param column the float column to scale * @return a scaled float column where the values are scaled so that the mean of the observed values * will be 0 and standard deviation will be 1 */ static List stdScaleNorm(Float[] column, int... decimals) { List col = column as List def stdDev = Stat.sd(col) as Float def mean = Stat.mean(col) as Float List vals = [] for (Float x : col) { vals.add(stdScaleNorm(x, mean, stdDev, decimals)) } return vals } /** * scales the distribution of data values so that the mean of the observed values * will be 0 and standard deviation will be 1 (a.k.a Z-score). * Z = ( Xi - μ ) / σ * * @param column the float column to scale * @return a scaled float column where the values are scaled so that the mean of the observed values * will be 0 and standard deviation will be 1 */ static List stdScaleNorm(BigDecimal[] column, int... decimals) { List col = column as List def stdDev = Stat.sd(col) as BigDecimal def mean = Stat.mean(col) as BigDecimal List vals = [] for (def x : col) { vals.add(stdScaleNorm(x, mean, stdDev, decimals)) } return vals } /** * scales the distribution of data values so that the mean of the observed values * will be 0 and standard deviation will be 1 (a.k.a Z-score). * Z = ( Xi - μ ) / σ * * @param column the float column to scale * @return a scaled float column where the values are scaled so that the mean of the observed values * will be 0 and standard deviation will be 1 */ static List stdScaleNorm(List column, int... decimals) { def stdDev = Stat.sd(column) def mean = Stat.mean(column) List vals = [] for (def x : column) { def type = x.getClass() vals.add(stdScaleNorm(x, convert(mean, type), convert(stdDev, type), decimals)) } return vals } static List stdScaleNorm(Matrix table, String columnName, int... decimals) { if (Number.isAssignableFrom(table.type(columnName))) { return stdScaleNorm(table[columnName] as List, decimals) } else { throw new IllegalArgumentException("$columnName is not a numeric column") } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy