scripts.algorithms.GLM-predict.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A GLM-TYPE REGRESSION TO A NEW (TEST) DATASET
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read the matrix X of records (feature vectors)
# B String --- Location to read GLM regression parameters (the betas), with dimensions
# ncol(X) x k: do not add intercept
# ncol(X)+1 x k: add intercept as given by the last B-row
# if k > 1, use only B[, 1] unless it is Multinomial Logit (dfam=3)
# M String " " Location to write the matrix of predicted response means/probabilities:
# nrow(X) x 1 : for Power-type distributions (dfam=1)
# nrow(X) x 2 : for Binomial distribution (dfam=2), column 2 is "No"
# nrow(X) x k+1: for Multinomial Logit (dfam=3), col# k+1 is baseline
# Y String " " Location to read response matrix Y, with the following dimensions:
# nrow(X) x 1 : for all distributions (dfam=1 or 2 or 3)
# nrow(X) x 2 : for Binomial (dfam=2) given by (#pos, #neg) counts
# nrow(X) x k+1: for Multinomial (dfam=3) given by category counts
# O String " " Location to write the printed statistics; by default is standard output
# dfam Int 1 GLM distribution family: 1 = Power, 2 = Binomial, 3 = Multinomial Logit
# vpow Double 0.0 Power for Variance defined as (mean)^power (ignored if dfam != 1):
# 0.0 = Gaussian, 1.0 = Poisson, 2.0 = Gamma, 3.0 = Inverse Gaussian
# link Int 0 Link function code: 0 = canonical (depends on distribution), 1 = Power,
# 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit; ignored if Multinomial
# lpow Double 1.0 Power for Link function defined as (mean)^power (ignored if link != 1):
# -2.0 = 1/mu^2, -1.0 = reciprocal, 0.0 = log, 0.5 = sqrt, 1.0 = identity
# disp Double 1.0 Dispersion value, when available
# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
# ---------------------------------------------------------------------------------------------
# OUTPUT: Matrix M of predicted means/probabilities, some statistics in CSV format (see below)
# The statistics are printed one per each line, in the following CSV format:
# NAME,[COLUMN],[SCALED],VALUE
# NAME is the string identifier for the statistic, see the table below.
# COLUMN is an optional integer value that specifies the Y-column for per-column statistics;
# note that a Binomial/Multinomial one-column Y input is converted into multi-column.
# SCALED is an optional Boolean value (TRUE or FALSE) that tells us whether or not the input
# dispersion parameter (disp) scaling has been applied to this statistic.
# VALUE is the value of the statistic.
#
# NAME COLUMN SCALED MEANING
# ---------------------------------------------------------------------------------------------
# LOGLHOOD_Z + Log-Likelihood Z-score (in st.dev's from mean)
# LOGLHOOD_Z_PVAL + Log-Likelihood Z-score p-value
# PEARSON_X2 + Pearson residual X^2 statistic
# PEARSON_X2_BY_DF + Pearson X^2 divided by degrees of freedom
# PEARSON_X2_PVAL + Pearson X^2 p-value
# DEVIANCE_G2 + Deviance from saturated model G^2 statistic
# DEVIANCE_G2_BY_DF + Deviance G^2 divided by degrees of freedom
# DEVIANCE_G2_PVAL + Deviance G^2 p-value
# AVG_TOT_Y + Average of Y column for a single response value
# STDEV_TOT_Y + St.Dev. of Y column for a single response value
# AVG_RES_Y + Average of column residual, i.e. of Y - mean(Y|X)
# STDEV_RES_Y + St.Dev. of column residual, i.e. of Y - mean(Y|X)
# PRED_STDEV_RES + + Model-predicted St.Dev. of column residual
# R2 + R^2 of Y column residual with bias included
# ADJUSTED_R2 + Adjusted R^2 of Y column residual with bias included
# R2_NOBIAS + R^2 of Y column residual with bias subtracted
# ADJUSTED_R2_NOBIAS + Adjusted R^2 of Y column residual with bias subtracted
# ---------------------------------------------------------------------------------------------
#
# Example with distribution = "Poisson.log":
# hadoop jar SystemML.jar -f GLM_HOME/GLM-predict.dml -nvargs dfam=1 vpow=1.0 link=1 lpow=0.0
# disp=3.0 fmt=csv X=INPUT_DIR/X B=INPUT_DIR/B Y=INPUT_DIR/Y M=OUTPUT_DIR/M O=OUTPUT_DIR/out.csv
# Default values for input parameters:
fileX = $X;
fileB = $B;
fileM = ifdef ($M, " ");
fileY = ifdef ($Y, " ");
fileO = ifdef ($O, " ");
fmtM = ifdef ($fmt, "text");
dist_type = ifdef ($dfam, 1); # $dfam = 1;
var_power = ifdef ($vpow, 0.0); # $vpow = 0.0;
link_type = ifdef ($link, 0); # $link = 0;
link_power = ifdef ($lpow, 1.0); # $lpow = 1.0;
dispersion = ifdef ($disp, 1.0); # $disp = 1.0;
var_power = as.double (var_power);
link_power = as.double (link_power);
dispersion = as.double (dispersion);
if (dist_type == 3) {
link_type = 2;
} else if (link_type == 0) { # Canonical Link
if (dist_type == 1) {
link_type = 1;
link_power = 1.0 - var_power;
} else if (dist_type == 2) {
link_type = 2;
} }
X = read (fileX);
num_records = nrow (X);
num_features = ncol (X);
B_full = read (fileB);
if (dist_type == 3) {
beta = B_full [1 : ncol (X), ];
intercept = B_full [nrow(B_full), ];
} else {
beta = B_full [1 : ncol (X), 1];
intercept = B_full [nrow(B_full), 1];
}
if (nrow (B_full) == ncol (X)) {
intercept = 0.0 * intercept;
is_intercept = FALSE;
} else {
num_features = num_features + 1;
is_intercept = TRUE;
}
ones_rec = matrix (1, rows = num_records, cols = 1);
linear_terms = X %*% beta + ones_rec %*% intercept;
[means, vars] =
glm_means_and_vars (linear_terms, dist_type, var_power, link_type, link_power);
if (fileM != " ") {
write (means, fileM, format=fmtM);
}
if (fileY != " ")
{
Y = read (fileY);
ones_ctg = matrix (1, rows = ncol(Y), cols = 1);
# Statistics To Compute:
Z_logl = 0.0 / 0.0;
Z_logl_pValue = 0.0 / 0.0;
X2_pearson = 0.0 / 0.0;
df_pearson = -1;
G2_deviance = 0.0 / 0.0;
df_deviance = -1;
X2_pearson_pValue = 0.0 / 0.0;
G2_deviance_pValue = 0.0 / 0.0;
Z_logl_scaled = 0.0 / 0.0;
Z_logl_scaled_pValue = 0.0 / 0.0;
X2_scaled = 0.0 / 0.0;
X2_scaled_pValue = 0.0 / 0.0;
G2_scaled = 0.0 / 0.0;
G2_scaled_pValue = 0.0 / 0.0;
# set Y_counts to avoid 'Initialization of Y_counts depends on if-else execution' warning
Y_counts = matrix(0.0, rows=1, cols=1);
if (dist_type == 1 & link_type == 1) {
#
# POWER DISTRIBUTIONS (GAUSSIAN, POISSON, GAMMA, ETC.)
#
if (link_power == 0) {
is_zero_Y = (Y == 0);
lt_saturated = log (Y + is_zero_Y) - is_zero_Y / (1.0 - is_zero_Y);
} else {
lt_saturated = Y ^ link_power;
}
Y_counts = ones_rec;
X2_pearson = sum ((Y - means) ^ 2 / vars);
df_pearson = num_records - num_features;
log_l_part =
glm_partial_loglikelihood_for_power_dist_and_link (linear_terms, Y, var_power, link_power);
log_l_part_saturated =
glm_partial_loglikelihood_for_power_dist_and_link (lt_saturated, Y, var_power, link_power);
G2_deviance = 2 * sum (log_l_part_saturated) - 2 * sum (log_l_part);
df_deviance = num_records - num_features;
} else { if (dist_type >= 2) {
#
# BINOMIAL AND MULTINOMIAL DISTRIBUTIONS
#
if (ncol (Y) == 1) {
num_categories = ncol (beta) + 1;
if (min (Y) <= 0) {
# Category labels "0", "-1" etc. are converted into the baseline label
Y = Y + (- Y + num_categories) * (Y <= 0);
}
Y_size = min (num_categories, max(Y));
Y_unsized = table (seq (1, num_records, 1), Y);
Y = matrix (0, rows = num_records, cols = num_categories);
Y [, 1 : Y_size] = Y_unsized [, 1 : Y_size];
Y_counts = ones_rec;
} else {
Y_counts = rowSums (Y);
}
P = means;
zero_Y = (Y == 0);
zero_P = (P == 0);
ones_ctg = matrix (1, rows = ncol(Y), cols = 1);
logl_vec = rowSums (Y * log (P + zero_Y) );
ent1_vec = rowSums (P * log (P + zero_P) );
ent2_vec = rowSums (P * (log (P + zero_P))^2);
E_logl = sum (Y_counts * ent1_vec);
V_logl = sum (Y_counts * (ent2_vec - ent1_vec ^ 2));
Z_logl = (sum (logl_vec) - E_logl) / sqrt (V_logl);
means = means * (Y_counts %*% t(ones_ctg));
vars = vars * (Y_counts %*% t(ones_ctg));
frac_below_5 = sum (means < 5) / (nrow (means) * ncol (means));
frac_below_1 = sum (means < 1) / (nrow (means) * ncol (means));
if (frac_below_5 > 0.2 | frac_below_1 > 0) {
print ("WARNING: residual statistics are inaccurate here due to low cell means.");
}
X2_pearson = sum ((Y - means) ^ 2 / means);
df_pearson = (num_records - num_features) * (ncol(Y) - 1);
G2_deviance = 2 * sum (Y * log ((Y + zero_Y) / (means + zero_Y)));
df_deviance = (num_records - num_features) * (ncol(Y) - 1);
}}
if (Z_logl == Z_logl) {
Z_logl_absneg = - abs (Z_logl);
Z_logl_pValue = 2.0 * pnorm(target = Z_logl_absneg);
}
if (X2_pearson == X2_pearson & df_pearson > 0) {
X2_pearson_pValue = pchisq(target = X2_pearson, df = df_pearson, lower.tail=FALSE);
}
if (G2_deviance == G2_deviance & df_deviance > 0) {
G2_deviance_pValue = pchisq(target = G2_deviance, df = df_deviance, lower.tail=FALSE);
}
Z_logl_scaled = Z_logl / sqrt (dispersion);
X2_scaled = X2_pearson / dispersion;
G2_scaled = G2_deviance / dispersion;
if (Z_logl_scaled == Z_logl_scaled) {
Z_logl_scaled_absneg = - abs (Z_logl_scaled);
Z_logl_scaled_pValue = 2.0 * pnorm(target = Z_logl_scaled_absneg);
}
if (X2_scaled == X2_scaled & df_pearson > 0) {
X2_scaled_pValue = pchisq(target = X2_scaled, df = df_pearson, lower.tail=FALSE);
}
if (G2_scaled == G2_scaled & df_deviance > 0) {
G2_scaled_pValue = pchisq(target = G2_scaled, df = df_deviance, lower.tail=FALSE);
}
avg_tot_Y = colSums ( Y ) / sum (Y_counts);
avg_res_Y = colSums (Y - means) / sum (Y_counts);
ss_avg_tot_Y = colSums (( Y - Y_counts %*% avg_tot_Y) ^ 2);
ss_res_Y = colSums ((Y - means) ^ 2);
ss_avg_res_Y = colSums ((Y - means - Y_counts %*% avg_res_Y) ^ 2);
df_ss_res_Y = sum (Y_counts) - num_features;
if (is_intercept) {
df_ss_avg_res_Y = df_ss_res_Y;
} else {
df_ss_avg_res_Y = df_ss_res_Y - 1;
}
var_tot_Y = ss_avg_tot_Y / (sum (Y_counts) - 1);
if (df_ss_avg_res_Y > 0) {
var_res_Y = ss_avg_res_Y / df_ss_avg_res_Y;
} else {
var_res_Y = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
}
R2_nobias = 1 - ss_avg_res_Y / ss_avg_tot_Y;
adjust_R2_nobias = 1 - var_res_Y / var_tot_Y;
R2 = 1 - ss_res_Y / ss_avg_tot_Y;
if (df_ss_res_Y > 0) {
adjust_R2 = 1 - (ss_res_Y / df_ss_res_Y) / var_tot_Y;
} else {
adjust_R2 = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
}
predicted_avg_var_res_Y = dispersion * colSums (vars) / sum (Y_counts);
# PREPARING THE OUTPUT CSV STATISTICS FILE
str = "LOGLHOOD_Z,,FALSE," + Z_logl;
str = append (str, "LOGLHOOD_Z_PVAL,,FALSE," + Z_logl_pValue);
str = append (str, "PEARSON_X2,,FALSE," + X2_pearson);
str = append (str, "PEARSON_X2_BY_DF,,FALSE," + (X2_pearson / df_pearson));
str = append (str, "PEARSON_X2_PVAL,,FALSE," + X2_pearson_pValue);
str = append (str, "DEVIANCE_G2,,FALSE," + G2_deviance);
str = append (str, "DEVIANCE_G2_BY_DF,,FALSE," + (G2_deviance / df_deviance));
str = append (str, "DEVIANCE_G2_PVAL,,FALSE," + G2_deviance_pValue);
str = append (str, "LOGLHOOD_Z,,TRUE," + Z_logl_scaled);
str = append (str, "LOGLHOOD_Z_PVAL,,TRUE," + Z_logl_scaled_pValue);
str = append (str, "PEARSON_X2,,TRUE," + X2_scaled);
str = append (str, "PEARSON_X2_BY_DF,,TRUE," + (X2_scaled / df_pearson));
str = append (str, "PEARSON_X2_PVAL,,TRUE," + X2_scaled_pValue);
str = append (str, "DEVIANCE_G2,,TRUE," + G2_scaled);
str = append (str, "DEVIANCE_G2_BY_DF,,TRUE," + (G2_scaled / df_deviance));
str = append (str, "DEVIANCE_G2_PVAL,,TRUE," + G2_scaled_pValue);
for (i in 1:ncol(Y)) {
str = append (str, "AVG_TOT_Y," + i + ",," + as.scalar (avg_tot_Y [1, i]));
str = append (str, "STDEV_TOT_Y," + i + ",," + as.scalar (sqrt (var_tot_Y [1, i])));
str = append (str, "AVG_RES_Y," + i + ",," + as.scalar (avg_res_Y [1, i]));
str = append (str, "STDEV_RES_Y," + i + ",," + as.scalar (sqrt (var_res_Y [1, i])));
str = append (str, "PRED_STDEV_RES," + i + ",TRUE," + as.scalar (sqrt (predicted_avg_var_res_Y [1, i])));
str = append (str, "R2," + i + ",," + as.scalar (R2 [1, i]));
str = append (str, "ADJUSTED_R2," + i + ",," + as.scalar (adjust_R2 [1, i]));
str = append (str, "R2_NOBIAS," + i + ",," + as.scalar (R2_nobias [1, i]));
str = append (str, "ADJUSTED_R2_NOBIAS," + i + ",," + as.scalar (adjust_R2_nobias [1, i]));
}
if (fileO != " ") {
write (str, fileO);
} else {
print (str);
}
}
glm_means_and_vars =
function (Matrix[double] linear_terms, int dist_type, double var_power, int link_type, double link_power)
return (Matrix[double] means, Matrix[double] vars)
# NOTE: "vars" represents the variance without dispersion, i.e. the V(mu) function.
{
num_points = nrow (linear_terms);
if (dist_type == 1 & link_type == 1) {
# POWER DISTRIBUTION
if (link_power == 0) {
y_mean = exp (linear_terms);
} else if (link_power == 1.0) {
y_mean = linear_terms;
} else if (link_power == -1.0) {
y_mean = 1.0 / linear_terms;
} else {
y_mean = linear_terms ^ (1.0 / link_power);
}
if (var_power == 0) {
var_function = matrix (1.0, rows = num_points, cols = 1);
} else if (var_power == 1.0) {
var_function = y_mean;
} else {
var_function = y_mean ^ var_power;
}
means = y_mean;
vars = var_function;
} else if (dist_type == 2 & link_type >= 1 & link_type <= 5) {
# BINOMIAL/BERNOULLI DISTRIBUTION
y_prob = matrix (0.0, rows = num_points, cols = 2);
if (link_type == 1 & link_power == 0) { # Binomial.log
y_prob [, 1] = exp (linear_terms);
y_prob [, 2] = 1.0 - y_prob [, 1];
} else if (link_type == 1 & link_power != 0) { # Binomial.power_nonlog
y_prob [, 1] = linear_terms ^ (1.0 / link_power);
y_prob [, 2] = 1.0 - y_prob [, 1];
} else if (link_type == 2) { # Binomial.logit
elt = exp (linear_terms);
y_prob [, 1] = elt / (1.0 + elt);
y_prob [, 2] = 1.0 / (1.0 + elt);
} else if (link_type == 3) { # Binomial.probit
sign_lt = 2 * (linear_terms >= 0) - 1;
t_gp = 1.0 / (1.0 + abs (linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0)
erf_corr =
t_gp * ( 0.254829592
+ t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+ t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+ t_gp * (-1.453152027
+ t_gp * 1.061405429)))) * sign_lt * exp (- (linear_terms ^ 2) / 2.0);
y_prob [, 1] = (1 + sign_lt) - erf_corr;
y_prob [, 2] = (1 - sign_lt) + erf_corr;
y_prob = y_prob / 2;
} else if (link_type == 4) { # Binomial.cloglog
elt = exp (linear_terms);
is_too_small = ((10000000 + elt) == 10000000);
y_prob [, 2] = exp (- elt);
y_prob [, 1] = (1 - is_too_small) * (1.0 - y_prob [, 2]) + is_too_small * elt * (1.0 - elt / 2);
} else if (link_type == 5) { # Binomial.cauchit
atan_linear_terms = atan (linear_terms);
y_prob [, 1] = 0.5 + atan_linear_terms / 3.1415926535897932384626433832795;
y_prob [, 2] = 0.5 - atan_linear_terms / 3.1415926535897932384626433832795;
}
means = y_prob;
ones_ctg = matrix (1, rows = 2, cols = 1);
vars = means * (means %*% (1 - diag (ones_ctg)));
} else if (dist_type == 3) {
# MULTINOMIAL LOGIT DISTRIBUTION
elt = exp (linear_terms);
ones_pts = matrix (1, rows = num_points, cols = 1);
elt = cbind (elt, ones_pts);
ones_ctg = matrix (1, rows = ncol (elt), cols = 1);
means = elt / (rowSums (elt) %*% t(ones_ctg));
vars = means * (means %*% (1 - diag (ones_ctg)));
} else {
means = matrix (0.0, rows = num_points, cols = 1);
vars = matrix (0.0, rows = num_points, cols = 1);
} }
glm_partial_loglikelihood_for_power_dist_and_link = # Assumes: dist_type == 1 & link_type == 1
function (Matrix[double] linear_terms, Matrix[double] Y, double var_power, double link_power)
return (Matrix[double] log_l_part)
{
num_records = nrow (Y);
if (var_power == 1.0) { # Poisson
if (link_power == 0) { # Poisson.log
is_natural_parameter_log_zero = (linear_terms == -1.0/0.0);
natural_parameters = replace (target = linear_terms, pattern = -1.0/0.0, replacement = 0);
b_cumulant = exp (linear_terms);
} else { # Poisson.power_nonlog
is_natural_parameter_log_zero = (linear_terms == 0);
natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
}
is_minus_infinity = (Y > 0) * is_natural_parameter_log_zero;
log_l_part = Y * natural_parameters - b_cumulant - is_minus_infinity / (1 - is_minus_infinity);
} else {
if (var_power == 2.0 & link_power == 0) { # Gamma.log
natural_parameters = - exp (- linear_terms);
b_cumulant = linear_terms;
} else if (var_power == 2.0) { # Gamma.power_nonlog
natural_parameters = - linear_terms ^ (- 1.0 / link_power);
b_cumulant = log (linear_terms) / link_power;
} else if (link_power == 0) { # PowerDist.log
natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
} else { # PowerDist.power_nonlog
power_np = (1.0 - var_power) / link_power;
natural_parameters = (linear_terms ^ power_np) / (1.0 - var_power);
power_cu = (2.0 - var_power) / link_power;
b_cumulant = (linear_terms ^ power_cu) / (2.0 - var_power);
}
log_l_part = Y * natural_parameters - b_cumulant;
} }