scripts.algorithms.StepGLM.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT CHOOSES A GLM REGRESSION MODEL IN A STEPWISE ALGIRITHM USING AIC
# EACH GLM REGRESSION IS SOLVED USING NEWTON/FISHER SCORING WITH TRUST REGIONS
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read the matrix X of feature vectors
# Y String --- Location to read response matrix Y with 1 column
# B String --- Location to store estimated regression parameters (the betas)
# S String --- Location to write the selected features ordered as computed by the algorithm
# O String " " Location to write the printed statistics; by default is standard output
# link Int 2 Link function code: 1 = log, 2 = Logit, 3 = Probit, 4 = Cloglog
# yneg Double 0.0 Response value for Bernoulli "No" label, usually 0.0 or -1.0
# icpt Int 0 Intercept presence, X columns shifting and rescaling:
# 0 = no intercept, no shifting, no rescaling;
# 1 = add intercept, but neither shift nor rescale X;
# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
# tol Double 0.000001 Tolerance (epsilon)
# disp Double 0.0 (Over-)dispersion value, or 0.0 to estimate it from data
# moi Int 200 Maximum number of outer (Newton / Fisher Scoring) iterations
# mii Int 0 Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum
# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
# no further features are being checked and the algorithm stops
# fmt String "text" The betas matrix output format, such as "text" or "csv"
# ---------------------------------------------------------------------------------------------
# OUTPUT: Matrix beta, whose size depends on icpt:
# icpt=0: ncol(X) x 1; icpt=1: (ncol(X) + 1) x 1; icpt=2: (ncol(X) + 1) x 2
#
# In addition, in the last run of GLM some statistics are provided in CSV format, one comma-separated name-value
# pair per each line, as follows:
#
# NAME MEANING
# -------------------------------------------------------------------------------------------
# TERMINATION_CODE A positive integer indicating success/failure as follows:
# 1 = Converged successfully; 2 = Maximum number of iterations reached;
# 3 = Input (X, Y) out of range; 4 = Distribution/link is not supported
# BETA_MIN Smallest beta value (regression coefficient), excluding the intercept
# BETA_MIN_INDEX Column index for the smallest beta value
# BETA_MAX Largest beta value (regression coefficient), excluding the intercept
# BETA_MAX_INDEX Column index for the largest beta value
# INTERCEPT Intercept value, or NaN if there is no intercept (if icpt=0)
# DISPERSION Dispersion used to scale deviance, provided as "disp" input parameter
# or estimated (same as DISPERSION_EST) if the "disp" parameter is <= 0
# DISPERSION_EST Dispersion estimated from the dataset
# DEVIANCE_UNSCALED Deviance from the saturated model, assuming dispersion == 1.0
# DEVIANCE_SCALED Deviance from the saturated model, scaled by the DISPERSION value
# -------------------------------------------------------------------------------------------
#
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f StepGLM.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
# S=OUTPUT_DIR_S/selected O=OUTPUT_DIR/stats link=2 yneg=-1.0 icpt=2 tol=0.00000001
# disp=1.0 moi=100 mii=10 thr=0.01 fmt=csv
#
# THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY!
# - LOG
# - LOGIT
# - PROBIT
# - CLOGLOG
fileX = $X;
fileY = $Y;
fileB = $B;
intercept_status = ifdef ($icpt, 0);
thr = ifdef ($thr, 0.01);
bernoulli_No_label = ifdef ($yneg, 0.0); # $yneg = 0.0;
distribution_type = 2;
bernoulli_No_label = as.double (bernoulli_No_label);
# currently only the forward selection strategy in supported: start from one feature and iteratively add
# features until AIC improves
dir = "forward";
print("BEGIN STEPWISE GLM SCRIPT");
print ("Reading X and Y...");
X_orig = read (fileX);
Y = read (fileY);
if (distribution_type == 2 & ncol(Y) == 1) {
is_Y_negative = (Y == bernoulli_No_label);
Y = cbind (1 - is_Y_negative, is_Y_negative);
count_Y_negative = sum (is_Y_negative);
if (count_Y_negative == 0) {
stop ("StepGLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label");
}
if (count_Y_negative == nrow(Y)) {
stop ("StepGLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label");
}
}
num_records = nrow (X_orig);
num_features = ncol (X_orig);
# BEGIN STEPWISE GENERALIZED LINEAR MODELS
if (dir == "forward") {
continue = TRUE;
columns_fixed = matrix (0, rows = 1, cols = num_features);
columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
# X_global stores the best model found at each step
X_global = matrix (0, rows = num_records, cols = 1);
if (intercept_status == 0) {
# Compute AIC of an empty model with no features and no intercept (all Ys are zero)
[AIC_best] = glm (X_global, Y, 0, num_features, columns_fixed_ordered, " ");
} else {
# compute AIC of an empty model with only intercept (all Ys are constant)
all_ones = matrix (1, rows = num_records, cols = 1);
[AIC_best] = glm (all_ones, Y, 0, num_features, columns_fixed_ordered, " ");
}
print ("Best AIC without any features: " + AIC_best);
# First pass to examine single features
AICs = matrix (AIC_best, rows = 1, cols = num_features);
parfor (i in 1:num_features) {
[AIC_1] = glm (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered, " ");
AICs[1,i] = AIC_1;
}
# Determine the best AIC
column_best = 0;
for (k in 1:num_features) {
AIC_cur = as.scalar (AICs[1,k]);
if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
column_best = k;
AIC_best = as.scalar(AICs[1,k]);
}
}
if (column_best == 0) {
print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
if (intercept_status == 0) {
# Compute AIC of an empty model with no features and no intercept (all Ys are zero)
[AIC_best] = glm (X_global, Y, 0, num_features, columns_fixed_ordered, fileB);
} else {
# compute AIC of an empty model with only intercept (all Ys are constant)
###all_ones = matrix (1, rows = num_records, cols = 1);
[AIC_best] = glm (all_ones, Y, 0, num_features, columns_fixed_ordered, fileB);
}
};
print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
columns_fixed[1,column_best] = 1;
columns_fixed_ordered[1,1] = column_best;
X_global = X_orig[,column_best];
while (continue) {
# Subsequent passes over the features
parfor (i in 1:num_features) {
if (as.scalar(columns_fixed[1,i]) == 0) {
# Construct the feature matrix
X = cbind (X_global, X_orig[,i]);
[AIC_2] = glm (X, Y, intercept_status, num_features, columns_fixed_ordered, " ");
AICs[1,i] = AIC_2;
}
}
# Determine the best AIC
for (k in 1:num_features) {
AIC_cur = as.scalar (AICs[1,k]);
if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
column_best = k;
AIC_best = as.scalar(AICs[1,k]);
}
}
# cbind best found features (i.e., columns) to X_global
if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
columns_fixed[1,column_best] = 1;
columns_fixed_ordered = cbind (columns_fixed_ordered, as.matrix(column_best));
if (ncol(columns_fixed_ordered) == num_features) { # all features examined
X_global = cbind (X_global, X_orig[,column_best]);
continue = FALSE;
} else {
X_global = cbind (X_global, X_orig[,column_best]);
}
} else {
continue = FALSE;
}
}
# run GLM with selected set of features
print ("Running GLM with selected features...");
[AIC] = glm (X_global, Y, intercept_status, num_features, columns_fixed_ordered, fileB);
} else {
stop ("Currently only forward selection strategy is supported!");
}
################### UDFS USED IN THIS SCRIPT ##################
glm = function (Matrix[Double] X, Matrix[Double] Y, Int intercept_status, Double num_features_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {
# distribution family code: 1 = Power, 2 = Bernoulli/Binomial; currently only Bernouli distribution family is supported!
distribution_type = 2; # $dfam = 2;
variance_as_power_of_the_mean = 0.0; # $vpow = 0.0;
# link function code: 0 = canonical (depends on distribution), 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit;
# currently only log (link = 1), logit (link = 2), probit (link = 3), and cloglog (link = 4) are supported!
link_type = ifdef ($link, 2); # $link = 2;
link_as_power_of_the_mean = 0.0; # $lpow = 0.0;
dispersion = ifdef ($disp, 0.0); # $disp = 0.0;
eps = ifdef ($tol, 0.000001); # $tol = 0.000001;
max_iteration_IRLS = ifdef ($moi, 200); # $moi = 200;
max_iteration_CG = ifdef ($mii, 0); # $mii = 0;
variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean);
link_as_power_of_the_mean = as.double (link_as_power_of_the_mean);
dispersion = as.double (dispersion);
eps = as.double (eps);
# Default values for output statistics:
regularization = 0.0;
termination_code = 0.0;
min_beta = 0.0 / 0.0;
i_min_beta = 0.0 / 0.0;
max_beta = 0.0 / 0.0;
i_max_beta = 0.0 / 0.0;
intercept_value = 0.0 / 0.0;
dispersion = 0.0 / 0.0;
estimated_dispersion = 0.0 / 0.0;
deviance_nodisp = 0.0 / 0.0;
deviance = 0.0 / 0.0;
##### INITIALIZE THE PARAMETERS #####
num_records = nrow (X);
num_features = ncol (X);
zeros_r = matrix (0, rows = num_records, cols = 1);
ones_r = 1 + zeros_r;
# Introduce the intercept, shift and rescale the columns of X if needed
if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
X = cbind (X, ones_r);
num_features = ncol (X);
}
scale_lambda = matrix (1, rows = num_features, cols = 1);
if (intercept_status == 1 | intercept_status == 2) {
scale_lambda [num_features, 1] = 0;
}
if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
# Important assumption: X [, num_features] = ones_r
avg_X_cols = t(colSums(X)) / num_records;
var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1);
is_unsafe = (var_X_cols <= 0);
scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
scale_X [num_features, 1] = 1;
shift_X = - avg_X_cols * scale_X;
shift_X [num_features, 1] = 0;
rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
} else {
scale_X = matrix (1, rows = num_features, cols = 1);
shift_X = matrix (0, rows = num_features, cols = 1);
rowSums_X_sq = rowSums (X ^ 2);
}
# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
# The transform is then associatively applied to the other side of the expression,
# and is rewritten via "scale_X" and "shift_X" as follows:
#
# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
# ssX_A = diag (scale_X) %*% A;
# ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A;
#
# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
# tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ];
# Initialize other input-dependent parameters
lambda = scale_lambda * regularization;
if (max_iteration_CG == 0) {
max_iteration_CG = num_features;
}
# Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const]
if (link_type == 0) {
if (distribution_type == 1) {
link_type = 1;
link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean;
} else {
if (distribution_type == 2) {
link_type = 2;
}
}
}
# For power distributions and/or links, we use two constants,
# "variance as power of the mean" and "link_as_power_of_the_mean",
# to specify the variance and the link as arbitrary powers of the
# mean. However, the variance-powers of 1.0 (Poisson family) and
# 2.0 (Gamma family) have to be treated as special cases, because
# these values integrate into logarithms. The link-power of 0.0
# is also special as it represents the logarithm link.
num_response_columns = ncol (Y);
is_supported = 0;
if (num_response_columns == 2 & distribution_type == 2 & link_type >= 1 & link_type <= 4) { # BERNOULLI DISTRIBUTION
is_supported = 1;
}
if (num_response_columns == 1 & distribution_type == 2) {
print ("Error: Bernoulli response matrix has not been converted into two-column format.");
}
if (is_supported == 1) {
##### INITIALIZE THE BETAS #####
[beta, saturated_log_l, isNaN] =
glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG);
# print(" --- saturated logLik " + saturated_log_l);
if (isNaN == 0) {
##### START OF THE MAIN PART #####
sum_X_sq = sum (rowSums_X_sq);
trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq));
### max_trust_delta = trust_delta * 10000.0;
log_l = 0.0;
deviance_nodisp = 0.0;
new_deviance_nodisp = 0.0;
isNaN_log_l = 2;
newbeta = beta;
g = matrix (0.0, rows = num_features, cols = 1);
g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
accept_new_beta = 1;
reached_trust_boundary = 0;
neg_log_l_change_predicted = 0.0;
i_IRLS = 0;
# print ("BEGIN IRLS ITERATIONS...");
ssX_newbeta = diag (scale_X) %*% newbeta;
ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
all_linear_terms = X %*% ssX_newbeta;
[new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
(all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
if (isNaN_new_log_l == 0) {
new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
}
while (termination_code == 0) {
accept_new_beta = 1;
if (i_IRLS > 0) {
if (isNaN_log_l == 0) {
accept_new_beta = 0;
}
# Decide whether to accept a new iteration point and update the trust region
# See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
rho = (- new_log_l + log_l) / neg_log_l_change_predicted;
if (rho < 0.25 | isNaN_new_log_l == 1) {
trust_delta = 0.25 * trust_delta;
}
if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) {
trust_delta = 2 * trust_delta;
### if (trust_delta > max_trust_delta) {
### trust_delta = max_trust_delta;
### }
}
if (rho > 0.1 & isNaN_new_log_l == 0) {
accept_new_beta = 1;
}
}
if (accept_new_beta == 1) {
beta = newbeta; log_l = new_log_l; deviance_nodisp = new_deviance_nodisp; isNaN_log_l = isNaN_new_log_l;
[g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
# We introduced these variables to avoid roundoff errors:
# g_Y = y_residual / (y_var * link_grad);
# w = 1.0 / (y_var * link_grad * link_grad);
gXY = - t(X) %*% g_Y;
g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ];
g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
}
[z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] =
get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG);
newbeta = beta + z;
ssX_newbeta = diag (scale_X) %*% newbeta;
ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
all_linear_terms = X %*% ssX_newbeta;
[new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
(all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
if (isNaN_new_log_l == 0) {
new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
}
log_l_change = new_log_l - log_l; # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps
if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 &
(2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) {
termination_code = 1;
}
rho = - log_l_change / neg_log_l_change_predicted;
z_norm = sqrt (sum (z * z));
i_IRLS = i_IRLS + 1;
if (i_IRLS == max_iteration_IRLS) {
termination_code = 2;
}
}
beta = newbeta;
log_l = new_log_l;
deviance_nodisp = new_deviance_nodisp;
#---------------------------- last part
if (termination_code != 1) {
print ("One of the runs of GLM did not converged in " + i_IRLS + " steps!");
}
##### COMPUTE AIC #####
if (distribution_type == 2 & link_type >= 1 & link_type <= 4) {
AIC = -2 * log_l;
if (sum (X) != 0) {
AIC = AIC + 2 * num_features;
}
} else {
stop ("Currently only the Bernoulli distribution family the following link functions are supported: log, logit, probit, and cloglog!");
}
if (fileB != " ") {
fileO = ifdef ($O, " ");
fileS = $S;
fmt = ifdef ($fmt, "text");
# Output which features give the best AIC and are being used for linear regression
write (Selected, fileS, format=fmt);
ssX_beta = diag (scale_X) %*% beta;
ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta;
if (intercept_status == 2) {
beta_out = cbind (ssX_beta, beta);
} else {
beta_out = ssX_beta;
}
if (intercept_status == 0 & num_features == 1) {
p = sum (X == 1);
if (p == num_records) {
beta_out = beta_out[1,];
}
}
if (intercept_status == 1 | intercept_status == 2) {
intercept_value = as.scalar (beta_out [num_features, 1]);
beta_noicept = beta_out [1 : (num_features - 1), 1];
} else {
beta_noicept = beta_out [1 : num_features, 1];
}
min_beta = min (beta_noicept);
max_beta = max (beta_noicept);
tmp_i_min_beta = rowIndexMin (t(beta_noicept))
i_min_beta = as.scalar (tmp_i_min_beta [1, 1]);
tmp_i_max_beta = rowIndexMax (t(beta_noicept))
i_max_beta = as.scalar (tmp_i_max_beta [1, 1]);
##### OVER-DISPERSION PART #####
all_linear_terms = X %*% ssX_beta;
[g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
pearson_residual_sq = g_Y ^ 2 / w;
pearson_residual_sq = replace (target = pearson_residual_sq, pattern = 0.0/0.0, replacement = 0);
# pearson_residual_sq = (y_residual ^ 2) / y_var;
if (num_records > num_features) {
estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features);
}
if (dispersion <= 0) {
dispersion = estimated_dispersion;
}
deviance = deviance_nodisp / dispersion;
##### END OF THE MAIN PART #####
str = "BETA_MIN," + min_beta;
str = append (str, "BETA_MIN_INDEX," + i_min_beta);
str = append (str, "BETA_MAX," + max_beta);
str = append (str, "BETA_MAX_INDEX," + i_max_beta);
str = append (str, "INTERCEPT," + intercept_value);
str = append (str, "DISPERSION," + dispersion);
str = append (str, "DISPERSION_EST," + estimated_dispersion);
str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp);
str = append (str, "DEVIANCE_SCALED," + deviance);
if (fileO != " ") {
write (str, fileO);
}
else {
print (str);
}
# Prepare the output matrix
print ("Writing the output matrix...");
if (intercept_status == 0 & num_features == 1) {
if (p == num_records) {
beta_out_tmp = matrix (0, rows = num_features_orig + 1, cols = 1);
beta_out_tmp[num_features_orig + 1,] = beta_out;
beta_out = beta_out_tmp;
write (beta_out, fileB, format=fmt);
stop ("");
} else if (sum (X) == 0){
beta_out = matrix (0, rows = num_features_orig, cols = 1);
write (beta_out, fileB, format=fmt);
stop ("");
}
}
no_selected = ncol (Selected);
max_selected = max (Selected);
last = max_selected + 1;
if (intercept_status != 0) {
Selected_ext = cbind (Selected, as.matrix (last));
P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext));
if (intercept_status == 2) {
P1_ssX_beta = P1 * ssX_beta;
P2_ssX_beta = colSums (P1_ssX_beta);
P1_beta = P1 * beta;
P2_beta = colSums (P1_beta);
if (max_selected < num_features_orig) {
P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1];
P2_ssX_beta[1, max_selected + 1] = 0;
P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1];
P2_beta[1, max_selected + 1] = 0;
}
beta_out = cbind (t(P2_ssX_beta), t(P2_beta));
} else {
P1_beta = P1 * beta;
P2_beta = colSums (P1_beta);
if (max_selected < num_features_orig) {
P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ;
P2_beta[1, max_selected + 1] = 0;
}
beta_out = t(P2_beta);
}
} else {
P1 = table (seq (1, no_selected), t(Selected));
P1_beta = P1 * beta;
P2_beta = colSums (P1_beta);
if (max_selected < num_features_orig) {
P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
}
beta_out = t(P2_beta);
}
write ( beta_out, fileB, format=fmt );
}
} else {
stop ("Input matrices X and/or Y are out of range!");
}
} else {
stop ("Response matrix with " + num_response_columns + " columns, distribution family (" + distribution_type + ", " + variance_as_power_of_the_mean
+ ") and link family (" + link_type + ", " + link_as_power_of_the_mean + ") are NOT supported together.");
}
}
glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG)
return (Matrix[double] beta, double saturated_log_l, int isNaN)
{
saturated_log_l = 0.0;
isNaN = 0;
y_corr = Y [, 1];
if (dist_type == 2) {
n_corr = rowSums (Y);
is_n_zero = (n_corr == 0);
y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;
}
linear_terms = y_corr;
if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
if (link_power == 0) {
if (sum (y_corr < 0) == 0) {
is_zero_y_corr = (y_corr == 0);
linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
} else { isNaN = 1; }
} else { if (link_power == 1.0) {
linear_terms = y_corr;
} else { if (link_power == -1.0) {
linear_terms = 1.0 / y_corr;
} else { if (link_power == 0.5) {
if (sum (y_corr < 0) == 0) {
linear_terms = sqrt (y_corr);
} else { isNaN = 1; }
} else { if (link_power > 0) {
if (sum (y_corr < 0) == 0) {
is_zero_y_corr = (y_corr == 0);
linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
} else { isNaN = 1; }
} else {
if (sum (y_corr <= 0) == 0) {
linear_terms = y_corr ^ link_power;
} else { isNaN = 1; }
}}}}}
}
if (dist_type == 2 & link_type >= 1 & link_type <= 5)
{ # BINOMIAL/BERNOULLI DISTRIBUTION
if (link_type == 1 & link_power == 0) { # Binomial.log
if (sum (y_corr < 0) == 0) {
is_zero_y_corr = (y_corr == 0);
linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
} else { isNaN = 1; }
} else { if (link_type == 1 & link_power > 0) { # Binomial.power_nonlog pos
if (sum (y_corr < 0) == 0) {
is_zero_y_corr = (y_corr == 0);
linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
} else { isNaN = 1; }
} else { if (link_type == 1) { # Binomial.power_nonlog neg
if (sum (y_corr <= 0) == 0) {
linear_terms = y_corr ^ link_power;
} else { isNaN = 1; }
} else {
is_zero_y_corr = (y_corr <= 0);
is_one_y_corr = (y_corr >= 1.0);
y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr);
if (link_type == 2) { # Binomial.logit
linear_terms = log (y_corr / (1.0 - y_corr))
+ is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
} else { if (link_type == 3) { # Binomial.probit
y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5);
t = sqrt (- 2.0 * log (y_below_half));
approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5))
+ is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
} else { if (link_type == 4) { # Binomial.cloglog
linear_terms = log (- log (1.0 - y_corr))
- log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
+ is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
} else { if (link_type == 5) { # Binomial.cauchit
linear_terms = tan ((y_corr - 0.5) * 3.1415926535897932384626433832795)
+ is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
}} }}}}}
}
if (isNaN == 0) {
[saturated_log_l, isNaN] =
glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power);
}
if ((dist_type == 1 & link_type == 1 & link_power == 0) |
(dist_type == 2 & link_type >= 2))
{
desired_eta = 0.0;
} else { if (link_type == 1 & link_power == 0) {
desired_eta = log (0.5);
} else { if (link_type == 1) {
desired_eta = 0.5 ^ link_power;
} else {
desired_eta = 0.5;
}}}
beta = matrix (0.0, rows = ncol(X), cols = 1);
if (desired_eta != 0) {
if (icept_status == 1 | icept_status == 2) {
beta [nrow(beta), 1] = desired_eta;
} else {
# We want: avg (X %*% ssX_transform %*% beta) = desired_eta
# Note that "ssX_transform" is trivial here, hence ignored
beta = straightenX (X, 0.000001, max_iter_CG);
beta = beta * desired_eta;
} } }
glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
int dist_type, double var_power, int link_type, double link_power)
return (Matrix[double] g_Y, Matrix[double] w)
# ORIGINALLY we returned more meaningful vectors, namely:
# Matrix[double] y_residual : y - y_mean, i.e. y observed - y predicted
# Matrix[double] link_gradient : derivative of the link function
# Matrix[double] var_function : variance without dispersion, i.e. the V(mu) function
# BUT, this caused roundoff errors, so we had to compute "directly useful" vectors
# and skip over the "meaningful intermediaries". Now we output these two variables:
# g_Y = y_residual / (var_function * link_gradient);
# w = 1.0 / (var_function * link_gradient ^ 2);
{
num_records = nrow (linear_terms);
zeros_r = matrix (0.0, rows = num_records, cols = 1);
ones_r = 1 + zeros_r;
g_Y = zeros_r;
w = zeros_r;
# Some constants
one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
ones_2 = matrix (1.0, rows = 1, cols = 2);
p_one_m_one = ones_2;
p_one_m_one [1, 2] = -1.0;
m_one_p_one = ones_2;
m_one_p_one [1, 1] = -1.0;
zero_one = ones_2;
zero_one [1, 1] = 0.0;
one_zero = ones_2;
one_zero [1, 2] = 0.0;
flip_pos = matrix (0, rows = 2, cols = 2);
flip_neg = flip_pos;
flip_pos [1, 2] = 1;
flip_pos [2, 1] = 1;
flip_neg [1, 2] = -1;
flip_neg [2, 1] = 1;
if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
y_mean = zeros_r;
if (link_power == 0) {
y_mean = exp (linear_terms);
y_mean_pow = y_mean ^ (1 - var_power);
w = y_mean_pow * y_mean;
g_Y = y_mean_pow * (Y - y_mean);
} else { if (link_power == 1.0) {
y_mean = linear_terms;
w = y_mean ^ (- var_power);
g_Y = w * (Y - y_mean);
} else {
y_mean = linear_terms ^ (1.0 / link_power);
c1 = (1 - var_power) / link_power - 1;
c2 = (2 - var_power) / link_power - 2;
g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power;
w = (linear_terms ^ c2) / (link_power ^ 2);
} }}
if (dist_type == 2 & link_type >= 1 & link_type <= 5)
{ # BINOMIAL/BERNOULLI DISTRIBUTION
if (link_type == 1) { # BINOMIAL.POWER LINKS
if (link_power == 0) { # Binomial.log
vec1 = 1 / (exp (- linear_terms) - 1);
g_Y = Y [, 1] - Y [, 2] * vec1;
w = rowSums (Y) * vec1;
} else { # Binomial.nonlog
vec1 = zeros_r;
if (link_power == 0.5) {
vec1 = 1 / (1 - linear_terms ^ 2);
} else { if (sum (linear_terms < 0) == 0) {
vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power));
} else {isNaN = 1;}}
# We want a "zero-protected" version of
# vec2 = Y [, 1] / linear_terms;
is_y_0 = (Y [, 1] == 0);
vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0;
g_Y = (vec2 - Y [, 2] * vec1 * linear_terms) / link_power;
w = rowSums (Y) * vec1 / link_power ^ 2;
}
} else {
is_LT_pos_infinite = (linear_terms == 1.0/0.0);
is_LT_neg_infinite = (linear_terms == -1.0/0.0);
is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
finite_linear_terms = replace (target = linear_terms, pattern = 1.0/0.0, replacement = 0);
finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0);
if (link_type == 2) { # Binomial.logit
Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
g_Y = rowSums (Y * (Y_prob %*% flip_neg)); ### = y_residual;
w = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob); ### = y_variance;
} else { if (link_type == 3) { # Binomial.probit
is_lt_pos = (linear_terms >= 0);
t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0)
pt_gp = t_gp * ( 0.254829592
+ t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+ t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+ t_gp * (-1.453152027
+ t_gp * 1.061405429))));
the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5);
w = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1;
g_Y = one_over_sqrt_two_pi * vec2 / vec1;
} else { if (link_type == 4) { # Binomial.cloglog
the_exp = exp (linear_terms)
the_exp_exp = exp (- the_exp);
is_too_small = ((10000000 + the_exp) == 10000000);
the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2);
g_Y = (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio;
w = the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio;
} else { if (link_type == 5) { # Binomial.cauchit
Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795;
Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1];
var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2];
link_gradient_normalized = (1 + linear_terms ^ 2) * 3.1415926535897932384626433832795;
g_Y = rowSums (Y) * y_residual / (var_function * link_gradient_normalized);
w = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2);
}}}}
}
}
}
glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y,
int dist_type, double var_power, int link_type, double link_power)
return (double log_l, int isNaN)
{
isNaN = 0;
log_l = 0.0;
num_records = nrow (Y);
zeros_r = matrix (0.0, rows = num_records, cols = 1);
if (dist_type == 1 & link_type == 1)
{ # POWER DISTRIBUTION
b_cumulant = zeros_r;
natural_parameters = zeros_r;
is_natural_parameter_log_zero = zeros_r;
if (var_power == 1.0 & link_power == 0) { # Poisson.log
b_cumulant = exp (linear_terms);
is_natural_parameter_log_zero = (linear_terms == -1.0/0.0);
natural_parameters = replace (target = linear_terms, pattern = -1.0/0.0, replacement = 0);
} else { if (var_power == 1.0 & link_power == 1.0) { # Poisson.id
if (sum (linear_terms < 0) == 0) {
b_cumulant = linear_terms;
is_natural_parameter_log_zero = (linear_terms == 0);
natural_parameters = log (linear_terms + is_natural_parameter_log_zero);
} else {isNaN = 1;}
} else { if (var_power == 1.0 & link_power == 0.5) { # Poisson.sqrt
if (sum (linear_terms < 0) == 0) {
b_cumulant = linear_terms ^ 2;
is_natural_parameter_log_zero = (linear_terms == 0);
natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero);
} else {isNaN = 1;}
} else { if (var_power == 1.0 & link_power > 0) { # Poisson.power_nonlog, pos
if (sum (linear_terms < 0) == 0) {
is_natural_parameter_log_zero = (linear_terms == 0);
b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
} else {isNaN = 1;}
} else { if (var_power == 1.0) { # Poisson.power_nonlog, neg
if (sum (linear_terms <= 0) == 0) {
b_cumulant = linear_terms ^ (1.0 / link_power);
natural_parameters = log (linear_terms) / link_power;
} else {isNaN = 1;}
} else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse
if (sum (linear_terms <= 0) == 0) {
b_cumulant = - log (linear_terms);
natural_parameters = - linear_terms;
} else {isNaN = 1;}
} else { if (var_power == 2.0 & link_power == 1.0) { # Gamma.id
if (sum (linear_terms <= 0) == 0) {
b_cumulant = log (linear_terms);
natural_parameters = - 1.0 / linear_terms;
} else {isNaN = 1;}
} else { if (var_power == 2.0 & link_power == 0) { # Gamma.log
b_cumulant = linear_terms;
natural_parameters = - exp (- linear_terms);
} else { if (var_power == 2.0) { # Gamma.power_nonlog
if (sum (linear_terms <= 0) == 0) {
b_cumulant = log (linear_terms) / link_power;
natural_parameters = - linear_terms ^ (- 1.0 / link_power);
} else {isNaN = 1;}
} else { if (link_power == 0) { # PowerDist.log
natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
} else { # PowerDist.power_nonlog
if (-2 * link_power == 1.0 - var_power) {
natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power);
} else { if (-1 * link_power == 1.0 - var_power) {
natural_parameters = 1.0 / linear_terms / (1.0 - var_power);
} else { if ( link_power == 1.0 - var_power) {
natural_parameters = linear_terms / (1.0 - var_power);
} else { if ( 2 * link_power == 1.0 - var_power) {
natural_parameters = linear_terms ^ 2 / (1.0 - var_power);
} else {
if (sum (linear_terms <= 0) == 0) {
power = (1.0 - var_power) / link_power;
natural_parameters = (linear_terms ^ power) / (1.0 - var_power);
} else {isNaN = 1;}
}}}}
if (-2 * link_power == 2.0 - var_power) {
b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power);
} else { if (-1 * link_power == 2.0 - var_power) {
b_cumulant = 1.0 / linear_terms / (2.0 - var_power);
} else { if ( link_power == 2.0 - var_power) {
b_cumulant = linear_terms / (2.0 - var_power);
} else { if ( 2 * link_power == 2.0 - var_power) {
b_cumulant = linear_terms ^ 2 / (2.0 - var_power);
} else {
if (sum (linear_terms <= 0) == 0) {
power = (2.0 - var_power) / link_power;
b_cumulant = (linear_terms ^ power) / (2.0 - var_power);
} else {isNaN = 1;}
}}}}
}}}}} }}}}}
if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) {
log_l = -1.0 / 0.0;
isNaN = 1;
}
if (isNaN == 0)
{
log_l = sum (Y * natural_parameters - b_cumulant);
if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
log_l = -1.0 / 0.0;
isNaN = 1;
} } }
if (dist_type == 2 & link_type >= 1 & link_type <= 5)
{ # BINOMIAL/BERNOULLI DISTRIBUTION
[Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power);
if (isNaN == 0) {
does_prob_contradict = (Y_prob <= 0);
if (sum (does_prob_contradict * abs (Y)) == 0) {
log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict));
if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
isNaN = 1;
}
} else {
log_l = -1.0 / 0.0;
isNaN = 1;
} } }
if (isNaN == 1) {
log_l = - 1.0 / 0.0;
}
}
binomial_probability_two_column =
function (Matrix[double] linear_terms, int link_type, double link_power)
return (Matrix[double] Y_prob, int isNaN)
{
isNaN = 0;
num_records = nrow (linear_terms);
# Define some auxiliary matrices
ones_2 = matrix (1.0, rows = 1, cols = 2);
p_one_m_one = ones_2;
p_one_m_one [1, 2] = -1.0;
m_one_p_one = ones_2;
m_one_p_one [1, 1] = -1.0;
zero_one = ones_2;
zero_one [1, 1] = 0.0;
one_zero = ones_2;
one_zero [1, 2] = 0.0;
zeros_r = matrix (0.0, rows = num_records, cols = 1);
ones_r = 1.0 + zeros_r;
# Begin the function body
Y_prob = zeros_r %*% ones_2;
if (link_type == 1) { # Binomial.power
if (link_power == 0) { # Binomial.log
Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one;
} else { if (link_power == 0.5) { # Binomial.sqrt
Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one;
} else { # Binomial.power_nonlog
if (sum (linear_terms < 0) == 0) {
Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one;
} else {isNaN = 1;}
}}
} else { # Binomial.non_power
is_LT_pos_infinite = (linear_terms == 1.0/0.0);
is_LT_neg_infinite = (linear_terms == -1.0/0.0);
is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
finite_linear_terms = replace (target = linear_terms, pattern = 1.0/0.0, replacement = 0);
finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0);
if (link_type == 2) { # Binomial.logit
Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
} else { if (link_type == 3) { # Binomial.probit
lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one;
t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0)
pt_gp = t_gp * ( 0.254829592
+ t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
+ t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
+ t_gp * (-1.453152027
+ t_gp * 1.061405429))));
the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg);
} else { if (link_type == 4) { # Binomial.cloglog
the_exp = exp (finite_linear_terms);
the_exp_exp = exp (- the_exp);
is_too_small = ((10000000 + the_exp) == 10000000);
Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2);
Y_prob [, 2] = the_exp_exp;
} else { if (link_type == 5) { # Binomial.cauchit
Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795;
} else {
isNaN = 1;
}}}}
Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
} }
# THE CG-STEIHAUG PROCEDURE SCRIPT
# Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize
# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z
# under constraint: ||z|| <= trust_delta.
# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform
# is given separately because sparse "X" may become dense after applying the transform.
#
get_CG_Steihaug_point =
function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w,
Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG)
return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary)
{
trust_delta_sq = trust_delta ^ 2;
size_CG = nrow (g);
z = matrix (0.0, rows = size_CG, cols = 1);
neg_log_l_change = 0.0;
reached_trust_boundary = 0;
g_reg = g + lambda * beta;
r_CG = g_reg;
p_CG = -r_CG;
rr_CG = sum(r_CG * r_CG);
eps_CG = rr_CG * min (0.25, sqrt (rr_CG));
converged_CG = 0;
if (rr_CG < eps_CG) {
converged_CG = 1;
}
max_iteration_CG = max_iter_CG;
if (max_iteration_CG <= 0) {
max_iteration_CG = size_CG;
}
i_CG = 0;
while (converged_CG == 0)
{
i_CG = i_CG + 1;
ssX_p_CG = diag (scale_X) %*% p_CG;
ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG;
temp_CG = t(X) %*% (w * (X %*% ssX_p_CG));
q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ];
pq_CG = sum (p_CG * q_CG);
if (pq_CG <= 0) {
pp_CG = sum (p_CG * p_CG);
if (pp_CG > 0) {
[z, neg_log_l_change] =
get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
reached_trust_boundary = 1;
} else {
neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
}
converged_CG = 1;
}
if (converged_CG == 0) {
alpha_CG = rr_CG / pq_CG;
new_z = z + alpha_CG * p_CG;
if (sum(new_z * new_z) >= trust_delta_sq) {
pp_CG = sum (p_CG * p_CG);
[z, neg_log_l_change] =
get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
reached_trust_boundary = 1;
converged_CG = 1;
}
if (converged_CG == 0) {
z = new_z;
old_rr_CG = rr_CG;
r_CG = r_CG + alpha_CG * q_CG;
rr_CG = sum(r_CG * r_CG);
if (i_CG == max_iteration_CG | rr_CG < eps_CG) {
neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
reached_trust_boundary = 0;
converged_CG = 1;
}
if (converged_CG == 0) {
p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG;
} } } } }
# An auxiliary function used twice inside the CG-STEIHAUG loop:
get_trust_boundary_point =
function (Matrix[double] g, Matrix[double] z, Matrix[double] p,
Matrix[double] q, Matrix[double] r, double pp, double pq,
double trust_delta_sq)
return (Matrix[double] new_z, double f_change)
{
zz = sum (z * z); pz = sum (p * z);
sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq));
tau_1 = (- pz + sq_root_d) / pp;
tau_2 = (- pz - sq_root_d) / pp;
zq = sum (z * q); gp = sum (g * p);
f_extra = 0.5 * sum (z * (r + g));
f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1;
f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2;
ind1 = as.integer(f_change_1 < f_change_2);
ind2 = as.integer(f_change_1 >= f_change_2);
new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p);
f_change = ind1 * f_change_1 + ind2 * f_change_2;
}
# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1
# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X).
straightenX =
function (Matrix[double] X, double eps, int max_iter_CG)
return (Matrix[double] w)
{
w_X = t(colSums(X));
lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
eps_LS = eps * nrow(X);
# BEGIN LEAST SQUARES
r_LS = - w_X;
z_LS = matrix (0.0, rows = ncol(X), cols = 1);
p_LS = - r_LS;
norm_r2_LS = sum (r_LS ^ 2);
i_LS = 0;
while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS)
{
q_LS = t(X) %*% X %*% p_LS;
q_LS = q_LS + lambda_LS * p_LS;
alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
z_LS = z_LS + alpha_LS * p_LS;
old_norm_r2_LS = norm_r2_LS;
r_LS = r_LS + alpha_LS * q_LS;
norm_r2_LS = sum (r_LS ^ 2);
p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
i_LS = i_LS + 1;
}
# END LEAST SQUARES
w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
}