scripts.algorithms.Cox.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
# The Breslow method is used for handling ties and the regression parameters
# are computed using trust region newton method with conjugate gradient
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read the input matrix X containing the survival data containing the following information
# - 1: timestamps
# - 2: whether an event occurred (1) or data is censored (0)
# - 3: feature vectors
# TE String --- Column indices of X as a column vector which contain timestamp (first row) and event information (second row)
# F String " " Column indices of X as a column vector which are to be used for fitting the Cox model
# R String " " If factors (categorical variables) are available in the input matrix X, location to read matrix R containing
# the start and end indices of the factors in X
# - R[,1]: start indices
# - R[,2]: end indices
# Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X;
# in this case the start and end indices corresponding to the baseline level need to be the same;
# if R is not provided by default all variables are considered to be continuous
# M String --- Location to store the results of Cox regression analysis including estimated regression parameters of the fitted
# Cox model (the betas), their standard errors, confidence intervals, and P-values
# S String " " Location to store a summary of some statistics of the fitted cox proportional hazard model including
# no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare;
# by default is standard output
# T String " " Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model;
# by default is standard output
# COV String --- Location to store the variance-covariance matrix of the betas
# RT String --- Location to store matrix RT containing the order-preserving recoded timestamps from X
# XO String --- Location to store sorted input matrix by the timestamps
# MF String --- Location to store column indices of X excluding the baseline factors if available
# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas
# tol Double 0.000001 Tolerance ("epsilon")
# moi Int 100 Max. number of outer (Newton) iterations
# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max
# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
# M[,1]: betas
# M[,2]: exp(betas)
# M[,3]: standard error of betas
# M[,4]: Z
# M[,5]: P-value
# M[,6]: lower 100*(1-alpha)% confidence interval of betas
# M[,7]: upper 100*(1-alpha)% confidence interval of betas
#
# Two log files containing a summary of some statistics of the fitted model:
# 1- File S with the following format
# - line 1: no. of observations
# - line 2: no. of events
# - line 3: log-likelihood
# - line 4: AIC
# - line 5: Rsquare (Cox & Snell)
# - line 6: max possible Rsquare
# 2- File T with the following format
# - line 1: Likelihood ratio test statistic, degree of freedom, P-value
# - line 2: Wald test statistic, degree of freedom, P-value
# - line 3: Score (log-rank) test statistic, degree of freedom, P-value
#
# Additionally, the following matrices are stored (needed for prediction)
# 1- A column matrix RT that contains the order-preserving recoded timestamps from X
# 2- Matrix XO which is matrix X with sorted timestamps
# 3- Variance-covariance matrix of the betas COV
# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
# -------------------------------------------------------------------------------------------
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R
# M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT
# XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv
fileX = $X;
fileTE = $TE;
fileRT = $RT;
fileMF = $MF;
fileM = $M;
fileXO = $XO;
fileCOV = $COV;
# Default values of some parameters
fileF = ifdef ($F, " "); # $F=" "
fileR = ifdef ($R, " "); # $R=" "
fileS = ifdef ($S, " "); # $S=" "
fileT = ifdef ($T, " "); # $T=" "
fmtO = ifdef ($fmt, "text"); # $fmt="text"
alpha = ifdef ($alpha, 0.05); # $alpha=0.05
tol = ifdef ($tol, 0.000001); # $tol=0.000001;
maxiter = ifdef ($moi, 100); # $moi=100;
maxinneriter = ifdef ($mii, 0); # $mii=0;
X_orig = read (fileX);
TE = read (fileTE);
if (fileF != " ") {
F = read (fileF);
}
######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET
if (fileR != " ") { # factors available
R = read (fileR);
if (ncol (R) != 2) {
stop ("Matrix R has wrong dimensions!");
}
print ("REMOVING BASLINE LEVEL OF EACH FACTOR...");
# identify baseline columns to be removed from X_orig
col_sum = colSums (X_orig);
col_seq = t (seq(1, ncol (X_orig)));
parfor (i in 1:nrow (R), check = 0) {
start_ind = as.scalar (R[i,1]);
end_ind = as.scalar (R[i,2]);
baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
col_seq[,baseline_ind] = 0;
}
ones = matrix (1, rows = nrow (F), cols = 1);
F_filter = table (ones, F, 1, ncol (X_orig));
F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
TE_F = t(append (t (TE), F_filter));
} else if (fileF != " ") { # all features scale
TE_F = t(append (t (TE), t(F)));
} else { # no features available
TE_F = TE;
}
write (TE_F, fileMF, format = fmtO);
X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));
######## RECODING TIMESTAMPS PRESERVING THE ORDER
print ("RECODING TIMESTAMPS...");
N = nrow (X_orig);
X_orig = order (target = X_orig, by = 1);
Idx = matrix (1, rows = N, cols = 1);
num_timestamps = 1;
if (N == 1) {
RT = matrix (1, rows = 1, cols = 1);
} else {
Idx[2:N,1] = ppred (X_orig[1:(N - 1),1], X_orig[2:N,1], "!=");
num_timestamps = sum (Idx);
A = removeEmpty (target = diag (Idx), margin = "cols");
if (ncol (A) > 1) {
A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
B = cumsum (A);
RT = B %*% seq(1, ncol(B));
} else { # there is only one group
RT = matrix (1, rows = N, cols = 1);
}
}
E = X_orig[,2];
print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");
######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
# b: the regression betas
# o: loss function value
# g: loss function gradient
# H: loss function Hessian
# sb: shift of b in one iteration
# so: shift of o in one iteration
# r: CG residual = H %*% sb + g
# d: CG direction vector
# Hd: = H %*% d
# c: scalar coefficient in CG
# delta: trust region size
# tol: tolerance value
# i: outer (Newton) iteration count
# j: inner (CG) iteration count
# computing initial coefficients b (all initialized to 0)
if (ncol (X_orig) > 2) {
X = X_orig[,3:ncol(X_orig)];
D = ncol (X);
zeros_D = matrix (0, rows = D, cols = 1);
b = zeros_D;
}
d_r = aggregate (target = E, groups = RT, fn = "sum");
e_r = aggregate (target = RT, groups = RT, fn = "count");
# computing initial loss function value o
num_distinct = nrow (d_r); # no. of distinct timestamps
e_r_rev_agg = cumsum (rev(e_r));
d_r_rev = rev(d_r);
o = sum (d_r_rev * log (e_r_rev_agg));
o_init = o;
if (ncol (X_orig) < 3) {
loglik = -o;
S_str = "no. of records " + N + " loglik " + loglik;
if (fileS != " ") {
write (S_str, fileS, format = fmtO);
} else {
print (S_str);
}
stop ("No features are selected!");
}
# computing initial gradient g
# part 1 g0_1
g0_1 = - t (colSums (X * E)); # g_1
# part 2 g0_2
X_rev_agg = cumsum (rev(X));
select = table (seq (1, num_distinct), e_r_rev_agg);
X_agg = select %*% X_rev_agg;
g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
#
g0 = g0_1 + g0_2;
g = g0;
# initialization for trust region Newton method
delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
initial_g2 = sum (g ^ 2);
exit_g2 = initial_g2 * tol ^ 2;
maxiter = 100;
maxinneriter = min (D, 100);
i = 0;
sum_g2 = sum (g ^ 2);
while (sum_g2 > exit_g2 & i < maxiter) {
i = i + 1;
sb = zeros_D;
r = g;
r2 = sum (r ^ 2);
exit_r2 = 0.01 * r2;
d = - r;
trust_bound_reached = FALSE;
j = 0;
exp_Xb = exp (X %*% b);
exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
X_exp_Xb = X * exp_Xb;
X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
j = j + 1;
# computing Hessian times d (Hd)
# part 1 Hd_1
Xd = X %*% d;
X_Xd_exp_Xb = X * (Xd * exp_Xb);
X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;
Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
# part 2 Hd_2
Xd_exp_Xb = Xd * exp_Xb;
Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;
Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
Hd_2 = Hd_2_num / (D_r_rev ^ 2);
Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));
c = r2 / sum (d * Hd);
[c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
sb = sb + c * d;
r = r + c * Hd;
r2_new = sum (r ^ 2);
d = - r + (r2_new / r2) * d;
r2 = r2_new;
}
# computing loss change in 1 iteration (so)
# part 1 so_1
so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
# part 2 so_2
exp_Xbsb = exp (X %*% (b + sb));
exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
#
so = so_1 + so_2;
so = so - o;
delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
if (so < 0) {
b = b + sb;
o = o + so;
# compute new gradient g
exp_Xb = exp (X %*% b);
exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
X_exp_Xb = X * exp_Xb;
X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
g = g0_1 + g_2;
sum_g2 = sum (g ^ 2);
}
}
if (sum_g2 > exit_g2 & i >= maxiter) {
print ("Trust region Newton method did not converge!");
}
print ("COMPUTING HESSIAN...");
H0 = matrix (0, rows = D, cols = D);
H = matrix (0, rows = D, cols = D);
X_exp_Xb_rev_2 = rev(X_exp_Xb);
X_rev_2 = rev(X);
X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
parfor (i in 1:D, check = 0) {
Xi = X[,i];
Xi_rev = rev(Xi);
## ----------Start calculating H0--------------
# part 1 H0_1
Xi_X = X_rev_2[,i:D] * Xi_rev;
Xi_X_rev_agg = cumsum (Xi_X);
Xi_X_rev_agg = select %*% Xi_X_rev_agg;
H0_1 = Xi_X_rev_agg / e_r_rev_agg;
# part 2 H0_2
Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
Xi_agg_rev_agg = cumsum (rev(Xi_agg));
H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
#-----------End calculating H0--------------------
## ----------Start calculating H--------------
# part 1 H_1
Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
# part 2 H_2
Xi_exp_Xb = exp_Xb * Xi;
Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");
Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
H_2 = H_2_num / (D_r_rev ^ 2);
H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
#-----------End calculating H--------------------
}
H = H + t(H) - diag( diag (H));
H0 = H0 + t(H0) - diag( diag (H0));
# compute standard error for betas
H_inv = inv (H);
se_b = sqrt (diag (H_inv));
# compute exp(b), Z, Pr[>|Z|]
exp_b = exp (b);
Z = b / se_b;
P = matrix (0, rows = D, cols = 1);
parfor (i in 1:D) {
P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal"));
}
# compute confidence intervals for b
z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
CI_l = b - se_b * z_alpha_2;
CI_r = b - se_b + z_alpha_2;
######## SOME STATISTICS AND TESTS
# no. of records
S_str = "no. of records " + N;
# no.of events
S_str = append (S_str, "no. of events " + sum (E));
# log-likelihood
loglik = -o;
S_str = append (S_str, "loglik " + loglik + " ");
# AIC = -2 * loglik + 2 * D
AIC = -2 * loglik + 2 * D;
S_str = append (S_str, "AIC " + AIC + " ");
# Wald test
wald_t = as.scalar (t(b) %*% H %*% b);
wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " ";
# Likelihood ratio test
lratio_t = 2 * o_init - 2 * o;
lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " ");
H0_inv = inv (H0);
score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " ");
# Rsquare (Cox & Snell)
Rsquare = 1 - exp (-lratio_t / N);
Rsquare_max = 1 - exp (-2 * o_init / N);
S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " ");
S_str = append (S_str, "max possible Rsquare: " + Rsquare_max);
M = matrix (0, rows = D, cols = 7);
M[,1] = b;
M[,2] = exp_b;
M[,3] = se_b;
M[,4] = Z;
M[,5] = P;
M[,6] = CI_l;
M[,7] = CI_r;
write (M, fileM, format = fmtO);
if (fileS != " ") {
write (S_str, fileS, format = fmtO);
} else {
print (S_str);
}
if (fileT != " ") {
write (T_str, fileT, format = fmtO);
} else {
print (T_str);
}
# needed for prediction
write (RT, fileRT, format = fmtO);
write (H_inv, fileCOV, format = fmtO);
write (X_orig, fileXO, format = fmtO);
####### UDFS FOR TRUST REGION NEWTON METHOD
ensure_trust_bound =
function (double x, double a, double b, double c)
return (double x_new, boolean is_violated)
{
if (a * x^2 + b * x + c > 0)
{
is_violated = TRUE;
rad = sqrt (b ^ 2 - 4 * a * c);
if (b >= 0) {
x_new = - (2 * c) / (b + rad);
} else {
x_new = - (b - rad) / (2 * a);
}
} else {
is_violated = FALSE;
x_new = x;
}
}
update_trust_bound =
function (double delta,
double sb_distance,
double so_exact,
double so_linear_approx,
double so_quadratic_approx)
return (double delta)
{
sigma1 = 0.25;
sigma2 = 0.5;
sigma3 = 4.0;
if (so_exact <= so_linear_approx) {
alpha = sigma3;
} else {
alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));
}
rho = so_exact / so_quadratic_approx;
if (rho < 0.0001) {
delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
} else { if (rho < 0.25) {
delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
} else { if (rho < 0.75) {
delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
} else {
delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
}}}
}