scripts.algorithms.Cox.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#  
# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
# The Breslow method is used for handling ties and the regression parameters 
# are computed using trust region newton method with conjugate gradient 
# 
# INPUT   PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME    TYPE     DEFAULT      MEANING
# ---------------------------------------------------------------------------------------------
# X       String   ---          Location to read the input matrix X containing the survival data containing the following information
# 								 - 1: timestamps 
#								 - 2: whether an event occurred (1) or data is censored (0)
#								 - 3: feature vectors 	
# TE	  String   ---          Column indices of X as a column vector which contain timestamp (first row) and event information (second row)
# F 	  String   " "			Column indices of X as a column vector which are to be used for fitting the Cox model
# R   	  String   " "	        If factors (categorical variables) are available in the input matrix X, location to read matrix R containing 
#								the start and end indices of the factors in X
#								 - R[,1]: start indices
#								 - R[,2]: end indices	
#								Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X; 
#								in this case the start and end indices corresponding to the baseline level need to be the same;
#								if R is not provided by default all variables are considered to be continuous 
# M       String   ---          Location to store the results of Cox regression analysis including estimated regression parameters of the fitted 
#								Cox model (the betas), their standard errors, confidence intervals, and P-values 
# S       String   " "          Location to store a summary of some statistics of the fitted cox proportional hazard model including			
#								no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare; 
#								by default is standard output  
# T       String   " "          Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model;
#								by default is standard output 
# COV	  String   ---			Location to store the variance-covariance matrix of the betas
# RT      String   ---			Location to store matrix RT containing the order-preserving recoded timestamps from X 
# XO      String   ---			Location to store sorted input matrix by the timestamps 
# MF      String   ---          Location to store column indices of X excluding the baseline factors if available
# alpha   Double   0.05         Parameter to compute a 100*(1-alpha)% confidence interval for the betas  
# tol     Double   0.000001     Tolerance ("epsilon")
# moi     Int      100     		Max. number of outer (Newton) iterations
# mii     Int      0      		Max. number of inner (conjugate gradient) iterations, 0 = no max   
# fmt     String   "text"       Matrix output format, usually "text" or "csv" (for matrices only)
# ---------------------------------------------------------------------------------------------
# OUTPUT: 
# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
#	M[,1]: betas	
#	M[,2]: exp(betas)
#	M[,3]: standard error of betas
#	M[,4]: Z 
#	M[,5]: P-value
#	M[,6]: lower 100*(1-alpha)% confidence interval of betas
#	M[,7]: upper 100*(1-alpha)% confidence interval of betas
#
# Two matrices containing a summary of some statistics of the fitted model:
# 1- File S with the following format 
#	- row 1: no. of observations
#	- row 2: no. of events
#   - row 3: log-likelihood 
#	- row 4: AIC
#	- row 5: Rsquare (Cox & Snell)
#	- row 6: max possible Rsquare
# 2- File T with the following format
#	- row 1: Likelihood ratio test statistic, degree of freedom, P-value
#	- row 2: Wald test statistic, degree of freedom, P-value
#	- row 3: Score (log-rank) test statistic, degree of freedom, P-value
# 
# Additionally, the following matrices are stored (needed for prediction)
# 1- A column matrix RT that contains the order-preserving recoded timestamps from X 
# 2- Matrix XO which is matrix X with sorted timestamps  
# 3- Variance-covariance matrix of the betas COV
# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
# -------------------------------------------------------------------------------------------
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R 
#									  M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT 
#									  XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv

fileX = $X;
fileTE = $TE;
fileRT = $RT;
fileMF = $MF;
fileM = $M;
fileXO = $XO;
fileCOV = $COV;

# Default values of some parameters
fileF = ifdef ($F, " ");			 # $F=" "
fileR = ifdef ($R, " ");         	 # $R=" " 
fileS = ifdef ($S, " ");         	 # $S=" " 
fileT = ifdef ($T, " ");         	 # $T=" " 
fmtO = ifdef ($fmt, "text");         # $fmt="text" 
alpha = ifdef ($alpha, 0.05);        # $alpha=0.05
tol = ifdef ($tol, 0.000001);   	 # $tol=0.000001;
maxiter = ifdef ($moi, 100);    	 # $moi=100;
maxinneriter = ifdef ($mii, 0);      # $mii=0;

X_orig = read (fileX);

TE = read (fileTE);
if (fileF != " ") {
	F = read (fileF);
}

######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET

if (fileR != " ") { # factors available
	R = read (fileR);
	if (ncol (R) != 2) {
		stop ("Matrix R has wrong dimensions!");
	}
	print ("REMOVING BASLINE LEVEL OF EACH FACTOR...");	
	# identify baseline columns to be removed from X_orig
	col_sum = colSums (X_orig);
	col_seq = t (seq(1, ncol (X_orig)));
	parfor (i in 1:nrow (R), check = 0) {
		start_ind = as.scalar (R[i,1]);
		end_ind = as.scalar (R[i,2]);
		baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
		col_seq[,baseline_ind] = 0;
	}
	ones = matrix (1, rows = nrow (F), cols = 1);
	F_filter = table (ones, F, 1, ncol (X_orig));
	F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
	TE_F = t(cbind (t (TE), F_filter));
} else if (fileF != " ") { # all features scale
	TE_F = t(cbind (t (TE), t(F)));
} else { # no features available 
	TE_F = TE;
}

write (TE_F, fileMF, format = fmtO);

X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));

######## RECODING TIMESTAMPS PRESERVING THE ORDER
print ("RECODING TIMESTAMPS...");

N = nrow (X_orig);
X_orig = order (target = X_orig, by = 1);
Idx = matrix (1, rows = N, cols = 1);
num_timestamps = 1;
if (N == 1) {
	RT = matrix (1, rows = 1, cols = 1);
} else {
	Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]);
	num_timestamps = sum (Idx);
	A = removeEmpty (target = diag (Idx), margin = "cols");
	if (ncol (A) > 1) {
		A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
		B = cumsum (A);
		RT = B %*% seq(1, ncol(B));
	} else { # there is only one group
		RT = matrix (1, rows = N, cols = 1);
	}
}
E = X_orig[,2]; 

print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");

######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
#  b: the regression betas
#  o: loss function value
#  g: loss function gradient
#  H: loss function Hessian
# sb: shift of b in one iteration
# so: shift of o in one iteration
#  r: CG residual = H %*% sb + g
#  d: CG direction vector
# Hd: = H %*% d
#  c: scalar coefficient in CG
# delta: trust region size
# tol: tolerance value
#  i: outer (Newton) iteration count
#  j: inner (CG) iteration count

# computing initial coefficients b (all initialized to 0)
if (ncol (X_orig) > 2) {
	X = X_orig[,3:ncol(X_orig)];
	D = ncol (X);
	zeros_D = matrix (0, rows = D, cols = 1);
	b = zeros_D; 
}
d_r = aggregate (target = E, groups = RT, fn = "sum"); 
e_r = aggregate (target = RT, groups = RT, fn = "count");

# computing initial loss function value o
num_distinct = nrow (d_r); # no. of distinct timestamps
e_r_rev_agg = cumsum (rev(e_r));
d_r_rev = rev(d_r);
o = sum (d_r_rev * log (e_r_rev_agg));
o_init = o;
if (ncol (X_orig) < 3) {
	loglik = -o;
	S_str = "no. of records " + N + " loglik " + loglik;
	if (fileS != " ") {
	  S = matrix(0, 6, 1);
	  S[1, 1] = N;
	  S[2, 1] = 0; # number of events
	  S[3, 1] = loglik;
	  S[4, 1] = -1; # AIC
	  S[5, 1] = -1; # Rsquare
	  S[6, 1] = -1; #Rsquare_max
		write (S, fileS, format = fmtO);
	} else {
		print (S_str);
	}
	stop ("No features are selected!");
}

# computing initial gradient g 
# part 1 g0_1
g0_1 = - t (colSums (X * E)); # g_1
# part 2 g0_2
X_rev_agg = cumsum (rev(X));
select = table (seq (1, num_distinct), e_r_rev_agg);
X_agg = select %*% X_rev_agg;
g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
#
g0 = g0_1 + g0_2;
g = g0; 

# initialization for trust region Newton method
delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2))); 
initial_g2 = sum (g ^ 2);
exit_g2 = initial_g2 * tol ^ 2;
maxiter = 100; 
maxinneriter = min (D, 100); 
i = 0;
sum_g2 = sum (g ^ 2);
while (sum_g2 > exit_g2 & i < maxiter) {  
	i = i + 1;
    sb = zeros_D;  
    r = g;  
    r2 = sum (r ^ 2);  
    exit_r2 = 0.01 * r2;
    d = - r;  
    trust_bound_reached = FALSE;  
    j = 0;

	exp_Xb = exp (X %*% b);
	exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
	D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
	X_exp_Xb = X * exp_Xb;
	X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
	X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

    while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) { 
        j = j + 1;
		# computing Hessian times d (Hd)
		# part 1 Hd_1
		Xd = X %*% d;
		X_Xd_exp_Xb = X * (Xd * exp_Xb);
		X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
		X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;
		
		Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
		# part 2 Hd_2
		
		Xd_exp_Xb = Xd * exp_Xb;
		Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
		Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;
		
		Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
		Hd_2 = Hd_2_num / (D_r_rev ^ 2);
		
		Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));

		c = r2 / sum (d * Hd);
        [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
        sb = sb + c * d;  
        r = r + c * Hd;  
        r2_new = sum (r ^ 2);
        d = - r + (r2_new / r2) * d;  
        r2 = r2_new;
    }

    # computing loss change in 1 iteration (so)
	# part 1 so_1
	so_1 = - as.scalar (colSums (X * E) %*% (b + sb)); 
	# part 2 so_2
	exp_Xbsb = exp (X %*% (b + sb));
	exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
	so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
	#
	so = so_1 + so_2;
	so = so - o; 
	
	delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
    if (so < 0) {
        b = b + sb;  
        o = o + so;  
		# compute new gradient g
		exp_Xb = exp (X %*% b);
		exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
		X_exp_Xb = X * exp_Xb;
		X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
		X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
		
		D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
		g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
		g = g0_1 + g_2;
		sum_g2 = sum (g ^ 2);
    }
}

if (sum_g2 > exit_g2 & i >= maxiter) {
	print ("Trust region Newton method did not converge!");
}


print ("COMPUTING HESSIAN...");

H0 = matrix (0, rows = D, cols = D);
H = matrix (0, rows = D, cols = D);

X_exp_Xb_rev_2 = rev(X_exp_Xb);
X_rev_2 = rev(X);

X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; 

parfor (i in 1:D, check = 0) {
	Xi = X[,i];
	Xi_rev = rev(Xi);

	## ----------Start calculating H0--------------	
	# part 1 H0_1
	Xi_X = X_rev_2[,i:D] * Xi_rev;
	Xi_X_rev_agg = cumsum (Xi_X);
	Xi_X_rev_agg = select %*% Xi_X_rev_agg;
	H0_1 = Xi_X_rev_agg / e_r_rev_agg;

	# part 2 H0_2
	Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
	Xi_agg_rev_agg = cumsum (rev(Xi_agg));	
	H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
	H0_2 = H0_2_num / (e_r_rev_agg ^ 2);

	H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
	#-----------End calculating H0--------------------
		
	## ----------Start calculating H--------------
	# part 1 H_1
	Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
	Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
	Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
	H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;

	# part 2 H_2
	Xi_exp_Xb = exp_Xb * Xi;
	Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum"); 
	
	Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
	H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
	H_2 = H_2_num / (D_r_rev ^ 2);
	H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
	#-----------End calculating H--------------------
}
H = H + t(H) - diag( diag (H));
H0 = H0 + t(H0) - diag( diag (H0));


# compute standard error for betas
H_inv = inv (H);
se_b = sqrt (diag (H_inv));

# compute exp(b), Z, Pr[>|Z|]
exp_b = exp (b);
Z = b / se_b;
P = matrix (0, rows = D, cols = 1);
parfor (i in 1:D) {
	P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal"));
}

# compute confidence intervals for b
z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
CI_l = b - se_b * z_alpha_2;
CI_r = b - se_b + z_alpha_2;

######## SOME STATISTICS AND TESTS
S = matrix(0, 6, 1);
T = matrix(0, 3, 3);

# no. of records
S_str = "no. of records " + N;
S[1, 1] = N;

# no.of events
S_str = append (S_str, "no. of events " + sum (E));
S[2, 1] = sum (E);

# log-likelihood
loglik = -o;
S_str = append (S_str, "loglik " + loglik + " ");
S[3, 1] = loglik;

# AIC = -2 * loglik + 2 * D
AIC = -2 * loglik + 2 * D;
S_str = append (S_str, "AIC " + AIC + " ");
S[4, 1] = AIC;

# Wald test
wald_t = as.scalar (t(b) %*% H %*% b);
wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " ";
T[1, 1] = wald_t;
T[1, 2] = D;
T[1, 3] = wald_p;

# Likelihood ratio test
lratio_t = 2 * o_init - 2 * o;
lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " ");

T[2, 1] = lratio_t;
T[2, 2] = D;
T[2, 3] = lratio_p;

H0_inv = inv (H0);
score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " ");
T[3, 1] = score_t;
T[3, 2] = D;
T[3, 3] = score_p;

# Rsquare (Cox & Snell)
Rsquare = 1 - exp (-lratio_t / N);  
Rsquare_max = 1 - exp (-2 * o_init / N);
S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " ");
S[5, 1] = Rsquare;
S_str = append (S_str, "max possible Rsquare: " + Rsquare_max);
S[6, 1] = Rsquare_max;

M = matrix (0, rows = D, cols = 7);
M[,1] = b;
M[,2] = exp_b;
M[,3] = se_b;
M[,4] = Z;
M[,5] = P;
M[,6] = CI_l;
M[,7] = CI_r;

write (M, fileM, format = fmtO);
if (fileS != " ") {
	write (S, fileS, format = fmtO);
} else {
	print (S_str);
}
if (fileT != " ") {
	write (T, fileT, format = fmtO);
} else {
	print (T_str);
}
# needed for prediction
write (RT, fileRT, format = fmtO);
write (H_inv, fileCOV, format = fmtO);
write (X_orig, fileXO, format = fmtO);


####### UDFS FOR TRUST REGION NEWTON METHOD

ensure_trust_bound =
    function (double x, double a, double b, double c)
    return (double x_new, boolean is_violated)
{
    if (a * x^2 + b * x + c > 0)
    {
        is_violated = TRUE;
        rad = sqrt (b ^ 2 - 4 * a * c);
        if (b >= 0) {
            x_new = - (2 * c) / (b + rad);
        } else {
            x_new = - (b - rad) / (2 * a);
        }
    } else {
        is_violated = FALSE;
        x_new = x;
    }
}

update_trust_bound =
    function (double delta,
              double sb_distance,
              double so_exact,
              double so_linear_approx,
              double so_quadratic_approx)
    return   (double delta)
{
    sigma1 = 0.25;
    sigma2 = 0.5;
    sigma3 = 4.0;

    if (so_exact <= so_linear_approx) {
       alpha = sigma3;
    } else {
       alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));
    }

    rho = so_exact / so_quadratic_approx;
    if (rho < 0.0001) {
        delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
    } else { if (rho < 0.25) {
        delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
    } else { if (rho < 0.75) {
        delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
    } else {
        delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
    }}} 
}