scripts.algorithms.KM.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#  
# THIS SCRIPT ANALIZES SURVIVAL DATA USING KAPLAN-MEIER ESTIMATES 
#
# INPUT   PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME    TYPE     DEFAULT      MEANING
# ---------------------------------------------------------------------------------------------
# X       String   ---          Location to read the input matrix X containing the survival data: 
#								timestamps, whether event occurred (1) or data is censored (0), and a number of factors (categorical features) 
#								for grouping and/or stratifying 
# TE	  String   ---          Column indices of X which contain timestamps (first entry) and event information (second entry) 
# GI	  String   ---          Column indices of X corresponding to the factors to be used for grouping
# SI	  String   ---          Column indices of X corresponding to the factors to be used for stratifying				
# O       String   ---          Location to write the matrix containing the results of the Kaplan-Meier analysis; see below for the description
# M       String   ---          Location to write Matrix M containing the following statistic: total number of events, median and its confidence intervals; 
#								if survival data for multiple groups and strata are provided each row of M contains the above statistics per group and stratum
# T 	  String   " "			If survival data from multiple groups available and ttype=log-rank or wilcoxon, 
#								location to write the matrix containing result of the (stratified) test for comparing multiple groups
# alpha   Double   0.05         Parameter to compute 100*(1-alpha)% confidence intervals for the survivor function and its median 
# etype   String   "greenwood"  Parameter to specify the error type according to "greenwood" (the default) or "peto"
# ctype   String   "log"        Parameter to modify the confidence interval; "plain" keeps the lower and upper bound of 
#								the confidence interval unmodified,	"log" (the default) corresponds to logistic transformation and 
#								"log-log" corresponds to the complementary log-log transformation 
# ttype   String   "none"   	If survival data for multiple groups is available specifies which test to perform for comparing 
#								survival data across multiple groups: "none" (the default) "log-rank" or "wilcoxon" test   
# fmt     String   "text"       The output format of results of the Kaplan-Meier analysis, such as "text" or "csv"
# ---------------------------------------------------------------------------------------------
# OUTPUT: 
# 1- Matrix KM whose dimension depends on the number of groups (denoted by g) and strata (denoted by s) in the data: 
#	each collection of 7 consecutive columns in KM corresponds to a unique combination of groups and strata in the data with the following schema
# 	1. col: timestamp
# 	2. col: no. at risk
# 	3. col: no. of events
# 	4. col: Kaplan-Meier estimate of survivor function surv
# 	5. col: standard error of surv
# 	6. col: lower 100*(1-alpha)% confidence interval for surv
# 	7. col: upper 100*(1-alpha)% confidence interval for surv
# 2- Matrix M whose dimension depends on the number of groups (g) and strata (s) in the data (k denotes the number of factors used for grouping 
#	,i.e., ncol(GI) and l denotes the number of factors used for stratifying, i.e., ncol(SI))
#	M[,1:k]: unique combination of values in the k factors used for grouping 
#	M[,(k+1):(k+l)]: unique combination of values in the l factors used for stratifying
#	M[,k+l+1]: total number of records
#	M[,k+l+2]: total number of events
#	M[,k+l+3]: median of surv
#	M[,k+l+4]: lower 100*(1-alpha)% confidence interval of the median of surv 
#	M[,k+l+5]: upper 100*(1-alpha)% confidence interval of the median of surv
#	If the number of groups and strata is equal to 1, M will have 4 columns with 
#	M[,1]: total number of events
#	M[,2]: median of surv
#	M[,3]: lower 100*(1-alpha)% confidence interval of the median of surv 
#	M[,4]: upper 100*(1-alpha)% confidence interval of the median of surv
# 3- If survival data from multiple groups available and ttype=log-rank or wilcoxon, a 1 x 4 matrix T and an g x 5 matrix T_GROUPS_OE with
#	T_GROUPS_OE[,1] = no. of events	
#	T_GROUPS_OE[,2] = observed value (O)
#	T_GROUPS_OE[,3] = expected value (E)
#	T_GROUPS_OE[,4] = (O-E)^2/E
#	T_GROUPS_OE[,5] = (O-E)^2/V 	 
#	T[1,1] = no. of groups
#	T[1,2] = degree of freedom for Chi-squared distributed test statistic
#	T[1,3] = test statistic 
#	T[1,4] = P-value
# -------------------------------------------------------------------------------------------
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f KM.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE GI=INPUT_DIR/GI SI=INPUT_DIR/SI O=OUTPUT_DIR/O 
#											M=OUTPUT_DIR/M T=OUTPUT_DIR/T alpha=0.05 etype=greenwood ctype=log fmt=csv

fileX = $X;
fileTE = $TE;
fileGI = ifdef ($GI, " ");
fileSI = ifdef ($SI, " ");
fileO = $O;
fileM = $M;

# Default values of some parameters
fileT = ifdef ($T, " ");                  # $T=" "

fileG = ifdef ($G, " ");                 # $G=" "
fileS = ifdef ($S, " ");                 # $S=" "
fmtO = ifdef ($fmt, "text");             # $fmt="text"
alpha = ifdef ($alpha, 0.05);            # $alpha=0.05
err_type  = ifdef ($etype, "greenwood"); # $etype="greenwood"
conf_type = ifdef ($ctype, "log");       # $ctype="log"
test_type = ifdef ($ttype, "none");      # $ttype="none"

X = read (fileX);
TE = read (fileTE);
if (fileGI != " ") {
	GI = read (fileGI);
} else {
    GI = matrix (0, rows = 1, cols = 1);
}

if (fileSI != " "){
	SI = read (fileSI);
} else {
    SI = matrix (0, rows = 1, cols = 1);
}

TE = t(TE);
GI = t(GI);
SI = t(SI);
		
# check arguments for validity
if (err_type != "greenwood" & err_type != "peto") { 
	stop (err_type + " is not a valid error type!");
}

if (conf_type != "plain" & conf_type != "log" & conf_type != "log-log") { 
	stop (conf_type + " is not a valid confidence type!");
}

if (test_type != "log-rank" & test_type != "wilcoxon" & test_type != "none") {
	stop (test_type + " is not a valid test type!");
}

n_group_cols = ncol (GI);
n_stratum_cols = ncol (SI);

# check GI and SI for validity
GI_1_1 = as.scalar (GI[1,1]);
SI_1_1 = as.scalar (SI[1,1]);	
if (n_group_cols == 1) {
	if (GI_1_1 == 0) { # no factors for grouping
		n_group_cols = 0;
	}
} else if (GI_1_1 == 0) {
	stop ("Matrix GI contains zero entries!");
}
if (n_stratum_cols == 1) {
	if (SI_1_1 == 0) { # no factors for stratifying
		n_stratum_cols = 0;
	}
} else if (SI_1_1 == 0) {
	stop ("Matrix SI contains zero entries!");
}

if (2 + n_group_cols + n_stratum_cols > ncol (X)) {
	stop ("X has an incorrect number of columns!");
}

# reorder cols of X 
if (GI_1_1 == 0 & SI_1_1 == 0) {
	Is = TE;
} else if (GI_1_1 == 0) {
	Is = cbind (TE, SI);
} else if (SI_1_1 == 0) {
	Is = cbind (TE, GI);
} else {
	Is = cbind (TE, GI, SI);
}
X = X %*% table (Is, seq (1, 2 + n_group_cols + n_stratum_cols), ncol (X), 2 + n_group_cols + n_stratum_cols);	

num_records = nrow (X);
num_groups = 1;
num_strata = 1;

### compute group id for each record
print ("Perform grouping...");
if (n_group_cols > 0) {
	for (g in 1:n_group_cols) { # sort columns corresponding to groups sequentially
		X = order (target = X, by = 2 + g);
	}
	XG = X[,3:(3 + n_group_cols - 1)];
	Idx = matrix (1, rows = num_records, cols = 1);
	Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),3:(2 + n_group_cols)] != X[2:num_records,3:(2 + n_group_cols)]);
	num_groups = sum (Idx);

	XG = replace (target = XG, pattern = 0, replacement = "Infinity");
	XG = XG * Idx;
	XG = replace (target = XG, pattern = "NaN", replacement = 0);	
	G_cols = removeEmpty (target = XG, margin = "rows"); 
	G_cols = replace (target = G_cols, pattern = "Infinity", replacement = 0);	

	A = removeEmpty (target = diag (Idx), margin = "cols");
	if (ncol (A) > 1) {
		A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
		B = cumsum (A);
		Gi = B %*% seq(1, ncol(B)); # group ids
	} else { # there is only one group
		Gi = matrix (1, rows = num_records, cols = 1);
	}
	if (n_stratum_cols > 0) {
		X = cbind (X[,1:2], Gi, X[,(3 + g):ncol (X)]);
	} else { # no strata
		X = cbind (X[,1:2],Gi);
	}
}

### compute stratum id for each record
print ("Perform stratifying...");
if (n_stratum_cols > 0) {
	s_offset = 2;
	if (n_group_cols > 0) {
		s_offset = 3;
	}
	for (s in 1:n_stratum_cols) { # sort columns corresponding to strata sequentially
		X = order (target = X, by = s_offset + s);		
	}
	XS = X[,(s_offset + 1):(s_offset + n_stratum_cols)];
	Idx = matrix (1, rows = num_records, cols = 1);
	Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),(s_offset + 1):(s_offset + n_stratum_cols)] != X[2:num_records,(s_offset + 1):(s_offset + n_stratum_cols)]);
	num_strata = sum (Idx);

	XS = replace (target = XS, pattern = 0, replacement = "Infinity");
	XS = XS * Idx;
	XS = replace (target = XS, pattern = "NaN", replacement = 0);	
	S_cols = removeEmpty (target = XS, margin = "rows"); 
	S_cols = replace (target = S_cols, pattern = "Infinity", replacement = 0);	

	SB = removeEmpty (target = seq (1,num_records), margin = "rows", select = Idx); # indices of stratum boundaries 
	A = removeEmpty (target = diag (Idx), margin = "cols");
	if (ncol (A) > 1) {
		A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
		B = cumsum (A);
		Si = B %*% seq(1, ncol(B)); # stratum ids
	} else { # there is only one stratum
		Si = matrix (1, rows = num_records, cols = 1);
	}
	X = cbind (X[,1:3],Si);
}

if (n_group_cols == 0 & n_stratum_cols == 0) {
	X = cbind (X, matrix (1, rows = num_records, cols = 2));
	SB = matrix (1, rows = 1, cols = 1);	
} else if (n_group_cols == 0) {	
	X = cbind (X[,1:2], matrix (1, rows = num_records, cols = 1), X[,3]);
} else if (n_stratum_cols == 0) {
	X = cbind (X, matrix (1, rows = num_records, cols = 1));
	SB = matrix (1, rows = 1, cols = 1);
}

######## BEGIN KAPLAN-MEIER ANALYSIS
print ("BEGIN KAPLAN-MEIER SURVIVAL FIT SCRIPT");

KM = matrix (0, rows = num_records, cols = num_groups * num_strata * 7);
KM_cols_select = matrix (1, rows = num_groups * num_strata * 7, cols = 1);
GSI = matrix (0, rows = num_groups * num_strata, cols = 2);
a = 1/0;
M = matrix (a, rows = num_groups * num_strata, cols = 5);
M_cols = seq (1, num_groups * num_strata);
z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");

if (num_groups > 1 & test_type != "none") { 
	str = "";
	TEST = matrix (0, rows = num_groups, cols = 5);
	TEST_GROUPS_OE = matrix (0, rows = 1, cols = 4);
	U = matrix (0, rows = num_groups, cols = num_strata);
	U_OE = matrix (0, rows = num_groups, cols = num_strata);
	OBS = matrix (0, rows = num_groups, cols = num_strata);
	EXP = matrix (0, rows = num_groups, cols = num_strata);
	V_sum_total = matrix (0, rows = num_groups-1, cols = (num_groups-1) * num_strata);
	n_event_all_global = matrix(1, rows=num_groups, cols=num_strata);
} else if (num_groups == 1 & test_type != "none") {
	stop ("Data contains only one group or no groups, at least two groups are required for test!");
}

parfor (s in 1:num_strata, check = 0) {
	
	start_ind = as.scalar (SB[s,]);
	end_ind = num_records;
	if (s != num_strata) {
		end_ind = as.scalar (SB[s + 1,]) - 1;
	} 
	
	######## RECODING TIMESTAMPS PRESERVING THE ORDER
	
	X_cur = X[start_ind:end_ind,];
	range = end_ind - start_ind + 1;
	X_cur = order (target = X_cur, by = 1);
	Idx1 = matrix (1, rows = range, cols = 1);
	
	num_timestamps = 1;
	if (range == 1) {
		RT = matrix (1, rows = 1, cols = 1);
	} else {
		Idx1[2:range,1] = (X_cur[1:(range - 1),1] != X_cur[2:range,1]);
		num_timestamps = sum (Idx1);
		A1 = removeEmpty (target = diag (Idx1), margin = "cols");
		if (ncol (A1) > 1) {
			A1[,1:(ncol (A1) - 1)] = A1[,1:(ncol (A1) - 1)] - A1[,2:ncol (A1)];
			B1 = cumsum (A1);
			RT = B1 %*% seq(1, ncol(B1)); 	
		} else { # there is only one group
			RT = matrix (1, rows = range, cols = 1);
		}
	}
	
	T = X_cur[,1];
	E = X_cur[,2];
	G = X_cur[,3];
	S = X_cur[,4];
	
	n_event_stratum = aggregate (target = E, groups = RT, fn = "sum"); # no. of uncensored events per stratum 
	n_event_all_stratum = aggregate (target = E, groups = RT, fn = "count"); # no. both censored and uncensored of events per stratum 
	Idx1 = cumsum (n_event_all_stratum); 
	time_stratum = table (seq (1, nrow (Idx1), 1), Idx1) %*% T; # distinct timestamps both censored and uncensored per stratum 
	time_stratum_has_zero = sum (time_stratum == 0) > 0;
	if (time_stratum_has_zero) {
		time_stratum = 	replace (target = time_stratum, pattern = 0, replacement = "Infinity");
	}
	n_time_all1 = nrow (n_event_stratum);  # no. of distinct timestamps both censored and uncensored per stratum
	n_event_all_stratum_agg = matrix (0, rows = n_time_all1, cols = 1); 
	if (n_time_all1 > 1) {
		n_event_all_stratum_agg[2:n_time_all1,] = Idx1[1:(n_time_all1 - 1),]; 
	}
	n_risk_stratum = range - n_event_all_stratum_agg; # no. at risk per stratum

	if (num_groups > 1 & test_type != "none") {	# needed for log-rank or wilcoxon test	
		n_risk_n_event_stratum = matrix (0, rows = n_time_all1, cols = num_groups * 2);
	}

	parfor (g in 1:num_groups, check = 0) {
	
		group_ind = (G == g);
		KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
		M_offset = (s - 1) * num_groups + g;
		if (sum (group_ind) != 0) { # group g is present in the stratum s

			GSI_offset = (s - 1) * num_groups + g; 
			GSI[GSI_offset,1] = g;
			GSI[GSI_offset,2] = s;		
			E_cur = E * group_ind;

			######## COMPUTE NO. AT RISK AND NO.OF EVENTS FOR EACH TIMESTAMP
			
			n_event = aggregate (target = E_cur, groups = RT, fn = "sum"); # no. of uncensored events per stratum per group
			n_event_all = aggregate (target = group_ind, groups = RT, fn = "sum"); # no. of both censored and uncensored events per stratum per group
			Idx1 = cumsum (n_event_all); 
			event_occurred = (n_event > 0);
			if (time_stratum_has_zero) {
				time = replace (target = time_stratum * event_occurred, pattern = "NaN", replacement = 0);
				time = removeEmpty (target = time, margin = "rows");
				time = replace (target = time, pattern = "Infinity", replacement = 0);
			} else {
				time = removeEmpty (target = time_stratum * event_occurred, margin = "rows");
			}
			n_time_all2 = nrow (n_event);  # no. of distinct timestamps both censored and uncensored per stratum per group
			n_event_all_agg = matrix (0, rows = n_time_all2, cols = 1); 
			if (n_time_all2 > 1) {
				n_event_all_agg[2:n_time_all2,] = Idx1[1:(n_time_all2 - 1),]; 
			}
				
			n_risk = sum (group_ind) - n_event_all_agg; # no. at risk per stratum per group
			
			if (num_groups > 1 & test_type != "none") {
				n_risk_n_event_stratum[,(g - 1) * 2 + 1] = n_risk;
				n_risk_n_event_stratum[,(g - 1) * 2 + 2] = n_event;					
			}
			
			# Extract only rows corresponding to events, i.e., for which n_event is nonzero 
			Idx1 = (n_event != 0);
			KM_1 = matrix (0, rows = n_time_all2, cols = 2);
			KM_1[,1] = n_risk;
			KM_1[,2] = n_event;
			KM_1 = removeEmpty (target = KM_1, margin = "rows", select = Idx1);
			n_risk = KM_1[,1];
			n_event = KM_1[,2];
			n_time = nrow (time);
			
			######## ESTIMATE SERVIVOR FUNCTION SURV, ITS STANDARD ERROR SE_SURV, AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL	
			surv = cumprod ((n_risk - n_event) / n_risk);
			tmp = n_event / (n_risk * (n_risk - n_event));
			se_surv = sqrt (cumsum (tmp)) * surv; 
			if (err_type == "peto") {
				se_surv = (surv * sqrt(1 - surv) / sqrt(n_risk));		
			}
		
			if (conf_type == "plain") { 
				# True survivor function is in [surv +- z_alpha_2 * se_surv], 
				# values less than 0 are replaced by 0, values larger than 1are replaced by 1!
				CI_l = max (surv - (z_alpha_2 * se_surv), 0);  
				CI_r = min (surv + (z_alpha_2 * se_surv), 1); 
			} else if (conf_type == "log") {
				# True survivor function is in [surv * exp(+- z_alpha_2 * se_surv / surv)]
				CI_l = max (surv * exp (- z_alpha_2 * se_surv / surv), 0); 
				CI_r = min (surv * exp ( z_alpha_2 * se_surv / surv), 1); 
			} else { # conf_type == "log-log"
				# True survivor function is in [surv ^ exp(+- z_alpha_2 * se(log(-log(surv))))]
				CI_l = max (surv ^ exp (- z_alpha_2 * se_surv / log(surv)), 0); 
				CI_r = min (surv ^ exp ( z_alpha_2 * se_surv / log(surv)), 1);  
			}	 
			#
			if (as.scalar (n_risk[n_time,]) == as.scalar (n_event[n_time,])) {
				CI_l[n_time,] = 0/0;
				CI_r[n_time,] = 0/0;
			}	
		
			n_event_sum = sum (n_event);
			n_event_sum_all = sum(n_event_all);
			if (n_event_sum > 0) {
				# KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
				KM[1:n_time,KM_offset + 1] = time;
				KM[1:n_time,KM_offset + 2] = n_risk;			
				KM[1:n_time,KM_offset + 3] = n_event;
				KM[1:n_time,KM_offset + 4] = surv;
				KM[1:n_time,KM_offset + 5] = se_surv;
				KM[1:n_time,KM_offset + 6] = CI_l;
				KM[1:n_time,KM_offset + 7] = CI_r;				
			}			
						
			######## ESTIMATE MEDIAN OF SERVIVAL TIMES AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
		
			p_5 = (surv <= 0.5); 
			pn_5 = sum (p_5);
			#M_offset = (s - 1) * num_groups + g;
			# if the estimated survivor function is larger than 0.5 for all timestamps median does not exist! 
			p_5_exists = (pn_5 != 0);
			M[M_offset,2] = n_event_sum;
			M[M_offset,1] = n_event_sum_all; 
			if (p_5_exists) {
				if ( as.scalar (surv[n_time - pn_5 + 1,1]) == 0.5 ) { # if the estimated survivor function is exactly equal to 0.5
					if (pn_5 > 1) {
						t_5 = as.scalar ((time[n_time - pn_5 + 1,1] + time[n_time - pn_5 + 2,1])/2);
					} else {
						t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
					}
				} else {
					t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
				}
		
				l_ind = (CI_l <= 0.5);
				r_ind = (CI_r <= 0.5);
				l_ind_sum = sum (l_ind);
				r_ind_sum = sum (r_ind);
				l_min_ind = as.scalar (rowIndexMin (t(l_ind)));
				r_min_ind = as.scalar (rowIndexMin (t(r_ind)));		
				if (l_min_ind == n_time) {
					if (l_ind_sum > 0) {
						if (as.scalar (l_ind[n_time,1]) == 0) { # NA at last position
							M[M_offset,4] = time[n_time - l_ind_sum,1];
						} else {
							M[M_offset,4] = time[1,1];
						}
					}
				} else {
					M[M_offset,4] = time[l_min_ind + 1,1];
				}
				#
				if (r_min_ind == n_time) {
					if (r_ind_sum > 0) {
						if (as.scalar (r_ind[n_time,1]) == 0) { # NA at last position
							M[M_offset,5] = time[n_time - r_ind_sum,1];
						} else {
							M[M_offset,5] = time[1,1];
						}
					}
				} else {
					M[M_offset,5] = time[r_min_ind + 1,1];
				}
				M[M_offset,3] = t_5;
				if (test_type != "none"){
					n_event_all_global[g,s] = n_event_sum_all; 
				}
			}
		} else {
			print ("group " + g + " is not present in the stratum " + s);
			KM_cols_select[(KM_offset + 1):(KM_offset + 7),1] = matrix (0, rows = 7, cols = 1);
			M_cols[M_offset,1] = 0;
		}		
	}
	
			
	######## COMPARISON BETWEEN DIFFERENT GROUPS USING LOG-RANK OR WILCOXON TEST
		
	if (num_groups > 1 & test_type != "none") {

		V = matrix (0, rows = num_groups-1, cols = num_groups-1);
		parfor (g in 0:(num_groups-1), check = 0) {
		
			n_risk = n_risk_n_event_stratum[,g * 2 + 1];			
			n_event = n_risk_n_event_stratum[,g * 2 + 2];
		
			if (test_type == "log-rank") {
				O = n_event;
				E = n_risk * n_event_stratum / n_risk_stratum;		
			} else { ### test_type == "wilcoxon"
				O = n_risk_stratum * n_event / range;
				E = n_risk * n_event_stratum / range;
			}			
			U[(g + 1),s] = sum (O - E);
			U_OE[g + 1, s] = (sum (O - E)*sum (O - E))/sum(E);
			OBS[g + 1, s] = sum(O);
			EXP[g + 1, s] = sum(E);
		}
		
		# parfor (i1 in 0:(num_groups - 2), check = 0) {
		for (i1 in 0:(num_groups - 2), check = 0) {
		
			n_risk = n_risk_n_event_stratum[,1 + i1 * 2]; 
			n_event = n_risk_n_event_stratum[,2 + i1 * 2]; 
			for (i2 in 0:(num_groups - 2)) {

				n_risk_i2j = n_risk_n_event_stratum[,1 + i2 * 2]; 
				I_i1i2 = 0;
				if (i1 == i2) { 
					I_i1i2 = 1;
				}
				if (test_type == "log-rank") {
					V1 = n_risk * n_event_stratum * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
					V1 = replace (target = V1, pattern = "NaN", replacement = 0);
					V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
					V[(i1 + 1),(i2 + 1)] = sum (V1 * V2);
				} else { ### test_type == "wilcoxon"
					V1 = (n_risk_stratum ^ 2) * (n_risk * n_event_stratum) * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
					V1 = replace (target = V1, pattern = "NaN", replacement = 0);
					V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
					V[(i1 + 1),(i2 + 1)] = sum (V1 * V2) / (range ^ 2);
				}
			}
		}
		V_start_ind = (s - 1) * (num_groups - 1) + 1;
		V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)] = V;
	}
}

if (num_groups > 1 & test_type != "none") {
	V_sum = matrix (0, rows = num_groups-1, cols = num_groups-1);
	for (s in 1:num_strata) {
		V_start_ind = (s - 1) * (num_groups - 1) + 1;
	    V_sum_total_part = V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)];
		V_sum = V_sum + V_sum_total_part;
	}
		
	U_sum = rowSums (U);

	test_st = as.scalar (t(U_sum[1:(num_groups-1),1]) %*% inv(V_sum) %*% U_sum[1:(num_groups-1),1]);
	p_val = 1 - cdf (target = test_st, dist = "chisq", df = num_groups-1 );
	if (test_type != "none") {
		U_OE_sum = rowSums(U_OE);
		V_OE =rowSums((U*U) /sum(V_sum));
		TEST_GROUPS_OE[1,1] = num_groups;
		TEST_GROUPS_OE[1,2] = num_groups - 1;
		TEST_GROUPS_OE[1,3] = test_st;
		TEST_GROUPS_OE[1,4] = p_val;
		TEST[,1] = rowSums(n_event_all_global);
		TEST[,2] = rowSums(OBS);
		TEST[,3] = rowSums(EXP);
		TEST[,4] = rowSums(U_OE_sum);
		TEST[,5] = rowSums(V_OE);
		str = append (str, test_type + " test for " + num_groups + " groups: Chi-squared = " + test_st + " on " + (num_groups - 1) + " df, p = " + p_val + " ");	
	} 
}

GSI = removeEmpty (target = GSI, margin = "rows");
if (n_group_cols > 0) {
	# making a copy of unique groups before adding new rows depending on strata
	G_cols_original = G_cols;

	GSI_1 = GSI[,1];
	tab_G = table (seq (1, nrow (GSI_1)), GSI_1, nrow (GSI_1), nrow (G_cols));
	G_cols = tab_G %*% G_cols;
}

if (n_stratum_cols > 0) {
	GSI_2 = GSI[,2];
	tab_S = table (seq (1, nrow (GSI_2)), GSI_2, nrow (GSI_2), nrow (S_cols));
	S_cols = tab_S %*% S_cols;
}

# pull out non-empty rows from M 
M_cols = removeEmpty (target = M_cols, margin = "rows");
tab_M = table (seq (1, nrow (M_cols)), M_cols, nrow (M_cols), nrow (M));
M = tab_M %*% M;
M = replace (target = M, pattern = "Infinity", replacement = "NaN");

# pull out non-empty rows from TEST
if (n_group_cols > 0 & n_stratum_cols > 0) {
	M = cbind (G_cols, S_cols, M);
	if (test_type != "none") {
		TEST = cbind (G_cols_original, TEST);
	}
} else if (n_group_cols > 0) {
	M = cbind (G_cols, M);
	if (test_type != "none") {	
		TEST = cbind (G_cols_original, TEST);
	}
} else if (n_stratum_cols > 0) {
	M = cbind (S_cols, M);
}

# pull out non-empty columns from KM
KM = t (cbind (t (KM), KM_cols_select) * KM_cols_select);
KM = removeEmpty (target = KM, margin = "cols");
KM = removeEmpty (target = KM, margin = "rows");
KM = KM[1:(nrow (KM) - 1),];

# write output matrices
write (M, fileM, format=fmtO);
write (KM, fileO, format=fmtO);

if (test_type != "none") {
	if (num_groups > 1 & fileT != " ") { 
		write (TEST, fileT, format=fmtO);
		write (TEST_GROUPS_OE, fileT+".groups.oe", format=fmtO);
	} else {
		print (str);
	}	
}