scripts.algorithms.KM.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT ANALIZES SURVIVAL DATA USING KAPLAN-MEIER ESTIMATES
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read the input matrix X containing the survival data:
# timestamps, whether event occurred (1) or data is censored (0), and a number of factors (categorical features)
# for grouping and/or stratifying
# TE String --- Column indices of X which contain timestamps (first entry) and event information (second entry)
# GI String --- Column indices of X corresponding to the factors to be used for grouping
# SI String --- Column indices of X corresponding to the factors to be used for stratifying
# O String --- Location to write the matrix containing the results of the Kaplan-Meier analysis; see below for the description
# M String --- Location to write Matrix M containing the following statistic: total number of events, median and its confidence intervals;
# if survival data for multiple groups and strata are provided each row of M contains the above statistics per group and stratum
# T String " " If survival data from multiple groups available and ttype=log-rank or wilcoxon,
# location to write the matrix containing result of the (stratified) test for comparing multiple groups
# alpha Double 0.05 Parameter to compute 100*(1-alpha)% confidence intervals for the survivor function and its median
# etype String "greenwood" Parameter to specify the error type according to "greenwood" (the default) or "peto"
# ctype String "log" Parameter to modify the confidence interval; "plain" keeps the lower and upper bound of
# the confidence interval unmodified, "log" (the default) corresponds to logistic transformation and
# "log-log" corresponds to the complementary log-log transformation
# ttype String "none" If survival data for multiple groups is available specifies which test to perform for comparing
# survival data across multiple groups: "none" (the default) "log-rank" or "wilcoxon" test
# fmt String "text" The output format of results of the Kaplan-Meier analysis, such as "text" or "csv"
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# 1- Matrix KM whose dimension depends on the number of groups (denoted by g) and strata (denoted by s) in the data:
# each collection of 7 consecutive columns in KM corresponds to a unique combination of groups and strata in the data with the following schema
# 1. col: timestamp
# 2. col: no. at risk
# 3. col: no. of events
# 4. col: Kaplan-Meier estimate of survivor function surv
# 5. col: standard error of surv
# 6. col: lower 100*(1-alpha)% confidence interval for surv
# 7. col: upper 100*(1-alpha)% confidence interval for surv
# 2- Matrix M whose dimension depends on the number of groups (g) and strata (s) in the data (k denotes the number of factors used for grouping
# ,i.e., ncol(GI) and l denotes the number of factors used for stratifying, i.e., ncol(SI))
# M[,1:k]: unique combination of values in the k factors used for grouping
# M[,(k+1):(k+l)]: unique combination of values in the l factors used for stratifying
# M[,k+l+1]: total number of records
# M[,k+l+2]: total number of events
# M[,k+l+3]: median of surv
# M[,k+l+4]: lower 100*(1-alpha)% confidence interval of the median of surv
# M[,k+l+5]: upper 100*(1-alpha)% confidence interval of the median of surv
# If the number of groups and strata is equal to 1, M will have 4 columns with
# M[,1]: total number of events
# M[,2]: median of surv
# M[,3]: lower 100*(1-alpha)% confidence interval of the median of surv
# M[,4]: upper 100*(1-alpha)% confidence interval of the median of surv
# 3- If survival data from multiple groups available and ttype=log-rank or wilcoxon, a 1 x 4 matrix T and an g x 5 matrix T_GROUPS_OE with
# T_GROUPS_OE[,1] = no. of events
# T_GROUPS_OE[,2] = observed value (O)
# T_GROUPS_OE[,3] = expected value (E)
# T_GROUPS_OE[,4] = (O-E)^2/E
# T_GROUPS_OE[,5] = (O-E)^2/V
# T[1,1] = no. of groups
# T[1,2] = degree of freedom for Chi-squared distributed test statistic
# T[1,3] = test statistic
# T[1,4] = P-value
# -------------------------------------------------------------------------------------------
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f KM.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE GI=INPUT_DIR/GI SI=INPUT_DIR/SI O=OUTPUT_DIR/O
# M=OUTPUT_DIR/M T=OUTPUT_DIR/T alpha=0.05 etype=greenwood ctype=log fmt=csv
fileX = $X;
fileTE = $TE;
fileGI = ifdef ($GI, " ");
fileSI = ifdef ($SI, " ");
fileO = $O;
fileM = $M;
# Default values of some parameters
fileT = ifdef ($T, " "); # $T=" "
fileG = ifdef ($G, " "); # $G=" "
fileS = ifdef ($S, " "); # $S=" "
fmtO = ifdef ($fmt, "text"); # $fmt="text"
alpha = ifdef ($alpha, 0.05); # $alpha=0.05
err_type = ifdef ($etype, "greenwood"); # $etype="greenwood"
conf_type = ifdef ($ctype, "log"); # $ctype="log"
test_type = ifdef ($ttype, "none"); # $ttype="none"
X = read (fileX);
TE = read (fileTE);
if (fileGI != " ") {
GI = read (fileGI);
} else {
GI = matrix (0, rows = 1, cols = 1);
}
if (fileSI != " "){
SI = read (fileSI);
} else {
SI = matrix (0, rows = 1, cols = 1);
}
TE = t(TE);
GI = t(GI);
SI = t(SI);
# check arguments for validity
if (err_type != "greenwood" & err_type != "peto") {
stop (err_type + " is not a valid error type!");
}
if (conf_type != "plain" & conf_type != "log" & conf_type != "log-log") {
stop (conf_type + " is not a valid confidence type!");
}
if (test_type != "log-rank" & test_type != "wilcoxon" & test_type != "none") {
stop (test_type + " is not a valid test type!");
}
n_group_cols = ncol (GI);
n_stratum_cols = ncol (SI);
# check GI and SI for validity
GI_1_1 = as.scalar (GI[1,1]);
SI_1_1 = as.scalar (SI[1,1]);
if (n_group_cols == 1) {
if (GI_1_1 == 0) { # no factors for grouping
n_group_cols = 0;
}
} else if (GI_1_1 == 0) {
stop ("Matrix GI contains zero entries!");
}
if (n_stratum_cols == 1) {
if (SI_1_1 == 0) { # no factors for stratifying
n_stratum_cols = 0;
}
} else if (SI_1_1 == 0) {
stop ("Matrix SI contains zero entries!");
}
if (2 + n_group_cols + n_stratum_cols > ncol (X)) {
stop ("X has an incorrect number of columns!");
}
# reorder cols of X
if (GI_1_1 == 0 & SI_1_1 == 0) {
Is = TE;
} else if (GI_1_1 == 0) {
Is = cbind (TE, SI);
} else if (SI_1_1 == 0) {
Is = cbind (TE, GI);
} else {
Is = cbind (TE, GI, SI);
}
X = X %*% table (Is, seq (1, 2 + n_group_cols + n_stratum_cols), ncol (X), 2 + n_group_cols + n_stratum_cols);
num_records = nrow (X);
num_groups = 1;
num_strata = 1;
### compute group id for each record
print ("Perform grouping...");
if (n_group_cols > 0) {
for (g in 1:n_group_cols) { # sort columns corresponding to groups sequentially
X = order (target = X, by = 2 + g);
}
XG = X[,3:(3 + n_group_cols - 1)];
Idx = matrix (1, rows = num_records, cols = 1);
Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),3:(2 + n_group_cols)] != X[2:num_records,3:(2 + n_group_cols)]);
num_groups = sum (Idx);
XG = replace (target = XG, pattern = 0, replacement = "Infinity");
XG = XG * Idx;
XG = replace (target = XG, pattern = "NaN", replacement = 0);
G_cols = removeEmpty (target = XG, margin = "rows");
G_cols = replace (target = G_cols, pattern = "Infinity", replacement = 0);
A = removeEmpty (target = diag (Idx), margin = "cols");
if (ncol (A) > 1) {
A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
B = cumsum (A);
Gi = B %*% seq(1, ncol(B)); # group ids
} else { # there is only one group
Gi = matrix (1, rows = num_records, cols = 1);
}
if (n_stratum_cols > 0) {
X = cbind (X[,1:2], Gi, X[,(3 + g):ncol (X)]);
} else { # no strata
X = cbind (X[,1:2],Gi);
}
}
### compute stratum id for each record
print ("Perform stratifying...");
if (n_stratum_cols > 0) {
s_offset = 2;
if (n_group_cols > 0) {
s_offset = 3;
}
for (s in 1:n_stratum_cols) { # sort columns corresponding to strata sequentially
X = order (target = X, by = s_offset + s);
}
XS = X[,(s_offset + 1):(s_offset + n_stratum_cols)];
Idx = matrix (1, rows = num_records, cols = 1);
Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),(s_offset + 1):(s_offset + n_stratum_cols)] != X[2:num_records,(s_offset + 1):(s_offset + n_stratum_cols)]);
num_strata = sum (Idx);
XS = replace (target = XS, pattern = 0, replacement = "Infinity");
XS = XS * Idx;
XS = replace (target = XS, pattern = "NaN", replacement = 0);
S_cols = removeEmpty (target = XS, margin = "rows");
S_cols = replace (target = S_cols, pattern = "Infinity", replacement = 0);
SB = removeEmpty (target = seq (1,num_records), margin = "rows", select = Idx); # indices of stratum boundaries
A = removeEmpty (target = diag (Idx), margin = "cols");
if (ncol (A) > 1) {
A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
B = cumsum (A);
Si = B %*% seq(1, ncol(B)); # stratum ids
} else { # there is only one stratum
Si = matrix (1, rows = num_records, cols = 1);
}
X = cbind (X[,1:3],Si);
}
if (n_group_cols == 0 & n_stratum_cols == 0) {
X = cbind (X, matrix (1, rows = num_records, cols = 2));
SB = matrix (1, rows = 1, cols = 1);
} else if (n_group_cols == 0) {
X = cbind (X[,1:2], matrix (1, rows = num_records, cols = 1), X[,3]);
} else if (n_stratum_cols == 0) {
X = cbind (X, matrix (1, rows = num_records, cols = 1));
SB = matrix (1, rows = 1, cols = 1);
}
######## BEGIN KAPLAN-MEIER ANALYSIS
print ("BEGIN KAPLAN-MEIER SURVIVAL FIT SCRIPT");
KM = matrix (0, rows = num_records, cols = num_groups * num_strata * 7);
KM_cols_select = matrix (1, rows = num_groups * num_strata * 7, cols = 1);
GSI = matrix (0, rows = num_groups * num_strata, cols = 2);
a = 1/0;
M = matrix (a, rows = num_groups * num_strata, cols = 5);
M_cols = seq (1, num_groups * num_strata);
z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
if (num_groups > 1 & test_type != "none") {
str = "";
TEST = matrix (0, rows = num_groups, cols = 5);
TEST_GROUPS_OE = matrix (0, rows = 1, cols = 4);
U = matrix (0, rows = num_groups, cols = num_strata);
U_OE = matrix (0, rows = num_groups, cols = num_strata);
OBS = matrix (0, rows = num_groups, cols = num_strata);
EXP = matrix (0, rows = num_groups, cols = num_strata);
V_sum_total = matrix (0, rows = num_groups-1, cols = (num_groups-1) * num_strata);
n_event_all_global = matrix(1, rows=num_groups, cols=num_strata);
} else if (num_groups == 1 & test_type != "none") {
stop ("Data contains only one group or no groups, at least two groups are required for test!");
}
parfor (s in 1:num_strata, check = 0) {
start_ind = as.scalar (SB[s,]);
end_ind = num_records;
if (s != num_strata) {
end_ind = as.scalar (SB[s + 1,]) - 1;
}
######## RECODING TIMESTAMPS PRESERVING THE ORDER
X_cur = X[start_ind:end_ind,];
range = end_ind - start_ind + 1;
X_cur = order (target = X_cur, by = 1);
Idx1 = matrix (1, rows = range, cols = 1);
num_timestamps = 1;
if (range == 1) {
RT = matrix (1, rows = 1, cols = 1);
} else {
Idx1[2:range,1] = (X_cur[1:(range - 1),1] != X_cur[2:range,1]);
num_timestamps = sum (Idx1);
A1 = removeEmpty (target = diag (Idx1), margin = "cols");
if (ncol (A1) > 1) {
A1[,1:(ncol (A1) - 1)] = A1[,1:(ncol (A1) - 1)] - A1[,2:ncol (A1)];
B1 = cumsum (A1);
RT = B1 %*% seq(1, ncol(B1));
} else { # there is only one group
RT = matrix (1, rows = range, cols = 1);
}
}
T = X_cur[,1];
E = X_cur[,2];
G = X_cur[,3];
S = X_cur[,4];
n_event_stratum = aggregate (target = E, groups = RT, fn = "sum"); # no. of uncensored events per stratum
n_event_all_stratum = aggregate (target = E, groups = RT, fn = "count"); # no. both censored and uncensored of events per stratum
Idx1 = cumsum (n_event_all_stratum);
time_stratum = table (seq (1, nrow (Idx1), 1), Idx1) %*% T; # distinct timestamps both censored and uncensored per stratum
time_stratum_has_zero = sum (time_stratum == 0) > 0;
if (time_stratum_has_zero) {
time_stratum = replace (target = time_stratum, pattern = 0, replacement = "Infinity");
}
n_time_all1 = nrow (n_event_stratum); # no. of distinct timestamps both censored and uncensored per stratum
n_event_all_stratum_agg = matrix (0, rows = n_time_all1, cols = 1);
if (n_time_all1 > 1) {
n_event_all_stratum_agg[2:n_time_all1,] = Idx1[1:(n_time_all1 - 1),];
}
n_risk_stratum = range - n_event_all_stratum_agg; # no. at risk per stratum
if (num_groups > 1 & test_type != "none") { # needed for log-rank or wilcoxon test
n_risk_n_event_stratum = matrix (0, rows = n_time_all1, cols = num_groups * 2);
}
parfor (g in 1:num_groups, check = 0) {
group_ind = (G == g);
KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
M_offset = (s - 1) * num_groups + g;
if (sum (group_ind) != 0) { # group g is present in the stratum s
GSI_offset = (s - 1) * num_groups + g;
GSI[GSI_offset,1] = g;
GSI[GSI_offset,2] = s;
E_cur = E * group_ind;
######## COMPUTE NO. AT RISK AND NO.OF EVENTS FOR EACH TIMESTAMP
n_event = aggregate (target = E_cur, groups = RT, fn = "sum"); # no. of uncensored events per stratum per group
n_event_all = aggregate (target = group_ind, groups = RT, fn = "sum"); # no. of both censored and uncensored events per stratum per group
Idx1 = cumsum (n_event_all);
event_occurred = (n_event > 0);
if (time_stratum_has_zero) {
time = replace (target = time_stratum * event_occurred, pattern = "NaN", replacement = 0);
time = removeEmpty (target = time, margin = "rows");
time = replace (target = time, pattern = "Infinity", replacement = 0);
} else {
time = removeEmpty (target = time_stratum * event_occurred, margin = "rows");
}
n_time_all2 = nrow (n_event); # no. of distinct timestamps both censored and uncensored per stratum per group
n_event_all_agg = matrix (0, rows = n_time_all2, cols = 1);
if (n_time_all2 > 1) {
n_event_all_agg[2:n_time_all2,] = Idx1[1:(n_time_all2 - 1),];
}
n_risk = sum (group_ind) - n_event_all_agg; # no. at risk per stratum per group
if (num_groups > 1 & test_type != "none") {
n_risk_n_event_stratum[,(g - 1) * 2 + 1] = n_risk;
n_risk_n_event_stratum[,(g - 1) * 2 + 2] = n_event;
}
# Extract only rows corresponding to events, i.e., for which n_event is nonzero
Idx1 = (n_event != 0);
KM_1 = matrix (0, rows = n_time_all2, cols = 2);
KM_1[,1] = n_risk;
KM_1[,2] = n_event;
KM_1 = removeEmpty (target = KM_1, margin = "rows", select = Idx1);
n_risk = KM_1[,1];
n_event = KM_1[,2];
n_time = nrow (time);
######## ESTIMATE SERVIVOR FUNCTION SURV, ITS STANDARD ERROR SE_SURV, AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
surv = cumprod ((n_risk - n_event) / n_risk);
tmp = n_event / (n_risk * (n_risk - n_event));
se_surv = sqrt (cumsum (tmp)) * surv;
if (err_type == "peto") {
se_surv = (surv * sqrt(1 - surv) / sqrt(n_risk));
}
if (conf_type == "plain") {
# True survivor function is in [surv +- z_alpha_2 * se_surv],
# values less than 0 are replaced by 0, values larger than 1are replaced by 1!
CI_l = max (surv - (z_alpha_2 * se_surv), 0);
CI_r = min (surv + (z_alpha_2 * se_surv), 1);
} else if (conf_type == "log") {
# True survivor function is in [surv * exp(+- z_alpha_2 * se_surv / surv)]
CI_l = max (surv * exp (- z_alpha_2 * se_surv / surv), 0);
CI_r = min (surv * exp ( z_alpha_2 * se_surv / surv), 1);
} else { # conf_type == "log-log"
# True survivor function is in [surv ^ exp(+- z_alpha_2 * se(log(-log(surv))))]
CI_l = max (surv ^ exp (- z_alpha_2 * se_surv / log(surv)), 0);
CI_r = min (surv ^ exp ( z_alpha_2 * se_surv / log(surv)), 1);
}
#
if (as.scalar (n_risk[n_time,]) == as.scalar (n_event[n_time,])) {
CI_l[n_time,] = 0/0;
CI_r[n_time,] = 0/0;
}
n_event_sum = sum (n_event);
n_event_sum_all = sum(n_event_all);
if (n_event_sum > 0) {
# KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
KM[1:n_time,KM_offset + 1] = time;
KM[1:n_time,KM_offset + 2] = n_risk;
KM[1:n_time,KM_offset + 3] = n_event;
KM[1:n_time,KM_offset + 4] = surv;
KM[1:n_time,KM_offset + 5] = se_surv;
KM[1:n_time,KM_offset + 6] = CI_l;
KM[1:n_time,KM_offset + 7] = CI_r;
}
######## ESTIMATE MEDIAN OF SERVIVAL TIMES AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
p_5 = (surv <= 0.5);
pn_5 = sum (p_5);
#M_offset = (s - 1) * num_groups + g;
# if the estimated survivor function is larger than 0.5 for all timestamps median does not exist!
p_5_exists = (pn_5 != 0);
M[M_offset,2] = n_event_sum;
M[M_offset,1] = n_event_sum_all;
if (p_5_exists) {
if ( as.scalar (surv[n_time - pn_5 + 1,1]) == 0.5 ) { # if the estimated survivor function is exactly equal to 0.5
if (pn_5 > 1) {
t_5 = as.scalar ((time[n_time - pn_5 + 1,1] + time[n_time - pn_5 + 2,1])/2);
} else {
t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
}
} else {
t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
}
l_ind = (CI_l <= 0.5);
r_ind = (CI_r <= 0.5);
l_ind_sum = sum (l_ind);
r_ind_sum = sum (r_ind);
l_min_ind = as.scalar (rowIndexMin (t(l_ind)));
r_min_ind = as.scalar (rowIndexMin (t(r_ind)));
if (l_min_ind == n_time) {
if (l_ind_sum > 0) {
if (as.scalar (l_ind[n_time,1]) == 0) { # NA at last position
M[M_offset,4] = time[n_time - l_ind_sum,1];
} else {
M[M_offset,4] = time[1,1];
}
}
} else {
M[M_offset,4] = time[l_min_ind + 1,1];
}
#
if (r_min_ind == n_time) {
if (r_ind_sum > 0) {
if (as.scalar (r_ind[n_time,1]) == 0) { # NA at last position
M[M_offset,5] = time[n_time - r_ind_sum,1];
} else {
M[M_offset,5] = time[1,1];
}
}
} else {
M[M_offset,5] = time[r_min_ind + 1,1];
}
M[M_offset,3] = t_5;
if (test_type != "none"){
n_event_all_global[g,s] = n_event_sum_all;
}
}
} else {
print ("group " + g + " is not present in the stratum " + s);
KM_cols_select[(KM_offset + 1):(KM_offset + 7),1] = matrix (0, rows = 7, cols = 1);
M_cols[M_offset,1] = 0;
}
}
######## COMPARISON BETWEEN DIFFERENT GROUPS USING LOG-RANK OR WILCOXON TEST
if (num_groups > 1 & test_type != "none") {
V = matrix (0, rows = num_groups-1, cols = num_groups-1);
parfor (g in 0:(num_groups-1), check = 0) {
n_risk = n_risk_n_event_stratum[,g * 2 + 1];
n_event = n_risk_n_event_stratum[,g * 2 + 2];
if (test_type == "log-rank") {
O = n_event;
E = n_risk * n_event_stratum / n_risk_stratum;
} else { ### test_type == "wilcoxon"
O = n_risk_stratum * n_event / range;
E = n_risk * n_event_stratum / range;
}
U[(g + 1),s] = sum (O - E);
U_OE[g + 1, s] = (sum (O - E)*sum (O - E))/sum(E);
OBS[g + 1, s] = sum(O);
EXP[g + 1, s] = sum(E);
}
# parfor (i1 in 0:(num_groups - 2), check = 0) {
for (i1 in 0:(num_groups - 2), check = 0) {
n_risk = n_risk_n_event_stratum[,1 + i1 * 2];
n_event = n_risk_n_event_stratum[,2 + i1 * 2];
for (i2 in 0:(num_groups - 2)) {
n_risk_i2j = n_risk_n_event_stratum[,1 + i2 * 2];
I_i1i2 = 0;
if (i1 == i2) {
I_i1i2 = 1;
}
if (test_type == "log-rank") {
V1 = n_risk * n_event_stratum * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
V1 = replace (target = V1, pattern = "NaN", replacement = 0);
V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
V[(i1 + 1),(i2 + 1)] = sum (V1 * V2);
} else { ### test_type == "wilcoxon"
V1 = (n_risk_stratum ^ 2) * (n_risk * n_event_stratum) * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
V1 = replace (target = V1, pattern = "NaN", replacement = 0);
V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
V[(i1 + 1),(i2 + 1)] = sum (V1 * V2) / (range ^ 2);
}
}
}
V_start_ind = (s - 1) * (num_groups - 1) + 1;
V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)] = V;
}
}
if (num_groups > 1 & test_type != "none") {
V_sum = matrix (0, rows = num_groups-1, cols = num_groups-1);
for (s in 1:num_strata) {
V_start_ind = (s - 1) * (num_groups - 1) + 1;
V_sum_total_part = V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)];
V_sum = V_sum + V_sum_total_part;
}
U_sum = rowSums (U);
test_st = as.scalar (t(U_sum[1:(num_groups-1),1]) %*% inv(V_sum) %*% U_sum[1:(num_groups-1),1]);
p_val = 1 - cdf (target = test_st, dist = "chisq", df = num_groups-1 );
if (test_type != "none") {
U_OE_sum = rowSums(U_OE);
V_OE =rowSums((U*U) /sum(V_sum));
TEST_GROUPS_OE[1,1] = num_groups;
TEST_GROUPS_OE[1,2] = num_groups - 1;
TEST_GROUPS_OE[1,3] = test_st;
TEST_GROUPS_OE[1,4] = p_val;
TEST[,1] = rowSums(n_event_all_global);
TEST[,2] = rowSums(OBS);
TEST[,3] = rowSums(EXP);
TEST[,4] = rowSums(U_OE_sum);
TEST[,5] = rowSums(V_OE);
str = append (str, test_type + " test for " + num_groups + " groups: Chi-squared = " + test_st + " on " + (num_groups - 1) + " df, p = " + p_val + " ");
}
}
GSI = removeEmpty (target = GSI, margin = "rows");
if (n_group_cols > 0) {
# making a copy of unique groups before adding new rows depending on strata
G_cols_original = G_cols;
GSI_1 = GSI[,1];
tab_G = table (seq (1, nrow (GSI_1)), GSI_1, nrow (GSI_1), nrow (G_cols));
G_cols = tab_G %*% G_cols;
}
if (n_stratum_cols > 0) {
GSI_2 = GSI[,2];
tab_S = table (seq (1, nrow (GSI_2)), GSI_2, nrow (GSI_2), nrow (S_cols));
S_cols = tab_S %*% S_cols;
}
# pull out non-empty rows from M
M_cols = removeEmpty (target = M_cols, margin = "rows");
tab_M = table (seq (1, nrow (M_cols)), M_cols, nrow (M_cols), nrow (M));
M = tab_M %*% M;
M = replace (target = M, pattern = "Infinity", replacement = "NaN");
# pull out non-empty rows from TEST
if (n_group_cols > 0 & n_stratum_cols > 0) {
M = cbind (G_cols, S_cols, M);
if (test_type != "none") {
TEST = cbind (G_cols_original, TEST);
}
} else if (n_group_cols > 0) {
M = cbind (G_cols, M);
if (test_type != "none") {
TEST = cbind (G_cols_original, TEST);
}
} else if (n_stratum_cols > 0) {
M = cbind (S_cols, M);
}
# pull out non-empty columns from KM
KM = t (cbind (t (KM), KM_cols_select) * KM_cols_select);
KM = removeEmpty (target = KM, margin = "cols");
KM = removeEmpty (target = KM, margin = "rows");
KM = KM[1:(nrow (KM) - 1),];
# write output matrices
write (M, fileM, format=fmtO);
write (KM, fileO, format=fmtO);
if (test_type != "none") {
if (num_groups > 1 & fileT != " ") {
write (TEST, fileT, format=fmtO);
write (TEST_GROUPS_OE, fileT+".groups.oe", format=fmtO);
} else {
print (str);
}
}