scripts.algorithms.stratstats.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#
# STRATIFIED BIVARIATE STATISTICS, VERSION 4
# 
# INPUT PARAMETERS:
# -----------------------------------------------------------------------------
# NAME  TYPE   DEFAULT  MEANING
# -----------------------------------------------------------------------------
# X     String  ---     Location to read matrix X that has all 1-st covariates
# Y     String  " "     Location to read matrix Y that has all 2-nd covariates
#                       the default value " " means "use X in place of Y"
# S     String  " "     Location to read matrix S that has the stratum column
#                       the default value " " means "use X in place of S"
# Xcid  String  " "     Location to read the 1-st covariate X-column indices
#                       the default value " " means "use columns 1 : ncol(X)"
# Ycid  String  " "     Location to read the 2-nd covariate Y-column indices
#                       the default value " " means "use columns 1 : ncol(Y)"
# Scid  Int      1      Column index of the stratum column in S
# O     String  ---     Location to store the output matrix (see below)
# fmt   String "text"   Matrix output format, usually "text" or "csv"
# -----------------------------------------------------------------------------
# Note: the stratum column must contain small positive integers; all fractional
# values are rounded; strata with ID <= 0 or NaN are treated as missing.
#
# OUTPUT MATRIX:
# One row per each distinct pair (1st covariate, 2nd covariate)
# 40 columns containing the following information:
#     Col 01: 1st covariate X-column number
#     Col 02: 1st covariate global presence count
#     Col 03: 1st covariate global mean
#     Col 04: 1st covariate global standard deviation
#     Col 05: 1st covariate stratified standard deviation
#     Col 06: R-squared, 1st covariate vs. strata
#     Col 07: adjusted R-squared, 1st covariate vs. strata
#     Col 08: P-value, 1st covariate vs. strata
#     Col 09-10: Reserved
#     Col 11: 2nd covariate Y-column number
#     Col 12: 2nd covariate global presence count
#     Col 13: 2nd covariate global mean
#     Col 14: 2nd covariate global standard deviation
#     Col 15: 2nd covariate stratified standard deviation
#     Col 16: R-squared, 2nd covariate vs. strata
#     Col 17: adjusted R-squared, 2nd covariate vs. strata
#     Col 18: P-value, 2nd covariate vs. strata
#     Col 19-20: Reserved
#     Col 21: Global 1st & 2nd covariate presence count
#     Col 22: Global regression slope (2nd vs. 1st covariate)
#     Col 23: Global regression slope standard deviation
#     Col 24: Global correlation = +/- sqrt(R-squared)
#     Col 25: Global residual standard deviation
#     Col 26: Global R-squared
#     Col 27: Global adjusted R-squared
#     Col 28: Global P-value for hypothesis "slope = 0"
#     Col 29-30: Reserved
#     Col 31: Stratified 1st & 2nd covariate presence count
#     Col 32: Stratified regression slope (2nd vs. 1st covariate)
#     Col 33: Stratified regression slope standard deviation
#     Col 34: Stratified correlation = +/- sqrt(R-squared)
#     Col 35: Stratified residual standard deviation
#     Col 36: Stratified R-squared
#     Col 37: Stratified adjusted R-squared
#     Col 38: Stratified P-value for hypothesis "slope = 0"
#     Col 39: Number of strata with at least two counted points
#     Col 40: Reserved
#
# EXAMPLES:
#
# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/X.mtx Xcid=INPUT_DIR/Xcid.mtx
#     Y=INPUT_DIR/Y.mtx Ycid=INPUT_DIR/Ycid.mtx S=INPUT_DIR/S.mtx Scid=1 O=OUTPUT_DIR/Out.mtx fmt=csv
#
# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/Data.mtx Xcid=INPUT_DIR/Xcid.mtx
#     Ycid=INPUT_DIR/Ycid.mtx Scid=1 O=OUTPUT_DIR/Out.mtx

fileX = $X;
fileY = ifdef ($Y, " ");
fileS = ifdef ($S, " ");
fileO = $O;
fmtO  = ifdef ($fmt, "text");

fileXcid = ifdef ($Xcid, " ");
fileYcid = ifdef ($Ycid, " ");
stratum_column_id = ifdef ($Scid, 1);

print ("BEGIN STRATIFIED STATISTICS SCRIPT");

print ("Reading the input matrices...");

XwithNaNs = read (fileX);
if (fileY != " ") {
    YwithNaNs = read (fileY);
} else {
    YwithNaNs = XwithNaNs;
}
if (fileS != " ") {
    SwithNaNsFull = read (fileS);
    SwithNaNs = SwithNaNsFull [, stratum_column_id];
} else {
    SwithNaNs = XwithNaNs [, stratum_column_id];
}
if (fileXcid != " ") {
    Xcols = read (fileXcid);
} else {
    Xcols = t(seq (1, ncol (XwithNaNs), 1));
}
if (fileYcid != " ") {
    Ycols = read (fileYcid);
} else {
    Ycols = t(seq (1, ncol (YwithNaNs), 1));
}
tXcols = t(Xcols);
tYcols = t(Ycols);

num_records  = nrow (XwithNaNs);
num_attrs    = ncol (XwithNaNs);
num_attrs_X  = ncol (Xcols);
num_attrs_Y  = ncol (Ycols);
num_attrs_XY = num_attrs_X * num_attrs_Y;

print ("Preparing the covariates...");

XnoNaNs = replace (target = XwithNaNs, pattern = 0.0/0.0, replacement = 0);
YnoNaNs = replace (target = YwithNaNs, pattern = 0.0/0.0, replacement = 0);
XNaNmask = (XwithNaNs == XwithNaNs);
YNaNmask = (YwithNaNs == YwithNaNs);
one_to_num_attrs_X = seq (1, num_attrs_X, 1);
one_to_num_attrs_Y = seq (1, num_attrs_Y, 1);
ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X);
ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y);

ProjX_ctable = table (tXcols, one_to_num_attrs_X);
ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable;

ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable;

X = XnoNaNs %*% ProjX;
Y = YnoNaNs %*% ProjY;
X_mask = XNaNmask %*% ProjX;
Y_mask = YNaNmask %*% ProjY;

print ("Preparing the strata...");

SnoNaNs = replace (target = SwithNaNs, pattern = 0.0/0.0, replacement = 0);
S = round (SnoNaNs) * (SnoNaNs > 0);
Proj_good_stratumID = diag (S > 0);
Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows");
vector_of_good_stratumIDs = Proj_good_stratumID %*% S;
vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs));
num_records_with_good_stratumID = nrow (Proj_good_stratumID);
one_to_num_records_with_good_stratumID = seq (1, num_records_with_good_stratumID, 1);

# Create a group-by summation matrix for records over stratum IDs
# "with_empty" means with stratum IDs that never occur in records

num_strata_with_empty = max (vector_of_good_stratumIDs);
StrataSummator_with_empty = table (vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID);
StrataSummator = removeEmpty (target = StrataSummator_with_empty, margin = "rows");
StrataSummator = StrataSummator %*% Proj_good_stratumID;
num_strata = nrow (StrataSummator);
num_empty_strata = num_strata_with_empty - num_strata;
print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata.");

print ("Computing the global single-variate statistics...");

cnt_X_global = colSums (X_mask);
cnt_Y_global = colSums (Y_mask);
avg_X_global = colSums (X) / cnt_X_global;
avg_Y_global = colSums (Y) / cnt_Y_global;
var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
                 sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
                 sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1);
stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);

print ("Computing the stratified single-variate statistics...");

# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata

Cnt_X_per_stratum = StrataSummator %*% X_mask;
Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
Is_none_X_per_stratum = (Cnt_X_per_stratum == 0);
Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0);
One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);

Sum_X_per_stratum  = StrataSummator %*% X;
Sum_Y_per_stratum  = StrataSummator %*% Y;

# Recompute some global statistics to exclude bad stratum-ID records

cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum);
cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum);
sum_X_with_good_stratumID = colSums (Sum_X_per_stratum);
sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum);
var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID;
var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID;

# Compute the stratified statistics

var_sumX_stratified   = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
var_sumY_stratified   = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
                        sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata);
stdev_X_stratified    = sqrt_failsafe (sqrt_failsafe_input_3);
                        sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
stdev_Y_stratified    = sqrt_failsafe (sqrt_failsafe_input_4);
r_sqr_X_vs_strata     = 1 - var_sumX_stratified / var_sumX_with_good_stratumID;
r_sqr_Y_vs_strata     = 1 - var_sumY_stratified / var_sumY_with_good_stratumID;
adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1));
adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1));
fStat_X_vs_strata     = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata));
fStat_Y_vs_strata     = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata));
p_val_X_vs_strata     = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata);
p_val_Y_vs_strata     = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata);

print ("Computing the global bivariate statistics...");

# Compute the aggregate X vs. Y statistics and map them into proper positions

cnt_XY_rectangle       = t(X_mask) %*% Y_mask;
sum_X_forXY_rectangle  = t(X)      %*% Y_mask;
sum_XX_forXY_rectangle = t(X * X)  %*% Y_mask;
sum_Y_forXY_rectangle  = t(X_mask) %*% Y;
sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
sum_XY_rectangle       = t(X)      %*% Y;
cnt_XY_global       = matrix (cnt_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_X_forXY_global  = matrix (sum_X_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_Y_forXY_global  = matrix (sum_Y_forXY_rectangle,  rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_XY_global       = matrix (sum_XY_rectangle,       rows = 1, cols = num_attrs_XY, byrow = TRUE);
ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY);

# Compute the global bivariate statistics for output

cov_sumX_sumY_global    = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
var_sumX_forXY_global   = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
var_sumY_forXY_global   = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
slope_XY_global         = cov_sumX_sumY_global / var_sumX_forXY_global;
                                                 sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
                                                 sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
corr_XY_global          = cov_sumX_sumY_global / sqrt_failsafe_output_5;
r_sqr_X_vs_Y_global     = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2);
                          sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
stdev_slope_XY_global   = sqrt_failsafe (sqrt_failsafe_input_6);
                          sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
stdev_errY_vs_X_global  = sqrt_failsafe (sqrt_failsafe_input_7);
fStat_Y_vs_X_global     = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
p_val_Y_vs_X_global     = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);

print ("Computing the stratified bivariate statistics...");

# Create projections to "intermingle" X and Y into attribute pairs

Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
ones_Y_col   = matrix (1.0, rows = num_attrs_Y, cols = 1);
for (i in 1:num_attrs_X) {
    start_cid = (i - 1) * num_attrs_Y + 1;
    end_cid = i * num_attrs_Y;
    Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col);
    Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_col);
}

# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata

Cnt_XY_per_stratum       = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_X_forXY_per_stratum  = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_Y_forXY_per_stratum  = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));
Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
Sum_XY_per_stratum       = StrataSummator %*% (( X      %*% Proj_X_to_XY) * ( Y      %*% Proj_Y_to_XY));

Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0);
One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);

# Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records

cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum);
sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum);
sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum);
sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum);

# Compute the stratified bivariate statistics

var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
cov_sumX_sumY_stratified  = sum_XY_with_good_stratumID       - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);

slope_XY_stratified     = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
                                                     sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
                                                     sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8);
corr_XY_stratified      = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
temp_X_vs_Y_stratified  = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1)
adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata);
                              sqrt_failsafe_input_9  = temp_X_vs_Y_stratified * var_sumY_forXY_stratified;
stdev_errY_vs_X_stratified  = sqrt_failsafe (sqrt_failsafe_input_9);
                              sqrt_failsafe_input_10 = sqrt_failsafe_input_9  / var_sumX_forXY_stratified;
stdev_slope_XY_stratified   = sqrt_failsafe (sqrt_failsafe_input_10);
fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1);

print ("Preparing the output matrix...");
OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);

OutMtx [ 1, ] = Xcols                 %*% Proj_X_to_XY;  # 1st covariate column number
OutMtx [ 2, ] = cnt_X_global          %*% Proj_X_to_XY;  # 1st covariate global presence count
OutMtx [ 3, ] = avg_X_global          %*% Proj_X_to_XY;  # 1st covariate global mean
OutMtx [ 4, ] = stdev_X_global        %*% Proj_X_to_XY;  # 1st covariate global standard deviation
OutMtx [ 5, ] = stdev_X_stratified    %*% Proj_X_to_XY;  # 1st covariate stratified standard deviation
OutMtx [ 6, ] = r_sqr_X_vs_strata     %*% Proj_X_to_XY;  # R-squared, 1st covariate vs. strata
OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY;  # adjusted R-squared, 1st covariate vs. strata
OutMtx [ 8, ] = p_val_X_vs_strata     %*% Proj_X_to_XY;  # P-value, 1st covariate vs. strata
OutMtx [11, ] = Ycols                 %*% Proj_Y_to_XY;  # 2nd covariate column number
OutMtx [12, ] = cnt_Y_global          %*% Proj_Y_to_XY;  # 2nd covariate global presence count
OutMtx [13, ] = avg_Y_global          %*% Proj_Y_to_XY;  # 2nd covariate global mean
OutMtx [14, ] = stdev_Y_global        %*% Proj_Y_to_XY;  # 2nd covariate global standard deviation
OutMtx [15, ] = stdev_Y_stratified    %*% Proj_Y_to_XY;  # 2nd covariate stratified standard deviation
OutMtx [16, ] = r_sqr_Y_vs_strata     %*% Proj_Y_to_XY;  # R-squared, 2nd covariate vs. strata
OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY;  # adjusted R-squared, 2nd covariate vs. strata
OutMtx [18, ] = p_val_Y_vs_strata     %*% Proj_Y_to_XY;  # P-value, 2nd covariate vs. strata

OutMtx [21, ] = cnt_XY_global;               # Global 1st & 2nd covariate presence count
OutMtx [22, ] = slope_XY_global;             # Global regression slope (2nd vs. 1st covariate)
OutMtx [23, ] = stdev_slope_XY_global;       # Global regression slope standard deviation
OutMtx [24, ] = corr_XY_global;              # Global correlation = +/- sqrt(R-squared)
OutMtx [25, ] = stdev_errY_vs_X_global;      # Global residual standard deviation
OutMtx [26, ] = r_sqr_X_vs_Y_global;         # Global R-squared
OutMtx [27, ] = adj_r_sqr_X_vs_Y_global;     # Global adjusted R-squared
OutMtx [28, ] = p_val_Y_vs_X_global;         # Global P-value for hypothesis "slope = 0"
OutMtx [31, ] = cnt_XY_with_good_stratumID;  # Stratified 1st & 2nd covariate presence count
OutMtx [32, ] = slope_XY_stratified;         # Stratified regression slope (2nd vs. 1st covariate)
OutMtx [33, ] = stdev_slope_XY_stratified;   # Stratified regression slope standard deviation
OutMtx [34, ] = corr_XY_stratified;          # Stratified correlation = +/- sqrt(R-squared)
OutMtx [35, ] = stdev_errY_vs_X_stratified;  # Stratified residual standard deviation
OutMtx [36, ] = r_sqr_X_vs_Y_stratified;     # Stratified R-squared
OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared
OutMtx [38, ] = p_val_Y_vs_X_stratified;     # Stratified P-value for hypothesis "slope = 0"
OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2);  # Number of strata with at least two counted points

OutMtx = t(OutMtx);

print ("Writing the output matrix...");
write (OutMtx, fileO, format=fmtO);
print ("END STRATIFIED STATISTICS SCRIPT");


fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
{ # TEMPORARY IMPLEMENTATION
    tailprob = fStat;
    for (i in 1:nrow(fStat)) {
      for (j in 1:ncol(fStat)) {
        q = as.scalar (fStat [i, j]);
        d1 = as.scalar (df_1 [i, j]);
        d2 = as.scalar (df_2 [i, j]);
        if (d1 >= 1 & d2 >= 1 & q >= 0) {
            tailprob  [i, j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
        } else {
            tailprob  [i, j] = 0/0;
        }
    } }
}

sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
{
    mask_A = (input_A >= 0);
    prep_A = input_A * mask_A;
    mask_A = mask_A * (prep_A == prep_A);
    prep_A = replace (target = prep_A, pattern = 0.0/0.0, replacement = 0);
    output_A = sqrt (prep_A) / mask_A;
}