scripts.algorithms.stratstats.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# STRATIFIED BIVARIATE STATISTICS, VERSION 4
#
# INPUT PARAMETERS:
# -----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# -----------------------------------------------------------------------------
# X String --- Location to read matrix X that has all 1-st covariates
# Y String " " Location to read matrix Y that has all 2-nd covariates
# the default value " " means "use X in place of Y"
# S String " " Location to read matrix S that has the stratum column
# the default value " " means "use X in place of S"
# Xcid String " " Location to read the 1-st covariate X-column indices
# the default value " " means "use columns 1 : ncol(X)"
# Ycid String " " Location to read the 2-nd covariate Y-column indices
# the default value " " means "use columns 1 : ncol(Y)"
# Scid Int 1 Column index of the stratum column in S
# O String --- Location to store the output matrix (see below)
# fmt String "text" Matrix output format, usually "text" or "csv"
# -----------------------------------------------------------------------------
# Note: the stratum column must contain small positive integers; all fractional
# values are rounded; strata with ID <= 0 or NaN are treated as missing.
#
# OUTPUT MATRIX:
# One row per each distinct pair (1st covariate, 2nd covariate)
# 40 columns containing the following information:
# Col 01: 1st covariate X-column number
# Col 02: 1st covariate global presence count
# Col 03: 1st covariate global mean
# Col 04: 1st covariate global standard deviation
# Col 05: 1st covariate stratified standard deviation
# Col 06: R-squared, 1st covariate vs. strata
# Col 07: adjusted R-squared, 1st covariate vs. strata
# Col 08: P-value, 1st covariate vs. strata
# Col 09-10: Reserved
# Col 11: 2nd covariate Y-column number
# Col 12: 2nd covariate global presence count
# Col 13: 2nd covariate global mean
# Col 14: 2nd covariate global standard deviation
# Col 15: 2nd covariate stratified standard deviation
# Col 16: R-squared, 2nd covariate vs. strata
# Col 17: adjusted R-squared, 2nd covariate vs. strata
# Col 18: P-value, 2nd covariate vs. strata
# Col 19-20: Reserved
# Col 21: Global 1st & 2nd covariate presence count
# Col 22: Global regression slope (2nd vs. 1st covariate)
# Col 23: Global regression slope standard deviation
# Col 24: Global correlation = +/- sqrt(R-squared)
# Col 25: Global residual standard deviation
# Col 26: Global R-squared
# Col 27: Global adjusted R-squared
# Col 28: Global P-value for hypothesis "slope = 0"
# Col 29-30: Reserved
# Col 31: Stratified 1st & 2nd covariate presence count
# Col 32: Stratified regression slope (2nd vs. 1st covariate)
# Col 33: Stratified regression slope standard deviation
# Col 34: Stratified correlation = +/- sqrt(R-squared)
# Col 35: Stratified residual standard deviation
# Col 36: Stratified R-squared
# Col 37: Stratified adjusted R-squared
# Col 38: Stratified P-value for hypothesis "slope = 0"
# Col 39: Number of strata with at least two counted points
# Col 40: Reserved
#
# EXAMPLES:
#
# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/X.mtx Xcid=INPUT_DIR/Xcid.mtx
# Y=INPUT_DIR/Y.mtx Ycid=INPUT_DIR/Ycid.mtx S=INPUT_DIR/S.mtx Scid=1 O=OUTPUT_DIR/Out.mtx fmt=csv
#
# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/Data.mtx Xcid=INPUT_DIR/Xcid.mtx
# Ycid=INPUT_DIR/Ycid.mtx Scid=1 O=OUTPUT_DIR/Out.mtx
fileX = $X;
fileY = ifdef ($Y, " ");
fileS = ifdef ($S, " ");
fileO = $O;
fmtO = ifdef ($fmt, "text");
fileXcid = ifdef ($Xcid, " ");
fileYcid = ifdef ($Ycid, " ");
stratum_column_id = ifdef ($Scid, 1);
print ("BEGIN STRATIFIED STATISTICS SCRIPT");
print ("Reading the input matrices...");
XwithNaNs = read (fileX);
if (fileY != " ") {
YwithNaNs = read (fileY);
} else {
YwithNaNs = XwithNaNs;
}
if (fileS != " ") {
SwithNaNsFull = read (fileS);
SwithNaNs = SwithNaNsFull [, stratum_column_id];
} else {
SwithNaNs = XwithNaNs [, stratum_column_id];
}
if (fileXcid != " ") {
Xcols = read (fileXcid);
} else {
Xcols = t(seq (1, ncol (XwithNaNs), 1));
}
if (fileYcid != " ") {
Ycols = read (fileYcid);
} else {
Ycols = t(seq (1, ncol (YwithNaNs), 1));
}
tXcols = t(Xcols);
tYcols = t(Ycols);
num_records = nrow (XwithNaNs);
num_attrs = ncol (XwithNaNs);
num_attrs_X = ncol (Xcols);
num_attrs_Y = ncol (Ycols);
num_attrs_XY = num_attrs_X * num_attrs_Y;
print ("Preparing the covariates...");
XnoNaNs = replace (target = XwithNaNs, pattern = 0.0/0.0, replacement = 0);
YnoNaNs = replace (target = YwithNaNs, pattern = 0.0/0.0, replacement = 0);
XNaNmask = (XwithNaNs == XwithNaNs);
YNaNmask = (YwithNaNs == YwithNaNs);
one_to_num_attrs_X = seq (1, num_attrs_X, 1);
one_to_num_attrs_Y = seq (1, num_attrs_Y, 1);
ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X);
ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y);
ProjX_ctable = table (tXcols, one_to_num_attrs_X);
ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable;
ProjY_ctable = table (tYcols, one_to_num_attrs_Y);
ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable;
X = XnoNaNs %*% ProjX;
Y = YnoNaNs %*% ProjY;
X_mask = XNaNmask %*% ProjX;
Y_mask = YNaNmask %*% ProjY;
print ("Preparing the strata...");
SnoNaNs = replace (target = SwithNaNs, pattern = 0.0/0.0, replacement = 0);
S = round (SnoNaNs) * (SnoNaNs > 0);
Proj_good_stratumID = diag (S > 0);
Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows");
vector_of_good_stratumIDs = Proj_good_stratumID %*% S;
vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs));
num_records_with_good_stratumID = nrow (Proj_good_stratumID);
one_to_num_records_with_good_stratumID = seq (1, num_records_with_good_stratumID, 1);
# Create a group-by summation matrix for records over stratum IDs
# "with_empty" means with stratum IDs that never occur in records
num_strata_with_empty = max (vector_of_good_stratumIDs);
StrataSummator_with_empty = table (vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID);
StrataSummator = removeEmpty (target = StrataSummator_with_empty, margin = "rows");
StrataSummator = StrataSummator %*% Proj_good_stratumID;
num_strata = nrow (StrataSummator);
num_empty_strata = num_strata_with_empty - num_strata;
print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata.");
print ("Computing the global single-variate statistics...");
cnt_X_global = colSums (X_mask);
cnt_Y_global = colSums (Y_mask);
avg_X_global = colSums (X) / cnt_X_global;
avg_Y_global = colSums (Y) / cnt_Y_global;
var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global);
var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global);
sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1);
stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1);
sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1);
stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2);
print ("Computing the stratified single-variate statistics...");
# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
Cnt_X_per_stratum = StrataSummator %*% X_mask;
Cnt_Y_per_stratum = StrataSummator %*% Y_mask;
Is_none_X_per_stratum = (Cnt_X_per_stratum == 0);
Is_none_Y_per_stratum = (Cnt_Y_per_stratum == 0);
One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum);
One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum);
num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum);
num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum);
Sum_X_per_stratum = StrataSummator %*% X;
Sum_Y_per_stratum = StrataSummator %*% Y;
# Recompute some global statistics to exclude bad stratum-ID records
cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum);
cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum);
sum_X_with_good_stratumID = colSums (Sum_X_per_stratum);
sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum);
var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID;
var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID;
# Compute the stratified statistics
var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum);
var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum);
sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata);
stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3);
sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4);
r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_with_good_stratumID;
r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_with_good_stratumID;
adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1));
adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1));
fStat_X_vs_strata = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata));
fStat_Y_vs_strata = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata));
p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata);
p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata);
print ("Computing the global bivariate statistics...");
# Compute the aggregate X vs. Y statistics and map them into proper positions
cnt_XY_rectangle = t(X_mask) %*% Y_mask;
sum_X_forXY_rectangle = t(X) %*% Y_mask;
sum_XX_forXY_rectangle = t(X * X) %*% Y_mask;
sum_Y_forXY_rectangle = t(X_mask) %*% Y;
sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y);
sum_XY_rectangle = t(X) %*% Y;
cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE);
ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY);
# Compute the global bivariate statistics for output
cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global;
var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global;
var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global;
slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global;
sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global;
sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5);
corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5;
r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global);
adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2);
sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2)
stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6);
sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2)
stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7);
fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global);
p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2);
print ("Computing the stratified bivariate statistics...");
# Create projections to "intermingle" X and Y into attribute pairs
Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY);
Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY);
ones_Y_col = matrix (1.0, rows = num_attrs_Y, cols = 1);
for (i in 1:num_attrs_X) {
start_cid = (i - 1) * num_attrs_Y + 1;
end_cid = i * num_attrs_Y;
Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col);
Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_col);
}
# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata
Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY));
Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY));
Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY));
Is_none_XY_per_stratum = (Cnt_XY_per_stratum == 0);
One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum);
num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum);
# Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records
cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum);
sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum);
sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum);
sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum);
# Compute the stratified bivariate statistics
var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum);
var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
cov_sumX_sumY_stratified = sum_XY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum);
slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified;
sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified;
sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8);
corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8;
r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified);
temp_X_vs_Y_stratified = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1)
adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata);
sqrt_failsafe_input_9 = temp_X_vs_Y_stratified * var_sumY_forXY_stratified;
stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_9);
sqrt_failsafe_input_10 = sqrt_failsafe_input_9 / var_sumX_forXY_stratified;
stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_10);
fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified);
p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1);
print ("Preparing the output matrix...");
OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY);
OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st covariate column number
OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st covariate global presence count
OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st covariate global mean
OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st covariate global standard deviation
OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st covariate stratified standard deviation
OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st covariate vs. strata
OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY; # adjusted R-squared, 1st covariate vs. strata
OutMtx [ 8, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st covariate vs. strata
OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd covariate column number
OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd covariate global presence count
OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd covariate global mean
OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd covariate global standard deviation
OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd covariate stratified standard deviation
OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd covariate vs. strata
OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # adjusted R-squared, 2nd covariate vs. strata
OutMtx [18, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd covariate vs. strata
OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd covariate presence count
OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd vs. 1st covariate)
OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation
OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared)
OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation
OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared
OutMtx [27, ] = adj_r_sqr_X_vs_Y_global; # Global adjusted R-squared
OutMtx [28, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0"
OutMtx [31, ] = cnt_XY_with_good_stratumID; # Stratified 1st & 2nd covariate presence count
OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd vs. 1st covariate)
OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation
OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared)
OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation
OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared
OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared
OutMtx [38, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0"
OutMtx [39, ] = colSums (Cnt_XY_per_stratum >= 2); # Number of strata with at least two counted points
OutMtx = t(OutMtx);
print ("Writing the output matrix...");
write (OutMtx, fileO, format=fmtO);
print ("END STRATIFIED STATISTICS SCRIPT");
fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob)
{ # TEMPORARY IMPLEMENTATION
tailprob = fStat;
for (i in 1:nrow(fStat)) {
for (j in 1:ncol(fStat)) {
q = as.scalar (fStat [i, j]);
d1 = as.scalar (df_1 [i, j]);
d2 = as.scalar (df_2 [i, j]);
if (d1 >= 1 & d2 >= 1 & q >= 0) {
tailprob [i, j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE);
} else {
tailprob [i, j] = 0/0;
}
} }
}
sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A)
{
mask_A = (input_A >= 0);
prep_A = input_A * mask_A;
mask_A = mask_A * (prep_A == prep_A);
prep_A = replace (target = prep_A, pattern = 0.0/0.0, replacement = 0);
output_A = sqrt (prep_A) / mask_A;
}