scripts.algorithms.decision-tree.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT IMPLEMENTS CLASSIFICATION TREES WITH BOTH SCALE AND CATEGORICAL FEATURES
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read feature matrix X; note that X needs to be both recoded and dummy coded
# Y String --- Location to read label matrix Y; note that Y needs to be both recoded and dummy coded
# R String " " Location to read the matrix R which for each feature in X contains the following information
# - R[,1]: column ids
# - R[,2]: start indices
# - R[,3]: end indices
# If R is not provided by default all variables are assumed to be scale
# bins Int 20 Number of equiheight bins per scale feature to choose thresholds
# depth Int 25 Maximum depth of the learned tree
# num_leaf Int 10 Number of samples when splitting stops and a leaf node is added
# num_samples Int 3000 Number of samples at which point we switch to in-memory subtree building
# impurity String "Gini" Impurity measure: entropy or Gini (the default)
# M String --- Location to write matrix M containing the learned tree
# O String " " Location to write the training accuracy; by default is standard output
# S_map String " " Location to write the mappings from scale feature ids to global feature ids
# C_map String " " Location to write the mappings from categorical feature ids to global feature ids
# fmt String "text" The output format of the model (matrix M), such as "text" or "csv"
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# Matrix M where each column corresponds to a node in the learned tree and each row contains the following information:
# M[1,j]: id of node j (in a complete binary tree)
# M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
# M[3,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical)
# that node j looks at if j is an internal node, otherwise 0
# M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features,
# otherwise the label that leaf node j is supposed to predict
# M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values
# stored in rows 6,7,... if j is categorical
# If j is a leaf node: number of misclassified samples reaching at node j
# M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale,
# otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
# -------------------------------------------------------------------------------------------
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f decision-tree.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model
# bins=20 depth=25 num_leaf=10 num_samples=3000 impurity=Gini fmt=csv
# External function for binning
binning = externalFunction(Matrix[Double] A, Integer binsize, Integer numbins) return (Matrix[Double] B, Integer numbinsdef)
implemented in (classname="org.apache.sysml.udf.lib.BinningWrapper",exectype="mem")
# Default values of some parameters
fileR = ifdef ($R, " ");
fileO = ifdef ($O, " ");
fileS_map = ifdef ($S_map, " ");
fileC_map = ifdef ($C_map, " ");
fileM = $M;
num_bins = ifdef($bins, 20);
depth = ifdef($depth, 25);
num_leaf = ifdef($num_leaf, 10);
threshold = ifdef ($num_samples, 3000);
imp = ifdef($impurity, "Gini");
fmtO = ifdef($fmt, "text");
X = read($X);
Y_bin = read($Y);
num_records = nrow (X);
num_classes = ncol (Y_bin);
# check if there is only one class label
Y_bin_sum = sum (colSums (Y_bin) == num_records);
if (Y_bin_sum == 1) {
stop ("Y contains only one class label. No model will be learned!");
} else if (Y_bin_sum > 1) {
stop ("Y is not properly dummy coded. Multiple columns of Y contain only ones!")
}
# split data into X_scale and X_cat
if (fileR != " ") {
R = read (fileR);
R = order (target = R, by = 2); # sort by start indices
dummy_coded = (R[,2] != R[,3]);
R_scale = removeEmpty (target = R[,2:3] * (1 - dummy_coded), margin = "rows");
R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
if (fileS_map != " ") {
scale_feature_mapping = removeEmpty (target = (1 - dummy_coded) * seq (1, nrow (R)), margin = "rows");
write (scale_feature_mapping, fileS_map, format = fmtO);
}
if (fileC_map != " ") {
cat_feature_mapping = removeEmpty (target = dummy_coded * seq (1, nrow (R)), margin = "rows");
write (cat_feature_mapping, fileC_map, format = fmtO);
}
sum_dummy = sum (dummy_coded);
if (sum_dummy == nrow (R)) { # all features categorical
print ("All features categorical");
num_cat_features = nrow (R_cat);
num_scale_features = 0;
X_cat = X;
distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
distinct_values_max = max (distinct_values);
distinct_values_offset = cumsum (t (distinct_values));
distinct_values_overall = sum (distinct_values);
} else if (sum_dummy == 0) { # all features scale
print ("All features scale");
num_scale_features = ncol (X);
num_cat_features = 0;
X_scale = X;
distinct_values_max = 1;
} else { # some features scale some features categorical
num_cat_features = nrow (R_cat);
num_scale_features = nrow (R_scale);
distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
distinct_values_max = max (distinct_values);
distinct_values_offset = cumsum (t (distinct_values));
distinct_values_overall = sum (distinct_values);
W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix ("1 -1", rows = 1, cols = 2);
W = matrix (W, rows = 2 * num_cat_features, cols = 1);
if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X)) {
W[2 * num_cat_features,] = 0;
}
last = (R_cat[,2] != ncol (X));
R_cat1 = (R_cat[,2] + 1) * last;
R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1;
R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 1);
col_tab = table (R_cat_vec, 1, W, ncol (X), 1);
col_ind = cumsum (col_tab);
col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol (X)), margin = "rows");
col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, ncol (X)), margin = "rows");
X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), ncol (X), nrow (col_ind_cat));
X_scale = X %*% table (col_ind_scale, seq (1, nrow (col_ind_scale)), ncol (X), nrow (col_ind_scale));
}
} else { # only scale features exist
print ("All features scale");
num_scale_features = ncol (X);
num_cat_features = 0;
X_scale = X;
distinct_values_max = 1;
}
if (num_scale_features > 0) {
print ("COMPUTING BINNING...");
bin_size = max (as.integer (num_records / num_bins), 1);
count_thresholds = matrix (0, rows = 1, cols = num_scale_features)
thresholds = matrix (0, rows = num_bins + 1, cols = num_scale_features)
parfor(i1 in 1:num_scale_features) {
col = order (target = X_scale[,i1], by = 1, decreasing = FALSE);
[col_bins, num_bins_defined] = binning (col, bin_size, num_bins);
count_thresholds[,i1] = num_bins_defined;
thresholds[,i1] = col_bins;
}
print ("PREPROCESSING SCALE FEATURE MATRIX...");
min_num_bins = min (count_thresholds);
max_num_bins = max (count_thresholds);
total_num_bins = sum (count_thresholds);
cum_count_thresholds = t (cumsum (t (count_thresholds)));
X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins);
parfor (i2 in 1:num_scale_features, check = 0) {
Xi2 = X_scale[,i2];
count_threshold = as.scalar (count_thresholds[,i2]);
offset_feature = 1;
if (i2 > 1) {
offset_feature = offset_feature + as.integer (as.scalar (cum_count_thresholds[, (i2 - 1)]));
}
ti2 = t(thresholds[1:count_threshold, i2]);
X_scale_ext[,offset_feature:(offset_feature + count_threshold - 1)] = outer (Xi2, ti2, "<");
}
}
##### INITIALIZATION
L = matrix (1, rows = num_records, cols = 1); # last visited node id for each training sample
# ---- model ----
# leaf nodes
# NC_large[,1]: node id
# NC_large[,2]: class label
# NC_large[,3]: no. of misclassified samples
# NC_large[,4]: 1 if special leaf (impure and # samples at that leaf > threshold) or 0 otherwise
NC_large = matrix (0, rows = 4, cols = 1);
NC_small = matrix (0, rows = 4, cols = 1); # same schema as NC_large
# internal nodes
# Q_large[,1]: node id
Q_large = matrix (1, rows = 1, cols = 1);
Q_small = matrix (0, rows = 1, cols = 1); # same schema as Q_large
# F_large[,1]: feature
# F_large[,2]: type
# F_large[,3]: offset
F_large = matrix (0, rows = 3, cols = 1);
F_small = matrix (0, rows = 3, cols = 1); # same schema as F_large
S_large = matrix (0, rows = 1, cols = 1); # split points for LARGE internal nodes
S_small = matrix (0, rows = 1, cols = 1); # split points for SMALL internal nodes
# temp matrix for tree traversal
# first row: ids of SMALL internal nodes
# second row: no. of samples for these nodes
cur_nodes_large = matrix (1, rows = 1, cols = 1);
num_cur_nodes_large = 1;
num_cur_nodes_small = 0;
level = 0;
while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) {
level = level + 1;
print (" --- start level " + level + " --- ");
##### PREPARE MODEL
if (num_cur_nodes_large > 0) { # LARGE nodes to process
cur_Q_large = matrix (0, rows = 1, cols = 2 * num_cur_nodes_large);
cur_NC_large = matrix (0, rows = 4, cols = 2 * num_cur_nodes_large);
cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large);
cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large * distinct_values_max);
cur_nodes_small = matrix (0, rows = 2, cols = 2 * num_cur_nodes_large);
}
##### LOOP OVER LARGE NODES...
parfor (i6 in 1:num_cur_nodes_large, check = 0) {
cur_node = as.scalar (cur_nodes_large[,i6]);
# --- find best split ---
# samples that reach cur_node
Ix = (L[,1] == cur_node);
cur_Y_bin = Y_bin * Ix;
label_counts_overall = colSums (cur_Y_bin);
label_sum_overall = sum (label_counts_overall);
label_dist_overall = label_counts_overall / label_sum_overall;
if (imp == "entropy") {
label_dist_zero = (label_dist_overall == 0);
cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2); # impurity before
} else { # imp == "Gini"
cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); # impurity before
}
best_scale_gain = 0;
best_cat_gain = 0;
if (num_scale_features > 0) {
# main operation
label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext);
# compute left and right label distribution
label_sum_left = rowSums (label_counts_left_scale);
label_dist_left = label_counts_left_scale / label_sum_left;
if (imp == "entropy") {
label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
log_label_dist_left = log (label_dist_left); # / log (2)
impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left);
} else { # imp == "Gini"
impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left));
}
#
label_counts_right_scale = - label_counts_left_scale + label_counts_overall;
label_sum_right = rowSums (label_counts_right_scale);
label_dist_right = label_counts_right_scale / label_sum_right;
if (imp == "entropy") {
label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
log_label_dist_right = log (label_dist_right); # / log (2)
impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right);
} else { # imp == "Gini"
impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right));
}
I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale);
I_gain_scale = replace (target = I_gain_scale, pattern = "NaN", replacement = 0);
# determine best feature to split on and the split value
best_scale_gain = max (I_gain_scale);
max_I_gain_ind_scale = as.scalar (rowIndexMax (t (I_gain_scale)));
p = (cum_count_thresholds < max_I_gain_ind_scale);
sum_cum_count_thresholds = sum (p);
best_scale_feature = sum_cum_count_thresholds + 1;
best_scale_split = max_I_gain_ind_scale;
if (best_scale_feature != 1) {
best_scale_split = best_scale_split - as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
}
}
if (num_cat_features > 0){
# initialization
split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall);
split_values = split_values_bin;
split_values_offset = matrix (0, rows = 1, cols = num_cat_features);
I_gains = split_values_offset;
impurities_left = split_values_offset;
impurities_right = split_values_offset;
best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes);
best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes);
# main operation
label_counts = t (t (cur_Y_bin) %*% X_cat);
parfor (i9 in 1:num_cat_features, check = 0){
start_ind = 1;
if (i9 > 1) {
start_ind = start_ind + as.scalar (distinct_values_offset[(i9 - 1),]);
}
offset = as.scalar (distinct_values[1,i9]);
cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),];
label_sum = rowSums (cur_label_counts);
label_dist = cur_label_counts / label_sum;
if (imp == "entropy") {
label_dist = replace (target = label_dist, pattern = 0, replacement = 1);
log_label_dist = log (label_dist); # / log(2)
impurity = - rowSums (label_dist * log_label_dist);
impurity = replace (target = impurity, pattern = "NaN", replacement = 1/0);
} else { # imp == "Gini"
impurity = rowSums (label_dist * (1 - label_dist));
}
# sort cur feature by impurity
cur_distinct_values = seq (1, nrow (cur_label_counts));
cur_distinct_values_impurity = cbind (cur_distinct_values, impurity);
cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE);
P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix
label_counts_sorted = P %*% cur_label_counts;
# compute left and right label distribution
label_counts_left = cumsum (label_counts_sorted);
label_sum_left = rowSums (label_counts_left);
label_dist_left = label_counts_left / label_sum_left;
label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1);
if (imp == "entropy") {
label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
log_label_dist_left = log (label_dist_left); # / log(2)
impurity_left = - rowSums (label_dist_left * log_label_dist_left);
} else { # imp == "Gini"
impurity_left = rowSums (label_dist_left * (1 - label_dist_left));
}
#
label_counts_right = - label_counts_left + label_counts_overall;
label_sum_right = rowSums (label_counts_right);
label_dist_right = label_counts_right / label_sum_right;
label_dist_right = replace (target = label_dist_right, pattern = "NaN", replacement = 1);
if (imp == "entropy") {
label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
log_label_dist_right = log (label_dist_right); # / log (2)
impurity_right = - rowSums (label_dist_right * log_label_dist_right);
} else { # imp == "Gini"
impurity_right = rowSums (label_dist_right * (1 - label_dist_right));
}
I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);
Ix_label_sum_left_zero = (label_sum_left == 0);
Ix_label_sum_right_zero = (label_sum_right == 0);
Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero;
I_gain = I_gain * (1 - Ix_label_sum_zero);
I_gain[nrow (I_gain),] = 0; # last entry invalid
max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));
split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
for (i10 in 1:max_I_gain_ind) {
ind = as.scalar (cur_feature_sorted[i10,1]);
if (ind == 1) {
split_values_bin[1,start_ind] = 1.0;
} else {
split_values_bin[1,(start_ind + ind - 1)] = 1.0;
}
}
split_values_offset[1,i9] = max_I_gain_ind;
I_gains[1,i9] = max (I_gain);
impurities_left[1,i9] = as.scalar (impurity_left[max_I_gain_ind,]);
impurities_right[1,i9] = as.scalar (impurity_right[max_I_gain_ind,]);
best_label_counts_left[i9,] = label_counts_left[max_I_gain_ind,];
best_label_counts_right[i9,] = label_counts_right[max_I_gain_ind,];
}
# determine best feature to split on and the split values
best_cat_feature = as.scalar (rowIndexMax (I_gains));
best_cat_gain = max (I_gains);
start_ind = 1;
if (best_cat_feature > 1) {
start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]);
}
offset = as.scalar (distinct_values[1,best_cat_feature]);
best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)];
}
# compare best scale feature to best cat. feature and pick the best one
if (num_scale_features > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
# --- update model ---
cur_F_large[1,i6] = best_scale_feature;
cur_F_large[2,i6] = 1;
cur_F_large[3,i6] = 1;
cur_S_large[1,(i6 - 1) * distinct_values_max + 1] = thresholds[best_scale_split, best_scale_feature];
left_child = 2 * (cur_node - 1) + 1 + 1;
right_child = 2 * (cur_node - 1) + 2 + 1;
# samples going to the left subtree
Ix_left = X_scale_ext[, max_I_gain_ind_scale];
Ix_left = Ix * Ix_left;
Ix_right = Ix * (1 - Ix_left);
L[,1] = L[,1] * (1 - Ix_left) + (Ix_left * left_child);
L[,1] = L[,1] * (1 - Ix_right) + (Ix_right * right_child);
left_child_size = sum (Ix_left);
right_child_size = sum (Ix_right);
# check if left or right child is a leaf
left_pure = FALSE;
right_pure = FALSE;
cur_impurity_left = as.scalar(impurity_left_scale[max_I_gain_ind_scale,]);
cur_impurity_right = as.scalar(impurity_right_scale[max_I_gain_ind_scale,]);
if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) &
(right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) |
(left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf
cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,];
cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples
cur_label_counts_right = label_counts_overall - cur_label_counts_left;
cur_NC_large[1,(2 * i6)] = right_child;
cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples
} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) |
(left_child_size <= threshold & (level == depth))) {
cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,];
cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples
} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) |
(right_child_size <= threshold & (level == depth))) {
cur_label_counts_right = label_counts_right_scale[max_I_gain_ind_scale,];
cur_NC_large[1,(2 * i6)] = right_child;
cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples
}
} else if (num_cat_features > 0 & best_cat_gain > 0) {
# --- update model ---
cur_F_large[1,i6] = best_cat_feature;
offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]);
S_start_ind = (i6 - 1) * distinct_values_max + 1;
cur_S_large[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)];
cur_F_large[3,i6] = offset_nonzero;
cur_F_large[2,i6] = 2;
left_child = 2 * (cur_node - 1) + 1 + 1;
right_child = 2 * (cur_node - 1) + 2 + 1;
# samples going to the left subtree
Ix_left = rowSums (X_cat[,start_ind:(start_ind + offset - 1)] * best_split_values_bin);
Ix_left = (Ix_left >= 1);
Ix_left = Ix * Ix_left;
Ix_right = Ix * (1 - Ix_left);
L[,1] = L[,1] * (1 - Ix_left) + (Ix_left * left_child);
L[,1] = L[,1] * (1 - Ix_right) + (Ix_right * right_child);
left_child_size = sum (Ix_left);
right_child_size = sum (Ix_right);
# check if left or right child is a leaf
left_pure = FALSE;
right_pure = FALSE;
cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]);
cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]);
if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) &
(right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) |
(left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf
cur_label_counts_left = best_label_counts_left[best_cat_feature,];
cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples
cur_label_counts_right = label_counts_overall - cur_label_counts_left;
cur_NC_large[1,(2 * i6)] = right_child;
cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples
} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) |
(left_child_size <= threshold & (level == depth))) {
cur_label_counts_left = best_label_counts_left[best_cat_feature,];
cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples
} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) |
(right_child_size <= threshold & (level == depth))) {
cur_label_counts_right = best_label_counts_right[best_cat_feature,];
cur_NC_large[1,(2 * i6)] = right_child;
cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples
}
} else {
print ("NUMBER OF SAMPLES AT NODE " + cur_node + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!");
right_pure = TRUE;
left_pure = TRUE;
cur_NC_large[1,(2 * (i6 - 1) + 1)] = cur_node;
class_label = as.scalar (rowIndexMax (label_counts_overall));
cur_NC_large[2,(2 * (i6 - 1) + 1)] = class_label;
cur_NC_large[3,(2 * (i6 - 1) + 1)] = label_sum_overall - max (label_counts_overall);
cur_NC_large[4,(2 * (i6 - 1) + 1)] = 1; # special leaf
}
# add nodes to Q
if (!left_pure) {
if (left_child_size > threshold) {
cur_Q_large[1,(2 * (i6 - 1)+ 1)] = left_child;
} else {
# first row: ids of SMALL internal nodes, second row: no. of samples for these nodes
cur_nodes_small[1,(2 * (i6 - 1)+ 1)] = left_child;
cur_nodes_small[2,(2 * (i6 - 1)+ 1)] = left_child_size;
}
}
if (!right_pure) {
if (right_child_size > threshold) {
cur_Q_large[1,(2 * i6)] = right_child;
} else{
# first row: ids of SMALL internal nodes, second row: no. of samples for these nodes
cur_nodes_small[1,(2 * i6)] = right_child;
cur_nodes_small[2,(2 * i6)] = right_child_size;
}
}
}
##### PREPARE MODEL FOR LARGE NODES
if (num_cur_nodes_large > 0) {
cur_Q_large = removeEmpty (target = cur_Q_large, margin = "cols");
if (as.scalar (cur_Q_large[1,1]) != 0) Q_large = cbind (Q_large, cur_Q_large);
cur_NC_large = removeEmpty (target = cur_NC_large, margin = "cols");
if (as.scalar (cur_NC_large[1,1]) != 0) NC_large = cbind (NC_large, cur_NC_large);
cur_F_large = removeEmpty (target = cur_F_large, margin = "cols");
if (as.scalar (cur_F_large[1,1]) != 0) F_large = cbind (F_large, cur_F_large);
cur_S_large = removeEmpty (target = cur_S_large, margin = "cols");
if (as.scalar (cur_S_large[1,1]) != 0) S_large = cbind (S_large, cur_S_large);
num_cur_nodes_large_pre = 2 * num_cur_nodes_large;
if (as.scalar (cur_Q_large[1,1]) == 0) {
num_cur_nodes_large = 0;
} else {
cur_nodes_large = cur_Q_large;
num_cur_nodes_large = ncol (cur_Q_large);
}
}
##### PREPARE MODEL FOR SMALL NODES
cur_nodes_small_nonzero = removeEmpty (target = cur_nodes_small, margin = "cols");
if (as.scalar (cur_nodes_small_nonzero[1,1]) != 0) { # if SMALL nodes exist
num_cur_nodes_small = ncol (cur_nodes_small_nonzero);
}
if (num_cur_nodes_small > 0) { # SMALL nodes to process
reserve_len = sum (2 ^ (ceil (log (cur_nodes_small_nonzero[2,]) / log (2)))) + num_cur_nodes_small;
cur_Q_small = matrix (0, rows = 1, cols = reserve_len);
cur_NC_small = matrix (0, rows = 4, cols = reserve_len);
cur_F_small = matrix (0, rows = 3, cols = reserve_len);
cur_S_small = matrix (0, rows = 1, cols = reserve_len * distinct_values_max); # split values of the best feature
}
##### LOOP OVER SMALL NODES...
parfor (i7 in 1:num_cur_nodes_small, check = 0) {
cur_node_small = as.scalar (cur_nodes_small_nonzero[1,i7]);
# build dataset for SMALL node
Ix = (L[,1] == cur_node_small);
if (num_scale_features > 0) {
X_scale_ext_small = removeEmpty (target = X_scale_ext, margin = "rows", select = Ix);
}
if (num_cat_features > 0) {
X_cat_small = removeEmpty (target = X_cat, margin = "rows", select = Ix);
}
L_small = removeEmpty (target = L * Ix, margin = "rows");
Y_bin_small = removeEmpty (target = Y_bin * Ix, margin = "rows");
# compute offset
start_ind_global = 1;
start_ind_S_global = 1;
if (i7 > 1) {
offsets = cumsum (t (2 ^ ceil (log (cur_nodes_small_nonzero[2,]) / log (2))));
start_ind_global = start_ind_global + as.scalar (offsets[(i7 - 1),]);
start_ind_S_global = start_ind_S_global + (as.scalar (offsets[(i7 - 1),]) * distinct_values_max);
}
Q = matrix (cur_node_small, rows = 1, cols = 1);
NC = matrix (0, rows = 4, cols = 1);
F = matrix (0, rows = 3, cols = 1);
S = matrix (0, rows = 1, cols = 1);
cur_nodes_ = matrix (cur_node_small, rows = 1, cols = ncol(cur_Q_small));
num_cur_nodes = 1;
level_ = level;
while (num_cur_nodes > 0 & level_ < depth) {
level_ = level_ + 1;
cur_Q = matrix (0, rows = 1, cols = 2 * num_cur_nodes);
cur_NC = matrix (0, rows = 4, cols = 2 * num_cur_nodes);
cur_F = matrix (0, rows = 3, cols = num_cur_nodes);
cur_S = matrix (0, rows = 1, cols = num_cur_nodes * distinct_values_max);
parfor (i8 in 1:num_cur_nodes, check = 0) {
cur_node = as.scalar (cur_nodes_[,i8]);
# --- find best split ---
# samples that reach cur_node
Ix = (L_small[,1] == cur_node);
cur_Y_bin = Y_bin_small * Ix;
label_counts_overall = colSums (cur_Y_bin);
label_sum_overall = sum (label_counts_overall);
label_dist_overall = label_counts_overall / label_sum_overall;
if (imp == "entropy") {
label_dist_zero = (label_dist_overall == 0);
cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero));
} else { # imp == "Gini"
cur_impurity = sum (label_dist_overall * (1 - label_dist_overall));
}
best_scale_gain = 0;
best_cat_gain = 0;
if (num_scale_features > 0) {
# main operation
label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext_small);
# compute left and right label distribution
label_sum_left = rowSums (label_counts_left_scale);
label_dist_left = label_counts_left_scale / label_sum_left;
if (imp == "entropy") {
label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
log_label_dist_left = log (label_dist_left); # / log (2)
impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left);
} else { # imp == "Gini"
impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left));
}
#
label_counts_right_scale = - label_counts_left_scale + label_counts_overall;
label_sum_right = rowSums (label_counts_right_scale);
label_dist_right = label_counts_right_scale / label_sum_right;
if (imp == "entropy") {
label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
log_label_dist_right = log (label_dist_right); # log (2)
impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right);
} else { # imp == "Gini"
impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right));
}
I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale); # I_gain -> I_gain_scale
I_gain_scale = replace (target = I_gain_scale, pattern = "NaN", replacement = 0);
# determine best feature to split on and the split value
best_scale_gain = max (I_gain_scale);
max_I_gain_ind_scale = as.scalar (rowIndexMax (t (I_gain_scale)));
p = (cum_count_thresholds < max_I_gain_ind_scale);
sum_cum_count_thresholds = sum (p);
best_scale_feature = sum_cum_count_thresholds + 1;
best_scale_split = max_I_gain_ind_scale;
if (best_scale_feature != 1) {
best_scale_split = best_scale_split - as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
}
}
if (num_cat_features > 0){
# initialization
split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall);
split_values = split_values_bin;
split_values_offset = matrix (0, rows = 1, cols = num_cat_features);
I_gains = split_values_offset;
impurities_left = split_values_offset;
impurities_right = split_values_offset;
best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes);
best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes);
# main operation
label_counts = t (t (cur_Y_bin) %*% X_cat_small);
parfor (i9 in 1:num_cat_features, check = 0){
start_ind = 1;
if (i9 > 1) {
start_ind = start_ind + as.scalar (distinct_values_offset[(i9 - 1),]);
}
offset = as.scalar (distinct_values[1,i9]);
cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),];
label_sum = rowSums (cur_label_counts);
label_dist = cur_label_counts / label_sum;
if (imp == "entropy") {
label_dist = replace (target = label_dist, pattern = 0, replacement = 1);
log_label_dist = log (label_dist); # / log(2)
impurity = - rowSums (label_dist * log_label_dist);
impurity = replace (target = impurity, pattern = "NaN", replacement = 1/0);
} else { # imp == "Gini"
impurity = rowSums (label_dist * (1 - label_dist));
}
# sort cur feature by impurity
cur_distinct_values = seq (1, nrow (cur_label_counts));
cur_distinct_values_impurity = cbind (cur_distinct_values, impurity);
cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE);
P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix
label_counts_sorted = P %*% cur_label_counts;
# compute left and right label distribution
label_counts_left = cumsum (label_counts_sorted);
label_sum_left = rowSums (label_counts_left);
label_dist_left = label_counts_left / label_sum_left;
label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1);
if (imp == "entropy") {
label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
log_label_dist_left = log (label_dist_left); # / log(2)
impurity_left = - rowSums (label_dist_left * log_label_dist_left);
} else { # imp == "Gini"
impurity_left = rowSums (label_dist_left * (1 - label_dist_left));
}
#
label_counts_right = - label_counts_left + label_counts_overall;
label_sum_right = rowSums (label_counts_right);
label_dist_right = label_counts_right / label_sum_right;
label_dist_right = replace (target = label_dist_right, pattern = "NaN", replacement = 1);
if (imp == "entropy") {
label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
log_label_dist_right = log (label_dist_right); # / log (2)
impurity_right = - rowSums (label_dist_right * log_label_dist_right);
} else { # imp == "Gini"
impurity_right = rowSums (label_dist_right * (1 - label_dist_right));
}
I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);
Ix_label_sum_left_zero = (label_sum_left == 0);
Ix_label_sum_right_zero = (label_sum_right == 0);
Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero;
I_gain = I_gain * (1 - Ix_label_sum_zero);
I_gain[nrow (I_gain),] = 0; # last entry invalid
max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));
split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
for (i10 in 1:max_I_gain_ind) {
ind = as.scalar (cur_feature_sorted[i10,1]);
if (ind == 1) {
split_values_bin[1,start_ind] = 1.0;
} else {
split_values_bin[1,(start_ind + ind - 1)] = 1.0;
}
}
split_values_offset[1,i9] = max_I_gain_ind;
I_gains[1,i9] = max (I_gain);
impurities_left[1,i9] = as.scalar (impurity_left[max_I_gain_ind,]);
impurities_right[1,i9] = as.scalar (impurity_right[max_I_gain_ind,]);
best_label_counts_left[i9,] = label_counts_left[max_I_gain_ind,];
best_label_counts_right[i9,] = label_counts_right[max_I_gain_ind,];
}
# determine best feature to split on and the split values
best_cat_feature = as.scalar (rowIndexMax (I_gains));
best_cat_gain = max (I_gains);
start_ind = 1;
if (best_cat_feature > 1) {
start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]);
}
offset = as.scalar (distinct_values[1,best_cat_feature]);
best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)];
}
# compare best scale feature to best cat. feature and pick the best one
if (num_scale_features > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
# --- update model ---
cur_F[1,i8] = best_scale_feature;
cur_F[2,i8] = 1;
cur_F[3,i8] = 1;
cur_S[1,(i8 - 1) * distinct_values_max + 1] = thresholds[best_scale_split, best_scale_feature];
left_child = 2 * (cur_node - 1) + 1 + 1;
right_child = 2 * (cur_node - 1) + 2 + 1;
# samples going to the left subtree
Ix_left = X_scale_ext_small[, max_I_gain_ind_scale];
Ix_left = Ix * Ix_left;
Ix_right = Ix * (1 - Ix_left);
L_small[,1] = L_small[,1] * (1 - Ix_left) + (Ix_left * left_child);
L_small[,1] = L_small[,1] * (1 - Ix_right) + (Ix_right * right_child);
left_child_size = sum (Ix_left);
right_child_size = sum (Ix_right);
# check if left or right child is a leaf
left_pure = FALSE;
right_pure = FALSE;
cur_impurity_left = as.scalar(impurity_left_scale[max_I_gain_ind_scale,]);
cur_impurity_right = as.scalar(impurity_right_scale[max_I_gain_ind_scale,]);
if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level_ == depth)) &
(right_child_size <= num_leaf | cur_impurity_right == 0 | (level_ == depth)) ) { # both left and right nodes are leaf
cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,];
cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples
cur_label_counts_right = label_counts_overall - cur_label_counts_left;
cur_NC[1,(2 * i8)] = right_child;
cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples
} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) {
cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,];
cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples
} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) {
cur_label_counts_right = label_counts_right_scale[max_I_gain_ind_scale,];
cur_NC[1,(2 * i8)] = right_child;
cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples
}
} else if (num_cat_features > 0 & best_cat_gain > 0) {
# --- update model ---
cur_F[1,i8] = best_cat_feature;
offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]);
S_start_ind = (i8 - 1) * distinct_values_max + 1;
cur_S[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)];
cur_F[3,i8] = offset_nonzero;
cur_F[2,i8] = 2;
left_child = 2 * (cur_node - 1) + 1 + 1;
right_child = 2 * (cur_node - 1) + 2 + 1;
# samples going to the left subtree
Ix_left = rowSums (X_cat_small[,start_ind:(start_ind + offset - 1)] * best_split_values_bin);
Ix_left = (Ix_left >= 1);
Ix_left = Ix * Ix_left;
Ix_right = Ix * (1 - Ix_left);
L_small[,1] = L_small[,1] * (1 - Ix_left) + (Ix_left * left_child);
L_small[,1] = L_small[,1] * (1 - Ix_right) + (Ix_right * right_child);
left_child_size = sum (Ix_left);
right_child_size = sum (Ix_right);
# check if left or right child is a leaf
left_pure = FALSE;
right_pure = FALSE;
cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]);
cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]);
if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level_ == depth)) &
(right_child_size <= num_leaf | cur_impurity_right == 0 | (level_ == depth)) ) { # both left and right nodes are leaf
cur_label_counts_left = best_label_counts_left[best_cat_feature,];
cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples
cur_label_counts_right = label_counts_overall - cur_label_counts_left;
cur_NC[1,(2 * i8)] = right_child;
cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples
} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) {
cur_label_counts_left = best_label_counts_left[best_cat_feature,];
cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples
} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) {
cur_label_counts_right = best_label_counts_right[best_cat_feature,];
cur_NC[1,(2 * i8)] = right_child;
cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples
}
} else {
print ("NUMBER OF SAMPLES AT NODE " + cur_node + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!");
right_pure = TRUE;
left_pure = TRUE;
cur_NC[1,(2 * (i8 - 1) + 1)] = cur_node;
class_label = as.scalar (rowIndexMax (label_counts_overall));
cur_NC[2,(2 * (i8 - 1) + 1)] = class_label;
cur_NC[3,(2 * (i8 - 1) + 1)] = label_sum_overall - max (label_counts_overall);
cur_NC[4,(2 * (i8 - 1) + 1)] = 1; # special leaf
}
# add nodes to Q
if (!left_pure) {
cur_Q[1,(2 * (i8 - 1)+ 1)] = left_child;
}
if (!right_pure) {
cur_Q[1,(2 * i8)] = right_child;
}
}
cur_Q = removeEmpty (target = cur_Q, margin = "cols");
Q = cbind (Q, cur_Q);
NC = cbind (NC, cur_NC);
F = cbind (F, cur_F);
S = cbind (S, cur_S);
num_cur_nodes_pre = 2 * num_cur_nodes;
if (as.scalar (cur_Q[1,1]) == 0) {
num_cur_nodes = 0;
} else {
cur_nodes_[,1:ncol(cur_Q)] = cur_Q;
num_cur_nodes = ncol (cur_Q);
}
}
cur_Q_small[,start_ind_global:(start_ind_global + ncol (Q) - 1)] = Q;
cur_NC_small[,start_ind_global:(start_ind_global + ncol (NC) - 1)] = NC;
cur_F_small[,start_ind_global:(start_ind_global + ncol (F) - 1)] = F;
cur_S_small[,start_ind_S_global:(start_ind_S_global + ncol (S) - 1)] = S;
}
##### PREPARE MODEL FOR SMALL NODES
if (num_cur_nodes_small > 0) { # small nodes already processed
cur_Q_small = removeEmpty (target = cur_Q_small, margin = "cols");
if (as.scalar (cur_Q_small[1,1]) != 0) Q_small = cbind (Q_small, cur_Q_small);
cur_NC_small = removeEmpty (target = cur_NC_small, margin = "cols");
if (as.scalar (cur_NC_small[1,1]) != 0) NC_small = cbind (NC_small, cur_NC_small);
cur_F_small = removeEmpty (target = cur_F_small, margin = "cols");
if (as.scalar (cur_F_small[1,1]) != 0) F_small = cbind (F_small, cur_F_small);
cur_S_small = removeEmpty (target = cur_S_small, margin = "cols");
if (as.scalar (cur_S_small[1,1]) != 0) S_small = cbind (S_small, cur_S_small);
num_cur_nodes_small = 0; # reset
}
print (" --- end level " + level + ", remaining no. of LARGE nodes to expand " + num_cur_nodes_large + " --- ");
}
num_misclassified = sum (NC_large[3,]) + sum (NC_small[3,]);
accuracy = (1 - (num_misclassified / num_records)) * 100;
str_o = "no. of misclassified " + num_misclassified + " training accuracy " + accuracy + " %";
if (fileO != " ") {
ACCURACY = matrix (accuracy, rows = 1, cols = 1);
write (ACCURACY, fileO, format = fmtO)
} else {
print (str_o);
}
#### prepare model
print ("PREPARING MODEL...")
# large nodes
if (as.scalar (Q_large[1,1]) == 0 & ncol (Q_large) > 1) {
Q_large = Q_large[,2:ncol (Q_large)];
}
if (as.scalar (NC_large[1,1]) == 0 & ncol (NC_large) > 1) {
NC_large = NC_large[,2:ncol (NC_large)];
}
if (as.scalar (S_large[1,1]) == 0 & ncol (S_large) > 1) {
S_large = S_large[,2:ncol (S_large)];
}
if (as.scalar (F_large[1,1]) == 0 & ncol (F_large) > 1) {
F_large = F_large[,2:ncol (F_large)];
}
# small nodes
if (as.scalar (Q_small[1,1]) == 0 & ncol (Q_small) > 1) {
Q_small = Q_small[,2:ncol (Q_small)];
}
if (as.scalar (NC_small[1,1]) == 0 & ncol (NC_small) > 1) {
NC_small = NC_small[,2:ncol (NC_small)];
}
if (as.scalar (S_small[1,1]) == 0 & ncol (S_small) > 1) {
S_small = S_small[,2:ncol (S_small)];
}
if (as.scalar (F_small[1,1]) == 0 & ncol (F_small) > 1) {
F_small = F_small[,2:ncol (F_small)];
}
# check for special leaves and if there are any remove them from Q_large and Q_small
special_large_leaves_ind = NC_large[4,];
if (sum (special_large_leaves_ind) > 0) {
print ("PROCESSING SPECIAL LARGE LEAVES...");
special_large_leaves = removeEmpty (target = NC_large[1,] * special_large_leaves_ind, margin = "cols");
large_internal_ind = 1 - colSums (outer (t (special_large_leaves), Q_large[1,], "=="));
Q_large = removeEmpty (target = Q_large * large_internal_ind, margin = "cols");
F_large = removeEmpty (target = F_large, margin = "cols"); # remove special leaves from F
}
special_small_leaves_ind = NC_small[4,];
if (sum (special_small_leaves_ind) > 0) {
print ("PROCESSING SPECIAL SMALL LEAVES...");
special_small_leaves = removeEmpty (target = NC_small[1,] * special_small_leaves_ind, margin = "cols");
small_internal_ind = 1 - colSums (outer (t (special_small_leaves), Q_small[1,], "=="));
Q_small = removeEmpty (target = Q_small * small_internal_ind, margin = "cols");
F_small = removeEmpty (target = F_small, margin = "cols"); # remove special leaves from F
}
# model corresponding to large internal nodes
max_offset = 1;
no_large_internal_node = FALSE;
if (as.scalar (Q_large[1,1]) != 0) {
print ("PROCESSING LARGE INTERNAL NODES...");
num_large_internal = ncol (Q_large);
max_offset = max (max (F_large[3,]), max (F_small[3,]));
M1_large = matrix (0, rows = 5 + max_offset, cols = num_large_internal);
M1_large[1,] = Q_large;
M1_large[3:5,] = F_large;
# process S_large
cum_offsets_large = cumsum (t (F_large[3,]));
parfor (it in 1:num_large_internal, check = 0) {
start_ind = 1;
if (it > 1) {
start_ind = start_ind + as.scalar (cum_offsets_large[(it - 1),]);
}
offset = as.scalar (F_large[3,it]);
M1_large[6:(6 + offset - 1),it] = t (S_large[1,start_ind:(start_ind + offset - 1)]);
}
} else {
print ("No LARGE internal nodes available");
no_large_internal_node = TRUE;
}
# model corresponding to small internal nodes
no_small_internal_node = FALSE;
if (as.scalar (Q_small[1,1]) != 0) {
print ("PROCESSING SMALL INTERNAL NODES...");
num_small_internal = ncol (Q_small);
M1_small = matrix (0, rows = 5 + max_offset, cols = num_small_internal);
M1_small[1,] = Q_small;
M1_small[3:5,] = F_small;
# process S_small
cum_offsets_small = cumsum (t (F_small[3,]));
parfor (it in 1:num_small_internal, check = 0) {
start_ind = 1;
if (it > 1) {
start_ind = start_ind + as.scalar (cum_offsets_small[(it - 1),]);
}
offset = as.scalar (F_small[3,it]);
M1_small[6:(6 + offset - 1),it] = t (S_small[1,start_ind:(start_ind + offset - 1)]);
}
} else {
print ("No SMALL internal nodes available");
no_small_internal_node = TRUE;
}
# model corresponding to large leaf nodes
no_large_leaf_node = FALSE;
if (as.scalar (NC_large[1,1]) != 0) {
print ("PROCESSING LARGE LEAF NODES...");
num_large_leaf = ncol (NC_large);
M2_large = matrix (0, rows = 5 + max_offset, cols = num_large_leaf);
M2_large[1,] = NC_large[1,];
M2_large[4:6,] = NC_large[2:4,];
} else {
print ("No LARGE leaf nodes available");
no_large_leaf_node = TRUE;
}
# model corresponding to small leaf nodes
no_small_leaf_node = FALSE;
if (as.scalar (NC_small[1,1]) != 0) {
print ("PROCESSING SMALL LEAF NODES...");
num_small_leaf = ncol (NC_small);
M2_small = matrix (0, rows = 5 + max_offset, cols = num_small_leaf);
M2_small[1,] = NC_small[1,];
M2_small[4:6,] = NC_small[2:4,];
} else {
print ("No SMALL leaf nodes available");
no_small_leaf_node = TRUE;
}
if (no_large_internal_node) {
M1 = M1_small;
} else if (no_small_internal_node) {
M1 = M1_large;
} else {
M1 = cbind (M1_large, M1_small);
}
if (no_large_leaf_node) {
M2 = M2_small;
} else if (no_small_internal_node) {
M2 = M2_large;
} else {
M2 = cbind (M2_large, M2_small);
}
M = cbind (M1, M2);
M = t (order (target = t (M), by = 1));
# removing redundant subtrees
if (ncol (M) > 1) {
print ("CHECKING FOR REDUNDANT SUBTREES...");
red_leaf = TRUE;
process_red_subtree = FALSE;
invalid_node_ind = matrix (0, rows = 1, cols = ncol (M));
while (red_leaf & ncol (M) > 1) {
leaf_ind = (M[3,] == 0);
labels = M[4,] * leaf_ind;
parent_ids = floor (M[1,] /2);
cond1 = (labels[,1:(ncol (M) - 1)] == labels[,2:ncol (M)]); # siebling leaves with same label
cond2 = (parent_ids[,1:(ncol (M)- 1)] == parent_ids[,2:ncol (M)]); # same parents
red_leaf_ind = cond1 * cond2 * leaf_ind[,2:ncol (M)];
if (sum (red_leaf_ind) > 0) { # if redundant subtrees exist
red_leaf_ids = red_leaf_ind * M[1,2:ncol (M)];
red_leaf_ids_nonzero = removeEmpty (target = red_leaf_ids, margin = "cols");
parfor (it in 1:ncol (red_leaf_ids_nonzero), check = 0){
cur_right_leaf_id = as.scalar (red_leaf_ids_nonzero[1,it]);
cur_parent_id = floor (cur_right_leaf_id / 2);
cur_right_leaf_pos = as.scalar (rowIndexMax ((M[1,] == cur_right_leaf_id)));
cur_parent_pos = as.scalar(rowIndexMax (M[1,] == cur_parent_id));
M[2:nrow (M), cur_parent_pos] = M[2:nrow (M), cur_right_leaf_pos];
M[3,cur_right_leaf_pos] = -1;
M[3,cur_right_leaf_pos - 1] = -1;
M[5,cur_parent_pos] = M[5,cur_parent_pos] + M[5,cur_right_leaf_pos - 1];
invalid_node_ind[1,cur_right_leaf_pos] = 1;
invalid_node_ind[1,cur_right_leaf_pos - 1] = 1;
}
process_red_subtree = TRUE;
} else {
red_leaf = FALSE;
}
}
if (process_red_subtree) {
print ("REMOVING REDUNDANT SUBTREES...");
valid_node_ind = (invalid_node_ind == 0);
M = removeEmpty (target = M * valid_node_ind, margin = "cols");
}
}
if (ncol (M) > 1) { # if internal nodes exist
print ("COMPUTING OFFSETS TO THE LEFT CHILD FOR INTERNAL NODES...");
internal_ind = (M[3,] > 0);
internal_ids = internal_ind * M[1,];
internal_ids_nonzero = removeEmpty (target = internal_ids, margin = "cols");
a1 = internal_ids_nonzero;
a2 = internal_ids_nonzero * 2;
pos_a1 = rowIndexMax( outer(t(a1), M[1,], "==") );
pos_a2 = rowIndexMax( outer(t(a2), M[1,], "==") );
M[2,] = t(table(pos_a1, 1, pos_a2 - pos_a1, ncol(M), 1));
}
else {
print ("The tree model contains only one leaf with label " + as.scalar (M[4,]));
}
write (M, fileM, format = fmtO);