scripts.algorithms.random-forest.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT IMPLEMENTS CLASSIFICATION RANDOM FOREST WITH BOTH SCALE AND CATEGORICAL FEATURES
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read feature matrix X; note that X needs to be both recoded and dummy coded
# Y String --- Location to read label matrix Y; note that Y needs to be both recoded and dummy coded
# R String " " Location to read the matrix R which for each feature in X contains the following information
# - R[,1]: column ids
# - R[,2]: start indices
# - R[,3]: end indices
# If R is not provided by default all variables are assumed to be scale
# bins Int 20 Number of equiheight bins per scale feature to choose thresholds
# depth Int 25 Maximum depth of the learned tree
# num_leaf Int 10 Number of samples when splitting stops and a leaf node is added
# num_samples Int 3000 Number of samples at which point we switch to in-memory subtree building
# num_trees Int 10 Number of trees to be learned in the random forest model
# subsamp_rate Double 1.0 Parameter controlling the size of each tree in the forest; samples are selected from a
# Poisson distribution with parameter subsamp_rate (the default value is 1.0)
# feature_subset Double 0.5 Parameter that controls the number of feature used as candidates for splitting at each tree node
# as a power of number of features in the dataset;
# by default square root of features (i.e., feature_subset = 0.5) are used at each tree node
# impurity String "Gini" Impurity measure: entropy or Gini (the default)
# M String --- Location to write matrix M containing the learned tree
# C String " " Location to write matrix C containing the number of times samples are chosen in each tree of the random forest
# S_map String " " Location to write the mappings from scale feature ids to global feature ids
# C_map String " " Location to write the mappings from categorical feature ids to global feature ids
# fmt String "text" The output format of the model (matrix M), such as "text" or "csv"
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# Matrix M where each column corresponds to a node in the learned tree and each row contains the following information:
# M[1,j]: id of node j (in a complete binary tree)
# M[2,j]: tree id to which node j belongs
# M[3,j]: Offset (no. of columns) to left child of j
# M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
# M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features,
# otherwise the label that leaf node j is supposed to predict
# M[6,j]: 1 if j is an internal node and the feature chosen for j is scale, otherwise the size of the subset of values
# stored in rows 7,8,... if j is categorical
# M[7:,j]: Only applicable for internal nodes. Threshold the example's feature value is compared to is stored at M[7,j] if the feature chosen for j is scale;
# If the feature chosen for j is categorical rows 7,8,... depict the value subset chosen for j
# -------------------------------------------------------------------------------------------
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f random-forest.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model
# bins=20 depth=25 num_leaf=10 num_samples=3000 num_trees=10 impurity=Gini fmt=csv
# External function for binning
binning = externalFunction(Matrix[Double] A, Integer binsize, Integer numbins) return (Matrix[Double] B, Integer numbinsdef)
implemented in (classname="org.apache.sysml.udf.lib.BinningWrapper",exectype="mem")
# Default values of some parameters
fileR = ifdef ($R, " ");
fileC = ifdef ($C, " ");
fileS_map = ifdef ($S_map, " ");
fileC_map = ifdef ($C_map, " ");
fileM = $M;
num_bins = ifdef($bins, 20);
depth = ifdef($depth, 25);
num_leaf = ifdef($num_leaf, 10);
num_trees = ifdef($num_trees, 1);
threshold = ifdef ($num_samples, 3000);
imp = ifdef($impurity, "Gini");
rate = ifdef ($subsamp_rate, 1);
fpow = ifdef ($feature_subset, 0.5);
fmtO = ifdef($fmt, "text");
X = read($X);
Y_bin = read($Y);
num_records = nrow (X);
num_classes = ncol (Y_bin);
# check if there is only one class label
Y_bin_sum = sum (colSums (Y_bin) == num_records);
if (Y_bin_sum == 1) {
stop ("Y contains only one class label. No model will be learned!");
} else if (Y_bin_sum > 1) {
stop ("Y is not properly dummy coded. Multiple columns of Y contain only ones!")
}
# split data into X_scale and X_cat
if (fileR != " ") {
R = read (fileR);
R = order (target = R, by = 2); # sort by start indices
dummy_coded = (R[,2] != R[,3]);
R_scale = removeEmpty (target = R[,2:3] * (1 - dummy_coded), margin = "rows");
R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
if (fileS_map != " ") {
scale_feature_mapping = removeEmpty (target = (1 - dummy_coded) * seq (1, nrow (R)), margin = "rows");
write (scale_feature_mapping, fileS_map, format = fmtO);
}
if (fileC_map != " ") {
cat_feature_mapping = removeEmpty (target = dummy_coded * seq (1, nrow (R)), margin = "rows");
write (cat_feature_mapping, fileC_map, format = fmtO);
}
sum_dummy = sum (dummy_coded);
if (sum_dummy == nrow (R)) { # all features categorical
print ("All features categorical");
num_cat_features = nrow (R_cat);
num_scale_features = 0;
X_cat = X;
distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
distinct_values_max = max (distinct_values);
distinct_values_offset = cumsum (t (distinct_values));
distinct_values_overall = sum (distinct_values);
} else if (sum_dummy == 0) { # all features scale
print ("All features scale");
num_scale_features = ncol (X);
num_cat_features = 0;
X_scale = X;
distinct_values_max = 1;
} else { # some features scale some features categorical
num_cat_features = nrow (R_cat);
num_scale_features = nrow (R_scale);
distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
distinct_values_max = max (distinct_values);
distinct_values_offset = cumsum (t (distinct_values));
distinct_values_overall = sum (distinct_values);
W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix ("1 -1", rows = 1, cols = 2);
W = matrix (W, rows = 2 * num_cat_features, cols = 1);
if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X)) {
W[2 * num_cat_features,] = 0;
}
last = (R_cat[,2] != ncol (X));
R_cat1 = (R_cat[,2] + 1) * last;
R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1;
R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 1);
col_tab = table (R_cat_vec, 1, W, ncol (X), 1);
col_ind = cumsum (col_tab);
col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol (X)), margin = "rows");
col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, ncol (X)), margin = "rows");
X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), ncol (X), nrow (col_ind_cat));
X_scale = X %*% table (col_ind_scale, seq (1, nrow (col_ind_scale)), ncol (X), nrow (col_ind_scale));
}
} else { # only scale features exist
print ("All features scale");
num_scale_features = ncol (X);
num_cat_features = 0;
X_scale = X;
distinct_values_max = 1;
}
if (num_scale_features > 0) {
print ("COMPUTING BINNING...");
bin_size = max (as.integer (num_records / num_bins), 1);
count_thresholds = matrix (0, rows = 1, cols = num_scale_features)
thresholds = matrix (0, rows = num_bins + 1, cols = num_scale_features)
parfor(i1 in 1:num_scale_features) {
col = order (target = X_scale[,i1], by = 1, decreasing = FALSE);
[col_bins, num_bins_defined] = binning (col, bin_size, num_bins);
count_thresholds[,i1] = num_bins_defined;
thresholds[,i1] = col_bins;
}
print ("PREPROCESSING SCALE FEATURE MATRIX...");
min_num_bins = min (count_thresholds);
max_num_bins = max (count_thresholds);
total_num_bins = sum (count_thresholds);
cum_count_thresholds = t (cumsum (t (count_thresholds)));
X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins);
parfor (i2 in 1:num_scale_features, check = 0) {
Xi2 = X_scale[,i2];
count_threshold = as.scalar (count_thresholds[,i2]);
offset_feature = 1;
if (i2 > 1) {
offset_feature = offset_feature + as.integer (as.scalar (cum_count_thresholds[, (i2 - 1)]));
}
ti2 = t(thresholds[1:count_threshold, i2]);
X_scale_ext[,offset_feature:(offset_feature + count_threshold - 1)] = outer (Xi2, ti2, "<");
}
}
num_features_total = num_scale_features + num_cat_features;
num_feature_samples = as.integer (floor (num_features_total ^ fpow));
##### INITIALIZATION
L = matrix (1, rows = num_records, cols = num_trees); # last visited node id for each training sample
# create matrix of counts (generated by Poisson distribution) storing how many times each sample appears in each tree
print ("CONPUTING COUNTS...");
C = rand (rows = num_records, cols = num_trees, pdf = "poisson", lambda = rate);
Ix_nonzero = (C != 0);
L = L * Ix_nonzero;
total_counts = sum (C);
# model
# LARGE leaf nodes
# NC_large[,1]: node id
# NC_large[,2]: tree id
# NC_large[,3]: class label
# NC_large[,4]: no. of misclassified samples
# NC_large[,5]: 1 if special leaf (impure and 3 samples at that leaf > threshold) or 0 otherwise
NC_large = matrix (0, rows = 5, cols = 1);
# SMALL leaf nodes
# same schema as for LARGE leaf nodes (to be initialized)
NC_small = matrix (0, rows = 5, cols = 1);
# LARGE internal nodes
# Q_large[,1]: node id
# Q_large[,2]: tree id
Q_large = matrix (0, rows = 2, cols = num_trees);
Q_large[1,] = matrix (1, rows = 1, cols = num_trees);
Q_large[2,] = t (seq (1, num_trees));
# SMALL internal nodes
# same schema as for LARGE internal nodes (to be initialized)
Q_small = matrix (0, rows = 2, cols = 1);
# F_large[,1]: feature
# F_large[,2]: type
# F_large[,3]: offset
F_large = matrix (0, rows = 3, cols = 1);
# same schema as for LARGE nodes
F_small = matrix (0, rows = 3, cols = 1);
# split points for LARGE internal nodes
S_large = matrix (0, rows = 1, cols = 1);
# split points for SMALL internal nodes
S_small = matrix (0, rows = 1, cols = 1);
# initialize queue
cur_nodes_large = matrix (1, rows = 2, cols = num_trees);
cur_nodes_large[2,] = t (seq (1, num_trees));
num_cur_nodes_large = num_trees;
num_cur_nodes_small = 0;
level = 0;
while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) {
level = level + 1;
print (" --- start level " + level + " --- ");
##### PREPARE MODEL
if (num_cur_nodes_large > 0) { # LARGE nodes to process
cur_Q_large = matrix (0, rows = 2, cols = 2 * num_cur_nodes_large);
cur_NC_large = matrix (0, rows = 5, cols = 2 * num_cur_nodes_large);
cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large);
cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large * distinct_values_max);
cur_nodes_small = matrix (0, rows = 3, cols = 2 * num_cur_nodes_large);
}
##### LOOP OVER LARGE NODES...
parfor (i6 in 1:num_cur_nodes_large, check = 0) {
cur_node = as.scalar (cur_nodes_large[1,i6]);
cur_tree = as.scalar (cur_nodes_large[2,i6]);
# select sample features WOR
feature_samples = sample (num_features_total, num_feature_samples);
feature_samples = order (target = feature_samples, by = 1);
num_scale_feature_samples = sum (feature_samples <= num_scale_features);
num_cat_feature_samples = num_feature_samples - num_scale_feature_samples;
# --- find best split ---
# samples that reach cur_node
Ix = (L[,cur_tree] == cur_node);
cur_Y_bin = Y_bin * (Ix * C[,cur_tree]);
label_counts_overall = colSums (cur_Y_bin);
label_sum_overall = sum (label_counts_overall);
label_dist_overall = label_counts_overall / label_sum_overall;
if (imp == "entropy") {
label_dist_zero = (label_dist_overall == 0);
cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2); # impurity before
} else { # imp == "Gini"
cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); # impurity before
}
best_scale_gain = 0;
best_cat_gain = 0;
if (num_scale_features > 0 & num_scale_feature_samples > 0) {
scale_feature_samples = feature_samples[1:num_scale_feature_samples,];
# main operation
label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext);
# compute left and right label distribution
label_sum_left = rowSums (label_counts_left_scale);
label_dist_left = label_counts_left_scale / label_sum_left;
if (imp == "entropy") {
label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
log_label_dist_left = log (label_dist_left); # / log (2)
impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left);
} else { # imp == "Gini"
impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left));
}
#
label_counts_right_scale = - label_counts_left_scale + label_counts_overall;
label_sum_right = rowSums (label_counts_right_scale);
label_dist_right = label_counts_right_scale / label_sum_right;
if (imp == "entropy") {
label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
log_label_dist_right = log (label_dist_right); # / log (2)
impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right);
} else { # imp == "Gini"
impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right));
}
I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale);
I_gain_scale = replace (target = I_gain_scale, pattern = "NaN", replacement = 0);
# determine best feature to split on and the split value
feature_start_ind = matrix (0, rows = 1, cols = num_scale_features);
feature_start_ind[1,1] = 1;
if (num_scale_features > 1) {
feature_start_ind[1,2:num_scale_features] = cum_count_thresholds[1,1:(num_scale_features - 1)] + 1;
}
max_I_gain_found = 0;
max_I_gain_found_ind = 0;
best_i = 0;
for (i in 1:num_scale_feature_samples) { # assuming feature_samples is 5x1
cur_feature_samples_bin = as.scalar (scale_feature_samples[i,]);
cur_start_ind = as.scalar (feature_start_ind[,cur_feature_samples_bin]);
cur_end_ind = as.scalar (cum_count_thresholds[,cur_feature_samples_bin]);
I_gain_portion = I_gain_scale[cur_start_ind:cur_end_ind,];
cur_max_I_gain = max (I_gain_portion);
cur_max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain_portion)));
if (cur_max_I_gain > max_I_gain_found) {
max_I_gain_found = cur_max_I_gain;
max_I_gain_found_ind = cur_max_I_gain_ind;
best_i = i;
}
}
best_scale_gain = max_I_gain_found;
max_I_gain_ind_scale = max_I_gain_found_ind;
best_scale_feature = 0;
if (best_i > 0) {
best_scale_feature = as.scalar (scale_feature_samples[best_i,]);
}
best_scale_split = max_I_gain_ind_scale;
if (best_scale_feature > 1) {
best_scale_split = best_scale_split + as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
}
}
if (num_cat_features > 0 & num_cat_feature_samples > 0){
cat_feature_samples = feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + num_cat_feature_samples),] - num_scale_features;
# initialization
split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall);
split_values = split_values_bin;
split_values_offset = matrix (0, rows = 1, cols = num_cat_features);
I_gains = split_values_offset;
impurities_left = split_values_offset;
impurities_right = split_values_offset;
best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes);
best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes);
# main operation
label_counts = t (t (cur_Y_bin) %*% X_cat);
parfor (i9 in 1:num_cat_feature_samples, check = 0){
cur_cat_feature = as.scalar (cat_feature_samples[i9,1]);
start_ind = 1;
if (cur_cat_feature > 1) {
start_ind = start_ind + as.scalar (distinct_values_offset[(cur_cat_feature - 1),]);
}
offset = as.scalar (distinct_values[1,cur_cat_feature]);
cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),];
label_sum = rowSums (cur_label_counts);
label_dist = cur_label_counts / label_sum;
if (imp == "entropy") {
label_dist = replace (target = label_dist, pattern = 0, replacement = 1);
log_label_dist = log (label_dist); # / log(2)
impurity = - rowSums (label_dist * log_label_dist);
impurity = replace (target = impurity, pattern = "NaN", replacement = 1/0);
} else { # imp == "Gini"
impurity = rowSums (label_dist * (1 - label_dist));
}
# sort cur feature by impurity
cur_distinct_values = seq (1, nrow (cur_label_counts));
cur_distinct_values_impurity = cbind (cur_distinct_values, impurity);
cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE);
P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix
label_counts_sorted = P %*% cur_label_counts;
# compute left and right label distribution
label_counts_left = cumsum (label_counts_sorted);
label_sum_left = rowSums (label_counts_left);
label_dist_left = label_counts_left / label_sum_left;
label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1);
if (imp == "entropy") {
label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
log_label_dist_left = log (label_dist_left); # / log(2)
impurity_left = - rowSums (label_dist_left * log_label_dist_left);
} else { # imp == "Gini"
impurity_left = rowSums (label_dist_left * (1 - label_dist_left));
}
#
label_counts_right = - label_counts_left + label_counts_overall;
label_sum_right = rowSums (label_counts_right);
label_dist_right = label_counts_right / label_sum_right;
label_dist_right = replace (target = label_dist_right, pattern = "NaN", replacement = 1);
if (imp == "entropy") {
label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
log_label_dist_right = log (label_dist_right); # / log (2)
impurity_right = - rowSums (label_dist_right * log_label_dist_right);
} else { # imp == "Gini"
impurity_right = rowSums (label_dist_right * (1 - label_dist_right));
}
I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);
Ix_label_sum_left_zero = (label_sum_left == 0);
Ix_label_sum_right_zero = (label_sum_right == 0);
Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero;
I_gain = I_gain * (1 - Ix_label_sum_zero);
I_gain[nrow (I_gain),] = 0; # last entry invalid
max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));
split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
for (i10 in 1:max_I_gain_ind) {
ind = as.scalar (cur_feature_sorted[i10,1]);
if (ind == 1) {
split_values_bin[1,start_ind] = 1.0;
} else {
split_values_bin[1,(start_ind + ind - 1)] = 1.0;
}
}
split_values_offset[1,cur_cat_feature] = max_I_gain_ind;
I_gains[1,cur_cat_feature] = max (I_gain);
impurities_left[1,cur_cat_feature] = as.scalar (impurity_left[max_I_gain_ind,]);
impurities_right[1,cur_cat_feature] = as.scalar (impurity_right[max_I_gain_ind,]);
best_label_counts_left[cur_cat_feature,] = label_counts_left[max_I_gain_ind,];
best_label_counts_right[cur_cat_feature,] = label_counts_right[max_I_gain_ind,];
}
# determine best feature to split on and the split values
best_cat_feature = as.scalar (rowIndexMax (I_gains));
best_cat_gain = max (I_gains);
start_ind = 1;
if (best_cat_feature > 1) {
start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]);
}
offset = as.scalar (distinct_values[1,best_cat_feature]);
best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)];
}
# compare best scale feature to best cat. feature and pick the best one
if (num_scale_features > 0 & num_scale_feature_samples > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
# --- update model ---
cur_F_large[1,i6] = best_scale_feature;
cur_F_large[2,i6] = 1;
cur_F_large[3,i6] = 1;
cur_S_large[1,(i6 - 1) * distinct_values_max + 1] = thresholds[max_I_gain_ind_scale, best_scale_feature];
left_child = 2 * (cur_node - 1) + 1 + 1;
right_child = 2 * (cur_node - 1) + 2 + 1;
# samples going to the left subtree
Ix_left = X_scale_ext[,best_scale_split];
Ix_left = Ix * Ix_left;
Ix_right = Ix * (1 - Ix_left);
L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child);
left_child_size = sum (Ix_left * C[,cur_tree]);
right_child_size = sum (Ix_right * C[,cur_tree]);
# check if left or right child is a leaf
left_pure = FALSE;
right_pure = FALSE;
cur_impurity_left = as.scalar(impurity_left_scale[best_scale_split,]); # max_I_gain_ind_scale
cur_impurity_right = as.scalar(impurity_right_scale[best_scale_split,]); # max_I_gain_ind_scale
if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) &
(right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) |
(left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf
cur_label_counts_left = label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale
cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
# compute number of misclassified points
cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
cur_label_counts_right = label_counts_overall - cur_label_counts_left;
cur_NC_large[1,(2 * i6)] = right_child;
cur_NC_large[2,(2 * i6)] = cur_tree;
cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
# compute number of misclassified pints
cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right);
} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) |
(left_child_size <= threshold & (level == depth))) {
cur_label_counts_left = label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale
cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
# compute number of misclassified points
cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) |
(right_child_size <= threshold & (level == depth))) {
cur_label_counts_right = label_counts_right_scale[best_scale_split,]; # max_I_gain_ind_scale
cur_NC_large[1,(2 * i6)] = right_child;
cur_NC_large[2,(2 * i6)] = cur_tree;
cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
# compute number of misclassified pints
cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right);
}
} else if (num_cat_features > 0 & num_cat_feature_samples > 0 & best_cat_gain > 0) {
# --- update model ---
cur_F_large[1,i6] = best_cat_feature;
cur_F_large[2,i6] = 2;
offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]);
S_start_ind = (i6 - 1) * distinct_values_max + 1;
cur_F_large[3,i6] = offset_nonzero;
cur_S_large[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)];
left_child = 2 * (cur_node - 1) + 1 + 1;
right_child = 2 * (cur_node - 1) + 2 + 1;
# samples going to the left subtree
Ix_left = rowSums (X_cat[,start_ind:(start_ind + offset - 1)] * best_split_values_bin);
Ix_left = (Ix_left >= 1);
Ix_left = Ix * Ix_left;
Ix_right = Ix * (1 - Ix_left);
L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child);
left_child_size = sum (Ix_left * C[,cur_tree]);
right_child_size = sum (Ix_right * C[,cur_tree]);
# check if left or right child is a leaf
left_pure = FALSE;
right_pure = FALSE;
cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]);
cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]);
if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) &
(right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) |
(left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf
cur_label_counts_left = best_label_counts_left[best_cat_feature,];
cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
# compute number of misclassified points
cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
cur_label_counts_right = label_counts_overall - cur_label_counts_left;
cur_NC_large[1,(2 * i6)] = right_child;
cur_NC_large[2,(2 * i6)] = cur_tree;
cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
# compute number of misclassified pints
cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right);
} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) |
(left_child_size <= threshold & (level == depth))) {
cur_label_counts_left = best_label_counts_left[best_cat_feature,];
cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
# compute number of misclassified points
cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) |
(right_child_size <= threshold & (level == depth))) {
cur_label_counts_right = best_label_counts_right[best_cat_feature,];
cur_NC_large[1,(2 * i6)] = right_child;
cur_NC_large[2,(2 * i6)] = cur_tree;
cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
# compute number of misclassified pints
cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right);
}
} else {
print ("NUMBER OF SAMPLES AT NODE " + cur_node + " in tree " + cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!");
right_pure = TRUE;
left_pure = TRUE;
cur_NC_large[1,(2 * (i6 - 1) + 1)] = cur_node;
cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
class_label = as.scalar (rowIndexMax (label_counts_overall));
cur_NC_large[3,(2 * (i6 - 1) + 1)] = class_label;
cur_NC_large[4,(2 * (i6 - 1) + 1)] = label_sum_overall - max (label_counts_overall);
cur_NC_large[5,(2 * (i6 - 1) + 1)] = 1; # special leaf
}
# add nodes to Q
if (!left_pure) {
if (left_child_size > threshold) {
cur_Q_large[1,(2 * (i6 - 1)+ 1)] = left_child;
cur_Q_large[2,(2 * (i6 - 1)+ 1)] = cur_tree;
} else {
cur_nodes_small[1,(2 * (i6 - 1)+ 1)] = left_child;
cur_nodes_small[2,(2 * (i6 - 1)+ 1)] = left_child_size;
cur_nodes_small[3,(2 * (i6 - 1)+ 1)] = cur_tree;
}
}
if (!right_pure) {
if (right_child_size > threshold) {
cur_Q_large[1,(2 * i6)] = right_child;
cur_Q_large[2,(2 * i6)] = cur_tree;
} else{
cur_nodes_small[1,(2 * i6)] = right_child;
cur_nodes_small[2,(2 * i6)] = right_child_size;
cur_nodes_small[3,(2 * i6)] = cur_tree;
}
}
}
##### PREPARE MODEL FOR LARGE NODES
if (num_cur_nodes_large > 0) {
cur_Q_large = removeEmpty (target = cur_Q_large, margin = "cols");
if (as.scalar (cur_Q_large[1,1]) != 0) Q_large = cbind (Q_large, cur_Q_large);
cur_NC_large = removeEmpty (target = cur_NC_large, margin = "cols");
if (as.scalar (cur_NC_large[1,1]) != 0) NC_large = cbind (NC_large, cur_NC_large);
cur_F_large = removeEmpty (target = cur_F_large, margin = "cols");
if (as.scalar (cur_F_large[1,1]) != 0) F_large = cbind (F_large, cur_F_large);
cur_S_large = removeEmpty (target = cur_S_large, margin = "cols");
if (as.scalar (cur_S_large[1,1]) != 0) S_large = cbind (S_large, cur_S_large);
num_cur_nodes_large_pre = 2 * num_cur_nodes_large;
if (as.scalar (cur_Q_large[1,1]) == 0) {
num_cur_nodes_large = 0;
} else {
cur_nodes_large = cur_Q_large;
num_cur_nodes_large = ncol (cur_Q_large);
}
}
##### PREPARE MODEL FOR SMALL NODES
cur_nodes_small_nonzero = removeEmpty (target = cur_nodes_small, margin = "cols");
if (as.scalar (cur_nodes_small_nonzero[1,1]) != 0) { # if SMALL nodes exist
num_cur_nodes_small = ncol (cur_nodes_small_nonzero);
}
if (num_cur_nodes_small > 0) { # SMALL nodes to process
reserve_len = sum (2 ^ (ceil (log (cur_nodes_small_nonzero[2,]) / log (2)))) + num_cur_nodes_small;
cur_Q_small = matrix (0, rows = 2, cols = reserve_len);
cur_F_small = matrix (0, rows = 3, cols = reserve_len);
cur_NC_small = matrix (0, rows = 5, cols = reserve_len);
cur_S_small = matrix (0, rows = 1, cols = reserve_len * distinct_values_max); # split values of the best feature
}
##### LOOP OVER SMALL NODES...
parfor (i7 in 1:num_cur_nodes_small, check = 0) {
cur_node_small = as.scalar (cur_nodes_small_nonzero[1,i7]);
cur_tree_small = as.scalar (cur_nodes_small_nonzero[3,i7]);
# build dataset for SMALL node
Ix = (L[,cur_tree_small] == cur_node_small);
if (num_scale_features > 0) {
X_scale_ext_small = removeEmpty (target = X_scale_ext, margin = "rows", select = Ix);
}
if (num_cat_features > 0) {
X_cat_small = removeEmpty (target = X_cat, margin = "rows", select = Ix);
}
L_small = removeEmpty (target = L * Ix, margin = "rows");
C_small = removeEmpty (target = C * Ix, margin = "rows");
Y_bin_small = removeEmpty (target = Y_bin * Ix, margin = "rows");
# compute offset
offsets = cumsum (t (2 ^ ceil (log (cur_nodes_small_nonzero[2,]) / log (2))));
start_ind_global = 1;
if (i7 > 1) {
start_ind_global = start_ind_global + as.scalar (offsets[(i7 - 1),]);
}
start_ind_S_global = 1;
if (i7 > 1) {
start_ind_S_global = start_ind_S_global + (as.scalar (offsets[(i7 - 1),]) * distinct_values_max);
}
Q = matrix (0, rows = 2, cols = 1);
Q[1,1] = cur_node_small;
Q[2,1] = cur_tree_small;
F = matrix (0, rows = 3, cols = 1);
NC = matrix (0, rows = 5, cols = 1);
S = matrix (0, rows = 1, cols = 1);
cur_nodes_ = matrix (cur_node_small, rows = 2, cols = 1);
cur_nodes_[1,1] = cur_node_small;
cur_nodes_[2,1] = cur_tree_small;
num_cur_nodes = 1;
level_ = level;
while (num_cur_nodes > 0 & level_ < depth) {
level_ = level_ + 1;
cur_Q = matrix (0, rows = 2, cols = 2 * num_cur_nodes);
cur_F = matrix (0, rows = 3, cols = num_cur_nodes);
cur_NC = matrix (0, rows = 5, cols = 2 * num_cur_nodes);
cur_S = matrix (0, rows = 1, cols = num_cur_nodes * distinct_values_max);
parfor (i8 in 1:num_cur_nodes, check = 0) {
cur_node = as.scalar (cur_nodes_[1,i8]);
cur_tree = as.scalar (cur_nodes_[2,i8]);
# select sample features WOR
feature_samples = sample (num_features_total, num_feature_samples);
feature_samples = order (target = feature_samples, by = 1);
num_scale_feature_samples = sum (feature_samples <= num_scale_features);
num_cat_feature_samples = num_feature_samples - num_scale_feature_samples;
# --- find best split ---
# samples that reach cur_node
Ix = (L_small[,cur_tree] == cur_node);
cur_Y_bin = Y_bin_small * (Ix * C_small[,cur_tree]);
label_counts_overall = colSums (cur_Y_bin);
label_sum_overall = sum (label_counts_overall);
label_dist_overall = label_counts_overall / label_sum_overall;
if (imp == "entropy") {
label_dist_zero = (label_dist_overall == 0);
cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2);
} else { # imp == "Gini"
cur_impurity = sum (label_dist_overall * (1 - label_dist_overall));
}
best_scale_gain = 0;
best_cat_gain = 0;
if (num_scale_features > 0 & num_scale_feature_samples > 0) {
scale_feature_samples = feature_samples[1:num_scale_feature_samples,];
# main operation
label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext_small);
# compute left and right label distribution
label_sum_left = rowSums (label_counts_left_scale);
label_dist_left = label_counts_left_scale / label_sum_left;
if (imp == "entropy") {
label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
log_label_dist_left = log (label_dist_left); # / log (2)
impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left);
} else { # imp == "Gini"
impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left));
}
#
label_counts_right_scale = - label_counts_left_scale + label_counts_overall;
label_sum_right = rowSums (label_counts_right_scale);
label_dist_right = label_counts_right_scale / label_sum_right;
if (imp == "entropy") {
label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
log_label_dist_right = log (label_dist_right); # log (2)
impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right);
} else { # imp == "Gini"
impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right));
}
I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale);
I_gain_scale = replace (target = I_gain_scale, pattern = "NaN", replacement = 0);
# determine best feature to split on and the split value
feature_start_ind = matrix (0, rows = 1, cols = num_scale_features);
feature_start_ind[1,1] = 1;
if (num_scale_features > 1) {
feature_start_ind[1,2:num_scale_features] = cum_count_thresholds[1,1:(num_scale_features - 1)] + 1;
}
max_I_gain_found = 0;
max_I_gain_found_ind = 0;
best_i = 0;
for (i in 1:num_scale_feature_samples) { # assuming feature_samples is 5x1
cur_feature_samples_bin = as.scalar (scale_feature_samples[i,]);
cur_start_ind = as.scalar (feature_start_ind[,cur_feature_samples_bin]);
cur_end_ind = as.scalar (cum_count_thresholds[,cur_feature_samples_bin]);
I_gain_portion = I_gain_scale[cur_start_ind:cur_end_ind,];
cur_max_I_gain = max (I_gain_portion);
cur_max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain_portion)));
if (cur_max_I_gain > max_I_gain_found) {
max_I_gain_found = cur_max_I_gain;
max_I_gain_found_ind = cur_max_I_gain_ind;
best_i = i;
}
}
best_scale_gain = max_I_gain_found;
max_I_gain_ind_scale = max_I_gain_found_ind;
best_scale_feature = 0;
if (best_i > 0) {
best_scale_feature = as.scalar (scale_feature_samples[best_i,]);
}
best_scale_split = max_I_gain_ind_scale;
if (best_scale_feature > 1) {
best_scale_split = best_scale_split + as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
}
}
if (num_cat_features > 0 & num_cat_feature_samples > 0){
cat_feature_samples = feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + num_cat_feature_samples),] - num_scale_features;
# initialization
split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall);
split_values = split_values_bin;
split_values_offset = matrix (0, rows = 1, cols = num_cat_features);
I_gains = split_values_offset;
impurities_left = split_values_offset;
impurities_right = split_values_offset;
best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes);
best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes);
# main operation
label_counts = t (t (cur_Y_bin) %*% X_cat_small);
parfor (i9 in 1:num_cat_feature_samples, check = 0){
cur_cat_feature = as.scalar (cat_feature_samples[i9,1]);
start_ind = 1;
if (cur_cat_feature > 1) {
start_ind = start_ind + as.scalar (distinct_values_offset[(cur_cat_feature - 1),]);
}
offset = as.scalar (distinct_values[1,cur_cat_feature]);
cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),];
label_sum = rowSums (cur_label_counts);
label_dist = cur_label_counts / label_sum;
if (imp == "entropy") {
label_dist = replace (target = label_dist, pattern = 0, replacement = 1);
log_label_dist = log (label_dist); # / log(2)
impurity = - rowSums (label_dist * log_label_dist);
impurity = replace (target = impurity, pattern = "NaN", replacement = 1/0);
} else { # imp == "Gini"
impurity = rowSums (label_dist * (1 - label_dist));
}
# sort cur feature by impurity
cur_distinct_values = seq (1, nrow (cur_label_counts));
cur_distinct_values_impurity = cbind (cur_distinct_values, impurity);
cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE);
P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix
label_counts_sorted = P %*% cur_label_counts;
# compute left and right label distribution
label_counts_left = cumsum (label_counts_sorted);
label_sum_left = rowSums (label_counts_left);
label_dist_left = label_counts_left / label_sum_left;
label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1);
if (imp == "entropy") {
label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
log_label_dist_left = log (label_dist_left); # / log(2)
impurity_left = - rowSums (label_dist_left * log_label_dist_left);
} else { # imp == "Gini"
impurity_left = rowSums (label_dist_left * (1 - label_dist_left));
}
#
label_counts_right = - label_counts_left + label_counts_overall;
label_sum_right = rowSums (label_counts_right);
label_dist_right = label_counts_right / label_sum_right;
label_dist_right = replace (target = label_dist_right, pattern = "NaN", replacement = 1);
if (imp == "entropy") {
label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
log_label_dist_right = log (label_dist_right); # / log (2)
impurity_right = - rowSums (label_dist_right * log_label_dist_right);
} else { # imp == "Gini"
impurity_right = rowSums (label_dist_right * (1 - label_dist_right));
}
I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);
Ix_label_sum_left_zero = (label_sum_left == 0);
Ix_label_sum_right_zero = (label_sum_right == 0);
Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero;
I_gain = I_gain * (1 - Ix_label_sum_zero);
I_gain[nrow (I_gain),] = 0; # last entry invalid
max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));
split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
for (i10 in 1:max_I_gain_ind) {
ind = as.scalar (cur_feature_sorted[i10,1]);
if (ind == 1) {
split_values_bin[1,start_ind] = 1.0;
} else {
split_values_bin[1,(start_ind + ind - 1)] = 1.0;
}
}
split_values_offset[1,cur_cat_feature] = max_I_gain_ind;
I_gains[1,cur_cat_feature] = max (I_gain);
impurities_left[1,cur_cat_feature] = as.scalar (impurity_left[max_I_gain_ind,]);
impurities_right[1,cur_cat_feature] = as.scalar (impurity_right[max_I_gain_ind,]);
best_label_counts_left[cur_cat_feature,] = label_counts_left[max_I_gain_ind,];
best_label_counts_right[cur_cat_feature,] = label_counts_right[max_I_gain_ind,];
}
# determine best feature to split on and the split values
best_cat_feature = as.scalar (rowIndexMax (I_gains));
best_cat_gain = max (I_gains);
start_ind = 1;
if (best_cat_feature > 1) {
start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]);
}
offset = as.scalar (distinct_values[1,best_cat_feature]);
best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)];
}
# compare best scale feature to best cat. feature and pick the best one
if (num_scale_features > 0 & num_scale_feature_samples > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
# --- update model ---
cur_F[1,i8] = best_scale_feature;
cur_F[2,i8] = 1;
cur_F[3,i8] = 1;
cur_S[1,(i8 - 1) * distinct_values_max + 1] = thresholds[max_I_gain_ind_scale, best_scale_feature];
left_child = 2 * (cur_node - 1) + 1 + 1;
right_child = 2 * (cur_node - 1) + 2 + 1;
# samples going to the left subtree
Ix_left = X_scale_ext_small[, best_scale_split];
Ix_left = Ix * Ix_left;
Ix_right = Ix * (1 - Ix_left);
L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child);
left_child_size = sum (Ix_left * C_small[,cur_tree]);
right_child_size = sum (Ix_right * C_small[,cur_tree]);
# check if left or right child is a leaf
left_pure = FALSE;
right_pure = FALSE;
cur_impurity_left = as.scalar(impurity_left_scale[best_scale_split,]);
cur_impurity_right = as.scalar(impurity_right_scale[best_scale_split,]);
if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) &
(right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) ) { # both left and right nodes are leaf
cur_label_counts_left = label_counts_left_scale[best_scale_split,];
cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
# compute number of misclassified points
cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
cur_label_counts_right = label_counts_overall - cur_label_counts_left;
cur_NC[1,(2 * i8)] = right_child;
cur_NC[2,(2 * i8)] = cur_tree;
cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
# compute number of misclassified points
cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right);
} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) {
cur_label_counts_left = label_counts_left_scale[best_scale_split,];
cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
# compute number of misclassified points
cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) {
cur_label_counts_right = label_counts_right_scale[best_scale_split,];
cur_NC[1,(2 * i8)] = right_child;
cur_NC[2,(2 * i8)] = cur_tree;
cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
# compute number of misclassified points
cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right);
}
} else if (num_cat_features > 0 & num_cat_feature_samples > 0 & best_cat_gain > 0) {
# --- update model ---
cur_F[1,i8] = best_cat_feature;
cur_F[2,i8] = 2;
offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]);
S_start_ind = (i8 - 1) * distinct_values_max + 1;
cur_F[3,i8] = offset_nonzero;
cur_S[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)];
left_child = 2 * (cur_node - 1) + 1 + 1;
right_child = 2 * (cur_node - 1) + 2 + 1;
# samples going to the left subtree
Ix_left = rowSums (X_cat_small[,start_ind:(start_ind + offset - 1)] * best_split_values_bin);
Ix_left = (Ix_left >= 1);
Ix_left = Ix * Ix_left;
Ix_right = Ix * (1 - Ix_left);
L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child);
left_child_size = sum (Ix_left * C_small[,cur_tree]);
right_child_size = sum (Ix_right * C_small[,cur_tree]);
# check if left or right child is a leaf
left_pure = FALSE;
right_pure = FALSE;
cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]);
cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]);
if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) &
(right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) ) { # both left and right nodes are leaf
cur_label_counts_left = best_label_counts_left[best_cat_feature,];
cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
# compute number of misclassified points
cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
cur_label_counts_right = label_counts_overall - cur_label_counts_left;
cur_NC[1,(2 * i8)] = right_child;
cur_NC[2,(2 * i8)] = cur_tree;
cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
# compute number of misclassified points
cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right);
} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) {
cur_label_counts_left = best_label_counts_left[best_cat_feature,];
cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
left_pure = TRUE;
# compute number of misclassified points
cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) {
cur_label_counts_right = best_label_counts_right[best_cat_feature,];
cur_NC[1,(2 * i8)] = right_child;
cur_NC[2,(2 * i8)] = cur_tree;
cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
right_pure = TRUE;
# compute number of misclassified points
cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right);
}
} else {
print ("NUMBER OF SAMPLES AT NODE " + cur_node + " in tree " + cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!");
right_pure = TRUE;
left_pure = TRUE;
cur_NC[1,(2 * (i8 - 1) + 1)] = cur_node;
cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
class_label = as.scalar (rowIndexMax (label_counts_overall));
cur_NC[3,(2 * (i8 - 1) + 1)] = class_label;
cur_NC[4,(2 * (i8 - 1) + 1)] = label_sum_overall - max (label_counts_overall);
cur_NC[5,(2 * (i8 - 1) + 1)] = 1; # special leaf
}
# add nodes to Q
if (!left_pure) {
cur_Q[1,(2 * (i8 - 1)+ 1)] = left_child;
cur_Q[2,(2 * (i8 - 1)+ 1)] = cur_tree;
}
if (!right_pure) {
cur_Q[1,(2 * i8)] = right_child;
cur_Q[2,(2 * i8)] = cur_tree;
}
}
cur_Q = removeEmpty (target = cur_Q, margin = "cols");
Q = cbind (Q, cur_Q);
NC = cbind (NC, cur_NC);
F = cbind (F, cur_F);
S = cbind (S, cur_S);
num_cur_nodes_pre = 2 * num_cur_nodes;
if (as.scalar (cur_Q[1,1]) == 0) {
num_cur_nodes = 0;
} else {
cur_nodes_ = cur_Q;
num_cur_nodes = ncol (cur_Q);
}
}
cur_Q_small[,start_ind_global:(start_ind_global + ncol (Q) - 1)] = Q;
cur_NC_small[,start_ind_global:(start_ind_global + ncol (NC) - 1)] = NC;
cur_F_small[,start_ind_global:(start_ind_global + ncol (F) - 1)] = F;
cur_S_small[,start_ind_S_global:(start_ind_S_global + ncol (S) - 1)] = S;
}
##### PREPARE MODEL FOR SMALL NODES
if (num_cur_nodes_small > 0) { # small nodes already processed
cur_Q_small = removeEmpty (target = cur_Q_small, margin = "cols");
if (as.scalar (cur_Q_small[1,1]) != 0) Q_small = cbind (Q_small, cur_Q_small);
cur_NC_small = removeEmpty (target = cur_NC_small, margin = "cols");
if (as.scalar (cur_NC_small[1,1]) != 0) NC_small = cbind (NC_small, cur_NC_small);
cur_F_small = removeEmpty (target = cur_F_small, margin = "cols"); #
if (as.scalar (cur_F_small[1,1]) != 0) F_small = cbind (F_small, cur_F_small);
cur_S_small = removeEmpty (target = cur_S_small, margin = "cols"); #
if (as.scalar (cur_S_small[1,1]) != 0) S_small = cbind (S_small, cur_S_small);
num_cur_nodes_small = 0; # reset
}
print (" --- end level " + level + ", remaining no. of LARGE nodes to expand " + num_cur_nodes_large + " --- ");
}
#### prepare model
print ("PREPARING MODEL...")
### large nodes
if (as.scalar (Q_large[1,1]) == 0 & ncol (Q_large) > 1) {
Q_large = Q_large[,2:ncol (Q_large)];
}
if (as.scalar (NC_large[1,1]) == 0 & ncol (NC_large) > 1) {
NC_large = NC_large[,2:ncol (NC_large)];
}
if (as.scalar (S_large[1,1]) == 0 & ncol (S_large) > 1) {
S_large = S_large[,2:ncol (S_large)];
}
if (as.scalar (F_large[1,1]) == 0 & ncol (F_large) > 1) {
F_large = F_large[,2:ncol (F_large)];
}
### small nodes
if (as.scalar (Q_small[1,1]) == 0 & ncol (Q_small) > 1) {
Q_small = Q_small[,2:ncol (Q_small)];
}
if (as.scalar (NC_small[1,1]) == 0 & ncol (NC_small) > 1) {
NC_small = NC_small[,2:ncol (NC_small)];
}
if (as.scalar (S_small[1,1]) == 0 & ncol (S_small) > 1) {
S_small = S_small[,2:ncol (S_small)];
}
if (as.scalar (F_small[1,1]) == 0 & ncol (F_small) > 1) {
F_small = F_small[,2:ncol (F_small)];
}
# check for special leaves and if there are any remove them from Q_large and Q_small
special_large_leaves_ind = NC_large[5,];
num_special_large_leaf = sum (special_large_leaves_ind);
if (num_special_large_leaf > 0) {
print ("PROCESSING " + num_special_large_leaf + " SPECIAL LARGE LEAVES...");
special_large_leaves = removeEmpty (target = NC_large[1:2,] * special_large_leaves_ind, margin = "cols");
large_internal_ind = 1 - colSums (outer (t (special_large_leaves[1,]), Q_large[1,], "==") * outer (t (special_large_leaves[2,]), Q_large[2,], "=="));
Q_large = removeEmpty (target = Q_large * large_internal_ind, margin = "cols");
F_large = removeEmpty (target = F_large, margin = "cols"); # remove special leaves from F
}
special_small_leaves_ind = NC_small[5,];
num_special_small_leaf = sum (special_small_leaves_ind);
if (num_special_small_leaf > 0) {
print ("PROCESSING " + num_special_small_leaf + " SPECIAL SMALL LEAVES...");
special_small_leaves = removeEmpty (target = NC_small[1:2,] * special_small_leaves_ind, margin = "cols");
small_internal_ind = 1 - colSums (outer (t (special_small_leaves[1,]), Q_small[1,], "==") * outer (t (special_small_leaves[2,]), Q_small[2,], "=="));
Q_small = removeEmpty (target = Q_small * small_internal_ind, margin = "cols");
F_small = removeEmpty (target = F_small, margin = "cols"); # remove special leaves from F
}
# model corresponding to large internal nodes
no_large_internal_node = FALSE;
if (as.scalar (Q_large[1,1]) != 0) {
print ("PROCESSING LARGE INTERNAL NODES...");
num_large_internal = ncol (Q_large);
max_offset = max (max (F_large[3,]), max (F_small[3,]));
M1_large = matrix (0, rows = 6 + max_offset, cols = num_large_internal);
M1_large[1:2,] = Q_large;
M1_large[4:6,] = F_large;
# process S_large
cum_offsets_large = cumsum (t (F_large[3,]));
parfor (it in 1:num_large_internal, check = 0) {
start_ind = 1;
if (it > 1) {
start_ind = start_ind + as.scalar (cum_offsets_large[(it - 1),]);
}
offset = as.scalar (F_large[3,it]);
M1_large[7:(7 + offset - 1),it] = t (S_large[1,start_ind:(start_ind + offset - 1)]);
}
} else {
print ("No LARGE internal nodes available");
no_large_internal_node = TRUE;
}
# model corresponding to small internal nodes
no_small_internal_node = FALSE;
if (as.scalar (Q_small[1,1]) != 0) {
print ("PROCESSING SMALL INTERNAL NODES...");
num_small_internal = ncol (Q_small);
M1_small = matrix (0, rows = 6 + max_offset, cols = num_small_internal);
M1_small[1:2,] = Q_small;
M1_small[4:6,] = F_small;
# process S_small
cum_offsets_small = cumsum (t (F_small[3,]));
parfor (it in 1:num_small_internal, check = 0) {
start_ind = 1;
if (it > 1) {
start_ind = start_ind + as.scalar (cum_offsets_small[(it - 1),]);
}
offset = as.scalar (F_small[3,it]);
M1_small[7:(7 + offset - 1),it] = t (S_small[1,start_ind:(start_ind + offset - 1)]);
}
} else {
print ("No SMALL internal nodes available");
no_small_internal_node = TRUE;
}
# model corresponding to large leaf nodes
no_large_leaf_node = FALSE;
if (as.scalar (NC_large[1,1]) != 0) {
print ("PROCESSING LARGE LEAF NODES...");
num_large_leaf = ncol (NC_large);
M2_large = matrix (0, rows = 6 + max_offset, cols = num_large_leaf);
M2_large[1:2,] = NC_large[1:2,];
M2_large[5:7,] = NC_large[3:5,];
} else {
print ("No LARGE leaf nodes available");
no_large_leaf_node = TRUE;
}
# model corresponding to small leaf nodes
no_small_leaf_node = FALSE;
if (as.scalar (NC_small[1,1]) != 0) {
print ("PROCESSING SMALL LEAF NODES...");
num_small_leaf = ncol (NC_small);
M2_small = matrix (0, rows = 6 + max_offset, cols = num_small_leaf);
M2_small[1:2,] = NC_small[1:2,];
M2_small[5:7,] = NC_small[3:5,];
} else {
print ("No SMALL leaf nodes available");
no_small_leaf_node = TRUE;
}
if (no_large_internal_node) {
M1 = M1_small;
} else if (no_small_internal_node) {
M1 = M1_large;
} else {
M1 = cbind (M1_large, M1_small);
}
if (no_large_leaf_node) {
M2 = M2_small;
} else if (no_small_leaf_node) {
M2 = M2_large;
} else {
M2 = cbind (M2_large, M2_small);
}
M = cbind (M1, M2);
M = t (order (target = t (M), by = 1)); # sort by node id
M = t (order (target = t (M), by = 2)); # sort by tree id
# removing redundant subtrees
if (ncol (M) > 1) {
print ("CHECKING FOR REDUNDANT SUBTREES...");
red_leaf = TRUE;
process_red_subtree = FALSE;
invalid_node_ind = matrix (0, rows = 1, cols = ncol (M));
while (red_leaf & ncol (M) > 1) {
leaf_ind = (M[4,] == 0);
labels = M[5,] * leaf_ind;
tree_ids = M[2,];
parent_ids = floor (M[1,] /2);
cond1 = (labels[,1:(ncol (M) - 1)] == labels[,2:ncol (M)]); # siebling leaves with same label
cond2 = (parent_ids[,1:(ncol (M) - 1)] == parent_ids[,2:ncol (M)]); # same parents
cond3 = (tree_ids[,1:(ncol (M) - 1)] == tree_ids[,2:ncol (M)]); # same tree
red_leaf_ind = cond1 * cond2 * cond3 * leaf_ind[,2:ncol (M)];
if (sum (red_leaf_ind) > 0) { # if redundant subtrees exist
red_leaf_ids = M[1:2,2:ncol (M)] * red_leaf_ind;
red_leaf_ids_nonzero = removeEmpty (target = red_leaf_ids, margin = "cols");
parfor (it in 1:ncol (red_leaf_ids_nonzero), check = 0){
cur_right_leaf_id = as.scalar (red_leaf_ids_nonzero[1,it]);
cur_parent_id = floor (cur_right_leaf_id / 2);
cur_tree_id = as.scalar (red_leaf_ids_nonzero[2,it]);
cur_right_leaf_pos = as.scalar (rowIndexMax ((M[1,] == cur_right_leaf_id) * (M[2,] == cur_tree_id)));
cur_parent_pos = as.scalar(rowIndexMax ((M[1,] == cur_parent_id) * (M[2,] == cur_tree_id)));
M[3:nrow (M), cur_parent_pos] = M[3:nrow (M), cur_right_leaf_pos];
M[4,cur_right_leaf_pos] = -1;
M[4,cur_right_leaf_pos - 1] = -1;
invalid_node_ind[1,cur_right_leaf_pos] = 1;
invalid_node_ind[1,cur_right_leaf_pos - 1] = 1;
}
process_red_subtree = TRUE;
} else {
red_leaf = FALSE;
}
}
if (process_red_subtree) {
print ("REMOVING REDUNDANT SUBTREES...");
valid_node_ind = (invalid_node_ind == 0);
M = removeEmpty (target = M * valid_node_ind, margin = "cols");
}
}
internal_ind = (M[4,] > 0);
internal_ids = M[1:2,] * internal_ind;
internal_ids_nonzero = removeEmpty (target = internal_ids, margin = "cols");
if (as.scalar (internal_ids_nonzero[1,1]) > 0) { # if internal nodes exist
a1 = internal_ids_nonzero[1,];
a2 = internal_ids_nonzero[1,] * 2;
vcur_tree_id = internal_ids_nonzero[2,];
pos_a1 = rowIndexMax( outer(t(a1), M[1,], "==") * outer(t(vcur_tree_id), M[2,], "==") );
pos_a2 = rowIndexMax( outer(t(a2), M[1,], "==") * outer(t(vcur_tree_id), M[2,], "==") );
M[3,] = t(table(pos_a1, 1, pos_a2 - pos_a1, ncol(M), 1));
}
else {
print ("All trees in the random forest contain only one leaf!");
}
if (fileC != " ") {
write (C, fileC, format = fmtO);
}
write (M, fileM, format = fmtO);