scripts.algorithms.random-forest.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#  
# THIS SCRIPT IMPLEMENTS CLASSIFICATION RANDOM FOREST WITH BOTH SCALE AND CATEGORICAL FEATURES
#
# INPUT         		PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME          		TYPE     DEFAULT      MEANING
# ---------------------------------------------------------------------------------------------
# X             		String   ---          Location to read feature matrix X; note that X needs to be both recoded and dummy coded 
# Y 					String   ---		  Location to read label matrix Y; note that Y needs to be both recoded and dummy coded
# R   	  				String   " "	      Location to read the matrix R which for each feature in X contains the following information 
#												- R[,1]: column ids
#												- R[,2]: start indices 
#												- R[,3]: end indices
#											  If R is not provided by default all variables are assumed to be scale
# bins          		Int 	 20			  Number of equiheight bins per scale feature to choose thresholds
# depth         		Int 	 25			  Maximum depth of the learned tree
# num_leaf      		Int      10           Number of samples when splitting stops and a leaf node is added
# num_samples   		Int 	 3000		  Number of samples at which point we switch to in-memory subtree building
# num_trees     		Int 	 10			  Number of trees to be learned in the random forest model
# subsamp_rate  		Double   1.0		  Parameter controlling the size of each tree in the forest; samples are selected from a 
#											  Poisson distribution with parameter subsamp_rate (the default value is 1.0)
# feature_subset    	Double   0.5    	  Parameter that controls the number of feature used as candidates for splitting at each tree node 
#											  as a power of number of features in the dataset;
#											  by default square root of features (i.e., feature_subset = 0.5) are used at each tree node 
# impurity      		String   "Gini"    	  Impurity measure: entropy or Gini (the default)
# M             		String 	 ---	   	  Location to write matrix M containing the learned tree
# C 					String   " "		  Location to write matrix C containing the number of times samples are chosen in each tree of the random forest 
# S_map					String   " "		  Location to write the mappings from scale feature ids to global feature ids
# C_map					String   " "		  Location to write the mappings from categorical feature ids to global feature ids
# fmt     	    		String   "text"       The output format of the model (matrix M), such as "text" or "csv"
# ---------------------------------------------------------------------------------------------
# OUTPUT: 
# Matrix M where each column corresponds to a node in the learned tree and each row contains the following information:
#	 M[1,j]: id of node j (in a complete binary tree)
#	 M[2,j]: tree id to which node j belongs
#	 M[3,j]: Offset (no. of columns) to left child of j 
#	 M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
#	 M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, 
#		     otherwise the label that leaf node j is supposed to predict
#	 M[6,j]: 1 if j is an internal node and the feature chosen for j is scale, otherwise the size of the subset of values 
#			 stored in rows 7,8,... if j is categorical
#	 M[7:,j]: Only applicable for internal nodes. Threshold the example's feature value is compared to is stored at M[7,j] if the feature chosen for j is scale;
# 			  If the feature chosen for j is categorical rows 7,8,... depict the value subset chosen for j   
# -------------------------------------------------------------------------------------------
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f random-forest.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model
#     				                 				   bins=20 depth=25 num_leaf=10 num_samples=3000 num_trees=10 impurity=Gini fmt=csv


# External function for binning
binning = externalFunction(Matrix[Double] A, Integer binsize, Integer numbins) return (Matrix[Double] B, Integer numbinsdef) 
	implemented in (classname="org.apache.sysml.udf.lib.BinningWrapper",exectype="mem")

	
# Default values of some parameters	
fileR = ifdef ($R, " ");
fileC = ifdef ($C, " ");
fileS_map = ifdef ($S_map, " ");
fileC_map = ifdef ($C_map, " ");
fileM = $M;	
num_bins = ifdef($bins, 20); 
depth = ifdef($depth, 25);
num_leaf = ifdef($num_leaf, 10);
num_trees = ifdef($num_trees, 1); 
threshold = ifdef ($num_samples, 3000);
imp = ifdef($impurity, "Gini");
rate = ifdef ($subsamp_rate, 1);
fpow = ifdef ($feature_subset, 0.5);
fmtO = ifdef($fmt, "text");

X = read($X);
Y_bin = read($Y);
num_records = nrow (X);
num_classes = ncol (Y_bin);

# check if there is only one class label
Y_bin_sum = sum (colSums (Y_bin) == num_records);
if (Y_bin_sum == 1) {
	stop ("Y contains only one class label. No model will be learned!");
} else if (Y_bin_sum > 1) {
	stop ("Y is not properly dummy coded. Multiple columns of Y contain only ones!")
}

# split data into X_scale and X_cat
if (fileR != " ") {
	R = read (fileR);
	R = order (target = R, by = 2); # sort by start indices
	dummy_coded = (R[,2] != R[,3]);
	R_scale = removeEmpty (target = R[,2:3] * (1 - dummy_coded), margin = "rows");
	R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
	if (fileS_map != " ") {
		scale_feature_mapping = removeEmpty (target = (1 - dummy_coded) * seq (1, nrow (R)), margin = "rows");
		write (scale_feature_mapping, fileS_map, format = fmtO);
	}
	if (fileC_map != " ") {
		cat_feature_mapping = removeEmpty (target = dummy_coded * seq (1, nrow (R)), margin = "rows");	
		write (cat_feature_mapping, fileC_map, format = fmtO);		
	}
	sum_dummy = sum (dummy_coded);	
	if (sum_dummy == nrow (R)) { # all features categorical
		print ("All features categorical");
		num_cat_features = nrow (R_cat);
		num_scale_features = 0;
		X_cat = X;	
		distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
		distinct_values_max = max (distinct_values);
		distinct_values_offset = cumsum (t (distinct_values));
		distinct_values_overall = sum (distinct_values);
	} else if (sum_dummy == 0) { # all features scale
		print ("All features scale");
		num_scale_features = ncol (X);
		num_cat_features = 0;
		X_scale = X;
		distinct_values_max = 1;
	} else { # some features scale some features categorical 
		num_cat_features = nrow (R_cat);
		num_scale_features = nrow (R_scale);
		distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
		distinct_values_max = max (distinct_values);
		distinct_values_offset = cumsum (t (distinct_values));
		distinct_values_overall = sum (distinct_values);
		
		W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix ("1 -1", rows = 1, cols = 2);
		W = matrix (W, rows = 2 * num_cat_features, cols = 1);
		if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X)) {
			W[2 * num_cat_features,] = 0;
		}
		
		last = (R_cat[,2] != ncol (X));
		R_cat1 = (R_cat[,2] + 1) * last;
		R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1;
		R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 1);	

		col_tab = table (R_cat_vec, 1, W, ncol (X), 1);
		col_ind = cumsum (col_tab);
		
		col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol (X)), margin = "rows");
		col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, ncol (X)), margin = "rows");	
		X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), ncol (X), nrow (col_ind_cat));
		X_scale = X %*% table (col_ind_scale, seq (1, nrow (col_ind_scale)), ncol (X), nrow (col_ind_scale));		
	}	
} else { # only scale features exist
	print ("All features scale");
	num_scale_features = ncol (X);
	num_cat_features = 0;
	X_scale = X;
	distinct_values_max = 1;
}	

if (num_scale_features > 0) {

	print ("COMPUTING BINNING...");
	bin_size = max (as.integer (num_records / num_bins), 1);
	count_thresholds = matrix (0, rows = 1, cols = num_scale_features)
	thresholds = matrix (0, rows = num_bins + 1, cols = num_scale_features)
	parfor(i1 in 1:num_scale_features) { 
		col = order (target = X_scale[,i1], by = 1, decreasing = FALSE);
		[col_bins, num_bins_defined] = binning (col, bin_size, num_bins);
		count_thresholds[,i1] = num_bins_defined;
		thresholds[,i1] = col_bins;	
	}
	
	print ("PREPROCESSING SCALE FEATURE MATRIX...");
	min_num_bins = min (count_thresholds);
	max_num_bins = max (count_thresholds);
	total_num_bins = sum (count_thresholds);
	cum_count_thresholds = t (cumsum (t (count_thresholds)));
	X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins);
	parfor (i2 in 1:num_scale_features, check = 0) { 
		Xi2 = X_scale[,i2];
		count_threshold = as.scalar (count_thresholds[,i2]);
		offset_feature = 1;
		if (i2 > 1) {
			offset_feature = offset_feature + as.integer (as.scalar (cum_count_thresholds[, (i2 - 1)]));
		}

		ti2 = t(thresholds[1:count_threshold, i2]);
		X_scale_ext[,offset_feature:(offset_feature + count_threshold - 1)] = outer (Xi2, ti2, "<");
	}
}

num_features_total = num_scale_features + num_cat_features;
num_feature_samples = as.integer (floor (num_features_total ^ fpow));

##### INITIALIZATION
L = matrix (1, rows = num_records, cols = num_trees); # last visited node id for each training sample

# create matrix of counts (generated by Poisson distribution) storing how many times each sample appears in each tree
print ("CONPUTING COUNTS...");
C = rand (rows = num_records, cols = num_trees, pdf = "poisson", lambda = rate);
Ix_nonzero = (C != 0);
L = L * Ix_nonzero;
total_counts = sum (C);


# model
# LARGE leaf nodes
# NC_large[,1]: node id
# NC_large[,2]: tree id
# NC_large[,3]: class label
# NC_large[,4]: no. of misclassified samples 
# NC_large[,5]: 1 if special leaf (impure and 3 samples at that leaf > threshold) or 0 otherwise 
NC_large = matrix (0, rows = 5, cols = 1); 

# SMALL leaf nodes 
# same schema as for LARGE leaf nodes (to be initialized)
NC_small = matrix (0, rows = 5, cols = 1); 

# LARGE internal nodes
# Q_large[,1]: node id
# Q_large[,2]: tree id
Q_large = matrix (0, rows = 2, cols = num_trees); 
Q_large[1,] = matrix (1, rows = 1, cols = num_trees);
Q_large[2,] = t (seq (1, num_trees));

# SMALL internal nodes
# same schema as for LARGE internal nodes (to be initialized)
Q_small = matrix (0, rows = 2, cols = 1); 

# F_large[,1]: feature
# F_large[,2]: type
# F_large[,3]: offset 
F_large = matrix (0, rows = 3, cols = 1);

# same schema as for LARGE nodes
F_small = matrix (0, rows = 3, cols = 1); 

# split points for LARGE internal nodes
S_large = matrix (0, rows = 1, cols = 1);

# split points for SMALL internal nodes 
S_small = matrix (0, rows = 1, cols = 1); 

# initialize queue
cur_nodes_large = matrix (1, rows = 2, cols = num_trees);
cur_nodes_large[2,] = t (seq (1, num_trees));

num_cur_nodes_large = num_trees;
num_cur_nodes_small = 0;
level = 0;

while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) {
	
	level = level + 1;
	print (" --- start level " + level + " --- ");
	
	##### PREPARE MODEL
	if (num_cur_nodes_large > 0) { # LARGE nodes to process
		cur_Q_large = matrix (0, rows = 2, cols = 2 * num_cur_nodes_large);
		cur_NC_large = matrix (0, rows = 5, cols = 2 * num_cur_nodes_large); 
		cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large); 
		cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large * distinct_values_max); 
		cur_nodes_small = matrix (0, rows = 3, cols = 2 * num_cur_nodes_large); 
	}

	##### LOOP OVER LARGE NODES...
	parfor (i6 in 1:num_cur_nodes_large, check = 0) { 
	
		cur_node = as.scalar (cur_nodes_large[1,i6]);
		cur_tree = as.scalar (cur_nodes_large[2,i6]);
			
		# select sample features WOR
		feature_samples = sample (num_features_total, num_feature_samples);
		feature_samples = order (target = feature_samples, by = 1);
		num_scale_feature_samples = sum (feature_samples <= num_scale_features);
		num_cat_feature_samples = num_feature_samples - num_scale_feature_samples;
		
		# --- find best split ---
		# samples that reach cur_node 
		Ix = (L[,cur_tree] == cur_node);		
		
		cur_Y_bin = Y_bin * (Ix * C[,cur_tree]);
		label_counts_overall = colSums (cur_Y_bin);
		label_sum_overall = sum (label_counts_overall);
		label_dist_overall = label_counts_overall / label_sum_overall;

		if (imp == "entropy") {
			label_dist_zero = (label_dist_overall == 0);
			cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2); # impurity before
		} else { # imp == "Gini"
			cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); # impurity before
		}
		best_scale_gain = 0;
		best_cat_gain = 0;
		if (num_scale_features > 0 & num_scale_feature_samples > 0) {
			
			scale_feature_samples = feature_samples[1:num_scale_feature_samples,];
			
			# main operation	
			label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext); 
		
			# compute left and right label distribution
			label_sum_left = rowSums (label_counts_left_scale);
			label_dist_left = label_counts_left_scale / label_sum_left;
			if (imp == "entropy") {
				label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
				log_label_dist_left = log (label_dist_left); # / log (2)
				impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left); 
			} else { # imp == "Gini"
				impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left)); 
			}
			#
			label_counts_right_scale = - label_counts_left_scale + label_counts_overall; 
			label_sum_right = rowSums (label_counts_right_scale);
			label_dist_right = label_counts_right_scale / label_sum_right;
			if (imp == "entropy") {
				label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
				log_label_dist_right = log (label_dist_right); # / log (2)
				impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right); 		
			} else { # imp == "Gini"
				impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right)); 
			}
			
			I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale); 
		
			I_gain_scale = replace (target = I_gain_scale, pattern = "NaN", replacement = 0);	
			
			# determine best feature to split on and the split value
			feature_start_ind = matrix (0, rows = 1, cols = num_scale_features);
			feature_start_ind[1,1] = 1;
			if (num_scale_features > 1) {
				feature_start_ind[1,2:num_scale_features] = cum_count_thresholds[1,1:(num_scale_features - 1)] + 1;
			}
			max_I_gain_found = 0;
			max_I_gain_found_ind = 0;
			best_i = 0;
			
			for (i in 1:num_scale_feature_samples) { # assuming feature_samples is 5x1
				cur_feature_samples_bin = as.scalar (scale_feature_samples[i,]); 
				cur_start_ind = as.scalar (feature_start_ind[,cur_feature_samples_bin]);
				cur_end_ind = as.scalar (cum_count_thresholds[,cur_feature_samples_bin]);
				I_gain_portion = I_gain_scale[cur_start_ind:cur_end_ind,];
				cur_max_I_gain = max (I_gain_portion);
				cur_max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain_portion)));
				if (cur_max_I_gain > max_I_gain_found) {
					max_I_gain_found = cur_max_I_gain;
					max_I_gain_found_ind = cur_max_I_gain_ind;
					best_i = i;
				}
			}

			best_scale_gain = max_I_gain_found;
			max_I_gain_ind_scale = max_I_gain_found_ind;
			best_scale_feature = 0;
			if (best_i > 0) {
				best_scale_feature = as.scalar (scale_feature_samples[best_i,]);
			}
			best_scale_split = max_I_gain_ind_scale;
			if (best_scale_feature > 1) {
				best_scale_split = best_scale_split + as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
			}					
		}
	
		if (num_cat_features > 0 & num_cat_feature_samples > 0){
			
			cat_feature_samples = feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + num_cat_feature_samples),] - num_scale_features;
			
			# initialization
			split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall); 
			split_values = split_values_bin; 
			split_values_offset = matrix (0, rows = 1, cols = num_cat_features); 
			I_gains = split_values_offset; 
			impurities_left = split_values_offset;
			impurities_right = split_values_offset;
			best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes);
			best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes);
			
			# main operation
			label_counts = t (t (cur_Y_bin) %*% X_cat);  			
			
			parfor (i9 in 1:num_cat_feature_samples, check = 0){
			
				cur_cat_feature = as.scalar (cat_feature_samples[i9,1]);
				start_ind = 1;
				if (cur_cat_feature > 1) {
					start_ind = start_ind + as.scalar (distinct_values_offset[(cur_cat_feature - 1),]);
				}
				offset = as.scalar (distinct_values[1,cur_cat_feature]);
				
				cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),];
								
				label_sum = rowSums (cur_label_counts);
				label_dist = cur_label_counts / label_sum;
				if (imp == "entropy") {
					label_dist = replace (target = label_dist, pattern = 0, replacement = 1);
					log_label_dist = log (label_dist); # / log(2)
					impurity = - rowSums (label_dist * log_label_dist); 
					impurity = replace (target = impurity, pattern = "NaN", replacement = 1/0); 
				} else { # imp == "Gini"
					impurity = rowSums (label_dist * (1 - label_dist)); 				
				}
			
				# sort cur feature by impurity
				cur_distinct_values = seq (1, nrow (cur_label_counts));
				cur_distinct_values_impurity = cbind (cur_distinct_values, impurity);
				cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE);
				P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix
				label_counts_sorted = P %*% cur_label_counts;

				# compute left and right label distribution			
				label_counts_left = cumsum (label_counts_sorted);
			
				label_sum_left = rowSums (label_counts_left);
				label_dist_left = label_counts_left / label_sum_left;
				label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1);
				if (imp == "entropy") {
					label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
					log_label_dist_left = log (label_dist_left); # / log(2)
					impurity_left = - rowSums (label_dist_left * log_label_dist_left);
				} else { # imp == "Gini"
					impurity_left = rowSums (label_dist_left * (1 - label_dist_left));					
				}
				#
				label_counts_right = - label_counts_left + label_counts_overall;
				label_sum_right = rowSums (label_counts_right);
				label_dist_right = label_counts_right / label_sum_right;
				label_dist_right = replace (target = label_dist_right, pattern = "NaN", replacement = 1);
				if (imp == "entropy") {
					label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
					log_label_dist_right = log (label_dist_right); # / log (2)
					impurity_right = - rowSums (label_dist_right * log_label_dist_right);			
				} else { # imp == "Gini"
					impurity_right = rowSums (label_dist_right * (1 - label_dist_right));					
				}
				I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);
			
				Ix_label_sum_left_zero = (label_sum_left == 0);
				Ix_label_sum_right_zero = (label_sum_right == 0);
				Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero;
				I_gain = I_gain * (1 - Ix_label_sum_zero);	
				
				I_gain[nrow (I_gain),] = 0; # last entry invalid
			
				max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));

				split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
				for (i10 in 1:max_I_gain_ind) {
					ind = as.scalar (cur_feature_sorted[i10,1]);
					if (ind == 1) {
						split_values_bin[1,start_ind] = 1.0; 
					} else {
						split_values_bin[1,(start_ind + ind - 1)] = 1.0;
					}
				}
				split_values_offset[1,cur_cat_feature] = max_I_gain_ind;
			
				I_gains[1,cur_cat_feature] = max (I_gain);
			
				impurities_left[1,cur_cat_feature] = as.scalar (impurity_left[max_I_gain_ind,]);
				impurities_right[1,cur_cat_feature] = as.scalar (impurity_right[max_I_gain_ind,]);
				best_label_counts_left[cur_cat_feature,] = label_counts_left[max_I_gain_ind,];
				best_label_counts_right[cur_cat_feature,] = label_counts_right[max_I_gain_ind,];				
			}
			
			# determine best feature to split on and the split values
			best_cat_feature = as.scalar (rowIndexMax (I_gains));
			best_cat_gain = max (I_gains);
			start_ind = 1;
			if (best_cat_feature > 1) {
				start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]);
			}
			offset = as.scalar (distinct_values[1,best_cat_feature]);
			best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)];		
		}		
	
		# compare best scale feature to best cat. feature and pick the best one
		if (num_scale_features > 0 & num_scale_feature_samples > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
			
			# --- update model ---
			cur_F_large[1,i6] = best_scale_feature;
			cur_F_large[2,i6] = 1;
			cur_F_large[3,i6] = 1;			
			cur_S_large[1,(i6 - 1) * distinct_values_max + 1] = thresholds[max_I_gain_ind_scale, best_scale_feature]; 
				
			left_child = 2 * (cur_node - 1) + 1 + 1;
			right_child = 2 * (cur_node - 1) + 2 + 1;
					
			# samples going to the left subtree
			Ix_left = X_scale_ext[,best_scale_split]; 
											
			Ix_left = Ix * Ix_left;		
			Ix_right = Ix * (1 - Ix_left);
				
			L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
			L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child);								
			
			left_child_size = sum (Ix_left * C[,cur_tree]);
			right_child_size = sum (Ix_right * C[,cur_tree]);
			
			# check if left or right child is a leaf
			left_pure = FALSE;
			right_pure = FALSE;
			cur_impurity_left = as.scalar(impurity_left_scale[best_scale_split,]); # max_I_gain_ind_scale
			cur_impurity_right = as.scalar(impurity_right_scale[best_scale_split,]); # max_I_gain_ind_scale
			if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) & 
			   (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) | 
			   (left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf	
				
				cur_label_counts_left = label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale
				cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; 
				cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
				cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
				left_pure = TRUE;	
				# compute number of misclassified points 
				cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); 
				
				cur_label_counts_right = label_counts_overall - cur_label_counts_left;
				cur_NC_large[1,(2 * i6)] = right_child; 
				cur_NC_large[2,(2 * i6)] = cur_tree;
				cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
				right_pure = TRUE;	
				# compute number of misclassified pints
				cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right); 
				
			} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) | 
					  (left_child_size <= threshold & (level == depth))) {	
				
				cur_label_counts_left = label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale
				cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; 
				cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
				cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label				
				left_pure = TRUE;
				# compute number of misclassified points 
				cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
				
			} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) |
					  (right_child_size <= threshold & (level == depth))) {
					  
				cur_label_counts_right = label_counts_right_scale[best_scale_split,]; # max_I_gain_ind_scale
				cur_NC_large[1,(2 * i6)] = right_child; 
				cur_NC_large[2,(2 * i6)] = cur_tree;
				cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
				right_pure = TRUE;
				# compute number of misclassified pints
				cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right);
				
			}
		
		} else if (num_cat_features > 0 & num_cat_feature_samples > 0 & best_cat_gain > 0) {
			
			# --- update model ---
			cur_F_large[1,i6] = best_cat_feature;
			cur_F_large[2,i6] = 2;
			offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]);
			S_start_ind = (i6 - 1) * distinct_values_max + 1;
			cur_F_large[3,i6] = offset_nonzero;
			cur_S_large[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)];		
		
			left_child = 2 * (cur_node - 1) + 1 + 1;
			right_child = 2 * (cur_node - 1) + 2 + 1;
					
			# samples going to the left subtree
			Ix_left = rowSums (X_cat[,start_ind:(start_ind + offset - 1)] * best_split_values_bin);
			Ix_left = (Ix_left >= 1);
											  
			Ix_left = Ix * Ix_left;		
			Ix_right = Ix * (1 - Ix_left);
			
			L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
			L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child);								
			
			left_child_size = sum (Ix_left * C[,cur_tree]);
			right_child_size = sum (Ix_right * C[,cur_tree]);
			
			# check if left or right child is a leaf
			left_pure = FALSE;
			right_pure = FALSE;
			cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]); 
			cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]); 
			if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) & 
			   (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) | 
			   (left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf	
				
				cur_label_counts_left = best_label_counts_left[best_cat_feature,];
				cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; 
				cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
				cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
				left_pure = TRUE;
				# compute number of misclassified points 
				cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); 
				
				cur_label_counts_right = label_counts_overall - cur_label_counts_left;
				cur_NC_large[1,(2 * i6)] = right_child; 
				cur_NC_large[2,(2 * i6)] = cur_tree;	
				cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
				right_pure = TRUE;
				# compute number of misclassified pints
				cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right);			
			
			} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) |
					  (left_child_size <= threshold & (level == depth))) {	
				
				cur_label_counts_left = best_label_counts_left[best_cat_feature,];
				cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; 
				cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;	
				cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label				
				left_pure = TRUE;
				# compute number of misclassified points 
				cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left);				
			
			} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) |
					  (right_child_size <= threshold & (level == depth))) {
				
				cur_label_counts_right = best_label_counts_right[best_cat_feature,];
				cur_NC_large[1,(2 * i6)] = right_child; 
				cur_NC_large[2,(2 * i6)] = cur_tree;
				cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
				right_pure = TRUE;
				# compute number of misclassified pints
				cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right);		
			
			}		
		} else {
			
			print ("NUMBER OF SAMPLES AT NODE " + cur_node + " in tree " + cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!");
			right_pure = TRUE;
			left_pure = TRUE;
			cur_NC_large[1,(2 * (i6 - 1) + 1)] = cur_node;
			cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
			class_label = as.scalar (rowIndexMax (label_counts_overall));
			cur_NC_large[3,(2 * (i6 - 1) + 1)] = class_label;
			cur_NC_large[4,(2 * (i6 - 1) + 1)] = label_sum_overall - max (label_counts_overall);
			cur_NC_large[5,(2 * (i6 - 1) + 1)] = 1; # special leaf	
						
		}
		
		# add nodes to Q
		if (!left_pure) {
			if (left_child_size > threshold) {
				cur_Q_large[1,(2 * (i6 - 1)+ 1)] = left_child; 
				cur_Q_large[2,(2 * (i6 - 1)+ 1)] = cur_tree; 
			} else {
				cur_nodes_small[1,(2 * (i6 - 1)+ 1)] = left_child;
				cur_nodes_small[2,(2 * (i6 - 1)+ 1)] = left_child_size;
				cur_nodes_small[3,(2 * (i6 - 1)+ 1)] = cur_tree;				
			}
		}
		if (!right_pure) {
			if (right_child_size > threshold) {
				cur_Q_large[1,(2 * i6)] = right_child;
				cur_Q_large[2,(2 * i6)] = cur_tree;				
			} else{
				cur_nodes_small[1,(2 * i6)] = right_child;
				cur_nodes_small[2,(2 * i6)] = right_child_size;
				cur_nodes_small[3,(2 * i6)] = cur_tree;								
			}	
		}
	}
	
	##### PREPARE MODEL FOR LARGE NODES
	if (num_cur_nodes_large > 0) {
		cur_Q_large = removeEmpty (target = cur_Q_large, margin = "cols");
		if (as.scalar (cur_Q_large[1,1]) != 0) Q_large = cbind (Q_large, cur_Q_large);
		cur_NC_large = removeEmpty (target = cur_NC_large, margin = "cols");
		if (as.scalar (cur_NC_large[1,1]) != 0) NC_large = cbind (NC_large, cur_NC_large);

		cur_F_large = removeEmpty (target = cur_F_large, margin = "cols");
		if (as.scalar (cur_F_large[1,1]) != 0) F_large = cbind (F_large, cur_F_large);
		cur_S_large = removeEmpty (target = cur_S_large, margin = "cols");
		if (as.scalar (cur_S_large[1,1]) != 0) S_large = cbind (S_large, cur_S_large);	
			
		num_cur_nodes_large_pre = 2 * num_cur_nodes_large;
		if (as.scalar (cur_Q_large[1,1]) == 0) {
			num_cur_nodes_large = 0;
		} else {
			cur_nodes_large = cur_Q_large;
			num_cur_nodes_large = ncol (cur_Q_large);
		}	
	}

	##### PREPARE MODEL FOR SMALL NODES	
	cur_nodes_small_nonzero = removeEmpty (target = cur_nodes_small, margin = "cols");
	if (as.scalar (cur_nodes_small_nonzero[1,1]) != 0) { # if SMALL nodes exist
		num_cur_nodes_small = ncol (cur_nodes_small_nonzero);
	}
		
	if (num_cur_nodes_small > 0) { # SMALL nodes to process		
		reserve_len = sum (2 ^ (ceil (log (cur_nodes_small_nonzero[2,]) / log (2)))) + num_cur_nodes_small;
		cur_Q_small =  matrix (0, rows = 2, cols = reserve_len);
		cur_F_small = matrix (0, rows = 3, cols = reserve_len); 
		cur_NC_small = matrix (0, rows = 5, cols = reserve_len); 
		cur_S_small = matrix (0, rows = 1, cols = reserve_len * distinct_values_max); # split values of the best feature	
	}

	##### LOOP OVER SMALL NODES...
	parfor (i7 in 1:num_cur_nodes_small, check = 0) { 
	
		cur_node_small = as.scalar (cur_nodes_small_nonzero[1,i7]);	
		cur_tree_small = as.scalar (cur_nodes_small_nonzero[3,i7]);
		
		# build dataset for SMALL node
		Ix = (L[,cur_tree_small] == cur_node_small);			
		if (num_scale_features > 0) {
			X_scale_ext_small = removeEmpty (target = X_scale_ext, margin = "rows", select = Ix);
		}
		if (num_cat_features > 0) {
			X_cat_small = removeEmpty (target = X_cat, margin = "rows", select = Ix);
		}
		
		L_small = removeEmpty (target = L * Ix, margin = "rows");
		C_small = removeEmpty (target = C * Ix, margin = "rows");
		Y_bin_small = removeEmpty (target = Y_bin * Ix, margin = "rows");
			
		# compute offset
		offsets = cumsum (t (2 ^ ceil (log (cur_nodes_small_nonzero[2,]) / log (2))));
		start_ind_global = 1;
		if (i7 > 1) {
			start_ind_global = start_ind_global + as.scalar (offsets[(i7 - 1),]);
		}
		start_ind_S_global = 1;
		if (i7 > 1) {
			start_ind_S_global = start_ind_S_global + (as.scalar (offsets[(i7 - 1),]) * distinct_values_max);
		}
			
		Q = matrix (0, rows = 2, cols = 1); 
		Q[1,1] = cur_node_small;
		Q[2,1] = cur_tree_small;
		F = matrix (0, rows = 3, cols = 1); 
		NC = matrix (0, rows = 5, cols = 1); 
		S = matrix (0, rows = 1, cols = 1); 

		cur_nodes_ = matrix (cur_node_small, rows = 2, cols = 1);
		cur_nodes_[1,1] = cur_node_small;
		cur_nodes_[2,1] = cur_tree_small;
		
		num_cur_nodes = 1;
		level_ = level;
		while (num_cur_nodes > 0 & level_ < depth) {
			
			level_ = level_ + 1;
				
			cur_Q = matrix (0, rows = 2, cols = 2 * num_cur_nodes);
			cur_F = matrix (0, rows = 3, cols = num_cur_nodes); 
			cur_NC = matrix (0, rows = 5, cols = 2 * num_cur_nodes); 
			cur_S = matrix (0, rows = 1, cols = num_cur_nodes * distinct_values_max);
		
			parfor (i8 in 1:num_cur_nodes, check = 0) { 
			
				cur_node = as.scalar (cur_nodes_[1,i8]);	
				cur_tree = as.scalar (cur_nodes_[2,i8]);
				
				# select sample features WOR
				feature_samples = sample (num_features_total, num_feature_samples);
				feature_samples = order (target = feature_samples, by = 1);
				num_scale_feature_samples = sum (feature_samples <= num_scale_features);
				num_cat_feature_samples = num_feature_samples - num_scale_feature_samples;
		
				# --- find best split ---
				# samples that reach cur_node 
				Ix = (L_small[,cur_tree] == cur_node);		
				cur_Y_bin = Y_bin_small * (Ix * C_small[,cur_tree]);
				label_counts_overall = colSums (cur_Y_bin);
				
				label_sum_overall = sum (label_counts_overall);
				label_dist_overall = label_counts_overall / label_sum_overall;
				if (imp == "entropy") {
					label_dist_zero = (label_dist_overall == 0);
					cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2);
				} else { # imp == "Gini"
					cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); 			
				}
				best_scale_gain = 0;
				best_cat_gain = 0;
				if (num_scale_features > 0 & num_scale_feature_samples > 0) {
					
					scale_feature_samples = feature_samples[1:num_scale_feature_samples,];
					
					# main operation	
					label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext_small); 
		
					# compute left and right label distribution
					label_sum_left = rowSums (label_counts_left_scale);
					label_dist_left = label_counts_left_scale / label_sum_left;
					if (imp == "entropy") {
						label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
						log_label_dist_left = log (label_dist_left); # / log (2)
						impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left); 
					} else { # imp == "Gini"
						impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left)); 
					}
					#
					label_counts_right_scale = - label_counts_left_scale + label_counts_overall; 
					label_sum_right = rowSums (label_counts_right_scale);
					label_dist_right = label_counts_right_scale / label_sum_right;
					if (imp == "entropy") {
						label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
						log_label_dist_right = log (label_dist_right); # log (2)
						impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right); 		
					} else { # imp == "Gini"
						impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right)); 			
					}
					I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale); 
			
					I_gain_scale = replace (target = I_gain_scale, pattern = "NaN", replacement = 0);		
			
					# determine best feature to split on and the split value
					feature_start_ind = matrix (0, rows = 1, cols = num_scale_features);
					feature_start_ind[1,1] = 1;
					if (num_scale_features > 1) {
						feature_start_ind[1,2:num_scale_features] = cum_count_thresholds[1,1:(num_scale_features - 1)] + 1;
					}
					max_I_gain_found = 0;
					max_I_gain_found_ind = 0;
					best_i = 0;			
						
					for (i in 1:num_scale_feature_samples) { # assuming feature_samples is 5x1
						cur_feature_samples_bin = as.scalar (scale_feature_samples[i,]); 
						cur_start_ind = as.scalar (feature_start_ind[,cur_feature_samples_bin]);
						cur_end_ind = as.scalar (cum_count_thresholds[,cur_feature_samples_bin]);
						I_gain_portion = I_gain_scale[cur_start_ind:cur_end_ind,];
						cur_max_I_gain = max (I_gain_portion);
						cur_max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain_portion)));
						if (cur_max_I_gain > max_I_gain_found) {
							max_I_gain_found = cur_max_I_gain;
							max_I_gain_found_ind = cur_max_I_gain_ind;
							best_i = i;
						}
					}
	
					best_scale_gain = max_I_gain_found;
					max_I_gain_ind_scale = max_I_gain_found_ind;
					best_scale_feature = 0;
					if (best_i > 0) {
						best_scale_feature = as.scalar (scale_feature_samples[best_i,]);
					}
					best_scale_split = max_I_gain_ind_scale;
					if (best_scale_feature > 1) {
						best_scale_split = best_scale_split + as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
					}				
				}
				
				if (num_cat_features > 0 & num_cat_feature_samples > 0){
						
					cat_feature_samples = feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + num_cat_feature_samples),] - num_scale_features;
						
					# initialization
					split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall); 
					split_values = split_values_bin; 
					split_values_offset = matrix (0, rows = 1, cols = num_cat_features); 
					I_gains = split_values_offset; 
					impurities_left = split_values_offset;
					impurities_right = split_values_offset;
					best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes);
					best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes);
			
					# main operation
					label_counts = t (t (cur_Y_bin) %*% X_cat_small);  		
					
					parfor (i9 in 1:num_cat_feature_samples, check = 0){
				
						cur_cat_feature = as.scalar (cat_feature_samples[i9,1]);
						start_ind = 1;
						if (cur_cat_feature > 1) {
							start_ind = start_ind + as.scalar (distinct_values_offset[(cur_cat_feature - 1),]);
						}
						offset = as.scalar (distinct_values[1,cur_cat_feature]);
				
						cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),];
							
						label_sum = rowSums (cur_label_counts);
						label_dist = cur_label_counts / label_sum;
						if (imp == "entropy") {
							label_dist = replace (target = label_dist, pattern = 0, replacement = 1);
							log_label_dist = log (label_dist); # / log(2)
							impurity = - rowSums (label_dist * log_label_dist); 
							impurity = replace (target = impurity, pattern = "NaN", replacement = 1/0); 
						} else { # imp == "Gini"
							impurity = rowSums (label_dist * (1 - label_dist)); 				
						}
				
						# sort cur feature by impurity
						cur_distinct_values = seq (1, nrow (cur_label_counts));
						cur_distinct_values_impurity = cbind (cur_distinct_values, impurity);
						cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE);
						P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix
						label_counts_sorted = P %*% cur_label_counts;
	
						# compute left and right label distribution			
						label_counts_left = cumsum (label_counts_sorted);
			
						label_sum_left = rowSums (label_counts_left);
						label_dist_left = label_counts_left / label_sum_left;
						label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1);
						if (imp == "entropy") {
							label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
							log_label_dist_left = log (label_dist_left); # / log(2)
							impurity_left = - rowSums (label_dist_left * log_label_dist_left);
						} else { # imp == "Gini"
							impurity_left = rowSums (label_dist_left * (1 - label_dist_left));					
						}
						#
						label_counts_right = - label_counts_left + label_counts_overall;
						label_sum_right = rowSums (label_counts_right);
						label_dist_right = label_counts_right / label_sum_right;
						label_dist_right = replace (target = label_dist_right, pattern = "NaN", replacement = 1);
						if (imp == "entropy") {
							label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
							log_label_dist_right = log (label_dist_right); # / log (2)
							impurity_right = - rowSums (label_dist_right * log_label_dist_right);			
						} else { # imp == "Gini"
							impurity_right = rowSums (label_dist_right * (1 - label_dist_right));					
						}
						I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);
			
						Ix_label_sum_left_zero = (label_sum_left == 0);
						Ix_label_sum_right_zero = (label_sum_right == 0);
						Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero;
						I_gain = I_gain * (1 - Ix_label_sum_zero);	
				
						I_gain[nrow (I_gain),] = 0; # last entry invalid
			
						max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));

						split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
						for (i10 in 1:max_I_gain_ind) {
							ind = as.scalar (cur_feature_sorted[i10,1]);
							if (ind == 1) {
								split_values_bin[1,start_ind] = 1.0; 
							} else {
								split_values_bin[1,(start_ind + ind - 1)] = 1.0;
							}
						}
						split_values_offset[1,cur_cat_feature] = max_I_gain_ind;
			
						I_gains[1,cur_cat_feature] = max (I_gain);
		
						impurities_left[1,cur_cat_feature] = as.scalar (impurity_left[max_I_gain_ind,]);
						impurities_right[1,cur_cat_feature] = as.scalar (impurity_right[max_I_gain_ind,]);
						best_label_counts_left[cur_cat_feature,] = label_counts_left[max_I_gain_ind,];
						best_label_counts_right[cur_cat_feature,] = label_counts_right[max_I_gain_ind,];				
					}
			
					# determine best feature to split on and the split values
					best_cat_feature = as.scalar (rowIndexMax (I_gains));
					best_cat_gain = max (I_gains);
					start_ind = 1;
					if (best_cat_feature > 1) {
						start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]);
					}
					offset = as.scalar (distinct_values[1,best_cat_feature]);
					best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)];
				}
				
				# compare best scale feature to best cat. feature and pick the best one
				if (num_scale_features > 0 & num_scale_feature_samples > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
					
					# --- update model ---
					cur_F[1,i8] = best_scale_feature;
					cur_F[2,i8] = 1;
					cur_F[3,i8] = 1;		
					cur_S[1,(i8 - 1) * distinct_values_max + 1] = thresholds[max_I_gain_ind_scale, best_scale_feature]; 
					
					left_child = 2 * (cur_node - 1) + 1 + 1;
					right_child = 2 * (cur_node - 1) + 2 + 1;
					
					# samples going to the left subtree
					Ix_left = X_scale_ext_small[, best_scale_split]; 
											
					Ix_left = Ix * Ix_left;		
					Ix_right = Ix * (1 - Ix_left);
				
					L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
					L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child);								
			
					left_child_size = sum (Ix_left * C_small[,cur_tree]);
					right_child_size = sum (Ix_right * C_small[,cur_tree]);
				
					# check if left or right child is a leaf
					left_pure = FALSE;
					right_pure = FALSE;
					cur_impurity_left = as.scalar(impurity_left_scale[best_scale_split,]); 
					cur_impurity_right = as.scalar(impurity_right_scale[best_scale_split,]); 
					if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) & 
					   (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) ) { # both left and right nodes are leaf	
						
						cur_label_counts_left = label_counts_left_scale[best_scale_split,]; 
						cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; 
						cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
						cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
						left_pure = TRUE;	
						# compute number of misclassified points 
						cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); 
				
						cur_label_counts_right = label_counts_overall - cur_label_counts_left;
						cur_NC[1,(2 * i8)] = right_child; 
						cur_NC[2,(2 * i8)] = cur_tree;	
						cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
						right_pure = TRUE;	
						# compute number of misclassified points 
						cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right);					
						
					} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) {
						
						cur_label_counts_left = label_counts_left_scale[best_scale_split,]; 
						cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
						cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;						
						cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label				
						left_pure = TRUE;	
						# compute number of misclassified points 
						cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); 
						
					} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) {
						
						cur_label_counts_right = label_counts_right_scale[best_scale_split,]; 
						cur_NC[1,(2 * i8)] = right_child;
						cur_NC[2,(2 * i8)] = cur_tree;							
						cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
						right_pure = TRUE;
						# compute number of misclassified points 
						cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right);						
						
					}									
														
				} else if (num_cat_features > 0 & num_cat_feature_samples > 0 & best_cat_gain > 0) {
					
					# --- update model ---
					cur_F[1,i8] = best_cat_feature;
					cur_F[2,i8] = 2;
					offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]);
					S_start_ind = (i8 - 1) * distinct_values_max + 1;
					cur_F[3,i8] = offset_nonzero;
					cur_S[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)];	
		
					left_child = 2 * (cur_node - 1) + 1 + 1;
					right_child = 2 * (cur_node - 1) + 2 + 1;
					
					# samples going to the left subtree
					Ix_left = rowSums (X_cat_small[,start_ind:(start_ind + offset - 1)] * best_split_values_bin);
					Ix_left = (Ix_left >= 1);
											  
					Ix_left = Ix * Ix_left;		
					Ix_right = Ix * (1 - Ix_left);
			
					L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
					L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child);								
			
					left_child_size = sum (Ix_left * C_small[,cur_tree]);
					right_child_size = sum (Ix_right * C_small[,cur_tree]);
		
					# check if left or right child is a leaf
					left_pure = FALSE;
					right_pure = FALSE;
					cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]); 
					cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]); 
					if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) & 
					   (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) ) { # both left and right nodes are leaf	
						
						cur_label_counts_left = best_label_counts_left[best_cat_feature,];
						cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
						cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;						
						cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
						left_pure = TRUE;
						# compute number of misclassified points 
						cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); 
						
						cur_label_counts_right = label_counts_overall - cur_label_counts_left;
						cur_NC[1,(2 * i8)] = right_child;
						cur_NC[2,(2 * i8)] = cur_tree;						
						cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
						right_pure = TRUE;
						# compute number of misclassified points 
						cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right);
						
					} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) {	
					
						cur_label_counts_left = best_label_counts_left[best_cat_feature,];
						cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; 
						cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
						cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label				
						left_pure = TRUE;
						# compute number of misclassified points 
						cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left);
						
					} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) {
						cur_label_counts_right = best_label_counts_right[best_cat_feature,];
						cur_NC[1,(2 * i8)] = right_child; 
						cur_NC[2,(2 * i8)] = cur_tree;
						cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
						right_pure = TRUE;
						# compute number of misclassified points 
						cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right);
						
					}		
				} else {
							
					print ("NUMBER OF SAMPLES AT NODE " + cur_node + " in tree " + cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!");							
					right_pure = TRUE;
					left_pure = TRUE;
					cur_NC[1,(2 * (i8 - 1) + 1)] = cur_node;
					cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
					class_label = as.scalar (rowIndexMax (label_counts_overall));
					cur_NC[3,(2 * (i8 - 1) + 1)] = class_label;
					cur_NC[4,(2 * (i8 - 1) + 1)] = label_sum_overall - max (label_counts_overall);
					cur_NC[5,(2 * (i8 - 1) + 1)] = 1; # special leaf
					
				}
			
				# add nodes to Q
				if (!left_pure) {
					cur_Q[1,(2 * (i8 - 1)+ 1)] = left_child; 
					cur_Q[2,(2 * (i8 - 1)+ 1)] = cur_tree; 
				}
				if (!right_pure) {
					cur_Q[1,(2 * i8)] = right_child;
					cur_Q[2,(2 * i8)] = cur_tree;				
				}
			}
		
			cur_Q = removeEmpty (target = cur_Q, margin = "cols"); 
			Q = cbind (Q, cur_Q);
			NC = cbind (NC, cur_NC);
			F = cbind (F, cur_F);
			S = cbind (S, cur_S);
		
			num_cur_nodes_pre = 2 * num_cur_nodes;
			if (as.scalar (cur_Q[1,1]) == 0) {
				num_cur_nodes = 0;
			} else {
				cur_nodes_ = cur_Q;
				num_cur_nodes = ncol (cur_Q);
			}
		}
		
		cur_Q_small[,start_ind_global:(start_ind_global + ncol (Q) - 1)] = Q;
		cur_NC_small[,start_ind_global:(start_ind_global + ncol (NC) - 1)] = NC;
		cur_F_small[,start_ind_global:(start_ind_global + ncol (F) - 1)] = F;	
		cur_S_small[,start_ind_S_global:(start_ind_S_global + ncol (S) - 1)] = S; 
	}

	##### PREPARE MODEL FOR SMALL NODES	
	if (num_cur_nodes_small > 0) {	# small nodes already processed
		cur_Q_small = removeEmpty (target = cur_Q_small, margin = "cols");
		if (as.scalar (cur_Q_small[1,1]) != 0) Q_small = cbind (Q_small, cur_Q_small);
		cur_NC_small = removeEmpty (target = cur_NC_small, margin = "cols");
		if (as.scalar (cur_NC_small[1,1]) != 0) NC_small = cbind (NC_small, cur_NC_small);
	
		cur_F_small = removeEmpty (target = cur_F_small, margin = "cols"); # 
		if (as.scalar (cur_F_small[1,1]) != 0) F_small = cbind (F_small, cur_F_small);
		cur_S_small = removeEmpty (target = cur_S_small, margin = "cols"); #		
		if (as.scalar (cur_S_small[1,1]) != 0) S_small = cbind (S_small, cur_S_small); 
		
		num_cur_nodes_small = 0; # reset
	} 
		
	print (" --- end level " + level + ", remaining no. of LARGE nodes to expand " + num_cur_nodes_large + " --- ");
}

#### prepare model
print ("PREPARING MODEL...")
### large nodes
if (as.scalar (Q_large[1,1]) == 0 & ncol (Q_large) > 1) {
	Q_large = Q_large[,2:ncol (Q_large)];
}
if (as.scalar (NC_large[1,1]) == 0 & ncol (NC_large) > 1) {
	NC_large = NC_large[,2:ncol (NC_large)];
}
if (as.scalar (S_large[1,1]) == 0 & ncol (S_large) > 1) {
	S_large = S_large[,2:ncol (S_large)];
}
if (as.scalar (F_large[1,1]) == 0 & ncol (F_large) > 1) {
	F_large = F_large[,2:ncol (F_large)];
}
### small nodes
if (as.scalar (Q_small[1,1]) == 0 & ncol (Q_small) > 1) {
	Q_small = Q_small[,2:ncol (Q_small)];
}
if (as.scalar (NC_small[1,1]) == 0 & ncol (NC_small) > 1) {
	NC_small = NC_small[,2:ncol (NC_small)];
}
if (as.scalar (S_small[1,1]) == 0 & ncol (S_small) > 1) {
	S_small = S_small[,2:ncol (S_small)];
}
if (as.scalar (F_small[1,1]) == 0 & ncol (F_small) > 1) {
	F_small = F_small[,2:ncol (F_small)];
}

# check for special leaves and if there are any remove them from Q_large and Q_small 
special_large_leaves_ind = NC_large[5,];
num_special_large_leaf = sum (special_large_leaves_ind);
if (num_special_large_leaf > 0) {
	print ("PROCESSING " + num_special_large_leaf + " SPECIAL LARGE LEAVES...");
	special_large_leaves = removeEmpty (target = NC_large[1:2,] * special_large_leaves_ind, margin = "cols");
	large_internal_ind = 1 - colSums (outer (t (special_large_leaves[1,]), Q_large[1,], "==") * outer (t (special_large_leaves[2,]), Q_large[2,], "=="));
	Q_large = removeEmpty (target = Q_large * large_internal_ind, margin = "cols");
	F_large = removeEmpty (target = F_large, margin = "cols"); # remove special leaves from F
}

special_small_leaves_ind = NC_small[5,];
num_special_small_leaf = sum (special_small_leaves_ind);
if (num_special_small_leaf > 0) {
	print ("PROCESSING " + num_special_small_leaf + " SPECIAL SMALL LEAVES...");
	special_small_leaves = removeEmpty (target = NC_small[1:2,] * special_small_leaves_ind, margin = "cols");
	small_internal_ind = 1 - colSums (outer (t (special_small_leaves[1,]), Q_small[1,], "==") * outer (t (special_small_leaves[2,]), Q_small[2,], "=="));
	Q_small = removeEmpty (target = Q_small * small_internal_ind, margin = "cols");
	F_small = removeEmpty (target = F_small, margin = "cols"); # remove special leaves from F
}

# model corresponding to large internal nodes
no_large_internal_node = FALSE;
if (as.scalar (Q_large[1,1]) != 0) {
	print ("PROCESSING LARGE INTERNAL NODES...");
	num_large_internal = ncol (Q_large);
	max_offset = max (max (F_large[3,]), max (F_small[3,]));
	M1_large = matrix (0, rows = 6 + max_offset, cols = num_large_internal);
	M1_large[1:2,] = Q_large;
	M1_large[4:6,] = F_large;
	# process S_large
	cum_offsets_large = cumsum (t (F_large[3,]));
	parfor (it in 1:num_large_internal, check = 0) {
		start_ind = 1;
		if (it > 1) {
			start_ind = start_ind + as.scalar (cum_offsets_large[(it - 1),]);
		}
		offset = as.scalar (F_large[3,it]);
		M1_large[7:(7 + offset - 1),it] = t (S_large[1,start_ind:(start_ind + offset - 1)]); 
	}	
} else {
	print ("No LARGE internal nodes available");
	no_large_internal_node = TRUE;
}

# model corresponding to small internal nodes
no_small_internal_node = FALSE;
if (as.scalar (Q_small[1,1]) != 0) {
	print ("PROCESSING SMALL INTERNAL NODES...");
	num_small_internal = ncol (Q_small);
	M1_small = matrix (0, rows = 6 + max_offset, cols = num_small_internal);
	M1_small[1:2,] = Q_small;
	M1_small[4:6,] = F_small;
	# process S_small
	cum_offsets_small = cumsum (t (F_small[3,]));
	parfor (it in 1:num_small_internal, check = 0) {
		start_ind = 1;
		if (it > 1) {
			start_ind = start_ind + as.scalar (cum_offsets_small[(it - 1),]);
		}
		offset = as.scalar (F_small[3,it]);
		M1_small[7:(7 + offset - 1),it] = t (S_small[1,start_ind:(start_ind + offset - 1)]); 
	}
} else {
	print ("No SMALL internal nodes available");	
	no_small_internal_node = TRUE;
}

# model corresponding to large leaf nodes
no_large_leaf_node = FALSE;
if (as.scalar (NC_large[1,1]) != 0) {
	print ("PROCESSING LARGE LEAF NODES...");
	num_large_leaf = ncol (NC_large);
	M2_large = matrix (0, rows = 6 + max_offset, cols = num_large_leaf);
	M2_large[1:2,] = NC_large[1:2,];
	M2_large[5:7,] = NC_large[3:5,];
} else {
	print ("No LARGE leaf nodes available");
	no_large_leaf_node = TRUE;
}

# model corresponding to small leaf nodes
no_small_leaf_node = FALSE;
if (as.scalar (NC_small[1,1]) != 0) {
	print ("PROCESSING SMALL LEAF NODES...");
	num_small_leaf = ncol (NC_small);
	M2_small = matrix (0, rows = 6 + max_offset, cols = num_small_leaf);
	M2_small[1:2,] = NC_small[1:2,];
	M2_small[5:7,] = NC_small[3:5,];
} else {
	print ("No SMALL leaf nodes available");
	no_small_leaf_node = TRUE;
}

if (no_large_internal_node) {
	M1 = M1_small;
} else if (no_small_internal_node) {
	M1 = M1_large;
} else {
	M1 = cbind (M1_large, M1_small);
}

if (no_large_leaf_node) {
	M2 = M2_small;
} else if (no_small_leaf_node) {
	M2 = M2_large;
} else {
	M2 = cbind (M2_large, M2_small);
}

M = cbind (M1, M2);
M = t (order (target = t (M), by = 1)); # sort by node id
M = t (order (target = t (M), by = 2)); # sort by tree id


# removing redundant subtrees
if (ncol (M) > 1) {
	print ("CHECKING FOR REDUNDANT SUBTREES...");
	red_leaf = TRUE;
	process_red_subtree = FALSE;
	invalid_node_ind = matrix (0, rows = 1, cols = ncol (M));
	while (red_leaf & ncol (M) > 1) {
		leaf_ind = (M[4,] == 0);
		labels = M[5,] * leaf_ind;
		tree_ids = M[2,];
		parent_ids = floor (M[1,] /2);
		cond1 = (labels[,1:(ncol (M) - 1)] == labels[,2:ncol (M)]); # siebling leaves with same label
		cond2 = (parent_ids[,1:(ncol (M) - 1)] == parent_ids[,2:ncol (M)]); # same parents
		cond3 = (tree_ids[,1:(ncol (M) - 1)] == tree_ids[,2:ncol (M)]); # same tree
		red_leaf_ind =  cond1 * cond2 * cond3 * leaf_ind[,2:ncol (M)];	
		
		if (sum (red_leaf_ind) > 0) { # if redundant subtrees exist
			red_leaf_ids = M[1:2,2:ncol (M)] * red_leaf_ind;
			red_leaf_ids_nonzero = removeEmpty (target = red_leaf_ids, margin = "cols");
			parfor (it in 1:ncol (red_leaf_ids_nonzero), check = 0){
				cur_right_leaf_id = as.scalar (red_leaf_ids_nonzero[1,it]); 
				cur_parent_id = floor (cur_right_leaf_id / 2);
				cur_tree_id = as.scalar (red_leaf_ids_nonzero[2,it]); 
				cur_right_leaf_pos = as.scalar (rowIndexMax ((M[1,] == cur_right_leaf_id) * (M[2,] == cur_tree_id)));
				cur_parent_pos = as.scalar(rowIndexMax ((M[1,] == cur_parent_id) * (M[2,] == cur_tree_id)));
				M[3:nrow (M), cur_parent_pos] = M[3:nrow (M), cur_right_leaf_pos];
				M[4,cur_right_leaf_pos] = -1;
				M[4,cur_right_leaf_pos - 1] = -1;
				invalid_node_ind[1,cur_right_leaf_pos] = 1;
				invalid_node_ind[1,cur_right_leaf_pos - 1] = 1;				
			}
			process_red_subtree = TRUE;
		} else {
			red_leaf = FALSE;
		}
	}
	
	if (process_red_subtree) {
		print ("REMOVING REDUNDANT SUBTREES...");
		valid_node_ind = (invalid_node_ind == 0);
		M = removeEmpty (target = M * valid_node_ind, margin = "cols");
	}
}

internal_ind = (M[4,] > 0);
internal_ids = M[1:2,] * internal_ind; 
internal_ids_nonzero = removeEmpty (target = internal_ids, margin = "cols");
if (as.scalar (internal_ids_nonzero[1,1]) > 0) { # if internal nodes exist 
    a1 = internal_ids_nonzero[1,];
    a2 = internal_ids_nonzero[1,] * 2;
    vcur_tree_id = internal_ids_nonzero[2,];
    pos_a1 = rowIndexMax( outer(t(a1), M[1,], "==") * outer(t(vcur_tree_id), M[2,], "==") );
    pos_a2 = rowIndexMax( outer(t(a2), M[1,], "==") * outer(t(vcur_tree_id), M[2,], "==") );
    M[3,] = t(table(pos_a1, 1, pos_a2 - pos_a1, ncol(M), 1));
} 
else {
    print ("All trees in the random forest contain only one leaf!");
}

if (fileC != " ") {
	write (C, fileC, format = fmtO);
}
write (M, fileM, format = fmtO);