scripts.algorithms.StepLinearRegDS.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#
# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME    TYPE    DEFAULT    MEANING
# --------------------------------------------------------------------------------------------
# X       String   	---      Location (on HDFS) to read the matrix X of feature vectors
# Y       String   	---      Location (on HDFS) to read the 1-column matrix Y of response values
# B       String   	---      Location to store estimated regression parameters (the betas)
# S       String    ---      Location to write the selected features ordered as computed by the algorithm
# O       String   	" "      Location to write the printed statistics; by default is standard output
# icpt    Int        0       Intercept presence, shifting and rescaling the columns of X:
#                            0 = no intercept, no shifting, no rescaling;
#                            1 = add intercept, but neither shift nor rescale X;
#                            2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
# thr     Double    0.01     Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
#                            no further features are being checked and the algorithm stops 
# fmt     String   "text"    Matrix output format for B (the betas) only, usually "text" or "csv"
# --------------------------------------------------------------------------------------------
# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
#         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:
# icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
# icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
# icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
#                        Col.2: betas for shifted/rescaled X and intercept
#
# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
# name-value pair per each line, as follows:
#
# NAME                  MEANING
# -------------------------------------------------------------------------------------
# AVG_TOT_Y             Average of the response value Y
# STDEV_TOT_Y           Standard Deviation of the response value Y
# AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual bias
# STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
# DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
# PLAIN_R2              Plain R^2 of residual with bias included vs. total average
# ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total average
# PLAIN_R2_NOBIAS       Plain R^2 of residual with bias subtracted vs. total average
# ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. total average
# PLAIN_R2_VS_0         * Plain R^2 of residual with bias included vs. zero constant
# ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero constant
# -------------------------------------------------------------------------------------
# * The last two statistics are only printed if there is no intercept (icpt=0)
# If the best AIC is achieved without any features the matrix of selected features contains 0.  
# Moreover, in this case no further statistics will be produced  
#
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
#     O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv

fileX = $X;
fileY = $Y;
fileB = $B;
fileS = $S;

# currently only the forward selection strategy in supported: start from one feature and iteratively add 
# features until AIC improves
dir = "forward";

fmt  = ifdef ($fmt, "text");
intercept_status = ifdef ($icpt, 0);   
thr = ifdef ($thr, 0.01);  

print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
print ("Reading X and Y...");
X_orig = read (fileX);
y = read (fileY);

n = nrow (X_orig);
m_orig = ncol (X_orig);

# BEGIN STEPWISE LINEAR REGRESSION

if (dir == "forward") {  
	
	continue = TRUE;
	columns_fixed = matrix (0, rows = 1, cols = m_orig);
	columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
	
	# X_global stores the best model found at each step 
	X_global = matrix (0, rows = n, cols = 1);
	
	if (intercept_status == 1 | intercept_status == 2) {
		beta = mean (y);
		AIC_best = 2 + n * log(sum((beta - y)^2) / n);
	} else {
		beta = 0;
		AIC_best = n * log(sum(y^2) / n);
	}
	AICs = matrix (AIC_best, rows = 1, cols = m_orig);
	print ("Best AIC without any features: " + AIC_best);
	
	# First pass to examine single features
	parfor (i in 1:m_orig) { 	
		[AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " ");					
		AICs[1,i] = AIC_1;
	}

	# Determine the best AIC 
	column_best = 0;	
	for (k in 1:m_orig) {
		AIC_cur = as.scalar (AICs[1,k]);
		if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
			column_best = k;
			AIC_best = as.scalar(AICs[1,k]);
		}
	}
	
	if (column_best == 0) {
		print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
		S = matrix (0, rows=1, cols=1);
		if (intercept_status == 0) {
			B = matrix (beta, rows = m_orig, cols = 1);
		} else {
			B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
			B_tmp[m_orig + 1,] = beta;
			B = B_tmp;
		}
		write (S, fileS, format=fmt);
		write (B, fileB, format=fmt);
		stop ("");
	}
	print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);	
	columns_fixed[1,column_best] = 1;
	columns_fixed_ordered[1,1] = column_best;
	X_global = X_orig[,column_best];		
		
	while (continue) {
		# Subsequent passes over the features
		parfor (i in 1:m_orig) { 
			if (as.scalar(columns_fixed[1,i]) == 0) {	
			
				# Construct the feature matrix
				X = cbind (X_global, X_orig[,i]);
				
				[AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " ");
				AICs[1,i] = AIC_2;
			}	
		}
	
		# Determine the best AIC
		for (k in 1:m_orig) {
			AIC_cur = as.scalar (AICs[1,k]);
			if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
				column_best = k;
				AIC_best = as.scalar(AICs[1,k]);
			}
		}
				
		# cbind best found features (i.e., columns) to X_global
		if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
			print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
			columns_fixed[1,column_best] = 1;
			columns_fixed_ordered = cbind (columns_fixed_ordered, as.matrix(column_best));
			if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
				X_global = cbind (X_global, X_orig[,column_best]);
				continue = FALSE;
			} else {
				X_global = cbind (X_global, X_orig[,column_best]);
			}
		} else {
			continue = FALSE;
		}
	}
	
	# run linear regression with selected set of features
	print ("Running linear regression with selected features...");
	[AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB); 
	
} else {
	stop ("Currently only forward selection strategy is supported!");
} 


/*
* Computes linear regression using a direct solver for (X^T X) beta = X^T y.
* It also outputs the AIC of the computed model.  
*/
linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {
		
	intercept_status = ifdef ($icpt, 0);
	fmt  = ifdef ($fmt, "text");	 		
	n = nrow (X);	
	m = ncol (X);
	
	# Introduce the intercept, shift and rescale the columns of X if needed
	if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
		ones_n = matrix (1, rows = n, cols = 1);
		X = cbind (X, ones_n);
		m = m - 1;
	}
	m_ext = ncol(X);
	
	if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
							     # Important assumption: X [, m_ext] = ones_n
		avg_X_cols = t(colSums(X)) / n;
		var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
		is_unsafe = (var_X_cols <= 0);
		scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
		scale_X [m_ext, 1] = 1;
		shift_X = - avg_X_cols * scale_X;
		shift_X [m_ext, 1] = 0;
	} else {
		scale_X = matrix (1, rows = m_ext, cols = 1);
		shift_X = matrix (0, rows = m_ext, cols = 1);
	}

	# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)

	A = t(X) %*% X;
	b = t(X) %*% y;
	if (intercept_status == 2) {
		A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
		A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
		b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
	}

	beta_unscaled = solve (A, b);
	
	# END THE DIRECT SOLVE ALGORITHM

	if (intercept_status == 2) {
		beta = scale_X * beta_unscaled;
		beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
	} else {
		beta = beta_unscaled;
	}
	
	# COMPUTE AIC
	y_residual = y - X %*% beta;
	ss_res = sum (y_residual ^ 2);
	eq_deg_of_freedom = m_ext;
	AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);
	
	if (fileB != " ") {
	
		fileO = ifdef ($O, " ");
		fileS = $S;
		
		print ("Computing the statistics...");
		avg_tot = sum (y) / n;
		ss_tot = sum (y ^ 2);
		ss_avg_tot = ss_tot - n * avg_tot ^ 2;
		var_tot = ss_avg_tot / (n - 1);
#		y_residual = y - X %*% beta;
		avg_res = sum (y_residual) / n;
#		ss_res = sum (y_residual ^ 2);
		ss_avg_res = ss_res - n * avg_res ^ 2;

		plain_R2 = 1 - ss_res / ss_avg_tot;
		if (n > m_ext) {
			dispersion  = ss_res / (n - m_ext);
			adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
		} else {
			dispersion  = 0.0 / 0.0;
			adjusted_R2 = 0.0 / 0.0;
		}

		plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
		deg_freedom = n - m - 1;
		if (deg_freedom > 0) {
			var_res = ss_avg_res / deg_freedom;
			adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
		} else {
			var_res = 0.0 / 0.0;
			adjusted_R2_nobias = 0.0 / 0.0;
			print ("Warning: zero or negative number of degrees of freedom.");
		}

		plain_R2_vs_0 = 1 - ss_res / ss_tot;
		if (n > m) {
			adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
		} else {
			adjusted_R2_vs_0 = 0.0 / 0.0;
		}

		str = "AVG_TOT_Y," + avg_tot;                                    #  Average of the response value Y
		str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard Deviation of the response value Y
		str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of the residual Y - pred(Y|X), i.e. residual bias
		str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard Deviation of the residual Y - pred(Y|X)
		str = append (str, "DISPERSION," + dispersion);                  #  GLM-style dispersion, i.e. residual sum of squares / # d.f.
		str = append (str, "PLAIN_R2," + plain_R2);                      #  Plain R^2 of residual with bias included vs. total average
		str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted R^2 of residual with bias included vs. total average
		str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias);        #  Plain R^2 of residual with bias subtracted vs. total average
		str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted R^2 of residual with bias subtracted vs. total average
		if (intercept_status == 0) {
			str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0);        #  Plain R^2 of residual with bias included vs. zero constant
			str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);  #  Adjusted R^2 of residual with bias included vs. zero constant
		}

		if (fileO != " ") {
			write (str, fileO);
		} else {
			print (str);
		}

		# Prepare the output matrix
		print ("Writing the output matrix...");
		if (intercept_status == 2) {
			beta_out = cbind (beta, beta_unscaled);
		} else {
			beta_out = beta;
		}
		
		# Output which features give the best AIC and are being used for linear regression 
		write (Selected, fileS, format=fmt);
		
		no_selected = ncol (Selected);
		max_selected = max (Selected);
		last = max_selected + 1;	
		
		if (intercept_status != 0) {
		
			Selected_ext = cbind (Selected, as.matrix (last));			
			P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); 

			if (intercept_status == 2) {
			
				P1_beta = P1 * beta;
				P2_beta = colSums (P1_beta);
				P1_beta_unscaled = P1 * beta_unscaled;
				P2_beta_unscaled = colSums(P1_beta_unscaled);
				
				if (max_selected < m_orig) {
					P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
					P2_beta_unscaled = cbind (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected)));
					
					P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1]; 
					P2_beta[1, max_selected + 1] = 0;
				
					P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1]; 
					P2_beta_unscaled[1, max_selected + 1] = 0;
				}
				beta_out = cbind (t(P2_beta), t(P2_beta_unscaled));
				
			} else {
			
				P1_beta = P1 * beta;
				P2_beta = colSums (P1_beta);
				
				if (max_selected < m_orig) {
					P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
					P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ; 
					P2_beta[1, max_selected + 1] = 0;
				}
				beta_out = t(P2_beta);
				
			}
		} else {
		
			P1 = table (seq (1, no_selected), t(Selected)); 
			P1_beta = P1 * beta;
			P2_beta = colSums (P1_beta);	

			if (max_selected < m_orig) {
				P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
			}		

			beta_out = t(P2_beta);	
		}
		
		write ( beta_out, fileB, format=fmt );		
	}
}