scripts.algorithms.Univar-Stats.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#
# DML Script to compute univariate statistics for all attributes in a given data set
#
# INPUT PARAMETERS:
# -------------------------------------------------------------------------------------------------
# NAME           TYPE     DEFAULT  MEANING
# -------------------------------------------------------------------------------------------------
# X              String   ---      Location of INPUT data matrix
# TYPES          String   ---      Location of INPUT matrix that lists the types of the features:
#                                     1 for scale, 2 for nominal, 3 for ordinal
# CONSOLE_OUTPUT Boolean  FALSE    If TRUE, print summary statistics to console
# STATS          String   ---      Location of OUTPUT matrix with summary statistics computed for
#                                  all features (17 statistics - 14 scale, 3 categorical)
# -------------------------------------------------------------------------------------------------
# OUTPUT: Matrix of summary statistics
#
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv
#    STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE
#

consoleOutput = ifdef($CONSOLE_OUTPUT, FALSE);

A = read($X); # data file
K = read($TYPES); # attribute kind file

# number of features/attributes
n = ncol(A);

# number of data records
m = nrow(A);

# number of statistics
numBaseStats = 17; # (14 scale stats, 3 categorical stats)

max_kind = max(K);

# matrices to store computed statistics
baseStats = matrix(0, rows=numBaseStats, cols=n);

# Compute max domain size among all categorical attributes
maxs = colMaxs(A);
maxDomainSize = max( (K > 1) * maxs );
maxDomain = as.integer(maxDomainSize);

parfor(i in 1:n, check=0) {

	# project out the i^th column
	F = A[,i];

	kind = as.scalar(K[1,i]);

	if ( kind == 1 ) {
		#print("[" + i + "] Scale");
		# compute SCALE statistics on the projected column
		minimum = min(F);
		maximum = max(F);
		rng = maximum - minimum;

		mu = mean(F);
		m2 = moment(F, 2);
		m3 = moment(F, 3);
		m4 = moment(F, 4);

		var = m/(m-1.0)*m2;
		std_dev = sqrt(var);
		se = std_dev/sqrt(m);
		cv = std_dev/mu;

		g1 = m3/(std_dev^3);
		g2 = m4/(std_dev^4) - 3;
		#se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); 
		se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); 

		#se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) );  
		se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); 

		md = median(F); #quantile(F, 0.5);
		iqm = interQuartileMean(F);

		# place the computed statistics in output matrices
		baseStats[1,i] = minimum;
		baseStats[2,i] = maximum;
		baseStats[3,i] = rng;

		baseStats[4,i] = mu;
		baseStats[5,i] = var;
		baseStats[6,i] = std_dev;
		baseStats[7,i] = se;
		baseStats[8,i] = cv;

		baseStats[9,i] = g1;
		baseStats[10,i] = g2;
		baseStats[11,i] = se_g1;
		baseStats[12,i] = se_g2;

		baseStats[13,i] = md;
		baseStats[14,i] = iqm;
	}
	else {
		if (kind == 2 | kind == 3) {
			#print("[" + i + "] Categorical");
			
			# check if the categorical column has valid values
			minF = min(F);
			if (minF <=0) {
				print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i);
			}
			else {
				# compute CATEGORICAL statistics on the projected column
				num_cat = max(F); # number of categories
				cat_counts = table(F,1, maxDomain, 1);  # counts for each category

				mode = rowIndexMax(t(cat_counts));
				mx = max(cat_counts)
				modeArr =  (cat_counts == mx)
				numModes = sum(modeArr);

				# place the computed statistics in output matrices
				baseStats[15,i] = num_cat;
				baseStats[16,i] = mode;
				baseStats[17,i] = numModes;
			}
		}
	}
}

if (consoleOutput == TRUE) {
	for(i in 1:n) {
		print("-------------------------------------------------");
		kind = as.scalar(K[1,i]);
		if (kind == 1) {
			print("Feature [" + i + "]: Scale");
			print(" (01) Minimum             | " + as.scalar(baseStats[1,i]));
			print(" (02) Maximum             | " + as.scalar(baseStats[2,i]));
			print(" (03) Range               | " + as.scalar(baseStats[3,i]));
			print(" (04) Mean                | " + as.scalar(baseStats[4,i]));
			print(" (05) Variance            | " + as.scalar(baseStats[5,i]));
			print(" (06) Std deviation       | " + as.scalar(baseStats[6,i]));
			print(" (07) Std err of mean     | " + as.scalar(baseStats[7,i]));
			print(" (08) Coeff of variation  | " + as.scalar(baseStats[8,i]));
			print(" (09) Skewness            | " + as.scalar(baseStats[9,i]));
			print(" (10) Kurtosis            | " + as.scalar(baseStats[10,i]));
			print(" (11) Std err of skewness | " + as.scalar(baseStats[11,i]));
			print(" (12) Std err of kurtosis | " + as.scalar(baseStats[12,i]));
			print(" (13) Median              | " + as.scalar(baseStats[13,i]));
			print(" (14) Interquartile mean  | " + as.scalar(baseStats[14,i]));
		} else {
			if (kind == 2 | kind == 3) {
				if (kind == 2) {
					print("Feature [" + i + "]: Categorical (Nominal)");
				} else {
					print("Feature [" + i + "]: Categorical (Ordinal)");
				}
				print(" (15) Num of categories   | " + as.integer(as.scalar(baseStats[15,i])));
				print(" (16) Mode                | " + as.integer(as.scalar(baseStats[16,i])));
				print(" (17) Num of modes        | " + as.integer(as.scalar(baseStats[17,i])));
			}
		}
	}
}

write(baseStats, $STATS);