scripts.algorithms.PCA.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# 
# This script performs Principal Component Analysis (PCA) on the given input data.
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME   TYPE   DEFAULT  MEANING
# ---------------------------------------------------------------------------------------------
# INPUT  String ---      Location to read the matrix A of feature vectors
# K      Int    ---      Indicates dimension of the new vector space constructed from eigen vectors
# CENTER Int    0        Indicates whether or not to center data 
# SCALE  Int    0        Indicates whether or not to scale data 
# OFMT   String ---      Output data format
# PROJDATA Int  0	     This argument indicates if the data should be projected or not
# MODEL  String ---      Location to already existing model: eigenvectors and eigenvalues 
# OUTPUT String /        Location to write output matrices (covariance matrix, new basis vectors, 
#                           and data projected onto new basis vectors)
# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000 
# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1
# ---------------------------------------------------------------------------------------------

A = read($INPUT);
K = ifdef($K, ncol(A));
ofmt = ifdef($OFMT, "CSV");
projectData = ifdef($PROJDATA,0);
model = ifdef($MODEL,"");
center = ifdef($CENTER,0);
scale = ifdef($SCALE,0);
output = ifdef($OUTPUT,"/");

evec_dominant = matrix(0,cols=1,rows=1);

if (model != "") {
	# reuse existing model to project data
    evec_dominant = read(model+"/dominant.eigen.vectors");
}else{
	if (model == "" ){
		model = output;	
	}	

	N = nrow(A);
	D = ncol(A);

	# perform z-scoring (centering and scaling)
	if (center == 1) {
	    cm = colMeans(A);
	    A = A - cm;
	}
	if (scale == 1) {
	    cvars = (colSums (A^2));	
	    if (center == 1){
		cm = colMeans(A);
	    	cvars = (cvars - N*(cm^2))/(N-1);		    
 	    }
	    Azscored = (A)/sqrt(cvars);
            A = Azscored;
	}	

	# co-variance matrix 
	mu = colSums(A)/N;
	C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;


	# compute eigen vectors and values
	[evalues, evectors] = eigen(C);

	decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
	diagmat = table(seq(1,D),decreasing_Idx);
	# sorts eigenvalues by decreasing order
	evalues = diagmat %*% evalues;
	# sorts eigenvectors column-wise in the order of decreasing eigenvalues
	evectors = evectors %*% diagmat;


	# select K dominant eigen vectors 
	nvec = ncol(evectors);

	eval_dominant = evalues[1:K, 1];
	evec_dominant = evectors[,1:K];
	
	# the square root of eigenvalues
	eval_stdev_dominant = sqrt(eval_dominant);
	
	write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt);
	write(eval_dominant, model+"/dominant.eigen.values", format=ofmt);
	write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt);
}
if (projectData == 1 | model != ""){
	# Construct new data set by treating computed dominant eigenvectors as the basis vectors
	newA = A %*% evec_dominant;
	write(newA, output+"/projected.data", format=ofmt);
}