
hex.pca.PCA Maven / Gradle / Ivy
package hex.pca;
import Jama.Matrix;
import Jama.SingularValueDecomposition;
import hex.DataInfo;
import hex.ModelBuilder;
import hex.ModelMetrics;
import hex.ModelCategory;
import hex.glrm.EmbeddedGLRM;
import hex.glrm.GLRM;
import hex.glrm.GLRMModel;
import hex.gram.Gram;
import hex.gram.Gram.GramTask;
import hex.schemas.ModelBuilderSchema;
import hex.schemas.PCAV3;
import hex.pca.PCAModel.PCAParameters;
import hex.svd.EmbeddedSVD;
import hex.svd.SVD;
import hex.svd.SVDModel;
import water.*;
import water.util.ArrayUtils;
import water.util.Log;
import water.util.PrettyPrint;
import water.util.TwoDimTable;
import java.util.Arrays;
/**
* Principal Components Analysis
* It computes the principal components from the singular value decomposition using the power method.
* SVD via Power Method Algorithm
* @author anqi_fu
*
*/
public class PCA extends ModelBuilder {
// Number of columns in training set (p)
private transient int _ncolExp; // With categoricals expanded into 0/1 indicator cols
@Override
public ModelBuilderSchema schema() {
return new PCAV3();
}
@Override protected Job trainModelImpl(long work, boolean restartTimer) {
return start(new PCADriver(), work, restartTimer);
}
@Override
public long progressUnits() {
return _parms._pca_method == PCAParameters.Method.GramSVD ? 5 : 3;
}
@Override
public ModelCategory[] can_build() {
return new ModelCategory[]{ ModelCategory.Clustering };
}
@Override public BuilderVisibility builderVisibility() { return BuilderVisibility.Stable; };
@Override
protected void checkMemoryFootPrint() {
HeartBeat hb = H2O.SELF._heartbeat;
double p = _train.degreesOfFreedom();
long mem_usage = (long)(hb._cpus_allowed * p*p * 8/*doubles*/ * Math.log((double)_train.lastVec().nChunks())/Math.log(2.)); //one gram per core
long max_mem = hb.get_max_mem();
if (mem_usage > max_mem) {
String msg = "Gram matrices (one per thread) won't fit in the driver node's memory ("
+ PrettyPrint.bytes(mem_usage) + " > " + PrettyPrint.bytes(max_mem)
+ ") - try reducing the number of columns and/or the number of categorical factors.";
error("_train", msg);
cancel(msg);
}
}
// Called from an http request
public PCA(PCAParameters parms) {
super("PCA", parms);
init(false);
}
@Override
public void init(boolean expensive) {
super.init(expensive);
if (_parms._max_iterations < 1 || _parms._max_iterations > 1e6)
error("_max_iterations", "max_iterations must be between 1 and 1e6 inclusive");
if (_train == null) return;
_ncolExp = _train.numColsExp(_parms._use_all_factor_levels, false);
// if (_ncolExp < 2) error("_train", "_train must have more than one column when categoricals are expanded");
// TODO: Initialize _parms._k = min(ncolExp(_train), nrow(_train)) if not set
int k_min = (int)Math.min(_ncolExp, _train.numRows());
if (_parms._k < 1 || _parms._k > k_min) error("_k", "_k must be between 1 and " + k_min);
if (!_parms._use_all_factor_levels && _parms._pca_method == PCAParameters.Method.GLRM)
error("_use_all_factor_levels", "GLRM only implemented for _use_all_factor_levels = true");
if (_parms._pca_method != PCAParameters.Method.GLRM && expensive && error_count() == 0) checkMemoryFootPrint();
}
class PCADriver extends H2O.H2OCountedCompleter {
protected PCADriver() { super(true); } // bump driver priority
protected void buildTables(PCAModel pca, String[] rowNames) {
// Eigenvectors are just the V matrix
String[] colTypes = new String[_parms._k];
String[] colFormats = new String[_parms._k];
String[] colHeaders = new String[_parms._k];
Arrays.fill(colTypes, "double");
Arrays.fill(colFormats, "%5f");
assert rowNames.length == pca._output._eigenvectors_raw.length;
for (int i = 0; i < colHeaders.length; i++) colHeaders[i] = "PC" + String.valueOf(i + 1);
pca._output._eigenvectors = new TwoDimTable("Rotation", null, rowNames, colHeaders, colTypes, colFormats, "",
new String[pca._output._eigenvectors_raw.length][], pca._output._eigenvectors_raw);
double[] vars = new double[pca._output._std_deviation.length];
for (int i = 0; i < vars.length; i++)
vars[i] = pca._output._std_deviation[i] * pca._output._std_deviation[i];
// Importance of principal components
double[] prop_var = new double[vars.length]; // Proportion of total variance
double[] cum_var = new double[vars.length]; // Cumulative proportion of total variance
for (int i = 0; i < vars.length; i++) {
prop_var[i] = vars[i] / pca._output._total_variance;
cum_var[i] = i == 0 ? prop_var[0] : cum_var[i-1] + prop_var[i];
}
pca._output._importance = new TwoDimTable("Importance of components", null,
new String[]{"Standard deviation", "Proportion of Variance", "Cumulative Proportion"},
colHeaders, colTypes, colFormats, "", new String[3][], new double[][]{pca._output._std_deviation, prop_var, cum_var});
pca._output._model_summary = pca._output._importance;
}
protected void computeStatsFillModel(PCAModel pca, SVDModel svd) {
// Fill PCA model with additional info needed for scoring
pca._output._normSub = svd._output._normSub;
pca._output._normMul = svd._output._normMul;
pca._output._permutation = svd._output._permutation;
pca._output._nnums = svd._output._nnums;
pca._output._ncats = svd._output._ncats;
pca._output._catOffsets = svd._output._catOffsets;
pca._output._nobs = svd._output._nobs;
// Fill model with eigenvectors and standard deviations
pca._output._std_deviation = ArrayUtils.mult(svd._output._d, 1.0 / Math.sqrt(svd._output._nobs - 1.0));
pca._output._eigenvectors_raw = svd._output._v;
pca._output._total_variance = svd._output._total_variance;
buildTables(pca, svd._output._names_expanded);
}
protected void computeStatsFillModel(PCAModel pca, GLRMModel glrm) {
assert glrm._parms._recover_svd;
// Fill model with additional info needed for scoring
pca._output._normSub = glrm._output._normSub;
pca._output._normMul = glrm._output._normMul;
pca._output._permutation = glrm._output._permutation;
pca._output._nnums = glrm._output._nnums;
pca._output._ncats = glrm._output._ncats;
pca._output._catOffsets = glrm._output._catOffsets;
pca._output._objective = glrm._output._objective;
// Fill model with eigenvectors and standard deviations
double dfcorr = 1.0 / Math.sqrt(_train.numRows() - 1.0);
pca._output._std_deviation = new double[_parms._k];
pca._output._eigenvectors_raw = glrm._output._eigenvectors_raw;
pca._output._total_variance = 0;
for(int i = 0; i < glrm._output._singular_vals.length; i++) {
pca._output._std_deviation[i] = dfcorr * glrm._output._singular_vals[i];
pca._output._total_variance += pca._output._std_deviation[i] * pca._output._std_deviation[i];
}
buildTables(pca, glrm._output._names_expanded);
}
protected void computeStatsFillModel(PCAModel pca, DataInfo dinfo, SingularValueDecomposition svd, Gram gram, long nobs) {
// Save adapted frame info for scoring later
pca._output._normSub = dinfo._normSub == null ? new double[dinfo._nums] : dinfo._normSub;
if(dinfo._normMul == null) {
pca._output._normMul = new double[dinfo._nums];
Arrays.fill(pca._output._normMul, 1.0);
} else
pca._output._normMul = dinfo._normMul;
pca._output._permutation = dinfo._permutation;
pca._output._nnums = dinfo._nums;
pca._output._ncats = dinfo._cats;
pca._output._catOffsets = dinfo._catOffsets;
double dfcorr = nobs / (nobs - 1.0);
double[] sval = svd.getSingularValues();
pca._output._std_deviation = new double[_parms._k]; // Only want first k standard deviations
for(int i = 0; i < _parms._k; i++) {
sval[i] = dfcorr * sval[i]; // Degrees of freedom = n-1, where n = nobs = # row observations processed
pca._output._std_deviation[i] = Math.sqrt(sval[i]);
}
double[][] eigvec = svd.getV().getArray();
pca._output._eigenvectors_raw = new double[eigvec.length][_parms._k]; // Only want first k eigenvectors
for(int i = 0; i < eigvec.length; i++)
System.arraycopy(eigvec[i], 0, pca._output._eigenvectors_raw[i], 0, _parms._k);
pca._output._total_variance = dfcorr * gram.diagSum(); // Since gram = X'X/n, but variance requires n-1 in denominator
buildTables(pca, dinfo.coefNames());
}
// Main worker thread
@Override protected void compute2() {
PCAModel model = null;
DataInfo dinfo = null;
try {
init(true); // Initialize parameters
_parms.read_lock_frames(PCA.this); // Fetch & read-lock input frames
if (error_count() > 0) throw new IllegalArgumentException("Found validation errors: " + validationErrors());
// The model to be built
model = new PCAModel(dest(), _parms, new PCAModel.PCAOutput(PCA.this));
model.delete_and_lock(_key);
if(_parms._pca_method == PCAParameters.Method.GramSVD) {
dinfo = new DataInfo(Key.make(), _train, _valid, 0, _parms._use_all_factor_levels, _parms._transform, DataInfo.TransformType.NONE, /* skipMissing */ !_parms._impute_missing, /* imputeMissing */ _parms._impute_missing, /* missingBucket */ false, /* weights */ false, /* offset */ false, /* fold */ false, /* intercept */ false);
DKV.put(dinfo._key, dinfo);
// Calculate and save Gram matrix of training data
// NOTE: Gram computes A'A/n where n = nrow(A) = number of rows in training set (excluding rows with NAs)
update(1, "Begin distributed calculation of Gram matrix");
GramTask gtsk = new GramTask(self(), dinfo).doAll(dinfo._adaptedFrame);
Gram gram = gtsk._gram; // TODO: This ends up with all NaNs if training data has too many missing values
assert gram.fullN() == _ncolExp;
model._output._nobs = gtsk._nobs;
// Compute SVD of Gram A'A/n using JAMA library
// Note: Singular values ordered in weakly descending order by algorithm
update(1, "Calculating SVD of Gram matrix locally");
Matrix gramJ = new Matrix(gtsk._gram.getXX());
SingularValueDecomposition svdJ = gramJ.svd();
update(1, "Computing stats from SVD");
computeStatsFillModel(model, dinfo, svdJ, gram, gtsk._nobs);
} else if(_parms._pca_method == PCAParameters.Method.Power || _parms._pca_method == PCAParameters.Method.Randomized) {
SVDModel.SVDParameters parms = new SVDModel.SVDParameters();
parms._train = _parms._train;
parms._valid = _parms._valid;
parms._ignored_columns = _parms._ignored_columns;
parms._ignore_const_cols = _parms._ignore_const_cols;
parms._score_each_iteration = _parms._score_each_iteration;
parms._use_all_factor_levels = _parms._use_all_factor_levels;
parms._transform = _parms._transform;
parms._nv = _parms._k;
parms._max_iterations = _parms._max_iterations;
parms._seed = _parms._seed;
// Set method for computing SVD accordingly
if(_parms._pca_method == PCAParameters.Method.Power)
parms._svd_method = SVDModel.SVDParameters.Method.Power;
else if(_parms._pca_method == PCAParameters.Method.Randomized)
parms._svd_method = SVDModel.SVDParameters.Method.Randomized;
// Calculate standard deviation, but not projection
parms._only_v = false;
parms._keep_u = false;
parms._save_v_frame = false;
SVDModel svd = null;
SVD job = null;
try {
job = new EmbeddedSVD(_key, _progressKey, parms);
svd = job.trainModel().get();
if (job.isCancelledOrCrashed())
PCA.this.cancel();
} finally {
if (job != null) job.remove();
if (svd != null) svd.remove();
}
// Recover PCA results from SVD model
update(1, "Computing stats from SVD");
computeStatsFillModel(model, svd);
} else if(_parms._pca_method == PCAParameters.Method.GLRM) {
GLRMModel.GLRMParameters parms = new GLRMModel.GLRMParameters();
parms._train = _parms._train;
parms._valid = _parms._valid;
parms._ignored_columns = _parms._ignored_columns;
parms._ignore_const_cols = _parms._ignore_const_cols;
parms._score_each_iteration = _parms._score_each_iteration;
parms._transform = _parms._transform;
parms._k = _parms._k;
parms._max_iterations = _parms._max_iterations;
parms._seed = _parms._seed;
parms._recover_svd = true;
parms._loss = GLRMModel.GLRMParameters.Loss.Quadratic;
parms._gamma_x = parms._gamma_y = 0;
parms._regularization_x = GLRMModel.GLRMParameters.Regularizer.None;
parms._regularization_y = GLRMModel.GLRMParameters.Regularizer.None;
parms._init = GLRM.Initialization.PlusPlus;
GLRMModel glrm = null;
GLRM job = null;
try {
job = new EmbeddedGLRM(_key, _progressKey, parms);
glrm = job.trainModel().get();
if (job.isCancelledOrCrashed())
PCA.this.cancel();
} finally {
if (job != null) job.remove();
if (glrm != null) {
// glrm._parms._representation_key.get().delete();
glrm._output._representation_key.get().delete();
glrm.remove();
}
}
// Recover PCA results from GLRM model
update(1, "Computing stats from GLRM decomposition");
computeStatsFillModel(model, glrm);
}
update(1, "Scoring and computing metrics on training data");
if (_parms._compute_metrics) {
model.score(_parms.train()).delete(); // This scores on the training data and appends a ModelMetrics
ModelMetrics mm = ModelMetrics.getFromDKV(model,_parms.train());
model._output._training_metrics = mm;
}
// At the end: validation scoring (no need to gather scoring history)
update(1, "Scoring and computing metrics on validation data");
if (_valid != null) {
model.score(_parms.valid()).delete(); //this appends a ModelMetrics on the validation set
model._output._validation_metrics = ModelMetrics.getFromDKV(model,_parms.valid());
}
model.update(self());
done();
} catch (Throwable t) {
Job thisJob = DKV.getGet(_key);
if (thisJob._state == JobState.CANCELLED) {
Log.info("Job cancelled by user.");
} else {
t.printStackTrace();
failed(t);
throw t;
}
} finally {
updateModelOutput();
_parms.read_unlock_frames(PCA.this);
if (model != null) model.unlock(_key);
if (dinfo != null) dinfo.remove();
}
tryComplete();
}
Key self() {
return _key;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy