
hex.pca.PCA Maven / Gradle / Ivy
package hex.pca;
import hex.DataInfo;
import hex.ModelBuilder;
import hex.ModelCategory;
import hex.schemas.ModelBuilderSchema;
import hex.schemas.PCAV3;
import hex.svd.SVD;
import hex.svd.SVDModel;
import water.*;
import water.fvec.Frame;
import water.util.Log;
import water.util.PrettyPrint;
import water.util.TwoDimTable;
import java.util.Arrays;
/**
* Quadratically Regularized PCA
* This is an algorithm for dimensionality reduction of numerical data.
* It is a general, parallelized implementation of PCA with quadratic regularization.
* Generalized Low Rank Models
* @author anqi_fu
*
*/
public class PCA extends ModelBuilder {
// Number of columns in training set (p)
private transient int _ncolExp; // With categoricals expanded into 0/1 indicator cols
@Override
public ModelBuilderSchema schema() {
return new PCAV3();
}
@Override
public Job trainModel() {
return start(new PCADriver(), 0);
}
@Override
public ModelCategory[] can_build() {
return new ModelCategory[]{ModelCategory.Clustering};
}
@Override public BuilderVisibility builderVisibility() { return BuilderVisibility.Experimental; };
@Override
protected void checkMemoryFootPrint() {
HeartBeat hb = H2O.SELF._heartbeat;
double p = _train.degreesOfFreedom();
long mem_usage = (long)(hb._cpus_allowed * p*p * 8/*doubles*/ * Math.log((double)_train.lastVec().nChunks())/Math.log(2.)); //one gram per core
long max_mem = hb.get_max_mem();
if (mem_usage > max_mem) {
String msg = "Gram matrices (one per thread) won't fit in the driver node's memory ("
+ PrettyPrint.bytes(mem_usage) + " > " + PrettyPrint.bytes(max_mem)
+ ") - try reducing the number of columns and/or the number of categorical factors.";
error("_train", msg);
cancel(msg);
}
}
public enum Initialization {
SVD, PlusPlus, User
}
// Called from an http request
public PCA(PCAModel.PCAParameters parms) {
super("PCA", parms);
init(false);
}
@Override
public void init(boolean expensive) {
super.init(expensive);
if (_parms._loading_key == null) _parms._loading_key = Key.make("PCALoading_" + Key.rand());
if (_parms._max_iterations < 1 || _parms._max_iterations > 1e6)
error("_max_iterations", "max_iterations must be between 1 and 1e6 inclusive");
if (_train == null) return;
if (_train.numCols() < 2) error("_train", "_train must have more than one column");
_ncolExp = _train.numColsExp(_parms._useAllFactorLevels, false);
// TODO: Initialize _parms._k = min(ncolExp(_train), nrow(_train)) if not set
int k_min = (int)Math.min(_ncolExp, _train.numRows());
if (_parms._k < 1 || _parms._k > k_min) error("_k", "_k must be between 1 and " + k_min);
if (expensive && error_count() == 0) checkMemoryFootPrint();
}
/**
* Given a n by k matrix X, form its Gram matrix
* @param x Matrix of real numbers
* @param transpose If true, compute n by n Gram of rows = XX'
* If false, compute k by k Gram of cols = X'X
* @return A symmetric positive semi-definite Gram matrix
*/
public static double[][] formGram(double[][] x, boolean transpose) {
if (x == null) return null;
int dim_in = transpose ? x[0].length : x.length;
int dim_out = transpose ? x.length : x[0].length;
double[][] xgram = new double[dim_out][dim_out];
// Compute all entries on and above diagonal
if(transpose) {
for (int i = 0; i < dim_in; i++) {
// Outer product = x[i] * x[i]', where x[i] is col i
for (int j = 0; j < dim_out; j++) {
for (int k = j; k < dim_out; k++)
xgram[j][k] += x[j][i] * x[k][i];
}
}
} else {
for (int i = 0; i < dim_in; i++) {
// Outer product = x[i]' * x[i], where x[i] is row i
for (int j = 0; j < dim_out; j++) {
for (int k = j; k < dim_out; k++)
xgram[j][k] += x[i][j] * x[i][k];
}
}
}
// Fill in entries below diagonal since Gram is symmetric
for (int i = 0; i < dim_in; i++) {
for (int j = 0; j < dim_out; j++) {
for (int k = 0; k < j; k++)
xgram[j][k] = xgram[k][j];
}
}
return xgram;
}
public static double[][] formGram(double[][] x) { return formGram(x, false); }
class PCADriver extends H2O.H2OCountedCompleter {
protected void computeStatsFillModel(PCAModel pca, SVDModel svd) {
// Eigenvectors are just the V matrix
String[] colTypes = new String[_parms._k];
String[] colFormats = new String[_parms._k];
String[] colHeaders = new String[_parms._k];
Arrays.fill(colTypes, "double");
Arrays.fill(colFormats, "%5f");
for (int i = 0; i < colHeaders.length; i++) colHeaders[i] = "PC" + String.valueOf(i + 1);
pca._output._eigenvectors_raw = svd._output._v;
pca._output._eigenvectors = new TwoDimTable("Rotation", null, _train.names(),
colHeaders, colTypes, colFormats, "", new String[_train.numCols()][], svd._output._v);
// Compute standard deviation
double[] sdev = new double[svd._output._d.length];
double[] vars = new double[svd._output._d.length];
double totVar = 0;
double dfcorr = 1.0 / Math.sqrt(_train.numRows() - 1.0);
for (int i = 0; i < sdev.length; i++) {
sdev[i] = dfcorr * svd._output._d[i];
vars[i] = sdev[i] * sdev[i];
totVar += vars[i];
}
pca._output._std_deviation = sdev;
// Importance of principal components
double[] prop_var = new double[vars.length]; // Proportion of total variance
double[] cum_var = new double[vars.length]; // Cumulative proportion of total variance
for (int i = 0; i < vars.length; i++) {
prop_var[i] = vars[i] / totVar;
cum_var[i] = i == 0 ? prop_var[0] : cum_var[i - 1] + prop_var[i];
}
pca._output._pc_importance = new TwoDimTable("Importance of components", null,
new String[]{"Standard deviation", "Proportion of Variance", "Cumulative Proportion"},
colHeaders, colTypes, colFormats, "", new String[3][], new double[][]{sdev, prop_var, cum_var});
// Fill PCA model with additional info needed for scoring
if(_parms._keep_loading) pca._output._loading_key = svd._output._u_key;
pca._output._normSub = svd._output._normSub;
pca._output._normMul = svd._output._normMul;
pca._output._permutation = svd._output._permutation;
pca._output._nnums = svd._output._nnums;
pca._output._ncats = svd._output._ncats;
pca._output._catOffsets = svd._output._catOffsets;
}
// Main worker thread
@Override protected void compute2() {
PCAModel model = null;
DataInfo dinfo = null;
DataInfo xinfo = null;
Frame x = null;
try {
_parms.read_lock_frames(PCA.this); // Fetch & read-lock input frames
init(true);
if (error_count() > 0) throw new IllegalArgumentException("Found validation errors: " + validationErrors());
// The model to be built
model = new PCAModel(dest(), _parms, new PCAModel.PCAOutput(PCA.this));
model.delete_and_lock(_key);
SVDModel.SVDParameters parms = new SVDModel.SVDParameters();
parms._train = _parms._train;
parms._ignored_columns = _parms._ignored_columns;
parms._ignore_const_cols = _parms._ignore_const_cols;
parms._score_each_iteration = _parms._score_each_iteration;
parms._useAllFactorLevels = _parms._useAllFactorLevels;
parms._transform = _parms._transform;
parms._nv = _parms._k;
parms._max_iterations = _parms._max_iterations;
parms._seed = _parms._seed;
// Calculate standard deviation and projection as well
parms._only_v = false;
parms._u_key = _parms._loading_key;
parms._keep_u = _parms._keep_loading;
SVDModel svd = null;
SVD job = null;
try {
job = new SVD(parms);
svd = job.trainModel().get();
} finally {
if (job != null) job.remove();
if (svd != null) svd.remove();
}
// Recover PCA results from SVD model
computeStatsFillModel(model, svd);
model.update(_key);
update(1);
done();
} catch (Throwable t) {
Job thisJob = DKV.getGet(_key);
if (thisJob._state == JobState.CANCELLED) {
Log.info("Job cancelled by user.");
} else {
t.printStackTrace();
failed(t);
throw t;
}
} finally {
_parms.read_unlock_frames(PCA.this);
if (model != null) model.unlock(_key);
if (dinfo != null) dinfo.remove();
if (xinfo != null) xinfo.remove();
if (x != null && !_parms._keep_loading) x.delete();
}
tryComplete();
}
Key self() {
return _key;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy