hex.svd.SVD Maven / Gradle / Ivy
package hex.svd;
import Jama.Matrix;
import Jama.QRDecomposition;
import Jama.SingularValueDecomposition;
import hex.DataInfo;
import hex.DataInfo.Row;
import hex.FrameTask;
import hex.ModelBuilder;
import hex.ModelCategory;
import hex.glrm.GLRMModel;
import hex.gram.Gram;
import hex.gram.Gram.GramTask;
import hex.svd.SVDModel.SVDParameters;
import hex.util.LinearAlgebraUtils;
import hex.util.LinearAlgebraUtils.BMulInPlaceTask;
import hex.util.LinearAlgebraUtils.BMulTask;
import hex.util.LinearAlgebraUtils.SMulTask;
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.rapids.Rapids;
import water.util.ArrayUtils;
import water.util.PrettyPrint;
import water.util.TwoDimTable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import static hex.util.DimensionReductionUtils.createScoringHistoryTableDR;
import static hex.util.DimensionReductionUtils.getTransformedEigenvectors;
import static java.lang.StrictMath.sqrt;
import static water.util.ArrayUtils.*;
/**
* Singular Value Decomposition
* SVD via Power Method Algorithm
* Proof of Convergence for Power Method
* Randomized Algorithms for Matrix Approximation
* @author anqi_fu
*/
public class SVD extends ModelBuilder {
// Convergence tolerance
private final double TOLERANCE = 1e-16; // Cutoff for estimation error of right singular vector
private final double EPS = 1e-16; // cutoff if vector norm is too small
// Maximum number of columns when categoricals expanded
private final int MAX_COLS_EXPANDED = 5000;
private boolean _callFromGLRM; // when SVD is used as an init method for GLRM, need to initialize properly
private GLRMModel _glrmModel;
// Number of columns in training set (p)
private transient int _ncolExp; // With categoricals expanded into 0/1 indicator cols
boolean _wideDataset = false; // default with wideDataset set to be false.
private double[] _estimatedSingularValues; // store estimated singular values for power method
private boolean _matrixRankReached = false; // stop if eigenvector norm becomes too small. Reach rank of matrix
private boolean _failedConvergence = false; // warn if power failed to converge for some eigenvector calculation
@Override protected SVDDriver trainModelImpl() { return new SVDDriver(); }
@Override public ModelCategory[] can_build() { return new ModelCategory[]{ ModelCategory.DimReduction }; }
@Override public BuilderVisibility builderVisibility() { return BuilderVisibility.Experimental; }
@Override public boolean isSupervised() { return false; }
@Override public boolean havePojo() { return true; }
@Override public boolean haveMojo() { return false; }
// Called from an http request
public SVD(SVDModel.SVDParameters parms ) { super(parms ); init(false); _glrmModel=null; _callFromGLRM=false;}
public SVD(SVDModel.SVDParameters parms, Job job) { super(parms,job); init(false); _glrmModel=null; _callFromGLRM=false;}
public SVD(SVDModel.SVDParameters parms, Job job, boolean callFromGlrm, GLRMModel gmodel) {
super(parms,job);
init(false);
_callFromGLRM = callFromGlrm;
if (gmodel == null)
error("_train SVD for GLRM", "Your GLRM model parameter is null.");
_glrmModel = gmodel;
}
public SVD(boolean startup_once) { super(new SVDParameters(),startup_once); }
@Override
protected void checkMemoryFootPrint_impl() {
HeartBeat hb = H2O.SELF._heartbeat;
double p = LinearAlgebraUtils.numColsExp(_train, true);
double r = _train.numRows();
boolean useGramSVD = _parms._svd_method == SVDParameters.Method.GramSVD;
boolean usePower = _parms._svd_method == SVDParameters.Method.Power;
boolean useRandomized = _parms._svd_method == SVDParameters.Method.Randomized;
double gramSize = _train.lastVec().nChunks()==1 ? 1 :
Math.log((double) _train.lastVec().nChunks()) / Math.log(2.); // gets to zero if nChunks=1
long mem_usage = (useGramSVD || usePower || useRandomized) ? (long) (hb._cpus_allowed * p * p * 8/*doubles*/
* gramSize) : 1; //one gram per core
long mem_usage_w = (useGramSVD || usePower || useRandomized) ? (long) (hb._cpus_allowed * r * r * 8/*doubles*/
* gramSize) : 1; //one gram per core
long max_mem = hb.get_free_mem();
if ((mem_usage > max_mem) && (mem_usage_w > max_mem)) {
String msg = "Gram matrices (one per thread) won't fit in the driver node's memory ("
+ PrettyPrint.bytes(mem_usage) + " > " + PrettyPrint.bytes(max_mem)
+ ") - try reducing the number of columns and/or the number of categorical factors.";
error("_train", msg);
}
// _wideDataset is true if original memory does not fit.
if (mem_usage > max_mem) {
_wideDataset = true; // have to set _wideDataset in this case
} else { // both ways fit into memory. Want to choose wideDataset if p is too big.
if ((p > 5000) && ( r < 5000)) {
_wideDataset = true;
}
}
}
/*
Set value of wideDataset. Note that this routine is used for test purposes only and is not intended
for users.
*/
public void setWideDataset(boolean isWide) {
_wideDataset = isWide;
}
@Override public void init(boolean expensive) {
super.init(expensive);
if (_parms._max_iterations < 1)
error("_max_iterations", "max_iterations must be at least 1");
if(_train == null) return;
if (_callFromGLRM) // when used to initialize GLRM, need to treat binary numeric columns with binary loss as numeric columns
_ncolExp = _glrmModel._output._catOffsets[_glrmModel._output._catOffsets.length-1]+_glrmModel._output._nnums;
else
_ncolExp = LinearAlgebraUtils.numColsExp(_train,_parms._use_all_factor_levels);
if (_ncolExp > MAX_COLS_EXPANDED) {
warn("_train", "_train has " + _ncolExp + " columns when categoricals are expanded. " +
"Algorithm may be slow.");
}
if(_parms._nv < 1 || _parms._nv > _ncolExp)
error("_nv", "Number of right singular values must be between 1 and " + _ncolExp);
if (expensive && error_count() == 0) {
if (!(_train.hasNAs()) || _parms._impute_missing) {
checkMemoryFootPrint(); // perform memory check here if dataset contains no NAs or if impute_missing enabled
}
}
}
// Compute ivv_sum - vec * vec' for symmetric array ivv_sum
public static double[][] updateIVVSum(double[][] ivv_sum, double[] vec) {
double diff;
for(int i = 0; i < vec.length; i++) {
for(int j = 0; j < i; j++) {
diff = ivv_sum[i][j] - vec[i] * vec[j];
ivv_sum[i][j] = ivv_sum[j][i] = diff;
}
ivv_sum[i][i] -= vec[i] * vec[i];
}
return ivv_sum;
}
class SVDDriver extends Driver {
SVDModel _model;
private double[] powerLoop(Gram gram, long seed, SVDModel model, double[] randomInitialV, double[] finalV, int k)
{
// Arrays.fill(randomInitialV,0);
randomInitialV = ArrayUtils.gaussianVector(seed+k, randomInitialV); // random vector for each iteration!
div(randomInitialV, l2norm(randomInitialV)); // normalize initial vector
return powerLoop(gram, randomInitialV, model, finalV, k);
}
/**
* Problem I have with the current powerLoop is that the err gets to be very small very quickly and
* we never really get to iterate over much. I am changing the stopping condition to one that is
* used for symmetric matrices as follows:
*
* let X = A^m*X0, let lambda1 = AX dot_product X/(X dot_product X).
* Stop if err = sqrt(AX dot_product AX/X dot_productX - lambda1^2) < tolerance or max iteration is reached.
*
* @param gram
* @param v
* @param model
* @param vnew
* @return
*/
private double[] powerLoop(Gram gram, double[] v, SVDModel model, double[] vnew, int k) {
// TODO: What happens if Gram matrix is essentially zero? Numerical inaccuracies in PUBDEV-1161.
assert v.length == gram.fullN();
// Set initial value v_0 to standard normal distribution
int iters = 0;
double err = 2 * TOLERANCE;
double lambda1_calc = 0; // this is the actual singular values that we are looking for as well!q
double lambda_est = 0;
int eigIndex = model._output._iterations+1; // we start counting at 1 and not zero.
// Update v_i <- (A'Av_{i-1})/||A'Av_{i-1}|| where A'A = Gram matrix of training frame
while(iters < _parms._max_iterations && err > TOLERANCE) {
if (stop_requested()) break;
// Compute x_i <- A'Av_{i-1} and ||x_i||
gram.mul(v, vnew);
lambda1_calc = innerProduct(vnew, v);
lambda_est = innerProduct(vnew, vnew);
double norm = l2norm(vnew);
double invnorm = 0;
err = 0;
if (norm > EPS) { // norm is not too small
invnorm = 1 / norm;
for (int i = 0; i < v.length; i++) {
vnew[i] *= invnorm; // Compute singular vector v_i = x_i/||x_i||
v[i] = vnew[i]; // Update v_i for next iteration
}
err = Math.sqrt(lambda_est - lambda1_calc * lambda1_calc);
iters++; // TODO: Should output vector of final iterations for each k
// store variables for scoring history
model._output._training_time_ms.add(System.currentTimeMillis());
model._output._history_err.add(err);
model._output._history_eigenVectorIndex.add((double) eigIndex);
} else {
_job.warn("_train SVD: Dataset is rank deficient. User specified "+_parms._nv);
_matrixRankReached = true;
break;
}
}
if (err > TOLERANCE) {
_failedConvergence=true;
_job.warn("_train: PCA Power method failed to converge within TOLERANCE. Increase max_iterations or reduce " +
"TOLERANCE to mitigate this problem.");
}
_estimatedSingularValues[k] = lambda1_calc;
return v;
}
private double computeSigmaU(DataInfo dinfo, SVDModel model, int k, double[][] ivv_sum, Vec[] uvecs, double[] vresult) {
double[] ivv_vk = ArrayUtils.multArrVec(ivv_sum, model._output._v[k], vresult);
CalcSigmaU ctsk = new CalcSigmaU(_job._key, dinfo, ivv_vk).doAll(Vec.T_NUM, dinfo._adaptedFrame);
model._output._d[k] = ctsk._sval;
assert ctsk._nobs == model._output._nobs : "Processed " + ctsk._nobs + " rows but expected " + model._output._nobs; // Check same number of skipped rows as Gram
Frame tmp = ctsk.outputFrame();
uvecs[k] = tmp.vec(0); // Save output column of U
tmp.unlock(_job);
return model._output._d[k];
}
/*
// Algorithm 4.4: Randomized subspace iteration from Halk et al (http://arxiv.org/pdf/0909.4061.pdf)
private Frame randSubIterInPlace(DataInfo dinfo, SVDModel model) {
DataInfo yinfo = null;
Frame yqfrm = null;
try {
// 1) Initialize Y = AG where G ~ N(0,1) and compute Y = QR factorization
_job.update(1, "Initializing random subspace of training data Y");
double[][] gt = ArrayUtils.gaussianArray(_parms._nv, _ncolExp, _parms._seed);
RandSubInit rtsk = new RandSubInit(_job._key, dinfo, gt);
rtsk.doAll(_parms._nv, Vec.T_NUM, dinfo._adaptedFrame);
yqfrm = rtsk.outputFrame(Key.make(), null, null); // Alternates between Y and Q from Y = QR
// Make input frame [A,Q] where A = read-only training data, Y = A \tilde{Q}, Q from Y = QR factorization
// Note: If A is n by p (p = num cols with categoricals expanded), then \tilde{Q} is p by k and Q is n by k
// Q frame is used to save both intermediate Y calculation and final orthonormal Q matrix
Frame aqfrm = new Frame(dinfo._adaptedFrame);
aqfrm.add(yqfrm);
// Calculate Cholesky of Y Gram to get R' = L matrix
_job.update(1, "Computing QR factorization of Y");
yinfo = new DataInfo(yqfrm, null, true, DataInfo.TransformType.NONE, true, false, false);
DKV.put(yinfo._key, yinfo);
LinearAlgebraUtils.computeQInPlace(_job._key, yinfo);
model._output._iterations = 0;
while (model._output._iterations < _parms._max_iterations) {
if(stop_requested()) break;
_job.update(1, "Iteration " + String.valueOf(model._output._iterations+1) + " of randomized subspace iteration");
// 2) Form \tilde{Y}_j = A'Q_{j-1} and compute \tilde{Y}_j = \tilde{Q}_j \tilde{R}_j factorization
SMulTask stsk = new SMulTask(dinfo, _parms._nv);
stsk.doAll(aqfrm);
Matrix ysmall = new Matrix(stsk._atq);
QRDecomposition ysmall_qr = new QRDecomposition(ysmall);
double[][] qtilde = ysmall_qr.getQ().getArray();
// 3) [A,Q_{j-1}] -> [A,Y_j]: Form Y_j = A\tilde{Q}_j and compute Y_j = Q_jR_j factorization
BMulInPlaceTask tsk = new BMulInPlaceTask(dinfo, ArrayUtils.transpose(qtilde));
tsk.doAll(aqfrm);
LinearAlgebraUtils.computeQInPlace(_job._key, yinfo);
model._output._iterations++;
model.update(_job);
}
} finally {
if( yinfo != null ) yinfo.remove();
}
return yqfrm;
}
*/
// Algorithm 4.4: Randomized subspace iteration from Halk et al (http://arxiv.org/pdf/0909.4061.pdf)
// This function keeps track of change in Q each iteration ||Q_j - Q_{j-1}||_2 to check convergence
private Frame randSubIter(DataInfo dinfo, SVDModel model) {
DataInfo yinfo = null;
Frame ybig = null, qfrm = null, ysmallF = null, ysmallqfrm = null;
final int ncolA = dinfo._adaptedFrame.numCols();
double[][] xx = null;
double[][] ysmall_q = null;
DataInfo ysmallInfo = null;
try {
// 1) Initialize Y = AG where G ~ N(0,1) and compute Y = QR factorization
_job.update(1, "Initializing random subspace of training data Y");
double[][] gt = ArrayUtils.gaussianArray(_parms._nv, _ncolExp, _parms._seed);
RandSubInit rtsk = new RandSubInit(_job._key, dinfo, gt);
rtsk.doAll(_parms._nv, Vec.T_NUM, dinfo._adaptedFrame);
ybig = rtsk.outputFrame(Key.make(), null, null);
Frame yqfrm = new Frame(ybig);
for (int i = 0; i < _parms._nv; i++)
yqfrm.add("qcol_" + i, yqfrm.anyVec().makeZero());
// Calculate Cholesky of Gram to get R' = L matrix
_job.update(1, "Computing QR factorization of Y");
yinfo = new DataInfo(ybig, null, true, DataInfo.TransformType.NONE, true, false, false);
DKV.put(yinfo._key, yinfo);
LinearAlgebraUtils.computeQ(_job._key, yinfo, yqfrm, xx);
if (yqfrm.hasInfs()) { // dataset is rank deficient, reduce _nv to fit the true rank better
_matrixRankReached=true; // count when bad infinity or NaNs appear to denote problem;
String warnMessage = "_train SVD: Dataset is rank deficient. _parms._nv was "+_parms._nv;
for (int colIndex = ybig.numCols(); colIndex < yqfrm.numCols(); colIndex++) {
if (yqfrm.vec(colIndex).pinfs() > 0) {
_parms._nv = colIndex-ybig.numCols();
break;
}
}
_job.warn(warnMessage+" and is now set to "+_parms._nv);
// redo with correct _nv number
gt = ArrayUtils.gaussianArray(_parms._nv, _ncolExp, _parms._seed);
rtsk = new RandSubInit(_job._key, dinfo, gt);
rtsk.doAll(_parms._nv, Vec.T_NUM, dinfo._adaptedFrame);
ybig.remove();
yinfo.remove();
ybig = rtsk.outputFrame(Key.make(), null, null);
yinfo = new DataInfo(ybig, null, true, DataInfo.TransformType.NONE, true, false, false);
DKV.put(yinfo._key, yinfo);
}
// Make input frame [A,Q,Y] where A = read-only training data, Y = A \tilde{Q}, Q from Y = QR factorization
// Note: If A is n by p (p = num cols with categoricals expanded), then \tilde{Q} is p by k and Q is n by k
Frame ayqfrm = new Frame(dinfo._adaptedFrame);
ayqfrm.add(ybig);
for (int i = 0; i < _parms._nv; i++)
ayqfrm.add("qcol_" + i, ayqfrm.anyVec().makeZero());
Frame ayfrm = ayqfrm.subframe(0, ncolA + _parms._nv); // [A,Y]
Frame aqfrm = ayqfrm.subframe(0, ncolA);
aqfrm.add(ayqfrm.subframe(ncolA + _parms._nv, ayqfrm.numCols())); // [A,Q]
yqfrm = ayqfrm.subframe(ncolA, ayqfrm.numCols()); // [Y,Q]
xx = MemoryManager.malloc8d(_parms._nv, _parms._nv);
LinearAlgebraUtils.computeQ(_job._key, yinfo, yqfrm, xx);
model._output._iterations = 0;
long qobs = dinfo._adaptedFrame.numRows() * _parms._nv; // Number of observations in Q
double qerr = 2 * TOLERANCE * qobs; // Stop when average SSE between Q_j and Q_{j-2} below tolerance
double average_SEE = qerr / qobs;
int wEndCol = 2*_parms._nv-1;
int wEndColR = _parms._nv-1;
while ((model._output._iterations < 10 || average_SEE > TOLERANCE) && model._output._iterations < _parms._max_iterations) { // Run at least 10 iterations before tolerance cutoff
if(stop_requested()) {
if (timeout())
_job.warn("_train SVD: max_runtime_secs is reached. Not all iterations are computed.");
break;
}
_job.update(1, "Iteration " + String.valueOf(model._output._iterations+1) + " of randomized subspace iteration");
// 2) Form \tilde{Y}_j = A'Q_{j-1} and compute \tilde{Y}_j = \tilde{Q}_j \tilde{R}_j factorization
SMulTask stsk = new SMulTask(dinfo, _parms._nv, _ncolExp);
stsk.doAll(aqfrm); // Pass in [A,Q]
if (_wideDataset) {
if (model._output._iterations==0) {
ysmallF = new water.util.ArrayUtils().frame(stsk._atq);
ysmallInfo = new DataInfo(ysmallF, null, true, DataInfo.TransformType.NONE,
true, false, false);
DKV.put(ysmallInfo._key, ysmallInfo);
ysmall_q = MemoryManager.malloc8d(_ncolExp, _parms._nv);
ysmallqfrm = new Frame(ysmallF);
for (int i = 0; i < _parms._nv; i++) // pray that _nv is small
ysmallqfrm.add("qcol_" + i, ysmallqfrm.anyVec().makeZero());
} else { // replace content of ysmallqfrm with new contents in _atq,
new CopyArrayToFrame(0, wEndColR, _ncolExp, stsk._atq).doAll(ysmallqfrm);
}
LinearAlgebraUtils.computeQ(_job._key, ysmallInfo, ysmallqfrm, xx);
ysmall_q = new FrameToArray(_parms._nv, wEndCol, _ncolExp, ysmall_q).doAll(ysmallqfrm).getArray();
} else { // let ysmall as 2-D double array
Matrix ysmall = new Matrix(stsk._atq); // small only for n_exp << m. Not for wide dataset.
QRDecomposition ysmall_qr = new QRDecomposition(ysmall);
ysmall_q = ysmall_qr.getQ().getArray(); // memory allocation here too.
}
// 3) Form Y_j = A\tilde{Q}_j and compute Y_j = Q_jR_j factorization (ybig)
BMulInPlaceTask tsk = new BMulInPlaceTask(dinfo, ArrayUtils.transpose(ysmall_q), _ncolExp);
tsk.doAll(ayfrm);
qerr = LinearAlgebraUtils.computeQ(_job._key, yinfo, yqfrm, xx);
average_SEE = qerr/qobs;
model._output._iterations++;
// store variables for scoring history
model._output._training_time_ms.add(System.currentTimeMillis());
model._output._history_average_SEE.add(average_SEE);
model.update(_job);
}
model._output._nobs = ybig.numRows(); // update nobs parameter
model.update(_job);
// 4) Extract and save final Q_j from [A,Q] frame
qfrm = ayqfrm.extractFrame(ncolA + _parms._nv, ayqfrm.numCols());
qfrm = new Frame(Key.make(), qfrm.names(), qfrm.vecs());
DKV.put(qfrm);
} finally {
if( yinfo != null ) yinfo.remove();
if( ybig != null ) ybig.delete();
if (ysmallInfo != null) ysmallInfo.remove();
if (ysmallF != null) ysmallF.delete();
if (ysmallqfrm != null) ysmallqfrm.delete();
}
return qfrm;
}
// Algorithm 5.1: Direct SVD from Halko et al (http://arxiv.org/pdf/0909.4061.pdf)
private Frame directSVD(DataInfo dinfo, Frame qfrm, SVDModel model) {
String u_name = (_parms._u_name == null || _parms._u_name.length() == 0) ? "SVDUMatrix_" + Key.rand() : _parms._u_name;
return directSVD(dinfo, qfrm, model, u_name);
}
private Frame directSVD(DataInfo dinfo, Frame qfrm, SVDModel model, String u_name) {
DataInfo qinfo = null;
Frame u = null;
final int ncolA = dinfo._adaptedFrame.numCols();
try {
Vec[] vecs = new Vec[ncolA + _parms._nv];
for (int i = 0; i < ncolA; i++) vecs[i] = dinfo._adaptedFrame.vec(i);
for (int i = 0; i < _parms._nv; i++) vecs[ncolA + i] = qfrm.vec(i);
Frame aqfrm = new Frame(vecs);
// 1) Form the matrix B' = A'Q = (Q'A)'
_job.update(1, "Forming small matrix B = Q'A for direct SVD");
SMulTask stsk = new SMulTask(dinfo, _parms._nv, _ncolExp);
stsk.doAll(aqfrm); // _atq size is _ncolExp by _nv
if (_wideDataset) { // for wide dataset, calculate gram of B*T(B), get the SVD and proceed from there.
/* double[][] xgram = ArrayUtils.formGram(stsk._atq, false);
Matrix gramJ2 = new Matrix(xgram); // form outer gram*/
Frame tB = new water.util.ArrayUtils().frame(stsk._atq);
DataInfo tbInfo = new DataInfo(tB, null, true, DataInfo.TransformType.NONE,
false, false, false);
GramTask gtsk = new GramTask(_job._key, tbInfo).doAll(tB);
Matrix gramJ = new Matrix(gtsk._gram.getXX()); // form outer gram
SingularValueDecomposition svdJ = gramJ.svd();
// 3) Form orthonormal matrix U = QV
_job.update(1, "Forming distributed orthonormal matrix U");
u=makeUVec(model, u_name, u, qfrm, new Matrix(stsk._atq), svdJ);
model._output._d = ArrayUtils.mult((Arrays.copyOfRange(ArrayUtils.sqrtArr(svdJ.getSingularValues()),
0, _parms._nv)), sqrt(tB.numRows()));
// to get v, we need to do T(A)*U*D^-1
// stuff A and U into a frame
Vec[] tvecs = new Vec[ncolA];
for (int i = 0; i < ncolA; i++) tvecs[i] = dinfo._adaptedFrame.vec(i);
Frame avfrm = new Frame(tvecs);
Frame fromSVD = null;
avfrm.add(u);
model._output._v = (new SMulTask(dinfo, _parms._nv, _ncolExp).doAll(avfrm))._atq;
// Perform T(A)*U and V is in _atq. Need to be scaled by svd.
model._output._v = ArrayUtils.mult(ArrayUtils.transpose(ArrayUtils.div(ArrayUtils.transpose(model._output._v),
model._output._d)), 1);
if (fromSVD != null) fromSVD.delete();
if (tB != null) tB.delete();
} else {
// 2) Compute SVD of small matrix: If B' = WDV', then B = VDW'
_job.update(1, "Calculating SVD of small matrix locally");
Matrix atqJ = new Matrix(stsk._atq);
SingularValueDecomposition svdJ = atqJ.svd();
// 3) Form orthonormal matrix U = QV
_job.update(1, "Forming distributed orthonormal matrix U");
if (_parms._keep_u) {
u=makeUVec(model, u_name, u, qfrm, atqJ, svdJ);
}
model._output._d = Arrays.copyOfRange(svdJ.getSingularValues(), 0, _parms._nv);
model._output._v = svdJ.getU().getMatrix(0, atqJ.getRowDimension() - 1, 0, _parms._nv - 1).getArray();
}
} finally {
if( qinfo != null ) qinfo.remove();
}
return u;
}
/*
Form orthonormal matrix U = QV
*/
public Frame makeUVec(SVDModel model, String u_name, Frame u, Frame qfrm, Matrix atqJ, SingularValueDecomposition svdJ ) {
model._output._u_key = Key.make(u_name);
double[][] svdJ_u = svdJ.getV().getMatrix(0, atqJ.getColumnDimension() - 1, 0,
_parms._nv - 1).getArray();
DataInfo qinfo = new DataInfo(qfrm, null, true, DataInfo.TransformType.NONE,
false, false, false);
DKV.put(qinfo._key, qinfo);
BMulTask btsk = new BMulTask(_job._key, qinfo, ArrayUtils.transpose(svdJ_u));
btsk.doAll(_parms._nv, Vec.T_NUM, qinfo._adaptedFrame);
qinfo.remove();
return btsk.outputFrame(model._output._u_key, null, null);
// DKV.remove(qinfo._key);
}
@Override
public void computeImpl() {
SVDModel model = null;
DataInfo dinfo = null, tinfo = null;
Frame u = null, qfrm = null;
Vec[] uvecs = null;
try {
init(true); // Initialize parameters
if (error_count() > 0) throw new IllegalArgumentException("Found validation errors: " + validationErrors());
// The model to be built
model = new SVDModel(dest(), _parms, new SVDModel.SVDOutput(SVD.this));
model.delete_and_lock(_job);
// store (possibly) rebalanced input train to pass it to nested SVD job
Frame tranRebalanced = new Frame(_train);
boolean frameHasNas = tranRebalanced.hasNAs();
// 0) Transform training data and save standardization vectors for use in scoring later
if ((!_parms._impute_missing) && frameHasNas) { // remove NAs rows
tinfo = new DataInfo(_train, _valid, 0, _parms._use_all_factor_levels, _parms._transform,
DataInfo.TransformType.NONE, /* skipMissing */ !_parms._impute_missing, /* imputeMissing */
_parms._impute_missing, /* missingBucket */ false, /* weights */ false,
/* offset */ false, /* fold */ false, /* intercept */ false);
DKV.put(tinfo._key, tinfo);
DKV.put(tranRebalanced._key, tranRebalanced);
_train = Rapids.exec(String.format("(na.omit %s)", tranRebalanced._key)).getFrame(); // remove NA rows
DKV.remove(tranRebalanced._key);
checkMemoryFootPrint();
}
dinfo = new DataInfo(_train, _valid, 0, _parms._use_all_factor_levels, _parms._transform,
DataInfo.TransformType.NONE, /* skipMissing */ !_parms._impute_missing, /* imputeMissing */
_parms._impute_missing, /* missingBucket */ false, /* weights */ false,
/* offset */ false, /* fold */ false, /* intercept */ false);
DKV.put(dinfo._key, dinfo);
if (!_parms._impute_missing && frameHasNas) {
// fixed the std and mean of dinfo to that of the frame before removing NA rows
dinfo._normMul = tinfo._normMul;
dinfo._numMeans = tinfo._numMeans;
dinfo._numNAFill = dinfo._numMeans; // NAs will be imputed with means
dinfo._normSub = tinfo._normSub;
}
// Save adapted frame info for scoring later
setSVDModel(model, dinfo);
String u_name = (_parms._u_name == null || _parms._u_name.length() == 0) ? "SVDUMatrix_" + Key.rand() : _parms._u_name;
String v_name = (_parms._v_name == null || _parms._v_name.length() == 0) ? "SVDVMatrix_" + Key.rand() : _parms._v_name;
if(_parms._svd_method == SVDParameters.Method.GramSVD) {
// Calculate and save Gram matrix of training data
// NOTE: Gram computes A'A/n where n = nrow(A) = number of rows in training set (excluding rows with NAs)
_job.update(1, "Begin distributed calculation of Gram matrix");
GramTask gtsk = new GramTask(_job._key, dinfo).doAll(dinfo._adaptedFrame);
Gram gram = gtsk._gram; // TODO: This ends up with all NaNs if training data has too many missing values
assert gram.fullN() == _ncolExp;
model._output._nobs = gtsk._nobs;
model._output._total_variance = gram.diagSum() * gtsk._nobs / (gtsk._nobs-1); // Since gram = X'X/nobs, but variance requires nobs-1 in denominator
model.update(_job);
// Cannot calculate SVD if all rows contain missing value(s) and hence were skipped
if(gtsk._nobs == 0)
error("_train", "Every row in _train contains at least one missing value. Consider setting impute_missing = TRUE.");
if (error_count() > 0) throw new IllegalArgumentException("Found validation errors: " + validationErrors());
// Calculate SVD of G = A'A/n and back out SVD of A. If SVD of A = UDV' then A'A/n = V(D^2/n)V'
_job.update(1, "Calculating SVD of Gram matrix locally");
Matrix gramJ = new Matrix(gtsk._gram.getXX());
SingularValueDecomposition svdJ = gramJ.svd();
// Output diagonal of D
_job.update(1, "Computing stats from SVD");
double[] sval = svdJ.getSingularValues();
model._output._d = MemoryManager.malloc8d(_parms._nv);
// model._output._d = new double[_parms._nv]; // Only want rank = nv diagonal values
for(int k = 0; k < _parms._nv; k++)
model._output._d[k] = Math.sqrt(sval[k] * model._output._nobs);
// Output right singular vectors V
double[][] v = svdJ.getV().getArray();
assert v.length == _ncolExp && LinearAlgebraUtils.numColsExp(dinfo._adaptedFrame,_parms._use_all_factor_levels) == _ncolExp;
model._output._v = MemoryManager.malloc8d(_ncolExp, _parms._nv);
// model._output._v = new double[_ncolExp][_parms._nv]; // Only want rank = nv decomposition
for(int i = 0; i < v.length; i++)
System.arraycopy(v[i], 0, model._output._v[i], 0, _parms._nv);
// Calculate left singular vectors U = AVD^(-1) if requested
if(_parms._keep_u) {
model._output._u_key = Key.make(u_name);
double[][] vt = ArrayUtils.transpose(model._output._v);
for (int k = 0; k < _parms._nv; k++)
ArrayUtils.div(vt[k], model._output._d[k]);
BMulTask tsk = new BMulTask(_job._key, dinfo, vt).doAll(_parms._nv, Vec.T_NUM, dinfo._adaptedFrame);
u = tsk.outputFrame(model._output._u_key, null, null);
}
} else if(_parms._svd_method == SVDParameters.Method.Power) {
// Calculate and save Gram matrix of training data
// NOTE: Gram computes A'A/n where n = nrow(A) = number of rows in training set (excluding rows with NAs)
// NOTE: the Gram also will apply the specified Transforms on the data before performing the operation.
// NOTE: valid transforms are NONE, DEMEAN, STANDARDIZE...
_job.update(1, "Begin distributed calculation of Gram matrix");
GramTask gtsk = null;
Gram.OuterGramTask ogtsk = null;
Gram gram = null, gram_update=null;
double[] randomInitialV = null; // store random initial eigenvectors, actually refering to V'
double[] finalV = null; // store eigenvectors obtained from powerLoop
int eigVecLen = _ncolExp; // size of one eigenvector
GramUpdate guptsk = null;
double[][] gramArrays = null; // store outergram as a double array
double[][] gramUpdatesW = null; // store the result of (I-sum vi*T(vi))*A*T(A)*(I-sum vi*T(vi))
//_estimatedSingularValues = new double[_parms._nv]; // allocate memory once
_estimatedSingularValues = MemoryManager.malloc8d(_parms._nv);
if (_wideDataset) {
ogtsk = new Gram.OuterGramTask(_job._key, dinfo).doAll(dinfo._adaptedFrame);
gram = ogtsk._gram;
model._output._nobs = ogtsk._nobs;
eigVecLen = (int) gram.fullN();
} else {
gtsk = new GramTask(_job._key, dinfo).doAll(dinfo._adaptedFrame);
gram = gtsk._gram; // TODO: This ends up with all NaNs if training data has too many missing values
assert gram.fullN() == _ncolExp;
model._output._nobs = gtsk._nobs;
}
model._output._total_variance = gram.diagSum() * model._output._nobs / (model._output._nobs-1); // Since gram = X'X/nobs, but variance requires nobs-1 in denominator
model.update(_job);
// 1) Run one iteration of power method
_job.update(1, "Iteration 1 of power method"); // One unit of work
// 1a) Initialize right singular vector v_1
model._output._v = MemoryManager.malloc8d(_parms._nv, eigVecLen);
// model._output._v = new double[_parms._nv][eigVecLen]; // Store V' for ease of use and transpose back at end
randomInitialV = MemoryManager.malloc8d(eigVecLen);
// randomInitialV = new double[eigVecLen]; // allocate memroy for randomInitialV and finalV once, save time
finalV = MemoryManager.malloc8d(eigVecLen);
//finalV = new double[eigVecLen];
model._output._v[0] = Arrays.copyOf(powerLoop(gram, _parms._seed, model, randomInitialV, finalV, 0),
eigVecLen);
// Keep track of I - \sum_i v_iv_i' where v_i = eigenvector i
double[][] ivv_sum = new double[eigVecLen][eigVecLen];
for (int i = 0; i < eigVecLen; i++) ivv_sum[i][i] = 1; //generate matrix I
// 1b) Initialize singular value \sigma_1 and update u_1 <- Av_1
if (!_parms._only_v) {
model._output._d = new double[_parms._nv]; // allocate memory once
if (!_wideDataset) {
model._output._u_key = Key.make(u_name);
uvecs = new Vec[_parms._nv];
computeSigmaU(dinfo, model, 0, ivv_sum, uvecs, finalV); // Compute first singular value \sigma_1
}
}
model._output._iterations = 1;
model.update(_job); // Update model in K/V store
// 1c) Update Gram matrix A_1'A_1 = (I - v_1v_1')A'A(I - v_1v_1')
updateIVVSum(ivv_sum, model._output._v[0]);
// double[][] gram_update = ArrayUtils.multArrArr(ArrayUtils.multArrArr(ivv_sum, gram), ivv_sum);
if (_wideDataset) {
gramArrays = new double[eigVecLen][eigVecLen]; // memory allocation is done once here
gramUpdatesW = new double[eigVecLen][eigVecLen];
gram_update = new Gram(eigVecLen, 0, dinfo.numNums(), dinfo._cats,false);
updateGram(ivv_sum, gramArrays, gramUpdatesW, gram, gram_update);
} else {
guptsk = new GramUpdate(_job._key, dinfo, ivv_sum).doAll(dinfo._adaptedFrame);
gram_update = guptsk._gram;
}
for (int k = 1; k < _parms._nv; k++) { // loop through for each eigenvalue/eigenvector...
if (_matrixRankReached || stop_requested()) { // number of eigenvalues found is less than _nv
if (timeout()) {
_job.warn("_train SVD: max_runtime_secs is reached. Not all eigenvalues/eigenvectors are computed.");
}
int newk = k;
_job.warn("_train SVD: Dataset is rank deficient. _parms._nv was "+_parms._nv+" and is now set to "+newk);
_parms._nv = newk; // change number of eigenvector parameters to be the actual number of eigenvectors found
break;
}
_job.update(1, "Iteration " + String.valueOf(k+1) + " of power method"); // One unit of work
// 2) Iterate x_i <- (A_k'A_k/n)x_{i-1} until convergence and set v_k = x_i/||x_i||
model._output._v[k] = Arrays.copyOf(powerLoop(gram_update, _parms._seed, model, randomInitialV, finalV,
k),
eigVecLen);
// 3) Residual data A_k = A - \sum_{i=1}^k \sigma_i u_iv_i' = A - \sum_{i=1}^k Av_iv_i' = A(I - \sum_{i=1}^k v_iv_i')
// 3a) Compute \sigma_k = ||A_{k-1}v_k|| and u_k = A_{k-1}v_k/\sigma_k
if (!_parms._only_v && !_wideDataset)
computeSigmaU(dinfo, model, k, ivv_sum, uvecs, finalV);
// 3b) Compute Gram of residual A_k'A_k = (I - \sum_{i=1}^k v_jv_j')A'A(I - \sum_{i=1}^k v_jv_j')
updateIVVSum(ivv_sum, model._output._v[k]); // Update I - \sum_{i=1}^k v_iv_i' with sum up to current singular value
// gram_update = ArrayUtils.multArrArr(ivv_sum, ArrayUtils.multArrArr(gram, ivv_sum)); // Too slow on wide arrays
if (_wideDataset) {
updateGram(ivv_sum, gramArrays, gramUpdatesW, gram, gram_update);
} else {
guptsk = new GramUpdate(_job._key, dinfo, ivv_sum).doAll(dinfo._adaptedFrame);
gram_update = guptsk._gram;
}
model._output._iterations++;
model.update(_job); // Update model in K/V store
} // end iteration to find eigenvectors
if (!_parms._only_v && !_parms._keep_u && _wideDataset) { // dealing with wide dataset per request from PCA, won't want U
for (int vecIndex = 0; vecIndex < _parms._nv; vecIndex++) {
model._output._d[vecIndex] = Math.sqrt(model._output._nobs*_estimatedSingularValues[vecIndex]);
}
model._output._v = getTransformedEigenvectors(dinfo, transpose(model._output._v));
}
if (!_wideDataset) {
// 4) Normalize output frame columns by singular values to get left singular vectors
model._output._v = ArrayUtils.transpose(model._output._v); // Transpose to get V (since vectors were stored as rows)
if (!_parms._only_v && !_parms._keep_u) { // Delete U vecs if computed, but user does not want it returned
for (int index=0; index < _parms._nv; index++){
uvecs[index].remove();
}
model._output._u_key = null;
} else if (!_parms._only_v && _parms._keep_u) { // Divide U cols by singular values and save to DKV
u = new Frame(model._output._u_key, null, uvecs);
DKV.put(u._key, u);
DivideU utsk = new DivideU(model._output._d);
utsk.doAll(u);
}
}
if (_failedConvergence) {
_job.warn("_train: PCA Power method failed to converge within TOLERANCE. Increase max_iterations or " +
"reduce TOLERANCE to mitigate this problem.");
}
LinkedHashMap scoreTable = new LinkedHashMap();
scoreTable.put("Timestamp", model._output._training_time_ms);
scoreTable.put("err", model._output._history_err);
scoreTable.put("Principal Component #", model._output._history_eigenVectorIndex);
model._output._scoring_history = createScoringHistoryTableDR(scoreTable,
"Scoring History from Power SVD", _job.start_time());
} else if(_parms._svd_method == SVDParameters.Method.Randomized) {
qfrm = randSubIter(dinfo, model);
u = directSVD(dinfo, qfrm, model, u_name);
model._output._training_time_ms.add(System.currentTimeMillis());
if (stop_requested() && model._output._history_average_SEE.size()==0) {
model._output._history_average_SEE.add(Double.POSITIVE_INFINITY);
}
model._output._history_average_SEE.add(model._output._history_average_SEE.get(model._output._history_average_SEE.size()-1)); // add last err back to it
LinkedHashMap scoreTable = new LinkedHashMap();
scoreTable.put("Timestamp", model._output._training_time_ms);
scoreTable.put("average SEE", model._output._history_average_SEE);
model._output._scoring_history = createScoringHistoryTableDR(scoreTable,
"Scoring History from Randomized SVD", _job.start_time());
} else
error("_svd_method", "Unrecognized SVD method " + _parms._svd_method);
if (_parms._save_v_frame) {
model._output._v_key = Key.make(v_name);
ArrayUtils.frame(model._output._v_key, null, model._output._v);
}
if ((model._output._d != null) && _parms._nv < model._output._d.length) { // need to shorten the correct eigen stuff
model._output._d = Arrays.copyOf(model._output._d, _parms._nv);
for (int index=0; index < model._output._v.length; index++) {
model._output._v[index] = Arrays.copyOf(model._output._v[index], _parms._nv);
}
}
model._output._model_summary = createModelSummaryTable(model._output);
model.update(_job);
} finally {
if( model != null ) model.unlock(_job);
if( dinfo != null ) dinfo.remove();
if (tinfo != null) tinfo.remove();
if( u != null & !_parms._keep_u ) u.delete();
if( qfrm != null ) qfrm.delete();
List> keep = new ArrayList<>();
if (model._output!=null) {
if (model._output._u_key != null) {
Frame uFrm = DKV.getGet(model._output._u_key);
if (uFrm != null) for (Vec vec : uFrm.vecs()) keep.add(vec._key);
}
Frame vFrm = DKV.getGet(model._output._v_key);
if (vFrm != null) for (Vec vec : vFrm.vecs()) keep.add(vec._key);
}
Scope.untrack(keep);
}
}
}
/*
This method will calculate (I-v1*T(v1))*A*T(A)*(I-v1*T(v1)). Note that we already have
A*T(A) part as a gram matrix. The ivv_sum part provides the (I-v1*T(v1)). All we need to
do here is to get the product and put it into a brand new gram matrix.
*/
private void updateGram(double[][] ivv_sum, double[][] gramToArray, double[][] resultGram, Gram gram, Gram gramUpdate)
{
int numRows = gram.fullN();
// grab gram matrix (A*T(A)) and expand into full matrix represented as 2D double array.
for (int row_index=0; row_index < numRows; row_index++) {
for (int col_index=0; col_index < numRows; col_index++) {
if (col_index <= row_index) {
gramToArray[row_index][col_index] = gram._xx[row_index][col_index];
} else {
gramToArray[row_index][col_index] = gram._xx[col_index][row_index];
}
}
}
resultGram = multArrArr(ivv_sum, gramToArray); // resultGram = (I-v1*T(v1))*A*T(A)
gramToArray = multArrArr(resultGram, ivv_sum); // overwrite gramToArray with final result resultGram*(I-v1*T(v1))
// copy over results from matrix multiplication output to resultGram
for (int row_index = 0; row_index < numRows; row_index++) {
for (int col_index = 0; col_index <= row_index; col_index++) {
gramUpdate._xx[row_index][col_index] = gramToArray[row_index][col_index];
}
}
}
/*
This method may make changes to the dinfo parameters if SVD is called by GLRM as a init method.
*/
private void setSVDModel(SVDModel model, DataInfo dinfo) {
if (_callFromGLRM) {
dinfo._normSub = Arrays.copyOf(_glrmModel._output._normSub, _glrmModel._output._normSub.length);
dinfo._normMul = Arrays.copyOf(_glrmModel._output._normMul, _glrmModel._output._normMul.length);
dinfo._permutation = Arrays.copyOf(_glrmModel._output._permutation, _glrmModel._output._permutation.length);
dinfo._numMeans = Arrays.copyOf(dinfo._normSub, dinfo._normSub.length);
dinfo._numNAFill = dinfo._numMeans; // NAs will be imputed with means
dinfo._nums = _glrmModel._output._nnums;
dinfo._cats = _glrmModel._output._ncats;
dinfo._catOffsets = Arrays.copyOf(_glrmModel._output._catOffsets, _glrmModel._output._catOffsets.length);
model._output._names_expanded = Arrays.copyOf(_glrmModel._output._names_expanded,
_glrmModel._output._names_expanded.length);
} else
model._output._names_expanded = dinfo.coefNames();
model._output._normSub = dinfo._normSub == null ? new double[dinfo._nums] : dinfo._normSub;
if (dinfo._normMul == null) {
model._output._normMul = new double[dinfo._nums];
Arrays.fill(model._output._normMul, 1.0);
} else
model._output._normMul = dinfo._normMul;
model._output._permutation = dinfo._permutation;
model._output._nnums = dinfo._nums;
model._output._ncats = dinfo._cats;
model._output._catOffsets = dinfo._catOffsets;
}
private TwoDimTable createModelSummaryTable(SVDModel.SVDOutput output) {
if(null == output._d) return null;
String[] colTypes = new String[_parms._nv];
String[] colFormats = new String[_parms._nv];
String[] colHeaders = new String[_parms._nv];
Arrays.fill(colTypes, "double");
Arrays.fill(colFormats, "%5f");
for(int i = 0; i < colHeaders.length; i++) colHeaders[i] = "sval" + String.valueOf(i + 1);
return new TwoDimTable("Singular values", null, new String[1],
colHeaders, colTypes, colFormats, "", new String[1][],
new double[][]{output._d});
}
private static class CalcSigmaU extends FrameTask {
final double[] _svec;
public double _sval;
public long _nobs;
public CalcSigmaU(Key jobKey, DataInfo dinfo, double[] svec) {
super(jobKey, dinfo);
_svec = svec;
_sval = 0;
}
@Override protected void processRow(long gid, DataInfo.Row r, NewChunk[] outputs) {
double num = r.innerProduct(_svec);
outputs[0].addNum(num);
_sval += num * num;
++_nobs;
}
@Override public void reduce(CalcSigmaU other) {
_nobs += other._nobs;
_sval += other._sval;
}
@Override protected void postGlobal() {
_sval = Math.sqrt(_sval);
}
}
private static class GramUpdate extends FrameTask {
final double[][] _ivv;
public Gram _gram;
public long _nobs;
public GramUpdate(Key jobKey, DataInfo dinfo, double[][] ivv) {
super(jobKey, dinfo);
assert null != ivv && ivv.length == ivv[0].length;
_ivv = ivv;
}
@Override protected boolean chunkInit(){
// To avoid memory allocation during every iteration.
_gram = new Gram(_dinfo.fullN(), 0, _ivv.length, 0, false);
_numRow = _dinfo.newDenseRow(MemoryManager.malloc8d(_ivv.length),0);
return true;
}
private transient Row _numRow;
@Override protected void processRow(long gid, DataInfo.Row r) {
double w = 1; // TODO: add weights to dinfo?
double[] nums = _numRow.numVals;
for(int row = 0; row < _ivv.length; row++)
nums[row] = r.innerProduct(_ivv[row]);
_gram.addRow(_numRow, w);
++_nobs;
}
@Override protected void chunkDone(long n){
double r = 1.0/_nobs;
_gram.mul(r);
}
@Override public void reduce(GramUpdate gt){
double r1 = (double)_nobs/(_nobs+gt._nobs);
_gram.mul(r1);
double r2 = (double)gt._nobs/(_nobs+gt._nobs);
gt._gram.mul(r2);
_gram.add(gt._gram);
_nobs += gt._nobs;
}
}
private static class DivideU extends MRTask {
final double[] _sigma;
public DivideU(double[] sigma) {
_sigma = sigma;
}
@Override public void map(Chunk cs[]) {
assert _sigma.length == cs.length;
for (int col = 0; col < cs.length; col++) {
for(int row = 0; row < cs[0].len(); row++) {
double x = cs[col].atd(row);
cs[col].set(row, x / _sigma[col]);
}
}
}
}
// Compute Y = AG where A is n by p and G is a p by k standard Gaussian matrix
private static class RandSubInit extends FrameTask {
final double[][] _gaus; // G' is k by p for convenient multiplication
public RandSubInit(Key jobKey, DataInfo dinfo, double[][] gaus) {
super(jobKey, dinfo);
_gaus = gaus;
}
@Override protected void processRow(long gid, DataInfo.Row row, NewChunk[] outputs) {
for(int k = 0; k < _gaus.length; k++) {
double y = row.innerProduct(_gaus[k]);
outputs[k].addNum(y);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy