
hex.svd.SVD Maven / Gradle / Ivy
package hex.svd;
import Jama.Matrix;
import Jama.QRDecomposition;
import Jama.SingularValueDecomposition;
import hex.*;
import hex.gram.Gram;
import hex.gram.Gram.GramTask;
import hex.schemas.ModelBuilderSchema;
import hex.schemas.SVDV99;
import hex.svd.SVDModel.SVDParameters;
import hex.util.LinearAlgebraUtils;
import hex.util.LinearAlgebraUtils.*;
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.util.ArrayUtils;
import water.util.Log;
import water.util.PrettyPrint;
import water.util.TwoDimTable;
import java.util.Arrays;
/**
* Singular Value Decomposition
* SVD via Power Method Algorithm
* Proof of Convergence for Power Method
* Randomized Algorithms for Matrix Approximation
* @author anqi_fu
*/
public class SVD extends ModelBuilder {
// Convergence tolerance
private final double TOLERANCE = 1e-6; // Cutoff for estimation error of right singular vector
// Maximum number of columns when categoricals expanded
private final int MAX_COLS_EXPANDED = 5000;
// Number of columns in training set (p)
private transient int _ncolExp; // With categoricals expanded into 0/1 indicator cols
@Override public ModelBuilderSchema schema() {
return new SVDV99();
}
@Override protected Job trainModelImpl(long work, boolean restartTimer) {
return start(new SVDDriver(), work, restartTimer);
}
@Override
public long progressUnits() {
switch(_parms._svd_method) {
case GramSVD:
return 2;
case Power:
return 1 + _parms._nv;
case Randomized:
return 5 + _parms._max_iterations;
default: return _parms._nv;
}
}
@Override public ModelCategory[] can_build() {
return new ModelCategory[]{ ModelCategory.DimReduction };
}
@Override public BuilderVisibility builderVisibility() { return BuilderVisibility.Experimental; };
// Called from an http request
public SVD(SVDModel.SVDParameters parms) {
super("SVD", parms);
init(false);
}
@Override
protected void checkMemoryFootPrint() {
HeartBeat hb = H2O.SELF._heartbeat;
double p = _train.degreesOfFreedom();
long mem_usage = (long)(hb._cpus_allowed * p*p * 8/*doubles*/ * Math.log((double)_train.lastVec().nChunks())/Math.log(2.)); //one gram per core
long max_mem = hb.get_max_mem();
if (mem_usage > max_mem) {
String msg = "Gram matrices (one per thread) won't fit in the driver node's memory ("
+ PrettyPrint.bytes(mem_usage) + " > " + PrettyPrint.bytes(max_mem)
+ ") - try reducing the number of columns and/or the number of categorical factors.";
error("_train", msg);
cancel(msg);
}
}
@Override public void init(boolean expensive) {
super.init(expensive);
if (_parms._max_iterations < 1)
error("_max_iterations", "max_iterations must be at least 1");
if(_train == null) return;
_ncolExp = _train.numColsExp(_parms._use_all_factor_levels, false);
if (_ncolExp > MAX_COLS_EXPANDED)
warn("_train", "_train has " + _ncolExp + " columns when categoricals are expanded. Algorithm may be slow.");
if(_parms._nv < 1 || _parms._nv > _ncolExp)
error("_nv", "Number of right singular values must be between 1 and " + _ncolExp);
if (_parms._svd_method != SVDParameters.Method.Randomized && expensive && error_count() == 0) checkMemoryFootPrint();
}
// Compute ivv_sum - vec * vec' for symmetric array ivv_sum
public static double[][] updateIVVSum(double[][] ivv_sum, double[] vec) {
double diff;
for(int i = 0; i < vec.length; i++) {
for(int j = 0; j < i; j++) {
diff = ivv_sum[i][j] - vec[i] * vec[j];
ivv_sum[i][j] = ivv_sum[j][i] = diff;
}
ivv_sum[i][i] -= vec[i] * vec[i];
}
return ivv_sum;
}
class SVDDriver extends H2O.H2OCountedCompleter {
protected SVDDriver() { super(true); } // bump driver priority
// private double[] powerLoop(Gram gram) { return powerLoop(gram, ArrayUtils.gaussianVector(gram.fullN())); }
private double[] powerLoop(Gram gram, long seed) { return powerLoop(gram, ArrayUtils.gaussianVector(gram.fullN(), seed)); }
private double[] powerLoop(Gram gram, double[] vinit) {
// TODO: What happens if Gram matrix is essentially zero? Numerical inaccuracies in PUBDEV-1161.
assert vinit.length == gram.fullN();
// Set initial value v_0 to standard normal distribution
int iters = 0;
double err = 2 * TOLERANCE;
double[] v = vinit.clone();
double[] vnew = new double[v.length];
// Update v_i <- (A'Av_{i-1})/||A'Av_{i-1}|| where A'A = Gram matrix of training frame
while(iters < _parms._max_iterations && err > TOLERANCE) {
// Compute x_i <- A'Av_{i-1} and ||x_i||
gram.mul(v, vnew);
double norm = ArrayUtils.l2norm(vnew);
double diff; err = 0;
for (int i = 0; i < v.length; i++) {
vnew[i] /= norm; // Compute singular vector v_i = x_i/||x_i||
diff = v[i] - vnew[i]; // Save error ||v_i - v_{i-1}||
err += diff * diff;
v[i] = vnew[i]; // Update v_i for next iteration
}
err = Math.sqrt(err);
iters++; // TODO: Should output vector of final iterations for each k
}
return v;
}
private double computeSigmaU(DataInfo dinfo, SVDModel model, int k, double[][] ivv_sum, Vec[] uvecs) {
double[] ivv_vk = ArrayUtils.multArrVec(ivv_sum, model._output._v[k]);
CalcSigmaU ctsk = new CalcSigmaU(self(), dinfo, ivv_vk).doAll(Vec.T_NUM, dinfo._adaptedFrame);
model._output._d[k] = ctsk._sval;
assert ctsk._nobs == model._output._nobs : "Processed " + ctsk._nobs + " rows but expected " + model._output._nobs; // Check same number of skipped rows as Gram
Frame tmp = ctsk.outputFrame();
uvecs[k] = tmp.vec(0); // Save output column of U
tmp.unlock(self());
return model._output._d[k];
}
// Algorithm 4.4: Randomized subspace iteration from Halk et al (http://arxiv.org/pdf/0909.4061.pdf)
private Frame randSubIterInPlace(DataInfo dinfo, SVDModel model) {
DataInfo yinfo = null;
Frame yqfrm = null;
try {
// 1) Initialize Y = AG where G ~ N(0,1) and compute Y = QR factorization
update(1, "Initializing random subspace of training data Y");
double[][] gt = ArrayUtils.gaussianArray(_parms._nv, _ncolExp, _parms._seed);
RandSubInit rtsk = new RandSubInit(self(), dinfo, gt);
rtsk.doAll(_parms._nv, Vec.T_NUM, dinfo._adaptedFrame);
yqfrm = rtsk.outputFrame(Key.make(), null, null); // Alternates between Y and Q from Y = QR
// Make input frame [A,Q] where A = read-only training data, Y = A \tilde{Q}, Q from Y = QR factorization
// Note: If A is n by p (p = num cols with categoricals expanded), then \tilde{Q} is p by k and Q is n by k
// Q frame is used to save both intermediate Y calculation and final orthonormal Q matrix
Frame aqfrm = new Frame(dinfo._adaptedFrame);
aqfrm.add(yqfrm);
// Calculate Cholesky of Y Gram to get R' = L matrix
update(1, "Computing QR factorization of Y");
yinfo = new DataInfo(Key.make(), yqfrm, null, true, DataInfo.TransformType.NONE, true, false, false);
DKV.put(yinfo._key, yinfo);
LinearAlgebraUtils.computeQInPlace(self(), yinfo);
model._output._iterations = 0;
while (model._output._iterations < _parms._max_iterations) {
if(!isRunning()) break;
update(1, "Iteration " + String.valueOf(model._output._iterations+1) + " of randomized subspace iteration");
// 2) Form \tilde{Y}_j = A'Q_{j-1} and compute \tilde{Y}_j = \tilde{Q}_j \tilde{R}_j factorization
SMulTask stsk = new SMulTask(dinfo, _parms._nv);
stsk.doAll(aqfrm);
Matrix ysmall = new Matrix(stsk._atq);
QRDecomposition ysmall_qr = new QRDecomposition(ysmall);
double[][] qtilde = ysmall_qr.getQ().getArray();
// 3) [A,Q_{j-1}] -> [A,Y_j]: Form Y_j = A\tilde{Q}_j and compute Y_j = Q_jR_j factorization
BMulInPlaceTask tsk = new BMulInPlaceTask(dinfo, ArrayUtils.transpose(qtilde));
tsk.doAll(aqfrm);
LinearAlgebraUtils.computeQInPlace(self(), yinfo);
model._output._iterations++;
model.update(self());
}
} catch( Throwable t ) {
Job thisJob = DKV.getGet(_key);
if (thisJob._state == JobState.CANCELLED) {
Log.info("Job cancelled by user.");
} else {
t.printStackTrace();
failed(t);
throw t;
}
} finally {
if( yinfo != null ) yinfo.remove();
}
return yqfrm;
}
// Algorithm 4.4: Randomized subspace iteration from Halk et al (http://arxiv.org/pdf/0909.4061.pdf)
// This function keeps track of change in Q each iteration ||Q_j - Q_{j-1}||_2 to check convergence
private Frame randSubIter(DataInfo dinfo, SVDModel model) {
DataInfo yinfo = null;
Frame ybig = null, qfrm = null;
final int ncolA = dinfo._adaptedFrame.numCols();
try {
// 1) Initialize Y = AG where G ~ N(0,1) and compute Y = QR factorization
update(1, "Initializing random subspace of training data Y");
double[][] gt = ArrayUtils.gaussianArray(_parms._nv, _ncolExp, _parms._seed);
RandSubInit rtsk = new RandSubInit(self(), dinfo, gt);
rtsk.doAll(_parms._nv, Vec.T_NUM, dinfo._adaptedFrame);
ybig = rtsk.outputFrame(Key.make(), null, null);
// Make input frame [A,Q,Y] where A = read-only training data, Y = A \tilde{Q}, Q from Y = QR factorization
// Note: If A is n by p (p = num cols with categoricals expanded), then \tilde{Q} is p by k and Q is n by k
Frame ayqfrm = new Frame(dinfo._adaptedFrame);
ayqfrm.add(ybig);
for (int i = 0; i < _parms._nv; i++)
ayqfrm.add("qcol_" + i, ayqfrm.anyVec().makeZero());
Frame ayfrm = ayqfrm.subframe(0, ncolA + _parms._nv); // [A,Y]
Frame yqfrm = ayqfrm.subframe(ncolA, ayqfrm.numCols()); // [Y,Q]
Frame aqfrm = ayqfrm.subframe(0, ncolA);
aqfrm.add(ayqfrm.subframe(ncolA + _parms._nv, ayqfrm.numCols())); // [A,Q]
// Calculate Cholesky of Gram to get R' = L matrix
update(1, "Computing QR factorization of Y");
yinfo = new DataInfo(Key.make(), ybig, null, true, DataInfo.TransformType.NONE, true, false, false);
DKV.put(yinfo._key, yinfo);
LinearAlgebraUtils.computeQ(self(), yinfo, yqfrm);
model._output._iterations = 0;
long qobs = dinfo._adaptedFrame.numRows() * _parms._nv; // Number of observations in Q
double qerr = 2 * TOLERANCE * qobs; // Stop when average SSE between Q_j and Q_{j-2} below tolerance
while ((model._output._iterations < 10 || qerr / qobs > TOLERANCE) && model._output._iterations < _parms._max_iterations) { // Run at least 10 iterations before tolerance cutoff
if(!isRunning()) break;
update(1, "Iteration " + String.valueOf(model._output._iterations+1) + " of randomized subspace iteration");
// 2) Form \tilde{Y}_j = A'Q_{j-1} and compute \tilde{Y}_j = \tilde{Q}_j \tilde{R}_j factorization
SMulTask stsk = new SMulTask(dinfo, _parms._nv);
stsk.doAll(aqfrm); // Pass in [A,Q]
Matrix ysmall = new Matrix(stsk._atq);
QRDecomposition ysmall_qr = new QRDecomposition(ysmall);
double[][] ysmall_q = ysmall_qr.getQ().getArray();
// 3) Form Y_j = A\tilde{Q}_j and compute Y_j = Q_jR_j factorization
BMulInPlaceTask tsk = new BMulInPlaceTask(dinfo, ArrayUtils.transpose(ysmall_q));
tsk.doAll(ayfrm);
qerr = LinearAlgebraUtils.computeQ(self(), yinfo, yqfrm);
model._output._iterations++;
model.update(self());
}
// 4) Extract and save final Q_j from [A,Q] frame
qfrm = ayqfrm.extractFrame(ncolA + _parms._nv, ayqfrm.numCols());
qfrm = new Frame(Key.make(), qfrm.names(), qfrm.vecs());
DKV.put(qfrm);
} catch( Throwable t ) {
Job thisJob = DKV.getGet(_key);
if (thisJob._state == JobState.CANCELLED) {
Log.info("Job cancelled by user.");
} else {
t.printStackTrace();
failed(t);
throw t;
}
} finally {
if( yinfo != null ) yinfo.remove();
if( ybig != null ) ybig.delete();
}
return qfrm;
}
// Algorithm 5.1: Direct SVD from Halko et al (http://arxiv.org/pdf/0909.4061.pdf)
private Frame directSVD(DataInfo dinfo, Frame qfrm, SVDModel model) {
String u_name = (_parms._u_name == null || _parms._u_name.length() == 0) ? "SVDUMatrix_" + Key.rand() : _parms._u_name;
return directSVD(dinfo, qfrm, model, u_name);
}
private Frame directSVD(DataInfo dinfo, Frame qfrm, SVDModel model, String u_name) {
DataInfo qinfo = null;
Frame u = null;
final int ncolA = dinfo._adaptedFrame.numCols();
try {
// 0) Make input frame [A,Q], where A = read-only training data, Q = matrix from randomized subspace iteration
Vec[] vecs = new Vec[ncolA + _parms._nv];
for (int i = 0; i < ncolA; i++) vecs[i] = dinfo._adaptedFrame.vec(i);
for (int i = 0; i < _parms._nv; i++) vecs[ncolA + i] = qfrm.vec(i);
Frame aqfrm = new Frame(vecs);
// 1) Form the matrix B' = A'Q = (Q'A)'
update(1, "Forming small matrix B = Q'A for direct SVD");
SMulTask stsk = new SMulTask(dinfo, _parms._nv);
stsk.doAll(aqfrm);
// 2) Compute SVD of small matrix: If B' = WDV', then B = VDW'
update(1, "Calculating SVD of small matrix locally");
Matrix atqJ = new Matrix(stsk._atq);
SingularValueDecomposition svdJ = atqJ.svd();
// 3) Form orthonormal matrix U = QV
update(1, "Forming distributed orthonormal matrix U");
if (_parms._keep_u) {
model._output._u_key = Key.make(u_name);
double[][] svdJ_u = svdJ.getV().getMatrix(0,atqJ.getColumnDimension()-1,0,_parms._nv-1).getArray();
qinfo = new DataInfo(Key.make(), qfrm, null, true, DataInfo.TransformType.NONE, false, false, false);
DKV.put(qinfo._key, qinfo);
BMulTask btsk = new BMulTask(self(), qinfo, ArrayUtils.transpose(svdJ_u));
btsk.doAll(_parms._nv, Vec.T_NUM, qinfo._adaptedFrame);
u = btsk.outputFrame(model._output._u_key, null, null);
}
model._output._d = Arrays.copyOfRange(svdJ.getSingularValues(),0,_parms._nv);
model._output._v = svdJ.getU().getMatrix(0,atqJ.getRowDimension()-1,0,_parms._nv-1).getArray();
} catch( Throwable t ) {
Job thisJob = DKV.getGet(_key);
if (thisJob._state == JobState.CANCELLED) {
Log.info("Job cancelled by user.");
} else {
t.printStackTrace();
failed(t);
throw t;
}
} finally {
if( qinfo != null ) qinfo.remove();
}
return u;
}
@Override protected void compute2() {
SVDModel model = null;
DataInfo dinfo = null;
Frame u = null, qfrm = null;
Vec[] uvecs = null;
try {
init(true); // Initialize parameters
_parms.read_lock_frames(SVD.this); // Fetch & read-lock input frames
if (error_count() > 0) throw new IllegalArgumentException("Found validation errors: " + validationErrors());
// The model to be built
model = new SVDModel(dest(), _parms, new SVDModel.SVDOutput(SVD.this));
model.delete_and_lock(self());
// 0) Transform training data and save standardization vectors for use in scoring later
dinfo = new DataInfo(Key.make(), _train, _valid, 0, _parms._use_all_factor_levels, _parms._transform, DataInfo.TransformType.NONE, /* skipMissing */ !_parms._impute_missing, /* imputeMissing */ _parms._impute_missing, /* missingBucket */ false, /* weights */ false, /* offset */ false, /* fold */ false, /* intercept */ false);
DKV.put(dinfo._key, dinfo);
// Save adapted frame info for scoring later
model._output._normSub = dinfo._normSub == null ? new double[dinfo._nums] : dinfo._normSub;
if(dinfo._normMul == null) {
model._output._normMul = new double[dinfo._nums];
Arrays.fill(model._output._normMul, 1.0);
} else
model._output._normMul = dinfo._normMul;
model._output._permutation = dinfo._permutation;
model._output._nnums = dinfo._nums;
model._output._ncats = dinfo._cats;
model._output._catOffsets = dinfo._catOffsets;
model._output._names_expanded = dinfo.coefNames();
String u_name = (_parms._u_name == null || _parms._u_name.length() == 0) ? "SVDUMatrix_" + Key.rand() : _parms._u_name;
String v_name = (_parms._v_name == null || _parms._v_name.length() == 0) ? "SVDVMatrix_" + Key.rand() : _parms._v_name;
if(_parms._svd_method == SVDParameters.Method.GramSVD) {
// Calculate and save Gram matrix of training data
// NOTE: Gram computes A'A/n where n = nrow(A) = number of rows in training set (excluding rows with NAs)
update(1, "Begin distributed calculation of Gram matrix");
GramTask gtsk = new GramTask(self(), dinfo).doAll(dinfo._adaptedFrame);
Gram gram = gtsk._gram; // TODO: This ends up with all NaNs if training data has too many missing values
assert gram.fullN() == _ncolExp;
model._output._nobs = gtsk._nobs;
model._output._total_variance = gram.diagSum() * gtsk._nobs / (gtsk._nobs-1); // Since gram = X'X/nobs, but variance requires nobs-1 in denominator
model.update(self());
// Calculate SVD of G = A'A/n and back out SVD of A. If SVD of A = UDV' then A'A/n = V(D^2/n)V'
update(1, "Calculating SVD of Gram matrix locally");
Matrix gramJ = new Matrix(gtsk._gram.getXX());
SingularValueDecomposition svdJ = gramJ.svd();
// Output diagonal of D
update(1, "Computing stats from SVD");
double[] sval = svdJ.getSingularValues();
model._output._d = new double[_parms._nv]; // Only want rank = nv diagonal values
for(int k = 0; k < _parms._nv; k++)
model._output._d[k] = Math.sqrt(sval[k] * model._output._nobs);
// Output right singular vectors V
double[][] v = svdJ.getV().getArray();
assert v.length == _ncolExp && dinfo._adaptedFrame.numColsExp(_parms._use_all_factor_levels, false) == _ncolExp;
model._output._v = new double[_ncolExp][_parms._nv]; // Only want rank = nv decomposition
for(int i = 0; i < v.length; i++)
System.arraycopy(v[i], 0, model._output._v[i], 0, _parms._nv);
// Calculate left singular vectors U = AVD^(-1) if requested
if(_parms._keep_u) {
model._output._u_key = Key.make(u_name);
double[][] vt = ArrayUtils.transpose(model._output._v);
for (int k = 0; k < _parms._nv; k++)
ArrayUtils.div(vt[k], model._output._d[k]);
BMulTask tsk = new BMulTask(self(), dinfo, vt).doAll(_parms._nv, Vec.T_NUM, dinfo._adaptedFrame);
u = tsk.outputFrame(model._output._u_key, null, null);
}
} else if(_parms._svd_method == SVDParameters.Method.Power) {
// Calculate and save Gram matrix of training data
// NOTE: Gram computes A'A/n where n = nrow(A) = number of rows in training set (excluding rows with NAs)
update(1, "Begin distributed calculation of Gram matrix");
GramTask gtsk = new GramTask(self(), dinfo).doAll(dinfo._adaptedFrame);
Gram gram = gtsk._gram; // TODO: This ends up with all NaNs if training data has too many missing values
assert gram.fullN() == _ncolExp;
model._output._nobs = gtsk._nobs;
model._output._total_variance = gram.diagSum() * gtsk._nobs / (gtsk._nobs-1); // Since gram = X'X/nobs, but variance requires nobs-1 in denominator
model.update(self());
// 1) Run one iteration of power method
update(1, "Iteration 1 of power method"); // One unit of work
// 1a) Initialize right singular vector v_1
model._output._v = new double[_parms._nv][_ncolExp]; // Store V' for ease of use and transpose back at end
model._output._v[0] = powerLoop(gram, _parms._seed);
// Keep track of I - \sum_i v_iv_i' where v_i = eigenvector i
double[][] ivv_sum = new double[_ncolExp][_ncolExp];
for (int i = 0; i < _ncolExp; i++) ivv_sum[i][i] = 1;
// 1b) Initialize singular value \sigma_1 and update u_1 <- Av_1
if (!_parms._only_v) {
model._output._d = new double[_parms._nv];
model._output._u_key = Key.make(u_name);
uvecs = new Vec[_parms._nv];
computeSigmaU(dinfo, model, 0, ivv_sum, uvecs); // Compute first singular value \sigma_1
}
model._output._iterations = 1;
model.update(self()); // Update model in K/V store
// 1c) Update Gram matrix A_1'A_1 = (I - v_1v_1')A'A(I - v_1v_1')
updateIVVSum(ivv_sum, model._output._v[0]);
// double[][] gram_update = ArrayUtils.multArrArr(ArrayUtils.multArrArr(ivv_sum, gram), ivv_sum);
GramUpdate guptsk = new GramUpdate(self(), dinfo, ivv_sum).doAll(dinfo._adaptedFrame);
Gram gram_update = guptsk._gram;
for (int k = 1; k < _parms._nv; k++) {
if (!isRunning()) break;
update(1, "Iteration " + String.valueOf(k+1) + " of power method"); // One unit of work
// 2) Iterate x_i <- (A_k'A_k/n)x_{i-1} until convergence and set v_k = x_i/||x_i||
model._output._v[k] = powerLoop(gram_update, _parms._seed);
// 3) Residual data A_k = A - \sum_{i=1}^k \sigma_i u_iv_i' = A - \sum_{i=1}^k Av_iv_i' = A(I - \sum_{i=1}^k v_iv_i')
// 3a) Compute \sigma_k = ||A_{k-1}v_k|| and u_k = A_{k-1}v_k/\sigma_k
if (!_parms._only_v)
computeSigmaU(dinfo, model, k, ivv_sum, uvecs);
// 3b) Compute Gram of residual A_k'A_k = (I - \sum_{i=1}^k v_jv_j')A'A(I - \sum_{i=1}^k v_jv_j')
updateIVVSum(ivv_sum, model._output._v[k]); // Update I - \sum_{i=1}^k v_iv_i' with sum up to current singular value
// gram_update = ArrayUtils.multArrArr(ivv_sum, ArrayUtils.multArrArr(gram, ivv_sum)); // Too slow on wide arrays
guptsk = new GramUpdate(self(), dinfo, ivv_sum).doAll(dinfo._adaptedFrame);
gram_update = guptsk._gram;
model._output._iterations++;
model.update(self()); // Update model in K/V store
}
// 4) Normalize output frame columns by singular values to get left singular vectors
model._output._v = ArrayUtils.transpose(model._output._v); // Transpose to get V (since vectors were stored as rows)
if (!_parms._only_v && !_parms._keep_u) { // Delete U vecs if computed, but user does not want it returned
for (int i = 0; i < uvecs.length; i++) uvecs[i].remove();
model._output._u_key = null;
} else if (!_parms._only_v && _parms._keep_u) { // Divide U cols by singular values and save to DKV
u = new Frame(model._output._u_key, null, uvecs);
DKV.put(u._key, u);
DivideU utsk = new DivideU(model._output._d);
utsk.doAll(u);
}
} else if(_parms._svd_method == SVDParameters.Method.Randomized) {
qfrm = randSubIter(dinfo, model);
u = directSVD(dinfo, qfrm, model, u_name);
} else
error("_svd_method", "Unrecognized SVD method " + _parms._svd_method);
if (_parms._save_v_frame) {
model._output._v_key = Key.make(v_name);
ArrayUtils.frame(model._output._v_key, null, model._output._v);
}
model._output._model_summary = createModelSummaryTable(model._output);
model.update(self());
done();
} catch( Throwable t ) {
Job thisJob = DKV.getGet(_key);
if (thisJob._state == JobState.CANCELLED) {
Log.info("Job cancelled by user.");
} else {
t.printStackTrace();
failed(t);
throw t;
}
} finally {
updateModelOutput();
if( model != null ) model.unlock(_key);
if( dinfo != null ) dinfo.remove();
if( u != null & !_parms._keep_u ) u.delete();
if( qfrm != null ) qfrm.delete();
_parms.read_unlock_frames(SVD.this);
}
// Job thisJob = DKV.getGet(_key);
// System.out.println("------------- JOB status: " + Arrays.toString(Job.jobs()));
tryComplete();
}
Key self() {
return _key;
}
}
private TwoDimTable createModelSummaryTable(SVDModel.SVDOutput output) {
if(null == output._d) return null;
String[] colTypes = new String[_parms._nv];
String[] colFormats = new String[_parms._nv];
String[] colHeaders = new String[_parms._nv];
Arrays.fill(colTypes, "double");
Arrays.fill(colFormats, "%5f");
for(int i = 0; i < colHeaders.length; i++) colHeaders[i] = "sval" + String.valueOf(i + 1);
return new TwoDimTable("Singular values", null, new String[1],
colHeaders, colTypes, colFormats, "", new String[1][], new double[][]{output._d});
}
private static class CalcSigmaU extends FrameTask {
final double[] _svec;
public double _sval;
public long _nobs;
public CalcSigmaU(Key jobKey, DataInfo dinfo, double[] svec) {
super(jobKey, dinfo);
_svec = svec;
_sval = 0;
}
@Override protected void processRow(long gid, DataInfo.Row r, NewChunk[] outputs) {
double num = r.innerProduct(_svec);
outputs[0].addNum(num);
_sval += num * num;
++_nobs;
}
@Override public void reduce(CalcSigmaU other) {
_nobs += other._nobs;
_sval += other._sval;
}
@Override protected void postGlobal() {
_sval = Math.sqrt(_sval);
}
}
private static class GramUpdate extends FrameTask {
final double[][] _ivv;
public Gram _gram;
public long _nobs;
public GramUpdate(Key jobKey, DataInfo dinfo, double[][] ivv) {
super(jobKey, dinfo);
assert null != ivv && ivv.length == ivv[0].length;
_ivv = ivv;
}
@Override protected boolean chunkInit(){
_gram = new Gram(_dinfo.fullN(), 0, _ivv.length, 0, false);
return true;
}
@Override protected void processRow(long gid, DataInfo.Row r) {
double w = 1; // TODO: add weights to dinfo?
double[] nums = new double[_ivv.length];
for(int row = 0; row < _ivv.length; row++)
nums[row] = r.innerProduct(_ivv[row]);
_gram.addRow(_dinfo.newDenseRow(nums), w);
++_nobs;
}
@Override protected void chunkDone(long n){
double r = 1.0/_nobs;
_gram.mul(r);
}
@Override public void reduce(GramUpdate gt){
double r1 = (double)_nobs/(_nobs+gt._nobs);
_gram.mul(r1);
double r2 = (double)gt._nobs/(_nobs+gt._nobs);
gt._gram.mul(r2);
_gram.add(gt._gram);
_nobs += gt._nobs;
}
}
private static class DivideU extends MRTask {
final double[] _sigma;
public DivideU(double[] sigma) {
_sigma = sigma;
}
@Override public void map(Chunk cs[]) {
assert _sigma.length == cs.length;
for (int col = 0; col < cs.length; col++) {
for(int row = 0; row < cs[0].len(); row++) {
double x = cs[col].atd(row);
cs[col].set(row, x / _sigma[col]);
}
}
}
}
// Compute Y = AG where A is n by p and G is a p by k standard Gaussian matrix
private static class RandSubInit extends FrameTask {
final double[][] _gaus; // G' is k by p for convenient multiplication
public RandSubInit(Key jobKey, DataInfo dinfo, double[][] gaus) {
super(jobKey, dinfo);
_gaus = gaus;
}
@Override protected void processRow(long gid, DataInfo.Row row, NewChunk[] outputs) {
for(int k = 0; k < _gaus.length; k++) {
double y = row.innerProduct(_gaus[k]);
outputs[k].addNum(y);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy