Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.ssvd;
import org.apache.mahout.math.CholeskyDecomposition;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.MatrixWritable;
import org.apache.mahout.math.RandomTrinaryMatrix;
import org.apache.mahout.math.SingularValueDecomposition;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.function.Functions;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
/**
* Sequential block-oriented out of core SVD algorithm.
*
* The basic algorithm (in-core version) is that we do a random projects, get a basis of that and
* then re-project the original matrix using that basis. This re-projected matrix allows us to get
* an approximate SVD of the original matrix.
*
* The input to this program is a list of files that contain the sub-matrices A_i. The result is a
* vector of singular values and optionally files that contain the left and right singular vectors.
*
* Mathematically, to decompose A, we do this:
*
* Y = A * \Omega
*
* Q R = Y
*
* B = Q" A
*
* U D V' = B
*
* (Q U) D V' \approx A
*
* To do this out of core, we break A into blocks each with the same number of rows. This gives a
* block-wise version of Y. As we are computing Y, we can also accumulate Y' Y and when done, we
* can use a Cholesky decomposition to do the QR decomposition of Y in a latent form. That gives us
* B in block-wise form and we can do the same trick to get an LQ of B. The L part can be
* decomposed in memory. Then we can recombine to get the final decomposition.
*
* The details go like this. Start with a block form of A.
*
* Y_i = A_i * \Omega
*
* Instead of doing a QR decomposition of Y, we do a Cholesky decomposition of Y' Y. This is a
* small in-memory operation. Q is large and dense and won't fit in memory.
*
* R' R = \sum_i Y_i' Y_i
*
* For reference, R is all we need to compute explicitly. Q will be computed on the fly when
* needed.
*
* Q = Y R^-1
*
* B = Q" A = \sum_i (A \Omega R^-1)' A_i
*
* As B is generated, it needs to be segmented in row-wise blocks since it is wide but not tall.
* This storage requires something like a map-reduce to accumulate the partial sums. In this code,
* we do this by re-reading previously computed chunks and augmenting them.
*
* While the pieces of B are being computed, we can accumulate B B' in preparation for a second
* Cholesky decomposition
*
* L L' = B B' = sum B_j B_j'
*
* Again, this is an LQ decomposition of BB', but we don't compute the Q part explicitly. L will be
* small and thus tractable.
*
* Finally, we do the actual SVD decomposition.
*
* U_0 D V_0' = L
*
* D contains the singular values of A. The left and right singular values can be reconstructed
* using Y and B. Note that both of these reconstructions can be done with single passes through
* the blocked forms of Y and B.
*
* U = A \Omega R^{-1} U_0
*
* V = B' L'^{-1} V_0
*/
public class SequentialOutOfCoreSvd {
private final CholeskyDecomposition l2;
private final SingularValueDecomposition svd;
private final CholeskyDecomposition r2;
private final int columnsPerSlice;
private final int seed;
private final int dim;
public SequentialOutOfCoreSvd(Iterable partsOfA, File tmpDir, int internalDimension, int columnsPerSlice)
throws IOException {
this.columnsPerSlice = columnsPerSlice;
this.dim = internalDimension;
seed = 1;
Matrix y2 = null;
// step 1, compute R as in R'R = Y'Y where Y = A \Omega
for (File file : partsOfA) {
MatrixWritable m = new MatrixWritable();
try (DataInputStream in = new DataInputStream(new FileInputStream(file))) {
m.readFields(in);
}
Matrix aI = m.get();
Matrix omega = new RandomTrinaryMatrix(seed, aI.columnSize(), internalDimension, false);
Matrix y = aI.times(omega);
if (y2 == null) {
y2 = y.transpose().times(y);
} else {
y2.assign(y.transpose().times(y), Functions.PLUS);
}
}
r2 = new CholeskyDecomposition(y2);
// step 2, compute B
int ncols = 0;
for (File file : partsOfA) {
MatrixWritable m = new MatrixWritable();
try (DataInputStream in = new DataInputStream(new FileInputStream(file))) {
m.readFields(in);
}
Matrix aI = m.get();
ncols = Math.max(ncols, aI.columnSize());
Matrix omega = new RandomTrinaryMatrix(seed, aI.numCols(), internalDimension, false);
for (int j = 0; j < aI.numCols(); j += columnsPerSlice) {
Matrix yI = aI.times(omega);
Matrix aIJ = aI.viewPart(0, aI.rowSize(), j, Math.min(columnsPerSlice, aI.columnSize() - j));
Matrix bIJ = r2.solveRight(yI).transpose().times(aIJ);
addToSavedCopy(bFile(tmpDir, j), bIJ);
}
}
// step 3, compute BB', L and SVD(L)
Matrix b2 = new DenseMatrix(internalDimension, internalDimension);
MatrixWritable bTmp = new MatrixWritable();
for (int j = 0; j < ncols; j += columnsPerSlice) {
if (bFile(tmpDir, j).exists()) {
try (DataInputStream in = new DataInputStream(new FileInputStream(bFile(tmpDir, j)))) {
bTmp.readFields(in);
}
b2.assign(bTmp.get().times(bTmp.get().transpose()), Functions.PLUS);
}
}
l2 = new CholeskyDecomposition(b2);
svd = new SingularValueDecomposition(l2.getL());
}
public void computeV(File tmpDir, int ncols) throws IOException {
// step 5, compute pieces of V
for (int j = 0; j < ncols; j += columnsPerSlice) {
File bPath = bFile(tmpDir, j);
if (bPath.exists()) {
MatrixWritable m = new MatrixWritable();
try (DataInputStream in = new DataInputStream(new FileInputStream(bPath))) {
m.readFields(in);
}
m.set(l2.solveRight(m.get().transpose()).times(svd.getV()));
try (DataOutputStream out = new DataOutputStream(new FileOutputStream(
new File(tmpDir, String.format("V-%s", bPath.getName().replaceAll(".*-", "")))))) {
m.write(out);
}
}
}
}
public void computeU(Iterable partsOfA, File tmpDir) throws IOException {
// step 4, compute pieces of U
for (File file : partsOfA) {
MatrixWritable m = new MatrixWritable();
m.readFields(new DataInputStream(new FileInputStream(file)));
Matrix aI = m.get();
Matrix y = aI.times(new RandomTrinaryMatrix(seed, aI.numCols(), dim, false));
Matrix uI = r2.solveRight(y).times(svd.getU());
m.set(uI);
try (DataOutputStream out = new DataOutputStream(new FileOutputStream(
new File(tmpDir, String.format("U-%s", file.getName().replaceAll(".*-", "")))))) {
m.write(out);
}
}
}
private static void addToSavedCopy(File file, Matrix matrix) throws IOException {
MatrixWritable mw = new MatrixWritable();
if (file.exists()) {
try (DataInputStream in = new DataInputStream(new FileInputStream(file))) {
mw.readFields(in);
}
mw.get().assign(matrix, Functions.PLUS);
} else {
mw.set(matrix);
}
try (DataOutputStream out = new DataOutputStream(new FileOutputStream(file))) {
mw.write(out);
}
}
private static File bFile(File tmpDir, int j) {
return new File(tmpDir, String.format("B-%09d", j));
}
public Vector getSingularValues() {
return new DenseVector(svd.getSingularValues());
}
}