org.apache.mahout.math.ssvd.SequentialOutOfCoreSvd Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.math.ssvd;

import org.apache.mahout.math.CholeskyDecomposition;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.MatrixWritable;
import org.apache.mahout.math.RandomTrinaryMatrix;
import org.apache.mahout.math.SingularValueDecomposition;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.function.Functions;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

/**
 * Sequential block-oriented out of core SVD algorithm.
 * 
 * The basic algorithm (in-core version) is that we do a random projects, get a basis of that and
 * then re-project the original matrix using that basis.  This re-projected matrix allows us to get
 * an approximate SVD of the original matrix.
 * 

 * The input to this program is a list of files that contain the sub-matrices A_i.  The result is a
 * vector of singular values and optionally files that contain the left and right singular vectors.
 * 

 * Mathematically, to decompose A, we do this:
 * 

 * Y = A * \Omega
 * 

 * Q R = Y
 * 

 * B = Q" A
 * 

 * U D V' = B
 * 

 * (Q U) D V' \approx A
 * 

 * To do this out of core, we break A into blocks each with the same number of rows.  This gives a
 * block-wise version of Y.  As we are computing Y, we can also accumulate Y' Y and when done, we
 * can use a Cholesky decomposition to do the QR decomposition of Y in a latent form.  That gives us
 * B in block-wise form and we can do the same trick to get an LQ of B.  The L part can be
 * decomposed in memory.  Then we can recombine to get the final decomposition.
 * 

 * The details go like this.  Start with a block form of A.
 * 

 * Y_i = A_i * \Omega
 * 

 * Instead of doing a QR decomposition of Y, we do a Cholesky decomposition of Y' Y.  This is a
 * small in-memory operation.  Q is large and dense and won't fit in memory.
 * 

 * R' R = \sum_i Y_i' Y_i
 * 

 * For reference, R is all we need to compute explicitly.  Q will be computed on the fly when
 * needed.
 * 

 * Q = Y R^-1
 * 

 * B = Q" A = \sum_i (A \Omega R^-1)' A_i
 * 

 * As B is generated, it needs to be segmented in row-wise blocks since it is wide but not tall.
 * This storage requires something like a map-reduce to accumulate the partial sums.  In this code,
 * we do this by re-reading previously computed chunks and augmenting them.
 * 

 * While the pieces of B are being computed, we can accumulate B B' in preparation for a second
 * Cholesky decomposition
 * 

 * L L' = B B' = sum B_j B_j'
 * 

 * Again, this is an LQ decomposition of BB', but we don't compute the Q part explicitly.  L will be
 * small and thus tractable.
 * 

 * Finally, we do the actual SVD decomposition.
 * 

 * U_0 D V_0' = L
 * 

 * D contains the singular values of A.  The left and right singular values can be reconstructed
 * using Y and B.  Note that both of these reconstructions can be done with single passes through
 * the blocked forms of Y and B.
 * 

 * U = A \Omega R^{-1} U_0
 * 
 * V = B' L'^{-1} V_0
 */
public class SequentialOutOfCoreSvd {

  private final CholeskyDecomposition l2;
  private final SingularValueDecomposition svd;
  private final CholeskyDecomposition r2;
  private final int columnsPerSlice;
  private final int seed;
  private final int dim;

  public SequentialOutOfCoreSvd(Iterable partsOfA, File tmpDir, int internalDimension, int columnsPerSlice)
    throws IOException {
    this.columnsPerSlice = columnsPerSlice;
    this.dim = internalDimension;

    seed = 1;
    Matrix y2 = null;

    // step 1, compute R as in R'R = Y'Y where Y = A \Omega
    for (File file : partsOfA) {
      MatrixWritable m = new MatrixWritable();
      try (DataInputStream in = new DataInputStream(new FileInputStream(file))) {
        m.readFields(in);
      }

      Matrix aI = m.get();
      Matrix omega = new RandomTrinaryMatrix(seed, aI.columnSize(), internalDimension, false);
      Matrix y = aI.times(omega);

      if (y2 == null) {
        y2 = y.transpose().times(y);
      } else {
        y2.assign(y.transpose().times(y), Functions.PLUS);
      }
    }
    r2 = new CholeskyDecomposition(y2);

    // step 2, compute B
    int ncols = 0;
    for (File file : partsOfA) {
      MatrixWritable m = new MatrixWritable();
      try (DataInputStream in = new DataInputStream(new FileInputStream(file))) {
        m.readFields(in);
      }
      Matrix aI = m.get();
      ncols = Math.max(ncols, aI.columnSize());

      Matrix omega = new RandomTrinaryMatrix(seed, aI.numCols(), internalDimension, false);
      for (int j = 0; j < aI.numCols(); j += columnsPerSlice) {
        Matrix yI = aI.times(omega);
        Matrix aIJ = aI.viewPart(0, aI.rowSize(), j, Math.min(columnsPerSlice, aI.columnSize() - j));
        Matrix bIJ = r2.solveRight(yI).transpose().times(aIJ);
        addToSavedCopy(bFile(tmpDir, j), bIJ);
      }
    }

    // step 3, compute BB', L and SVD(L)
    Matrix b2 = new DenseMatrix(internalDimension, internalDimension);
    MatrixWritable bTmp = new MatrixWritable();
    for (int j = 0; j < ncols; j += columnsPerSlice) {
      if (bFile(tmpDir, j).exists()) {
        try (DataInputStream in = new DataInputStream(new FileInputStream(bFile(tmpDir, j)))) {
          bTmp.readFields(in);
        }

        b2.assign(bTmp.get().times(bTmp.get().transpose()), Functions.PLUS);
      }
    }
    l2 = new CholeskyDecomposition(b2);
    svd = new SingularValueDecomposition(l2.getL());
  }

  public void computeV(File tmpDir, int ncols) throws IOException {
    // step 5, compute pieces of V
    for (int j = 0; j < ncols; j += columnsPerSlice) {
      File bPath = bFile(tmpDir, j);
      if (bPath.exists()) {
        MatrixWritable m = new MatrixWritable();
        try (DataInputStream in = new DataInputStream(new FileInputStream(bPath))) {
          m.readFields(in);
        }
        m.set(l2.solveRight(m.get().transpose()).times(svd.getV()));
        try (DataOutputStream out = new DataOutputStream(new FileOutputStream(
            new File(tmpDir, String.format("V-%s", bPath.getName().replaceAll(".*-", "")))))) {
          m.write(out);
        }
      }
    }
  }

  public void computeU(Iterable partsOfA, File tmpDir) throws IOException {
    // step 4, compute pieces of U
    for (File file : partsOfA) {
      MatrixWritable m = new MatrixWritable();
      m.readFields(new DataInputStream(new FileInputStream(file)));
      Matrix aI = m.get();

      Matrix y = aI.times(new RandomTrinaryMatrix(seed, aI.numCols(), dim, false));
      Matrix uI = r2.solveRight(y).times(svd.getU());
      m.set(uI);
      try (DataOutputStream out = new DataOutputStream(new FileOutputStream(
          new File(tmpDir, String.format("U-%s", file.getName().replaceAll(".*-", "")))))) {
        m.write(out);
      }
    }
  }

  private static void addToSavedCopy(File file, Matrix matrix) throws IOException {
    MatrixWritable mw = new MatrixWritable();
    if (file.exists()) {
      try (DataInputStream in = new DataInputStream(new FileInputStream(file))) {
        mw.readFields(in);
      }
      mw.get().assign(matrix, Functions.PLUS);
    } else {
      mw.set(matrix);
    }
    try (DataOutputStream out = new DataOutputStream(new FileOutputStream(file))) {
      mw.write(out);
    }
  }

  private static File bFile(File tmpDir, int j) {
    return new File(tmpDir, String.format("B-%09d", j));
  }

  public Vector getSingularValues() {
    return new DenseVector(svd.getSingularValues());
  }
}