org.apache.mahout.math.decomposer.hebbian.HebbianSolver Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-math Show documentation
High performance scientific and technical computing data structures and methods, mostly based on CERN's Colt Java API
There is a newer version: 0.13.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.math.decomposer.hebbian;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Random;

import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.decomposer.AsyncEigenVerifier;
import org.apache.mahout.math.decomposer.EigenStatus;
import org.apache.mahout.math.decomposer.SingularVectorVerifier;
import org.apache.mahout.math.function.PlusMult;
import org.apache.mahout.math.function.TimesFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The Hebbian solver is an iterative, sparse, singular value decomposition solver, based on the paper
 * Generalized Hebbian Algorithm for
 * Latent Semantic Analysis (2005) by Genevieve Gorrell and Brandyn Webb (a.k.a. Simon Funk).
 * TODO: more description here!  For now: read the inline comments, and the comments for the constructors.
 */
public class HebbianSolver {

  private static final Logger log = LoggerFactory.getLogger(HebbianSolver.class);
  private static final boolean DEBUG = false;

  private final EigenUpdater updater;
  private final SingularVectorVerifier verifier;
  private final double convergenceTarget;
  private final int maxPassesPerEigen;
  private final Random rng = RandomUtils.getRandom();

  private int numPasses = 0;

  /**
   * Creates a new HebbianSolver
   *
   * @param updater
   *  {@link EigenUpdater} used to do the actual work of iteratively updating the current "best guess"
   *   singular vector one data-point presentation at a time.
   * @param verifier
   *  {@link SingularVectorVerifier } an object which perpetually tries to check how close to
   *   convergence the current singular vector is (typically is a
   *  {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } which does this
   *   in the background in another thread, while the main thread continues to converge)
   * @param convergenceTarget a small "epsilon" value which tells the solver how small you want the cosine of the
   *  angle between a proposed eigenvector and that same vector after being multiplied by the (square of the) input
   *  corpus
   * @param maxPassesPerEigen a cutoff which tells the solver after how many times of checking for convergence (done
   *  by the verifier) should the solver stop trying, even if it has not reached the convergenceTarget.
   */
  public HebbianSolver(EigenUpdater updater,
                       SingularVectorVerifier verifier,
                       double convergenceTarget,
                       int maxPassesPerEigen) {
    this.updater = updater;
    this.verifier = verifier;
    this.convergenceTarget = convergenceTarget;
    this.maxPassesPerEigen = maxPassesPerEigen;
  }

  /**
   * Creates a new HebbianSolver with maxPassesPerEigen = Integer.MAX_VALUE (i.e. keep on iterating until
   * convergenceTarget is reached).  Not recommended unless only looking for
   * the first few (5, maybe 10?) singular
   * vectors, as small errors which compound early on quickly put a minimum error on subsequent vectors.
   *
   * @param updater {@link EigenUpdater} used to do the actual work of iteratively updating the current "best guess"
   *  singular vector one data-point presentation at a time.
   * @param verifier {@link org.apache.mahout.math.decomposer.SingularVectorVerifier }
   * an object which perpetually tries to check how close to
   *  convergence the current singular vector is (typically is a
   * {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } which does this
   *  in the background in another thread, while the main thread continues to converge)
   * @param convergenceTarget a small "epsilon" value which tells the solver how small you want the cosine of the
   *  angle between a proposed eigenvector and that same vector after being multiplied by the (square of the) input
   *  corpus
   */
  public HebbianSolver(EigenUpdater updater,
                       SingularVectorVerifier verifier,
                       double convergenceTarget) {
    this(updater,
        verifier,
        convergenceTarget,
        Integer.MAX_VALUE);
  }

  /**
   * This is the recommended constructor to use if you're not sure
   * Creates a new HebbianSolver with the default {@link HebbianUpdater } to do the updating work, and the default
   * {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } to check for convergence in a
   * (single) background thread.
   *
   * @param convergenceTarget a small "epsilon" value which tells the solver how small you want the cosine of the
   *  angle between a proposed eigenvector and that same vector after being multiplied by the (square of the) input
   *  corpus
   * @param maxPassesPerEigen a cutoff which tells the solver after how many times of checking for convergence (done
   *  by the verifier) should the solver stop trying, even if it has not reached the convergenceTarget.
   */
  public HebbianSolver(double convergenceTarget, int maxPassesPerEigen) {
    this(new HebbianUpdater(),
        new AsyncEigenVerifier(),
        convergenceTarget,
        maxPassesPerEigen);
  }

  /**
   * Creates a new HebbianSolver with the default {@link HebbianUpdater } to do the updating work, and the default
   * {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } to check for convergence in a (single)
   * background thread, with
   * maxPassesPerEigen set to Integer.MAX_VALUE.  Not recommended unless only looking
   * for the first few (5, maybe 10?) singular
   * vectors, as small errors which compound early on quickly put a minimum error on subsequent vectors.
   *
   * @param convergenceTarget a small "epsilon" value which tells the solver how small you want the cosine of the
   *  angle between a proposed eigenvector and that same vector after being multiplied by the (square of the) input
   *  corpus
   */
  public HebbianSolver(double convergenceTarget) {
    this(convergenceTarget, Integer.MAX_VALUE);
  }

  /**
   * Creates a new HebbianSolver with the default {@link HebbianUpdater } to do the updating work, and the default
   * {@link org.apache.mahout.math.decomposer.AsyncEigenVerifier } to check for convergence in a (single)
   * background thread, with
   * convergenceTarget set to 0, which means that the solver will not really care about convergence as a loop-exiting
   * criterion (but will be checking for convergence anyways, so it will be logged and singular values will be
   * saved).
   *
   * @param numPassesPerEigen the exact number of times the verifier will check convergence status in the background
   *                          before the solver will move on to the next eigen-vector.
   */
  public HebbianSolver(int numPassesPerEigen) {
    this(0.0, numPassesPerEigen);
  }

  /**
   * Primary singular vector solving method.
   *
   * @param corpus input matrix to find singular vectors of.  Needs not be symmetric, should probably be sparse (in
   *   fact the input vectors are not mutated, and accessed only via dot-products and sums, so they should be
   *   {@link org.apache.mahout.math.SequentialAccessSparseVector }
   * @param desiredRank the number of singular vectors to find (in roughly decreasing order by singular value)
   * @return the final {@link TrainingState } of the solver, after desiredRank singular vectors (and approximate
   *         singular values) have been found.
   */
  public TrainingState solve(Matrix corpus,
                             int desiredRank) {
    int cols = corpus.numCols();
    Matrix eigens = new DenseMatrix(desiredRank, cols);
    List eigenValues = new ArrayList<>();
    log.info("Finding {} singular vectors of matrix with {} rows, via Hebbian", desiredRank, corpus.numRows());
    /*
     * The corpusProjections matrix is a running cache of the residual projection of each corpus vector against all
     * of the previously found singular vectors.  Without this, if multiple passes over the data is made (per
     * singular vector), recalculating these projections eventually dominates the computational complexity of the
     * solver.
     */
    Matrix corpusProjections = new DenseMatrix(corpus.numRows(), desiredRank);
    TrainingState state = new TrainingState(eigens, corpusProjections);
    for (int i = 0; i < desiredRank; i++) {
      Vector currentEigen = new DenseVector(cols);
      Vector previousEigen = null;
      while (hasNotConverged(currentEigen, corpus, state)) {
        int randomStartingIndex = getRandomStartingIndex(corpus, eigens);
        Vector initialTrainingVector = corpus.viewRow(randomStartingIndex);
        state.setTrainingIndex(randomStartingIndex);
        updater.update(currentEigen, initialTrainingVector, state);
        for (int corpusRow = 0; corpusRow < corpus.numRows(); corpusRow++) {
          state.setTrainingIndex(corpusRow);
          if (corpusRow != randomStartingIndex) {
            updater.update(currentEigen, corpus.viewRow(corpusRow), state);
          }
        }
        state.setFirstPass(false);
        if (DEBUG) {
          if (previousEigen == null) {
            previousEigen = currentEigen.clone();
          } else {
            double dot = currentEigen.dot(previousEigen);
            if (dot > 0.0) {
              dot /= currentEigen.norm(2) * previousEigen.norm(2);
            }
           // log.info("Current pass * previous pass = {}", dot);
          }
        }
      }
      // converged!
      double eigenValue = state.getStatusProgress().get(state.getStatusProgress().size() - 1).getEigenValue();
      // it's actually more efficient to do this to normalize than to call currentEigen = currentEigen.normalize(),
      // because the latter does a clone, which isn't necessary here.
      currentEigen.assign(new TimesFunction(), 1 / currentEigen.norm(2));
      eigens.assignRow(i, currentEigen);
      eigenValues.add(eigenValue);
      state.setCurrentEigenValues(eigenValues);
      log.info("Found eigenvector {}, eigenvalue: {}", i, eigenValue);

      /**
       *  TODO: Persist intermediate output!
       */
      state.setFirstPass(true);
      state.setNumEigensProcessed(state.getNumEigensProcessed() + 1);
      state.setActivationDenominatorSquared(0);
      state.setActivationNumerator(0);
      state.getStatusProgress().clear();
      numPasses = 0;
    }
    return state;
  }

  /**
   * You have to start somewhere...
   * TODO: start instead wherever you find a vector with maximum residual length after subtracting off the projection
   * TODO: onto all previous eigenvectors.
   *
   * @param corpus the corpus matrix
   * @param eigens not currently used, but should be (see above TODO)
   * @return the index into the corpus where the "starting seed" input vector lies.
   */
  private int getRandomStartingIndex(Matrix corpus, Matrix eigens) {
    int index;
    Vector v;
    do {
      double r = rng.nextDouble();
      index = (int) (r * corpus.numRows());
      v = corpus.viewRow(index);
    } while (v == null || v.norm(2) == 0 || v.getNumNondefaultElements() < 5);
    return index;
  }

  /**
   * Uses the {@link SingularVectorVerifier } to check for convergence
   *
   * @param currentPseudoEigen the purported singular vector whose convergence is being checked
   * @param corpus             the corpus to check against
   * @param state              contains the previous eigens, various other solving state {@link TrainingState}
   * @return true if either we have converged, or maxPassesPerEigen has been exceeded.
   */
  protected boolean hasNotConverged(Vector currentPseudoEigen,
                                    Matrix corpus,
                                    TrainingState state) {
    numPasses++;
    if (state.isFirstPass()) {
      log.info("First pass through the corpus, no need to check convergence...");
      return true;
    }
    Matrix previousEigens = state.getCurrentEigens();
    log.info("Have made {} passes through the corpus, checking convergence...", numPasses);
    /*
     * Step 1: orthogonalize currentPseudoEigen by subtracting off eigen(i) * helper.get(i)
     * Step 2: zero-out the helper vector because it has already helped.
     */
    for (int i = 0; i < state.getNumEigensProcessed(); i++) {
      Vector previousEigen = previousEigens.viewRow(i);
      currentPseudoEigen.assign(previousEigen, new PlusMult(-state.getHelperVector().get(i)));
      state.getHelperVector().set(i, 0);
    }
    if (currentPseudoEigen.norm(2) > 0) {
      for (int i = 0; i < state.getNumEigensProcessed(); i++) {
        Vector previousEigen = previousEigens.viewRow(i);
        log.info("dot with previous: {}", previousEigen.dot(currentPseudoEigen) / currentPseudoEigen.norm(2));
      }
    }
    /*
     * Step 3: verify how eigen-like the prospective eigen is.  This is potentially asynchronous.
     */
    EigenStatus status = verify(corpus, currentPseudoEigen);
    if (status.inProgress()) {
      log.info("Verifier not finished, making another pass...");
    } else {
      log.info("Has 1 - cosAngle: {}, convergence target is: {}", 1.0 - status.getCosAngle(), convergenceTarget);
      state.getStatusProgress().add(status);
    }
    return
        state.getStatusProgress().size() <= maxPassesPerEigen
        && 1.0 - status.getCosAngle() > convergenceTarget;
  }

  protected EigenStatus verify(Matrix corpus, Vector currentPseudoEigen) {
    return verifier.verify(corpus, currentPseudoEigen);
  }

  public static void main(String[] args) {
    Properties props = new Properties();
    String propertiesFile = args.length > 0 ? args[0] : "config/solver.properties";
    //  props.load(new FileInputStream(propertiesFile));

    String corpusDir = props.getProperty("solver.input.dir");
    String outputDir = props.getProperty("solver.output.dir");
    if (corpusDir == null || corpusDir.isEmpty() || outputDir == null || outputDir.isEmpty()) {
      log.error("{} must contain values for solver.input.dir and solver.output.dir", propertiesFile);
      return;
    }
    //int inBufferSize = Integer.parseInt(props.getProperty("solver.input.bufferSize"));
    int rank = Integer.parseInt(props.getProperty("solver.output.desiredRank"));
    double convergence = Double.parseDouble(props.getProperty("solver.convergence"));
    int maxPasses = Integer.parseInt(props.getProperty("solver.maxPasses"));
    //int numThreads = Integer.parseInt(props.getProperty("solver.verifier.numThreads"));

    HebbianUpdater updater = new HebbianUpdater();
    SingularVectorVerifier verifier = new AsyncEigenVerifier();
    HebbianSolver solver = new HebbianSolver(updater, verifier, convergence, maxPasses);
    Matrix corpus = null;
    /*
    if (numThreads <= 1) {
      //  corpus = new DiskBufferedDoubleMatrix(new File(corpusDir), inBufferSize);
    } else {
      //  corpus = new ParallelMultiplyingDiskBufferedDoubleMatrix(new File(corpusDir), inBufferSize, numThreads);
    }
     */
    long now = System.currentTimeMillis();
    TrainingState finalState = solver.solve(corpus, rank);
    long time = (System.currentTimeMillis() - now) / 1000;
    log.info("Solved {} eigenVectors in {} seconds.  Persisted to {}",
             finalState.getCurrentEigens().rowSize(), time, outputDir);
  }

  
}