All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.math.hadoop.stochasticsvd.qr.QRFirstStep Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.mahout.math.hadoop.stochasticsvd.qr;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Deque;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.lib.MultipleOutputs;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.iterator.CopyConstructorIterator;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.stochasticsvd.DenseBlockWritable;
import org.apache.mahout.math.UpperTriangular;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;

/**
 * QR first step without MR abstractions and doing it just in terms of iterators
 * and collectors. (although Collector is probably an outdated api).
 * 
 * 
 */
@SuppressWarnings("deprecation")
public class QRFirstStep implements Closeable, OutputCollector {

  public static final String PROP_K = "ssvd.k";
  public static final String PROP_P = "ssvd.p";
  public static final String PROP_AROWBLOCK_SIZE = "ssvd.arowblock.size";

  private int kp;
  private List yLookahead;
  private GivensThinSolver qSolver;
  private int blockCnt;
  private final DenseBlockWritable value = new DenseBlockWritable();
  private final Writable tempKey = new IntWritable();
  private MultipleOutputs outputs;
  private final Deque closeables = Lists.newLinkedList();
  private SequenceFile.Writer tempQw;
  private Path tempQPath;
  private final List rSubseq = Lists.newArrayList();
  private final Configuration jobConf;

  private final OutputCollector qtHatOut;
  private final OutputCollector rHatOut;

  public QRFirstStep(Configuration jobConf,
                     OutputCollector qtHatOut,
                     OutputCollector rHatOut) {
    this.jobConf = jobConf;
    this.qtHatOut = qtHatOut;
    this.rHatOut = rHatOut;
    setup();
  }

  @Override
  public void close() throws IOException {
    cleanup();
  }

  public int getKP() {
    return kp;
  }

  private void flushSolver() throws IOException {
    UpperTriangular r = qSolver.getRTilde();
    double[][] qt = qSolver.getThinQtTilde();

    rSubseq.add(r);

    value.setBlock(qt);
    getTempQw().append(tempKey, value);

    /*
     * this probably should be a sparse row matrix, but compressor should get it
     * for disk and in memory we want it dense anyway, sparse random
     * implementations would be a mostly a memory management disaster consisting
     * of rehashes and GC // thrashing. (IMHO)
     */
    value.setBlock(null);
    qSolver.reset();
  }

  // second pass to run a modified version of computeQHatSequence.
  private void flushQBlocks() throws IOException {
    if (blockCnt == 1) {
      /*
       * only one block, no temp file, no second pass. should be the default
       * mode for efficiency in most cases. Sure mapper should be able to load
       * the entire split in memory -- and we don't require even that.
       */
      value.setBlock(qSolver.getThinQtTilde());
      outputQHat(value);
      outputR(new VectorWritable(new DenseVector(qSolver.getRTilde().getData(),
                                                 true)));

    } else {
      secondPass();
    }
  }

  private void outputQHat(DenseBlockWritable value) throws IOException {
    qtHatOut.collect(NullWritable.get(), value);
  }

  private void outputR(VectorWritable value) throws IOException {
    rHatOut.collect(NullWritable.get(), value);
  }

  private void secondPass() throws IOException {
    qSolver = null; // release mem
    FileSystem localFs = FileSystem.getLocal(jobConf);
    SequenceFile.Reader tempQr =
      new SequenceFile.Reader(localFs, tempQPath, jobConf);
    closeables.addFirst(tempQr);
    int qCnt = 0;
    while (tempQr.next(tempKey, value)) {
      value
        .setBlock(GivensThinSolver.computeQtHat(value.getBlock(),
                                                qCnt,
                                                new CopyConstructorIterator<>(rSubseq.iterator())));
      if (qCnt == 1) {
        /*
         * just merge r[0] <- r[1] so it doesn't have to repeat in subsequent
         * computeQHat iterators
         */
        GivensThinSolver.mergeR(rSubseq.get(0), rSubseq.remove(1));
      } else {
        qCnt++;
      }
      outputQHat(value);
    }

    assert rSubseq.size() == 1;

    outputR(new VectorWritable(new DenseVector(rSubseq.get(0).getData(), true)));

  }

  protected void map(Vector incomingYRow) throws IOException {
    double[] yRow;
    if (yLookahead.size() == kp) {
      if (qSolver.isFull()) {

        flushSolver();
        blockCnt++;

      }
      yRow = yLookahead.remove(0);

      qSolver.appendRow(yRow);
    } else {
      yRow = new double[kp];
    }

    if (incomingYRow.isDense()) {
      for (int i = 0; i < kp; i++) {
        yRow[i] = incomingYRow.get(i);
      }
    } else {
      Arrays.fill(yRow, 0);
      for (Element yEl : incomingYRow.nonZeroes()) {
        yRow[yEl.index()] = yEl.get();
      }
    }

    yLookahead.add(yRow);
  }

  protected void setup() {

    int r = Integer.parseInt(jobConf.get(PROP_AROWBLOCK_SIZE));
    int k = Integer.parseInt(jobConf.get(PROP_K));
    int p = Integer.parseInt(jobConf.get(PROP_P));
    kp = k + p;

    yLookahead = Lists.newArrayListWithCapacity(kp);
    qSolver = new GivensThinSolver(r, kp);
    outputs = new MultipleOutputs(new JobConf(jobConf));
    closeables.addFirst(new Closeable() {
      @Override
      public void close() throws IOException {
        outputs.close();
      }
    });

  }

  protected void cleanup() throws IOException {
    try {
      if (qSolver == null && yLookahead.isEmpty()) {
        return;
      }
      if (qSolver == null) {
        qSolver = new GivensThinSolver(yLookahead.size(), kp);
      }
      // grow q solver up if necessary

      qSolver.adjust(qSolver.getCnt() + yLookahead.size());
      while (!yLookahead.isEmpty()) {

        qSolver.appendRow(yLookahead.remove(0));

      }
      assert qSolver.isFull();
      if (++blockCnt > 1) {
        flushSolver();
        assert tempQw != null;
        closeables.remove(tempQw);
        Closeables.close(tempQw, false);
      }
      flushQBlocks();

    } finally {
      IOUtils.close(closeables);
    }

  }

  private SequenceFile.Writer getTempQw() throws IOException {
    if (tempQw == null) {
      /*
       * temporary Q output hopefully will not exceed size of IO cache in which
       * case it is only good since it is going to be managed by kernel, not
       * java GC. And if IO cache is not good enough, then at least it is always
       * sequential.
       */
      String taskTmpDir = System.getProperty("java.io.tmpdir");

      FileSystem localFs = FileSystem.getLocal(jobConf);
      Path parent = new Path(taskTmpDir);
      Path sub = new Path(parent, "qw_" + System.currentTimeMillis());
      tempQPath = new Path(sub, "q-temp.seq");
      tempQw =
        SequenceFile.createWriter(localFs,
                                  jobConf,
                                  tempQPath,
                                  IntWritable.class,
                                  DenseBlockWritable.class,
                                  CompressionType.BLOCK);
      closeables.addFirst(tempQw);
      closeables.addFirst(new IOUtils.DeleteFileOnClose(new File(tempQPath
        .toString())));
    }
    return tempQw;
  }

  @Override
  public void collect(Writable key, Vector vw) throws IOException {
    map(vw);
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy