All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.math.hadoop.stochasticsvd.ABtDenseOutJob Maven / Gradle / Ivy

There is a newer version: 0.13.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.math.hadoop.stochasticsvd;

import java.io.Closeable;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Deque;
import java.util.Iterator;
import java.util.regex.Matcher;

import com.google.common.collect.Lists;
import org.apache.commons.lang3.Validate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.Functions;
import org.apache.mahout.math.hadoop.stochasticsvd.qr.QRFirstStep;

/**
 * Computes ABt products, then first step of QR which is pushed down to the
 * reducer.
 */
@SuppressWarnings("deprecation")
public final class ABtDenseOutJob {

  public static final String PROP_BT_PATH = "ssvd.Bt.path";
  public static final String PROP_BT_BROADCAST = "ssvd.Bt.broadcast";
  public static final String PROP_SB_PATH = "ssvdpca.sb.path";
  public static final String PROP_SQ_PATH = "ssvdpca.sq.path";
  public static final String PROP_XI_PATH = "ssvdpca.xi.path";

  private ABtDenseOutJob() {
  }

  /**
   * So, here, i preload A block into memory.
   * 

* * A sparse matrix seems to be ideal for that but there are two reasons why i * am not using it: *

    *
  • 1) I don't know the full block height. so i may need to reallocate it * from time to time. Although this probably not a showstopper. *
  • 2) I found that RandomAccessSparseVectors seem to take much more memory * than the SequentialAccessSparseVectors. *
*

* */ public static class ABtMapper extends Mapper { private SplitPartitionedWritable outKey; private final Deque closeables = new ArrayDeque<>(); private SequenceFileDirIterator btInput; private Vector[] aCols; private double[][] yiCols; private int aRowCount; private int kp; private int blockHeight; private boolean distributedBt; private Path[] btLocalPath; private Configuration localFsConfig; /* * xi and s_q are PCA-related corrections, per MAHOUT-817 */ protected Vector xi; protected Vector sq; @Override protected void map(Writable key, VectorWritable value, Context context) throws IOException, InterruptedException { Vector vec = value.get(); int vecSize = vec.size(); if (aCols == null) { aCols = new Vector[vecSize]; } else if (aCols.length < vecSize) { aCols = Arrays.copyOf(aCols, vecSize); } if (vec.isDense()) { for (int i = 0; i < vecSize; i++) { extendAColIfNeeded(i, aRowCount + 1); aCols[i].setQuick(aRowCount, vec.getQuick(i)); } } else if (vec.size() > 0) { for (Vector.Element vecEl : vec.nonZeroes()) { int i = vecEl.index(); extendAColIfNeeded(i, aRowCount + 1); aCols[i].setQuick(aRowCount, vecEl.get()); } } aRowCount++; } private void extendAColIfNeeded(int col, int rowCount) { if (aCols[col] == null) { aCols[col] = new SequentialAccessSparseVector(rowCount < blockHeight ? blockHeight : rowCount, 1); } else if (aCols[col].size() < rowCount) { Vector newVec = new SequentialAccessSparseVector(rowCount + blockHeight, aCols[col].getNumNondefaultElements() << 1); newVec.viewPart(0, aCols[col].size()).assign(aCols[col]); aCols[col] = newVec; } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { try { yiCols = new double[kp][]; for (int i = 0; i < kp; i++) { yiCols[i] = new double[Math.min(aRowCount, blockHeight)]; } int numPasses = (aRowCount - 1) / blockHeight + 1; String propBtPathStr = context.getConfiguration().get(PROP_BT_PATH); Validate.notNull(propBtPathStr, "Bt input is not set"); Path btPath = new Path(propBtPathStr); DenseBlockWritable dbw = new DenseBlockWritable(); /* * so it turns out that it may be much more efficient to do a few * independent passes over Bt accumulating the entire block in memory * than pass huge amount of blocks out to combiner. so we aim of course * to fit entire s x (k+p) dense block in memory where s is the number * of A rows in this split. If A is much sparser than (k+p) avg # of * elements per row then the block may exceed the split size. if this * happens, and if the given blockHeight is not high enough to * accomodate this (because of memory constraints), then we start * splitting s into several passes. since computation is cpu-bound * anyway, it should be o.k. for supersparse inputs. (as ok it can be * that projection is thicker than the original anyway, why would one * use that many k+p then). */ int lastRowIndex = -1; for (int pass = 0; pass < numPasses; pass++) { if (distributedBt) { btInput = new SequenceFileDirIterator<>(btLocalPath, true, localFsConfig); } else { btInput = new SequenceFileDirIterator<>(btPath, PathType.GLOB, null, null, true, context.getConfiguration()); } closeables.addFirst(btInput); Validate.isTrue(btInput.hasNext(), "Empty B' input!"); int aRowBegin = pass * blockHeight; int bh = Math.min(blockHeight, aRowCount - aRowBegin); /* * check if we need to trim block allocation */ if (pass > 0) { if (bh == blockHeight) { for (int i = 0; i < kp; i++) { Arrays.fill(yiCols[i], 0.0); } } else { for (int i = 0; i < kp; i++) { yiCols[i] = null; } for (int i = 0; i < kp; i++) { yiCols[i] = new double[bh]; } } } while (btInput.hasNext()) { Pair btRec = btInput.next(); int btIndex = btRec.getFirst().get(); Vector btVec = btRec.getSecond().get(); Vector aCol; if (btIndex > aCols.length || (aCol = aCols[btIndex]) == null || aCol.size() == 0) { /* 100% zero A column in the block, skip it as sparse */ continue; } int j = -1; for (Vector.Element aEl : aCol.nonZeroes()) { j = aEl.index(); /* * now we compute only swathes between aRowBegin..aRowBegin+bh * exclusive. it seems like a deficiency but in fact i think it * will balance itself out: either A is dense and then we * shouldn't have more than one pass and therefore filter * conditions will never kick in. Or, the only situation where we * can't fit Y_i block in memory is when A input is much sparser * than k+p per row. But if this is the case, then we'd be looking * at very few elements without engaging them in any operations so * even then it should be ok. */ if (j < aRowBegin) { continue; } if (j >= aRowBegin + bh) { break; } /* * assume btVec is dense */ if (xi != null) { /* * MAHOUT-817: PCA correction for B'. I rewrite the whole * computation loop so i don't have to check if PCA correction * is needed at individual element level. It looks bulkier this * way but perhaps less wasteful on cpu. */ for (int s = 0; s < kp; s++) { // code defensively against shortened xi double xii = xi.size() > btIndex ? xi.get(btIndex) : 0.0; yiCols[s][j - aRowBegin] += aEl.get() * (btVec.getQuick(s) - xii * sq.get(s)); } } else { /* * no PCA correction */ for (int s = 0; s < kp; s++) { yiCols[s][j - aRowBegin] += aEl.get() * btVec.getQuick(s); } } } if (lastRowIndex < j) { lastRowIndex = j; } } /* * so now we have stuff in yi */ dbw.setBlock(yiCols); outKey.setTaskItemOrdinal(pass); context.write(outKey, dbw); closeables.remove(btInput); btInput.close(); } } finally { IOUtils.close(closeables); } } @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); int k = Integer.parseInt(conf.get(QRFirstStep.PROP_K)); int p = Integer.parseInt(conf.get(QRFirstStep.PROP_P)); kp = k + p; outKey = new SplitPartitionedWritable(context); blockHeight = conf.getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, -1); distributedBt = conf.get(PROP_BT_BROADCAST) != null; if (distributedBt) { btLocalPath = HadoopUtil.getCachedFiles(conf); localFsConfig = new Configuration(); localFsConfig.set("fs.default.name", "file:///"); } /* * PCA -related corrections (MAHOUT-817) */ String xiPathStr = conf.get(PROP_XI_PATH); if (xiPathStr != null) { xi = SSVDHelper.loadAndSumUpVectors(new Path(xiPathStr), conf); sq = SSVDHelper.loadAndSumUpVectors(new Path(conf.get(PROP_SQ_PATH)), conf); } } } /** * QR first step pushed down to reducer. * */ public static class QRReducer extends Reducer { /* * HACK: partition number formats in hadoop, copied. this may stop working * if it gets out of sync with newer hadoop version. But unfortunately rules * of forming output file names are not sufficiently exposed so we need to * hack it if we write the same split output from either mapper or reducer. * alternatively, we probably can replace it by our own output file naming * management completely and bypass MultipleOutputs entirely. */ private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } private final Deque closeables = Lists.newLinkedList(); protected int blockHeight; protected int lastTaskId = -1; protected OutputCollector qhatCollector; protected OutputCollector rhatCollector; protected QRFirstStep qr; protected Vector yiRow; protected Vector sb; @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); blockHeight = conf.getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, -1); String sbPathStr = conf.get(PROP_SB_PATH); /* * PCA -related corrections (MAHOUT-817) */ if (sbPathStr != null) { sb = SSVDHelper.loadAndSumUpVectors(new Path(sbPathStr), conf); } } protected void setupBlock(Context context, SplitPartitionedWritable spw) throws InterruptedException, IOException { IOUtils.close(closeables); qhatCollector = createOutputCollector(QJob.OUTPUT_QHAT, spw, context, DenseBlockWritable.class); rhatCollector = createOutputCollector(QJob.OUTPUT_RHAT, spw, context, VectorWritable.class); qr = new QRFirstStep(context.getConfiguration(), qhatCollector, rhatCollector); closeables.addFirst(qr); lastTaskId = spw.getTaskId(); } @Override protected void reduce(SplitPartitionedWritable key, Iterable values, Context context) throws IOException, InterruptedException { if (key.getTaskId() != lastTaskId) { setupBlock(context, key); } Iterator iter = values.iterator(); DenseBlockWritable dbw = iter.next(); double[][] yiCols = dbw.getBlock(); if (iter.hasNext()) { throw new IOException("Unexpected extra Y_i block in reducer input."); } long blockBase = key.getTaskItemOrdinal() * blockHeight; int bh = yiCols[0].length; if (yiRow == null) { yiRow = new DenseVector(yiCols.length); } for (int k = 0; k < bh; k++) { for (int j = 0; j < yiCols.length; j++) { yiRow.setQuick(j, yiCols[j][k]); } key.setTaskItemOrdinal(blockBase + k); // pca offset correction if any if (sb != null) { yiRow.assign(sb, Functions.MINUS); } qr.collect(key, yiRow); } } private Path getSplitFilePath(String name, SplitPartitionedWritable spw, Context context) throws InterruptedException, IOException { String uniqueFileName = FileOutputFormat.getUniqueFile(context, name, ""); uniqueFileName = uniqueFileName.replaceFirst("-r-", "-m-"); uniqueFileName = uniqueFileName.replaceFirst("\\d+$", Matcher.quoteReplacement(NUMBER_FORMAT.format(spw.getTaskId()))); return new Path(FileOutputFormat.getWorkOutputPath(context), uniqueFileName); } /** * key doesn't matter here, only value does. key always gets substituted by * SPW. * * @param * bogus */ private OutputCollector createOutputCollector(String name, final SplitPartitionedWritable spw, Context ctx, Class valueClass) throws IOException, InterruptedException { Path outputPath = getSplitFilePath(name, spw, ctx); final SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.get(outputPath.toUri(), ctx.getConfiguration()), ctx.getConfiguration(), outputPath, SplitPartitionedWritable.class, valueClass); closeables.addFirst(w); return new OutputCollector() { @Override public void collect(K key, V val) throws IOException { w.append(spw, val); } }; } @Override protected void cleanup(Context context) throws IOException, InterruptedException { IOUtils.close(closeables); } } public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path xiPath, Path sqPath, Path sbPath, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks, boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); Job job = new Job(oldApiJob); job.setJobName("ABt-job"); job.setJarByClass(ABtDenseOutJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputAPaths); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(SplitPartitionedWritable.class); job.setMapOutputValueClass(DenseBlockWritable.class); job.setOutputKeyClass(SplitPartitionedWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(ABtMapper.class); job.setReducerClass(QRReducer.class); job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows); job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight); job.getConfiguration().setInt(QRFirstStep.PROP_K, k); job.getConfiguration().setInt(QRFirstStep.PROP_P, p); job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString()); /* * PCA-related options, MAHOUT-817 */ if (xiPath != null) { job.getConfiguration().set(PROP_XI_PATH, xiPath.toString()); job.getConfiguration().set(PROP_SB_PATH, sbPath.toString()); job.getConfiguration().set(PROP_SQ_PATH, sqPath.toString()); } job.setNumReduceTasks(numReduceTasks); // broadcast Bt files if required. if (broadcastBInput) { job.getConfiguration().set(PROP_BT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf); FileStatus[] fstats = fs.globStatus(inputBtGlob); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration()); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("ABt job unsuccessful."); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy