org.apache.mahout.math.hadoop.stochasticsvd.ABtDenseOutJob Maven / Gradle / Ivy
Show all versions of mahout-mr Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.hadoop.stochasticsvd;
import java.io.Closeable;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Deque;
import java.util.Iterator;
import java.util.regex.Matcher;
import com.google.common.collect.Lists;
import org.apache.commons.lang3.Validate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.Functions;
import org.apache.mahout.math.hadoop.stochasticsvd.qr.QRFirstStep;
/**
* Computes ABt products, then first step of QR which is pushed down to the
* reducer.
*/
@SuppressWarnings("deprecation")
public final class ABtDenseOutJob {
public static final String PROP_BT_PATH = "ssvd.Bt.path";
public static final String PROP_BT_BROADCAST = "ssvd.Bt.broadcast";
public static final String PROP_SB_PATH = "ssvdpca.sb.path";
public static final String PROP_SQ_PATH = "ssvdpca.sq.path";
public static final String PROP_XI_PATH = "ssvdpca.xi.path";
private ABtDenseOutJob() {
}
/**
* So, here, i preload A block into memory.
*
*
* A sparse matrix seems to be ideal for that but there are two reasons why i
* am not using it:
*
* - 1) I don't know the full block height. so i may need to reallocate it
* from time to time. Although this probably not a showstopper.
*
- 2) I found that RandomAccessSparseVectors seem to take much more memory
* than the SequentialAccessSparseVectors.
*
*
*
*/
public static class ABtMapper
extends
Mapper {
private SplitPartitionedWritable outKey;
private final Deque closeables = new ArrayDeque<>();
private SequenceFileDirIterator btInput;
private Vector[] aCols;
private double[][] yiCols;
private int aRowCount;
private int kp;
private int blockHeight;
private boolean distributedBt;
private Path[] btLocalPath;
private Configuration localFsConfig;
/*
* xi and s_q are PCA-related corrections, per MAHOUT-817
*/
protected Vector xi;
protected Vector sq;
@Override
protected void map(Writable key, VectorWritable value, Context context)
throws IOException, InterruptedException {
Vector vec = value.get();
int vecSize = vec.size();
if (aCols == null) {
aCols = new Vector[vecSize];
} else if (aCols.length < vecSize) {
aCols = Arrays.copyOf(aCols, vecSize);
}
if (vec.isDense()) {
for (int i = 0; i < vecSize; i++) {
extendAColIfNeeded(i, aRowCount + 1);
aCols[i].setQuick(aRowCount, vec.getQuick(i));
}
} else if (vec.size() > 0) {
for (Vector.Element vecEl : vec.nonZeroes()) {
int i = vecEl.index();
extendAColIfNeeded(i, aRowCount + 1);
aCols[i].setQuick(aRowCount, vecEl.get());
}
}
aRowCount++;
}
private void extendAColIfNeeded(int col, int rowCount) {
if (aCols[col] == null) {
aCols[col] =
new SequentialAccessSparseVector(rowCount < blockHeight ? blockHeight
: rowCount, 1);
} else if (aCols[col].size() < rowCount) {
Vector newVec =
new SequentialAccessSparseVector(rowCount + blockHeight,
aCols[col].getNumNondefaultElements() << 1);
newVec.viewPart(0, aCols[col].size()).assign(aCols[col]);
aCols[col] = newVec;
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
try {
yiCols = new double[kp][];
for (int i = 0; i < kp; i++) {
yiCols[i] = new double[Math.min(aRowCount, blockHeight)];
}
int numPasses = (aRowCount - 1) / blockHeight + 1;
String propBtPathStr = context.getConfiguration().get(PROP_BT_PATH);
Validate.notNull(propBtPathStr, "Bt input is not set");
Path btPath = new Path(propBtPathStr);
DenseBlockWritable dbw = new DenseBlockWritable();
/*
* so it turns out that it may be much more efficient to do a few
* independent passes over Bt accumulating the entire block in memory
* than pass huge amount of blocks out to combiner. so we aim of course
* to fit entire s x (k+p) dense block in memory where s is the number
* of A rows in this split. If A is much sparser than (k+p) avg # of
* elements per row then the block may exceed the split size. if this
* happens, and if the given blockHeight is not high enough to
* accomodate this (because of memory constraints), then we start
* splitting s into several passes. since computation is cpu-bound
* anyway, it should be o.k. for supersparse inputs. (as ok it can be
* that projection is thicker than the original anyway, why would one
* use that many k+p then).
*/
int lastRowIndex = -1;
for (int pass = 0; pass < numPasses; pass++) {
if (distributedBt) {
btInput =
new SequenceFileDirIterator<>(btLocalPath, true, localFsConfig);
} else {
btInput =
new SequenceFileDirIterator<>(btPath, PathType.GLOB, null, null, true, context.getConfiguration());
}
closeables.addFirst(btInput);
Validate.isTrue(btInput.hasNext(), "Empty B' input!");
int aRowBegin = pass * blockHeight;
int bh = Math.min(blockHeight, aRowCount - aRowBegin);
/*
* check if we need to trim block allocation
*/
if (pass > 0) {
if (bh == blockHeight) {
for (int i = 0; i < kp; i++) {
Arrays.fill(yiCols[i], 0.0);
}
} else {
for (int i = 0; i < kp; i++) {
yiCols[i] = null;
}
for (int i = 0; i < kp; i++) {
yiCols[i] = new double[bh];
}
}
}
while (btInput.hasNext()) {
Pair btRec = btInput.next();
int btIndex = btRec.getFirst().get();
Vector btVec = btRec.getSecond().get();
Vector aCol;
if (btIndex > aCols.length || (aCol = aCols[btIndex]) == null
|| aCol.size() == 0) {
/* 100% zero A column in the block, skip it as sparse */
continue;
}
int j = -1;
for (Vector.Element aEl : aCol.nonZeroes()) {
j = aEl.index();
/*
* now we compute only swathes between aRowBegin..aRowBegin+bh
* exclusive. it seems like a deficiency but in fact i think it
* will balance itself out: either A is dense and then we
* shouldn't have more than one pass and therefore filter
* conditions will never kick in. Or, the only situation where we
* can't fit Y_i block in memory is when A input is much sparser
* than k+p per row. But if this is the case, then we'd be looking
* at very few elements without engaging them in any operations so
* even then it should be ok.
*/
if (j < aRowBegin) {
continue;
}
if (j >= aRowBegin + bh) {
break;
}
/*
* assume btVec is dense
*/
if (xi != null) {
/*
* MAHOUT-817: PCA correction for B'. I rewrite the whole
* computation loop so i don't have to check if PCA correction
* is needed at individual element level. It looks bulkier this
* way but perhaps less wasteful on cpu.
*/
for (int s = 0; s < kp; s++) {
// code defensively against shortened xi
double xii = xi.size() > btIndex ? xi.get(btIndex) : 0.0;
yiCols[s][j - aRowBegin] +=
aEl.get() * (btVec.getQuick(s) - xii * sq.get(s));
}
} else {
/*
* no PCA correction
*/
for (int s = 0; s < kp; s++) {
yiCols[s][j - aRowBegin] += aEl.get() * btVec.getQuick(s);
}
}
}
if (lastRowIndex < j) {
lastRowIndex = j;
}
}
/*
* so now we have stuff in yi
*/
dbw.setBlock(yiCols);
outKey.setTaskItemOrdinal(pass);
context.write(outKey, dbw);
closeables.remove(btInput);
btInput.close();
}
} finally {
IOUtils.close(closeables);
}
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
int k = Integer.parseInt(conf.get(QRFirstStep.PROP_K));
int p = Integer.parseInt(conf.get(QRFirstStep.PROP_P));
kp = k + p;
outKey = new SplitPartitionedWritable(context);
blockHeight = conf.getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, -1);
distributedBt = conf.get(PROP_BT_BROADCAST) != null;
if (distributedBt) {
btLocalPath = HadoopUtil.getCachedFiles(conf);
localFsConfig = new Configuration();
localFsConfig.set("fs.default.name", "file:///");
}
/*
* PCA -related corrections (MAHOUT-817)
*/
String xiPathStr = conf.get(PROP_XI_PATH);
if (xiPathStr != null) {
xi = SSVDHelper.loadAndSumUpVectors(new Path(xiPathStr), conf);
sq =
SSVDHelper.loadAndSumUpVectors(new Path(conf.get(PROP_SQ_PATH)), conf);
}
}
}
/**
* QR first step pushed down to reducer.
*
*/
public static class QRReducer
extends Reducer {
/*
* HACK: partition number formats in hadoop, copied. this may stop working
* if it gets out of sync with newer hadoop version. But unfortunately rules
* of forming output file names are not sufficiently exposed so we need to
* hack it if we write the same split output from either mapper or reducer.
* alternatively, we probably can replace it by our own output file naming
* management completely and bypass MultipleOutputs entirely.
*/
private static final NumberFormat NUMBER_FORMAT =
NumberFormat.getInstance();
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
private final Deque closeables = Lists.newLinkedList();
protected int blockHeight;
protected int lastTaskId = -1;
protected OutputCollector qhatCollector;
protected OutputCollector rhatCollector;
protected QRFirstStep qr;
protected Vector yiRow;
protected Vector sb;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
blockHeight = conf.getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, -1);
String sbPathStr = conf.get(PROP_SB_PATH);
/*
* PCA -related corrections (MAHOUT-817)
*/
if (sbPathStr != null) {
sb = SSVDHelper.loadAndSumUpVectors(new Path(sbPathStr), conf);
}
}
protected void setupBlock(Context context, SplitPartitionedWritable spw)
throws InterruptedException, IOException {
IOUtils.close(closeables);
qhatCollector =
createOutputCollector(QJob.OUTPUT_QHAT,
spw,
context,
DenseBlockWritable.class);
rhatCollector =
createOutputCollector(QJob.OUTPUT_RHAT,
spw,
context,
VectorWritable.class);
qr =
new QRFirstStep(context.getConfiguration(),
qhatCollector,
rhatCollector);
closeables.addFirst(qr);
lastTaskId = spw.getTaskId();
}
@Override
protected void reduce(SplitPartitionedWritable key,
Iterable values,
Context context) throws IOException,
InterruptedException {
if (key.getTaskId() != lastTaskId) {
setupBlock(context, key);
}
Iterator iter = values.iterator();
DenseBlockWritable dbw = iter.next();
double[][] yiCols = dbw.getBlock();
if (iter.hasNext()) {
throw new IOException("Unexpected extra Y_i block in reducer input.");
}
long blockBase = key.getTaskItemOrdinal() * blockHeight;
int bh = yiCols[0].length;
if (yiRow == null) {
yiRow = new DenseVector(yiCols.length);
}
for (int k = 0; k < bh; k++) {
for (int j = 0; j < yiCols.length; j++) {
yiRow.setQuick(j, yiCols[j][k]);
}
key.setTaskItemOrdinal(blockBase + k);
// pca offset correction if any
if (sb != null) {
yiRow.assign(sb, Functions.MINUS);
}
qr.collect(key, yiRow);
}
}
private Path getSplitFilePath(String name,
SplitPartitionedWritable spw,
Context context) throws InterruptedException,
IOException {
String uniqueFileName = FileOutputFormat.getUniqueFile(context, name, "");
uniqueFileName = uniqueFileName.replaceFirst("-r-", "-m-");
uniqueFileName =
uniqueFileName.replaceFirst("\\d+$",
Matcher.quoteReplacement(NUMBER_FORMAT.format(spw.getTaskId())));
return new Path(FileOutputFormat.getWorkOutputPath(context),
uniqueFileName);
}
/**
* key doesn't matter here, only value does. key always gets substituted by
* SPW.
*
* @param
* bogus
*/
private OutputCollector createOutputCollector(String name,
final SplitPartitionedWritable spw,
Context ctx,
Class valueClass) throws IOException, InterruptedException {
Path outputPath = getSplitFilePath(name, spw, ctx);
final SequenceFile.Writer w =
SequenceFile.createWriter(FileSystem.get(outputPath.toUri(), ctx.getConfiguration()),
ctx.getConfiguration(),
outputPath,
SplitPartitionedWritable.class,
valueClass);
closeables.addFirst(w);
return new OutputCollector() {
@Override
public void collect(K key, V val) throws IOException {
w.append(spw, val);
}
};
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
IOUtils.close(closeables);
}
}
public static void run(Configuration conf,
Path[] inputAPaths,
Path inputBtGlob,
Path xiPath,
Path sqPath,
Path sbPath,
Path outputPath,
int aBlockRows,
int minSplitSize,
int k,
int p,
int outerProdBlockHeight,
int numReduceTasks,
boolean broadcastBInput)
throws ClassNotFoundException, InterruptedException, IOException {
JobConf oldApiJob = new JobConf(conf);
Job job = new Job(oldApiJob);
job.setJobName("ABt-job");
job.setJarByClass(ABtDenseOutJob.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
FileInputFormat.setInputPaths(job, inputAPaths);
if (minSplitSize > 0) {
FileInputFormat.setMinInputSplitSize(job, minSplitSize);
}
FileOutputFormat.setOutputPath(job, outputPath);
SequenceFileOutputFormat.setOutputCompressionType(job,
CompressionType.BLOCK);
job.setMapOutputKeyClass(SplitPartitionedWritable.class);
job.setMapOutputValueClass(DenseBlockWritable.class);
job.setOutputKeyClass(SplitPartitionedWritable.class);
job.setOutputValueClass(VectorWritable.class);
job.setMapperClass(ABtMapper.class);
job.setReducerClass(QRReducer.class);
job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows);
job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT,
outerProdBlockHeight);
job.getConfiguration().setInt(QRFirstStep.PROP_K, k);
job.getConfiguration().setInt(QRFirstStep.PROP_P, p);
job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString());
/*
* PCA-related options, MAHOUT-817
*/
if (xiPath != null) {
job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
job.getConfiguration().set(PROP_SB_PATH, sbPath.toString());
job.getConfiguration().set(PROP_SQ_PATH, sqPath.toString());
}
job.setNumReduceTasks(numReduceTasks);
// broadcast Bt files if required.
if (broadcastBInput) {
job.getConfiguration().set(PROP_BT_BROADCAST, "y");
FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf);
FileStatus[] fstats = fs.globStatus(inputBtGlob);
if (fstats != null) {
for (FileStatus fstat : fstats) {
/*
* new api is not enabled yet in our dependencies at this time, still
* using deprecated one
*/
DistributedCache.addCacheFile(fstat.getPath().toUri(),
job.getConfiguration());
}
}
}
job.submit();
job.waitForCompletion(false);
if (!job.isSuccessful()) {
throw new IOException("ABt job unsuccessful.");
}
}
}