org.apache.mahout.math.hadoop.stochasticsvd.ABtJob Maven / Gradle / Ivy
Show all versions of mahout-mr Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.hadoop.stochasticsvd;
import java.io.Closeable;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Deque;
import java.util.regex.Matcher;
import com.google.common.collect.Lists;
import org.apache.commons.lang3.Validate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.stochasticsvd.qr.QRFirstStep;
/**
* Computes ABt products, then first step of QR which is pushed down to the
* reducer.
*
*/
@SuppressWarnings("deprecation")
public final class ABtJob {
public static final String PROP_BT_PATH = "ssvd.Bt.path";
public static final String PROP_BT_BROADCAST = "ssvd.Bt.broadcast";
private ABtJob() {
}
/**
* So, here, i preload A block into memory.
*
*
* A sparse matrix seems to be ideal for that but there are two reasons why i
* am not using it:
*
* - 1) I don't know the full block height. so i may need to reallocate it
* from time to time. Although this probably not a showstopper.
*
- 2) I found that RandomAccessSparseVectors seem to take much more memory
* than the SequentialAccessSparseVectors.
*
*
*
*/
public static class ABtMapper
extends
Mapper {
private SplitPartitionedWritable outKey;
private final Deque closeables = new ArrayDeque<>();
private SequenceFileDirIterator btInput;
private Vector[] aCols;
// private Vector[] yiRows;
// private VectorWritable outValue = new VectorWritable();
private int aRowCount;
private int kp;
private int blockHeight;
private SparseRowBlockAccumulator yiCollector;
@Override
protected void map(Writable key, VectorWritable value, Context context)
throws IOException, InterruptedException {
Vector vec = value.get();
int vecSize = vec.size();
if (aCols == null) {
aCols = new Vector[vecSize];
} else if (aCols.length < vecSize) {
aCols = Arrays.copyOf(aCols, vecSize);
}
if (vec.isDense()) {
for (int i = 0; i < vecSize; i++) {
extendAColIfNeeded(i, aRowCount + 1);
aCols[i].setQuick(aRowCount, vec.getQuick(i));
}
} else {
for (Vector.Element vecEl : vec.nonZeroes()) {
int i = vecEl.index();
extendAColIfNeeded(i, aRowCount + 1);
aCols[i].setQuick(aRowCount, vecEl.get());
}
}
aRowCount++;
}
private void extendAColIfNeeded(int col, int rowCount) {
if (aCols[col] == null) {
aCols[col] =
new SequentialAccessSparseVector(rowCount < 10000 ? 10000 : rowCount,
1);
} else if (aCols[col].size() < rowCount) {
Vector newVec =
new SequentialAccessSparseVector(rowCount << 1,
aCols[col].getNumNondefaultElements() << 1);
newVec.viewPart(0, aCols[col].size()).assign(aCols[col]);
aCols[col] = newVec;
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
try {
// yiRows= new Vector[aRowCount];
int lastRowIndex = -1;
while (btInput.hasNext()) {
Pair btRec = btInput.next();
int btIndex = btRec.getFirst().get();
Vector btVec = btRec.getSecond().get();
Vector aCol;
if (btIndex > aCols.length || (aCol = aCols[btIndex]) == null) {
continue;
}
int j = -1;
for (Vector.Element aEl : aCol.nonZeroes()) {
j = aEl.index();
// outKey.setTaskItemOrdinal(j);
// outValue.set(btVec.times(aEl.get())); // assign might work better
// // with memory after all.
// context.write(outKey, outValue);
yiCollector.collect((long) j, btVec.times(aEl.get()));
}
if (lastRowIndex < j) {
lastRowIndex = j;
}
}
aCols = null;
// output empty rows if we never output partial products for them
// this happens in sparse matrices when last rows are all zeros
// and is subsequently causing shorter Q matrix row count which we
// probably don't want to repair there but rather here.
Vector yDummy = new SequentialAccessSparseVector(kp);
// outValue.set(yDummy);
for (lastRowIndex += 1; lastRowIndex < aRowCount; lastRowIndex++) {
// outKey.setTaskItemOrdinal(lastRowIndex);
// context.write(outKey, outValue);
yiCollector.collect((long) lastRowIndex, yDummy);
}
} finally {
IOUtils.close(closeables);
}
}
@Override
protected void setup(final Context context) throws IOException,
InterruptedException {
int k =
Integer.parseInt(context.getConfiguration().get(QRFirstStep.PROP_K));
int p =
Integer.parseInt(context.getConfiguration().get(QRFirstStep.PROP_P));
kp = k + p;
outKey = new SplitPartitionedWritable(context);
String propBtPathStr = context.getConfiguration().get(PROP_BT_PATH);
Validate.notNull(propBtPathStr, "Bt input is not set");
Path btPath = new Path(propBtPathStr);
boolean distributedBt =
context.getConfiguration().get(PROP_BT_BROADCAST) != null;
if (distributedBt) {
Path[] btFiles = HadoopUtil.getCachedFiles(context.getConfiguration());
// DEBUG: stdout
//System.out.printf("list of files: " + btFiles);
StringBuilder btLocalPath = new StringBuilder();
for (Path btFile : btFiles) {
if (btLocalPath.length() > 0) {
btLocalPath.append(Path.SEPARATOR_CHAR);
}
btLocalPath.append(btFile);
}
btInput =
new SequenceFileDirIterator<>(new Path(btLocalPath.toString()),
PathType.LIST,
null,
null,
true,
context.getConfiguration());
} else {
btInput =
new SequenceFileDirIterator<>(btPath, PathType.GLOB, null, null, true, context.getConfiguration());
}
// TODO: how do i release all that stuff??
closeables.addFirst(btInput);
OutputCollector yiBlockCollector =
new OutputCollector() {
@Override
public void collect(LongWritable blockKey,
SparseRowBlockWritable block) throws IOException {
outKey.setTaskItemOrdinal((int) blockKey.get());
try {
context.write(outKey, block);
} catch (InterruptedException exc) {
throw new IOException("Interrupted", exc);
}
}
};
blockHeight =
context.getConfiguration().getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT,
-1);
yiCollector =
new SparseRowBlockAccumulator(blockHeight, yiBlockCollector);
closeables.addFirst(yiCollector);
}
}
/**
* QR first step pushed down to reducer.
*
*/
public static class QRReducer
extends
Reducer {
// hack: partition number formats in hadoop, copied. this may stop working
// if it gets
// out of sync with newer hadoop version. But unfortunately rules of forming
// output file names are not sufficiently exposed so we need to hack it
// if we write the same split output from either mapper or reducer.
// alternatively, we probably can replace it by our own output file namnig
// management
// completely and bypass MultipleOutputs entirely.
private static final NumberFormat NUMBER_FORMAT =
NumberFormat.getInstance();
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
private final Deque closeables = Lists.newLinkedList();
protected final SparseRowBlockWritable accum = new SparseRowBlockWritable();
protected int blockHeight;
protected int lastTaskId = -1;
protected OutputCollector qhatCollector;
protected OutputCollector rhatCollector;
protected QRFirstStep qr;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
blockHeight =
context.getConfiguration().getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT,
-1);
}
protected void setupBlock(Context context, SplitPartitionedWritable spw)
throws InterruptedException, IOException {
IOUtils.close(closeables);
qhatCollector =
createOutputCollector(QJob.OUTPUT_QHAT,
spw,
context,
DenseBlockWritable.class);
rhatCollector =
createOutputCollector(QJob.OUTPUT_RHAT,
spw,
context,
VectorWritable.class);
qr =
new QRFirstStep(context.getConfiguration(),
qhatCollector,
rhatCollector);
closeables.addFirst(qr);
lastTaskId = spw.getTaskId();
}
@Override
protected void reduce(SplitPartitionedWritable key,
Iterable values,
Context context) throws IOException,
InterruptedException {
accum.clear();
for (SparseRowBlockWritable bw : values) {
accum.plusBlock(bw);
}
if (key.getTaskId() != lastTaskId) {
setupBlock(context, key);
}
long blockBase = key.getTaskItemOrdinal() * blockHeight;
for (int k = 0; k < accum.getNumRows(); k++) {
Vector yiRow = accum.getRows()[k];
key.setTaskItemOrdinal(blockBase + accum.getRowIndices()[k]);
qr.collect(key, yiRow);
}
}
private Path getSplitFilePath(String name,
SplitPartitionedWritable spw,
Context context) throws InterruptedException,
IOException {
String uniqueFileName = FileOutputFormat.getUniqueFile(context, name, "");
uniqueFileName = uniqueFileName.replaceFirst("-r-", "-m-");
uniqueFileName =
uniqueFileName.replaceFirst("\\d+$",
Matcher.quoteReplacement(NUMBER_FORMAT.format(spw.getTaskId())));
return new Path(FileOutputFormat.getWorkOutputPath(context),
uniqueFileName);
}
/**
* key doesn't matter here, only value does. key always gets substituted by
* SPW.
*/
private OutputCollector createOutputCollector(String name,
final SplitPartitionedWritable spw,
Context ctx,
Class valueClass)
throws IOException, InterruptedException {
Path outputPath = getSplitFilePath(name, spw, ctx);
final SequenceFile.Writer w =
SequenceFile.createWriter(FileSystem.get(outputPath.toUri(), ctx.getConfiguration()),
ctx.getConfiguration(),
outputPath,
SplitPartitionedWritable.class,
valueClass);
closeables.addFirst(w);
return new OutputCollector() {
@Override
public void collect(K key, V val) throws IOException {
w.append(spw, val);
}
};
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
IOUtils.close(closeables);
}
}
public static void run(Configuration conf,
Path[] inputAPaths,
Path inputBtGlob,
Path outputPath,
int aBlockRows,
int minSplitSize,
int k,
int p,
int outerProdBlockHeight,
int numReduceTasks,
boolean broadcastBInput)
throws ClassNotFoundException, InterruptedException, IOException {
JobConf oldApiJob = new JobConf(conf);
// MultipleOutputs
// .addNamedOutput(oldApiJob,
// QJob.OUTPUT_QHAT,
// org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
// SplitPartitionedWritable.class,
// DenseBlockWritable.class);
//
// MultipleOutputs
// .addNamedOutput(oldApiJob,
// QJob.OUTPUT_RHAT,
// org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
// SplitPartitionedWritable.class,
// VectorWritable.class);
Job job = new Job(oldApiJob);
job.setJobName("ABt-job");
job.setJarByClass(ABtJob.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
FileInputFormat.setInputPaths(job, inputAPaths);
if (minSplitSize > 0) {
FileInputFormat.setMinInputSplitSize(job, minSplitSize);
}
FileOutputFormat.setOutputPath(job, outputPath);
SequenceFileOutputFormat.setOutputCompressionType(job,
CompressionType.BLOCK);
job.setMapOutputKeyClass(SplitPartitionedWritable.class);
job.setMapOutputValueClass(SparseRowBlockWritable.class);
job.setOutputKeyClass(SplitPartitionedWritable.class);
job.setOutputValueClass(VectorWritable.class);
job.setMapperClass(ABtMapper.class);
job.setCombinerClass(BtJob.OuterProductCombiner.class);
job.setReducerClass(QRReducer.class);
job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows);
job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT,
outerProdBlockHeight);
job.getConfiguration().setInt(QRFirstStep.PROP_K, k);
job.getConfiguration().setInt(QRFirstStep.PROP_P, p);
job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString());
// number of reduce tasks doesn't matter. we don't actually
// send anything to reducers.
job.setNumReduceTasks(numReduceTasks);
// broadcast Bt files if required.
if (broadcastBInput) {
job.getConfiguration().set(PROP_BT_BROADCAST, "y");
FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf);
FileStatus[] fstats = fs.globStatus(inputBtGlob);
if (fstats != null) {
for (FileStatus fstat : fstats) {
/*
* new api is not enabled yet in our dependencies at this time, still
* using deprecated one
*/
DistributedCache.addCacheFile(fstat.getPath().toUri(), conf);
}
}
}
job.submit();
job.waitForCompletion(false);
if (!job.isSuccessful()) {
throw new IOException("ABt job unsuccessful.");
}
}
}