org.apache.mahout.math.hadoop.stochasticsvd.BtJob Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-mr Show documentation
Show all versions of mahout-mr Show documentation
Scalable machine learning libraries
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.hadoop.stochasticsvd;
import org.apache.commons.lang3.Validate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.lib.MultipleOutputs;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.UpperTriangular;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.Functions;
import org.apache.mahout.math.function.PlusMult;
import org.apache.mahout.math.hadoop.stochasticsvd.qr.QRLastStep;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Deque;
/**
* Bt job. For details, see working notes in MAHOUT-376.
*
*
* Uses hadoop deprecated API wherever new api has not been updated
* (MAHOUT-593), hence @SuppressWarning("deprecation").
*
*
* This job outputs either Bt in its standard output, or upper triangular
* matrices representing BBt partial sums if that's requested . If the latter
* mode is enabled, then we accumulate BBt outer product sums in upper
* triangular accumulator and output it at the end of the job, thus saving space
* and BBt job.
*
*
* This job also outputs Q and Bt and optionally BBt. Bt is output to standard
* job output (part-*) and Q and BBt use named multiple outputs.
*
*
*/
@SuppressWarnings("deprecation")
public final class BtJob {
public static final String OUTPUT_Q = "Q";
public static final String OUTPUT_BT = "part";
public static final String OUTPUT_BBT = "bbt";
public static final String OUTPUT_SQ = "sq";
public static final String OUTPUT_SB = "sb";
public static final String PROP_QJOB_PATH = "ssvd.QJob.path";
public static final String PROP_OUPTUT_BBT_PRODUCTS =
"ssvd.BtJob.outputBBtProducts";
public static final String PROP_OUTER_PROD_BLOCK_HEIGHT =
"ssvd.outerProdBlockHeight";
public static final String PROP_RHAT_BROADCAST = "ssvd.rhat.broadcast";
public static final String PROP_XI_PATH = "ssvdpca.xi.path";
public static final String PROP_NV = "ssvd.nv";
private BtJob() {
}
public static class BtMapper extends
Mapper {
private QRLastStep qr;
private final Deque closeables = new ArrayDeque<>();
private int blockNum;
private MultipleOutputs outputs;
private final VectorWritable qRowValue = new VectorWritable();
private Vector btRow;
private SparseRowBlockAccumulator btCollector;
private Context mapContext;
private boolean nv;
// pca stuff
private Vector sqAccum;
private boolean computeSq;
/**
* We maintain A and QtHat inputs partitioned the same way, so we
* essentially are performing map-side merge here of A and QtHats except
* QtHat is stored not row-wise but block-wise.
*/
@Override
protected void map(Writable key, VectorWritable value, Context context)
throws IOException, InterruptedException {
mapContext = context;
// output Bt outer products
Vector aRow = value.get();
Vector qRow = qr.next();
int kp = qRow.size();
// make sure Qs are inheriting A row labels.
outputQRow(key, qRow, aRow);
// MAHOUT-817
if (computeSq) {
if (sqAccum == null) {
sqAccum = new DenseVector(kp);
}
sqAccum.assign(qRow, Functions.PLUS);
}
if (btRow == null) {
btRow = new DenseVector(kp);
}
if (!aRow.isDense()) {
for (Vector.Element el : aRow.nonZeroes()) {
double mul = el.get();
for (int j = 0; j < kp; j++) {
btRow.setQuick(j, mul * qRow.getQuick(j));
}
btCollector.collect((long) el.index(), btRow);
}
} else {
int n = aRow.size();
for (int i = 0; i < n; i++) {
double mul = aRow.getQuick(i);
for (int j = 0; j < kp; j++) {
btRow.setQuick(j, mul * qRow.getQuick(j));
}
btCollector.collect((long) i, btRow);
}
}
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
Path qJobPath = new Path(conf.get(PROP_QJOB_PATH));
/*
* actually this is kind of dangerous because this routine thinks we need
* to create file name for our current job and this will use -m- so it's
* just serendipity we are calling it from the mapper too as the QJob did.
*/
Path qInputPath =
new Path(qJobPath, FileOutputFormat.getUniqueFile(context,
QJob.OUTPUT_QHAT,
""));
blockNum = context.getTaskAttemptID().getTaskID().getId();
SequenceFileValueIterator qhatInput =
new SequenceFileValueIterator<>(qInputPath,
true,
conf);
closeables.addFirst(qhatInput);
/*
* read all r files _in order of task ids_, i.e. partitions (aka group
* nums).
*
* Note: if broadcast option is used, this comes from distributed cache
* files rather than hdfs path.
*/
SequenceFileDirValueIterator rhatInput;
boolean distributedRHat = conf.get(PROP_RHAT_BROADCAST) != null;
if (distributedRHat) {
Path[] rFiles = HadoopUtil.getCachedFiles(conf);
Validate.notNull(rFiles,
"no RHat files in distributed cache job definition");
//TODO: this probably can be replaced w/ local fs makeQualified
Configuration lconf = new Configuration();
lconf.set("fs.default.name", "file:///");
rhatInput =
new SequenceFileDirValueIterator<>(rFiles,
SSVDHelper.PARTITION_COMPARATOR,
true,
lconf);
} else {
Path rPath = new Path(qJobPath, QJob.OUTPUT_RHAT + "-*");
rhatInput =
new SequenceFileDirValueIterator<>(rPath,
PathType.GLOB,
null,
SSVDHelper.PARTITION_COMPARATOR,
true,
conf);
}
Validate.isTrue(rhatInput.hasNext(), "Empty R-hat input!");
closeables.addFirst(rhatInput);
outputs = new MultipleOutputs(new JobConf(conf));
closeables.addFirst(new IOUtils.MultipleOutputsCloseableAdapter(outputs));
qr = new QRLastStep(qhatInput, rhatInput, blockNum);
closeables.addFirst(qr);
/*
* it's so happens that current QRLastStep's implementation preloads R
* sequence into memory in the constructor so it's ok to close rhat input
* now.
*/
if (!rhatInput.hasNext()) {
closeables.remove(rhatInput);
rhatInput.close();
}
OutputCollector btBlockCollector =
new OutputCollector() {
@Override
public void collect(LongWritable blockKey,
SparseRowBlockWritable block) throws IOException {
try {
mapContext.write(blockKey, block);
} catch (InterruptedException exc) {
throw new IOException("Interrupted.", exc);
}
}
};
btCollector =
new SparseRowBlockAccumulator(conf.getInt(PROP_OUTER_PROD_BLOCK_HEIGHT,
-1), btBlockCollector);
closeables.addFirst(btCollector);
// MAHOUT-817
computeSq = conf.get(PROP_XI_PATH) != null;
// MAHOUT-1067
nv = conf.getBoolean(PROP_NV, false);
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
try {
if (sqAccum != null) {
/*
* hack: we will output sq partial sums with index -1 for summation.
*/
SparseRowBlockWritable sbrw = new SparseRowBlockWritable(1);
sbrw.plusRow(0, sqAccum);
LongWritable lw = new LongWritable(-1);
context.write(lw, sbrw);
}
} finally {
IOUtils.close(closeables);
}
}
@SuppressWarnings("unchecked")
private void outputQRow(Writable key, Vector qRow, Vector aRow) throws IOException {
if (nv && (aRow instanceof NamedVector)) {
qRowValue.set(new NamedVector(qRow, ((NamedVector) aRow).getName()));
} else {
qRowValue.set(qRow);
}
outputs.getCollector(OUTPUT_Q, null).collect(key, qRowValue);
}
}
public static class OuterProductCombiner
extends
Reducer {
protected final SparseRowBlockWritable accum = new SparseRowBlockWritable();
protected final Deque closeables = new ArrayDeque<>();
protected int blockHeight;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
blockHeight =
context.getConfiguration().getInt(PROP_OUTER_PROD_BLOCK_HEIGHT, -1);
}
@Override
protected void reduce(Writable key,
Iterable values,
Context context) throws IOException,
InterruptedException {
for (SparseRowBlockWritable bw : values) {
accum.plusBlock(bw);
}
context.write(key, accum);
accum.clear();
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
IOUtils.close(closeables);
}
}
public static class OuterProductReducer
extends
Reducer {
protected final SparseRowBlockWritable accum = new SparseRowBlockWritable();
protected final Deque closeables = new ArrayDeque<>();
protected int blockHeight;
private boolean outputBBt;
private UpperTriangular mBBt;
private MultipleOutputs outputs;
private final IntWritable btKey = new IntWritable();
private final VectorWritable btValue = new VectorWritable();
// MAHOUT-817
private Vector xi;
private final PlusMult pmult = new PlusMult(0);
private Vector sbAccum;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
blockHeight = conf.getInt(PROP_OUTER_PROD_BLOCK_HEIGHT, -1);
outputBBt = conf.getBoolean(PROP_OUPTUT_BBT_PRODUCTS, false);
if (outputBBt) {
int k = conf.getInt(QJob.PROP_K, -1);
int p = conf.getInt(QJob.PROP_P, -1);
Validate.isTrue(k > 0, "invalid k parameter");
Validate.isTrue(p >= 0, "invalid p parameter");
mBBt = new UpperTriangular(k + p);
}
String xiPathStr = conf.get(PROP_XI_PATH);
if (xiPathStr != null) {
xi = SSVDHelper.loadAndSumUpVectors(new Path(xiPathStr), conf);
if (xi == null) {
throw new IOException(String.format("unable to load mean path xi from %s.",
xiPathStr));
}
}
if (outputBBt || xi != null) {
outputs = new MultipleOutputs(new JobConf(conf));
closeables.addFirst(new IOUtils.MultipleOutputsCloseableAdapter(outputs));
}
}
@Override
protected void reduce(LongWritable key,
Iterable values,
Context context) throws IOException,
InterruptedException {
accum.clear();
for (SparseRowBlockWritable bw : values) {
accum.plusBlock(bw);
}
// MAHOUT-817:
if (key.get() == -1L) {
Vector sq = accum.getRows()[0];
@SuppressWarnings("unchecked")
OutputCollector sqOut =
outputs.getCollector(OUTPUT_SQ, null);
sqOut.collect(new IntWritable(0), new VectorWritable(sq));
return;
}
/*
* at this point, sum of rows should be in accum, so we just generate
* outer self product of it and add to BBt accumulator.
*/
for (int k = 0; k < accum.getNumRows(); k++) {
Vector btRow = accum.getRows()[k];
btKey.set((int) (key.get() * blockHeight + accum.getRowIndices()[k]));
btValue.set(btRow);
context.write(btKey, btValue);
if (outputBBt) {
int kp = mBBt.numRows();
// accumulate partial BBt sum
for (int i = 0; i < kp; i++) {
double vi = btRow.get(i);
if (vi != 0.0) {
for (int j = i; j < kp; j++) {
double vj = btRow.get(j);
if (vj != 0.0) {
mBBt.setQuick(i, j, mBBt.getQuick(i, j) + vi * vj);
}
}
}
}
}
// MAHOUT-817
if (xi != null) {
// code defensively against shortened xi
int btIndex = btKey.get();
double xii = xi.size() > btIndex ? xi.getQuick(btIndex) : 0.0;
// compute s_b
pmult.setMultiplicator(xii);
if (sbAccum == null) {
sbAccum = new DenseVector(btRow.size());
}
sbAccum.assign(btRow, pmult);
}
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
// if we output BBt instead of Bt then we need to do it.
try {
if (outputBBt) {
@SuppressWarnings("unchecked")
OutputCollector collector =
outputs.getCollector(OUTPUT_BBT, null);
collector.collect(new IntWritable(),
new VectorWritable(new DenseVector(mBBt.getData())));
}
// MAHOUT-817
if (sbAccum != null) {
@SuppressWarnings("unchecked")
OutputCollector collector =
outputs.getCollector(OUTPUT_SB, null);
collector.collect(new IntWritable(), new VectorWritable(sbAccum));
}
} finally {
IOUtils.close(closeables);
}
}
}
public static void run(Configuration conf,
Path[] inputPathA,
Path inputPathQJob,
Path xiPath,
Path outputPath,
int minSplitSize,
int k,
int p,
int btBlockHeight,
int numReduceTasks,
boolean broadcast,
Class extends Writable> labelClass,
boolean outputBBtProducts)
throws ClassNotFoundException, InterruptedException, IOException {
JobConf oldApiJob = new JobConf(conf);
MultipleOutputs.addNamedOutput(oldApiJob,
OUTPUT_Q,
org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
labelClass,
VectorWritable.class);
if (outputBBtProducts) {
MultipleOutputs.addNamedOutput(oldApiJob,
OUTPUT_BBT,
org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
IntWritable.class,
VectorWritable.class);
/*
* MAHOUT-1067: if we are asked to output BBT products then named vector
* names should be propagated to Q too so that UJob could pick them up
* from there.
*/
oldApiJob.setBoolean(PROP_NV, true);
}
if (xiPath != null) {
// compute pca -related stuff as well
MultipleOutputs.addNamedOutput(oldApiJob,
OUTPUT_SQ,
org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
IntWritable.class,
VectorWritable.class);
MultipleOutputs.addNamedOutput(oldApiJob,
OUTPUT_SB,
org.apache.hadoop.mapred.SequenceFileOutputFormat.class,
IntWritable.class,
VectorWritable.class);
}
/*
* HACK: we use old api multiple outputs since they are not available in the
* new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we
* can use new api interfaces.
*/
Job job = new Job(oldApiJob);
job.setJobName("Bt-job");
job.setJarByClass(BtJob.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job, inputPathA);
if (minSplitSize > 0) {
FileInputFormat.setMinInputSplitSize(job, minSplitSize);
}
FileOutputFormat.setOutputPath(job, outputPath);
// WARN: tight hadoop integration here:
job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT);
FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
SequenceFileOutputFormat.setOutputCompressionType(job,
CompressionType.BLOCK);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(SparseRowBlockWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(VectorWritable.class);
job.setMapperClass(BtMapper.class);
job.setCombinerClass(OuterProductCombiner.class);
job.setReducerClass(OuterProductReducer.class);
job.getConfiguration().setInt(QJob.PROP_K, k);
job.getConfiguration().setInt(QJob.PROP_P, p);
job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString());
job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS,
outputBBtProducts);
job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight);
job.setNumReduceTasks(numReduceTasks);
/*
* PCA-related options, MAHOUT-817
*/
if (xiPath != null) {
job.getConfiguration().set(PROP_XI_PATH, xiPath.toString());
}
/*
* we can broadhast Rhat files since all of them are reuqired by each job,
* but not Q files which correspond to splits of A (so each split of A will
* require only particular Q file, each time different one).
*/
if (broadcast) {
job.getConfiguration().set(PROP_RHAT_BROADCAST, "y");
FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf);
FileStatus[] fstats =
fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*"));
if (fstats != null) {
for (FileStatus fstat : fstats) {
/*
* new api is not enabled yet in our dependencies at this time, still
* using deprecated one
*/
DistributedCache.addCacheFile(fstat.getPath().toUri(),
job.getConfiguration());
}
}
}
job.submit();
job.waitForCompletion(false);
if (!job.isSuccessful()) {
throw new IOException("Bt job unsuccessful.");
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy