org.apache.mahout.math.hadoop.MatrixColumnMeansJob Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-mr Show documentation
Show all versions of mahout-mr Show documentation
Scalable machine learning libraries
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.mahout.math.hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.Functions;
import com.google.common.io.Closeables;
/**
* MatrixColumnMeansJob is a job for calculating the column-wise mean of a
* DistributedRowMatrix. This job can be accessed using
* DistributedRowMatrix.columnMeans()
*/
public final class MatrixColumnMeansJob {
public static final String VECTOR_CLASS =
"DistributedRowMatrix.columnMeans.vector.class";
private MatrixColumnMeansJob() {
}
public static Vector run(Configuration conf,
Path inputPath,
Path outputVectorTmpPath) throws IOException {
return run(conf, inputPath, outputVectorTmpPath, null);
}
/**
* Job for calculating column-wise mean of a DistributedRowMatrix
*
* @param initialConf
* @param inputPath
* path to DistributedRowMatrix input
* @param outputVectorTmpPath
* path for temporary files created during job
* @param vectorClass
* String of desired class for returned vector e.g. DenseVector,
* RandomAccessSparseVector (may be null for {@link DenseVector} )
* @return Vector containing column-wise mean of DistributedRowMatrix
*/
public static Vector run(Configuration initialConf,
Path inputPath,
Path outputVectorTmpPath,
String vectorClass) throws IOException {
try {
initialConf.set(VECTOR_CLASS,
vectorClass == null ? DenseVector.class.getName()
: vectorClass);
Job job = new Job(initialConf, "MatrixColumnMeansJob");
job.setJarByClass(MatrixColumnMeansJob.class);
FileOutputFormat.setOutputPath(job, outputVectorTmpPath);
outputVectorTmpPath.getFileSystem(job.getConfiguration())
.delete(outputVectorTmpPath, true);
job.setNumReduceTasks(1);
FileOutputFormat.setOutputPath(job, outputVectorTmpPath);
FileInputFormat.addInputPath(job, inputPath);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileOutputFormat.setOutputPath(job, outputVectorTmpPath);
job.setMapperClass(MatrixColumnMeansMapper.class);
job.setReducerClass(MatrixColumnMeansReducer.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(VectorWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(VectorWritable.class);
job.submit();
job.waitForCompletion(true);
Path tmpFile = new Path(outputVectorTmpPath, "part-r-00000");
SequenceFileValueIterator iterator =
new SequenceFileValueIterator<>(tmpFile, true, initialConf);
try {
if (iterator.hasNext()) {
return iterator.next().get();
} else {
return (Vector) Class.forName(vectorClass).getConstructor(int.class)
.newInstance(0);
}
} finally {
Closeables.close(iterator, true);
}
} catch (IOException ioe) {
throw ioe;
} catch (Throwable thr) {
throw new IOException(thr);
}
}
/**
* Mapper for calculation of column-wise mean.
*/
public static class MatrixColumnMeansMapper extends
Mapper {
private Vector runningSum;
private String vectorClass;
@Override
public void setup(Context context) {
vectorClass = context.getConfiguration().get(VECTOR_CLASS);
}
/**
* The mapper computes a running sum of the vectors the task has seen.
* Element 0 of the running sum vector contains a count of the number of
* vectors that have been seen. The remaining elements contain the
* column-wise running sum. Nothing is written at this stage
*/
@Override
public void map(Writable r, VectorWritable v, Context context)
throws IOException {
if (runningSum == null) {
/*
* If this is the first vector the mapper has seen, instantiate a new
* vector using the parameter VECTOR_CLASS
*/
runningSum = ClassUtils.instantiateAs(vectorClass,
Vector.class,
new Class>[] { int.class },
new Object[] { v.get().size() + 1 });
runningSum.set(0, 1);
runningSum.viewPart(1, v.get().size()).assign(v.get());
} else {
runningSum.set(0, runningSum.get(0) + 1);
runningSum.viewPart(1, v.get().size()).assign(v.get(), Functions.PLUS);
}
}
/**
* The column-wise sum is written at the cleanup stage. A single reducer is
* forced so null can be used for the key
*/
@Override
public void cleanup(Context context) throws InterruptedException,
IOException {
if (runningSum != null) {
context.write(NullWritable.get(), new VectorWritable(runningSum));
}
}
}
/**
* The reducer adds the partial column-wise sums from each of the mappers to
* compute the total column-wise sum. The total sum is then divided by the
* total count of vectors to determine the column-wise mean.
*/
public static class MatrixColumnMeansReducer extends
Reducer {
private static final IntWritable ONE = new IntWritable(1);
private String vectorClass;
private Vector outputVector;
private final VectorWritable outputVectorWritable = new VectorWritable();
@Override
public void setup(Context context) {
vectorClass = context.getConfiguration().get(VECTOR_CLASS);
}
@Override
public void reduce(NullWritable n,
Iterable vectors,
Context context) throws IOException, InterruptedException {
/**
* Add together partial column-wise sums from mappers
*/
for (VectorWritable v : vectors) {
if (outputVector == null) {
outputVector = v.get();
} else {
outputVector.assign(v.get(), Functions.PLUS);
}
}
/**
* Divide total column-wise sum by count of vectors, which corresponds to
* the number of rows in the DistributedRowMatrix
*/
if (outputVector != null) {
outputVectorWritable.set(outputVector.viewPart(1,
outputVector.size() - 1)
.divide(outputVector.get(0)));
context.write(ONE, outputVectorWritable);
} else {
Vector emptyVector = ClassUtils.instantiateAs(vectorClass,
Vector.class,
new Class>[] { int.class },
new Object[] { 0 });
context.write(ONE, new VectorWritable(emptyVector));
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy