org.apache.mahout.math.hadoop.MatrixColumnMeansJob Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-mr Show documentation
Scalable machine learning libraries
There is a newer version: 0.13.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package org.apache.mahout.math.hadoop;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.Functions;

import com.google.common.io.Closeables;

/**
 * MatrixColumnMeansJob is a job for calculating the column-wise mean of a
 * DistributedRowMatrix. This job can be accessed using
 * DistributedRowMatrix.columnMeans()
 */
public final class MatrixColumnMeansJob {

  public static final String VECTOR_CLASS =
    "DistributedRowMatrix.columnMeans.vector.class";

  private MatrixColumnMeansJob() {
  }

  public static Vector run(Configuration conf,
                           Path inputPath,
                           Path outputVectorTmpPath) throws IOException {
    return run(conf, inputPath, outputVectorTmpPath, null);
  }

  /**
   * Job for calculating column-wise mean of a DistributedRowMatrix
   *
   * @param initialConf
   * @param inputPath
   *          path to DistributedRowMatrix input
   * @param outputVectorTmpPath
   *          path for temporary files created during job
   * @param vectorClass
   *          String of desired class for returned vector e.g. DenseVector,
   *          RandomAccessSparseVector (may be null for {@link DenseVector} )
   * @return Vector containing column-wise mean of DistributedRowMatrix
   */
  public static Vector run(Configuration initialConf,
                           Path inputPath,
                           Path outputVectorTmpPath,
                           String vectorClass) throws IOException {

    try {
      initialConf.set(VECTOR_CLASS,
                      vectorClass == null ? DenseVector.class.getName()
                          : vectorClass);

      Job job = new Job(initialConf, "MatrixColumnMeansJob");
      job.setJarByClass(MatrixColumnMeansJob.class);

      FileOutputFormat.setOutputPath(job, outputVectorTmpPath);
      
      outputVectorTmpPath.getFileSystem(job.getConfiguration())
                         .delete(outputVectorTmpPath, true);
      job.setNumReduceTasks(1);
      FileOutputFormat.setOutputPath(job, outputVectorTmpPath);
      FileInputFormat.addInputPath(job, inputPath);
      job.setInputFormatClass(SequenceFileInputFormat.class);
      job.setOutputFormatClass(SequenceFileOutputFormat.class);
      FileOutputFormat.setOutputPath(job, outputVectorTmpPath);

      job.setMapperClass(MatrixColumnMeansMapper.class);
      job.setReducerClass(MatrixColumnMeansReducer.class);
      job.setMapOutputKeyClass(NullWritable.class);
      job.setMapOutputValueClass(VectorWritable.class);
      job.setOutputKeyClass(IntWritable.class);
      job.setOutputValueClass(VectorWritable.class);
      job.submit();
      job.waitForCompletion(true);

      Path tmpFile = new Path(outputVectorTmpPath, "part-r-00000");
      SequenceFileValueIterator iterator =
        new SequenceFileValueIterator<>(tmpFile, true, initialConf);
      try {
        if (iterator.hasNext()) {
          return iterator.next().get();
        } else {
          return (Vector) Class.forName(vectorClass).getConstructor(int.class)
                               .newInstance(0);
        }
      } finally {
        Closeables.close(iterator, true);
      }
    } catch (IOException ioe) {
      throw ioe;
    } catch (Throwable thr) {
      throw new IOException(thr);
    }
  }

  /**
   * Mapper for calculation of column-wise mean.
   */
  public static class MatrixColumnMeansMapper extends
      Mapper {

    private Vector runningSum;
    private String vectorClass;

    @Override
    public void setup(Context context) {
      vectorClass = context.getConfiguration().get(VECTOR_CLASS);
    }

    /**
     * The mapper computes a running sum of the vectors the task has seen.
     * Element 0 of the running sum vector contains a count of the number of
     * vectors that have been seen. The remaining elements contain the
     * column-wise running sum. Nothing is written at this stage
     */
    @Override
    public void map(Writable r, VectorWritable v, Context context)
      throws IOException {
      if (runningSum == null) {
          /*
           * If this is the first vector the mapper has seen, instantiate a new
           * vector using the parameter VECTOR_CLASS
           */
        runningSum = ClassUtils.instantiateAs(vectorClass,
                                              Vector.class,
                                              new Class[] { int.class },
                                              new Object[] { v.get().size() + 1 });
        runningSum.set(0, 1);
        runningSum.viewPart(1, v.get().size()).assign(v.get());
      } else {
        runningSum.set(0, runningSum.get(0) + 1);
        runningSum.viewPart(1, v.get().size()).assign(v.get(), Functions.PLUS);
      }
    }

    /**
     * The column-wise sum is written at the cleanup stage. A single reducer is
     * forced so null can be used for the key
     */
    @Override
    public void cleanup(Context context) throws InterruptedException,
      IOException {
      if (runningSum != null) {
        context.write(NullWritable.get(), new VectorWritable(runningSum));
      }
    }

  }

  /**
   * The reducer adds the partial column-wise sums from each of the mappers to
   * compute the total column-wise sum. The total sum is then divided by the
   * total count of vectors to determine the column-wise mean.
   */
  public static class MatrixColumnMeansReducer extends
      Reducer {

    private static final IntWritable ONE = new IntWritable(1);

    private String vectorClass;
    private Vector outputVector;
    private final VectorWritable outputVectorWritable = new VectorWritable();

    @Override
    public void setup(Context context) {
      vectorClass = context.getConfiguration().get(VECTOR_CLASS);
    }

    @Override
    public void reduce(NullWritable n,
                       Iterable vectors,
                       Context context) throws IOException, InterruptedException {

      /**
       * Add together partial column-wise sums from mappers
       */
      for (VectorWritable v : vectors) {
        if (outputVector == null) {
          outputVector = v.get();
        } else {
          outputVector.assign(v.get(), Functions.PLUS);
        }
      }

      /**
       * Divide total column-wise sum by count of vectors, which corresponds to
       * the number of rows in the DistributedRowMatrix
       */
      if (outputVector != null) {
        outputVectorWritable.set(outputVector.viewPart(1,
                                                       outputVector.size() - 1)
                                             .divide(outputVector.get(0)));
        context.write(ONE, outputVectorWritable);
      } else {
        Vector emptyVector = ClassUtils.instantiateAs(vectorClass,
                                                      Vector.class,
                                                      new Class[] { int.class },
                                                      new Object[] { 0 });
        context.write(ONE, new VectorWritable(emptyVector));
      }
    }
  }

}