org.apache.mahout.clustering.spectral.AffinityMatrixInputJob Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-mr Show documentation
Scalable machine learning libraries
There is a newer version: 0.13.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.spectral;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;

public final class AffinityMatrixInputJob {

  private AffinityMatrixInputJob() {
  }

  /**
   * Initializes and executes the job of reading the documents containing
   * the data of the affinity matrix in (x_i, x_j, value) format.
   */
  public static void runJob(Path input, Path output, int rows, int cols)
    throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    HadoopUtil.delete(conf, output);

    conf.setInt(Keys.AFFINITY_DIMENSIONS, rows);
    Job job = new Job(conf, "AffinityMatrixInputJob: " + input + " -> M/R -> " + output);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DistributedRowMatrix.MatrixEntryWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(AffinityMatrixInputMapper.class);   
    job.setReducerClass(AffinityMatrixInputReducer.class);

    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setJarByClass(AffinityMatrixInputJob.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
      throw new IllegalStateException("Job failed!");
    }
  }

  /**
   * A transparent wrapper for the above method which handles the tedious tasks
   * of setting and retrieving system Paths. Hands back a fully-populated
   * and initialized DistributedRowMatrix.
   */
  public static DistributedRowMatrix runJob(Path input, Path output, int dimensions)
    throws IOException, InterruptedException, ClassNotFoundException {
    Path seqFiles = new Path(output, "seqfiles-" + (System.nanoTime() & 0xFF));
    runJob(input, seqFiles, dimensions, dimensions);
    DistributedRowMatrix a = new DistributedRowMatrix(seqFiles,
        new Path(seqFiles, "seqtmp-" + (System.nanoTime() & 0xFF)), 
        dimensions, dimensions);
    a.setConf(new Configuration());
    return a;
  }
}