org.apache.hama.examples.Kmeans Maven / Gradle / Ivy

Go to download
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hama.examples;

import java.util.List;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hama.HamaConfiguration;
import org.apache.hama.bsp.BSPJob;
import org.apache.hama.commons.io.VectorWritable;
import org.apache.hama.ml.kmeans.KMeansBSP;

/**
 * Uses the {@link KMeansBSP} class to run a Kmeans Clustering with BSP. You can
 * provide your own input, or generate some random input for benchmarking.
 * 
 * For your own input, you can supply a text file that contains a tab separated
 * sequence of doubles on each line. The first k-vectors are used as the seed
 * centers.
 * 
 * For random input, just supply the "-g" command the number of vectors to
 * generate and the dimension of the vectors.
 * 
 * You must pass always an input directory and an output path, as well as how
 * many iterations the algorithm should run (it will also stop if the centers
 * won't move anymore).
 * 
 * The centers are stored in the given input path under
 * center/center_output.seq. This is a center sequencefile with
 * {@link VectorWritable} as key and {@link NullWritable} as value. You can read
 * it with the normal FS cat utility, but you have to add the hama-ml jar to the
 * lib directory of Hadoop, so it can find the vector classes.
 * 
 * The assignments from an index (the order of the center in the above sequence
 * file matters!, also starting from 0!) to a vector can be found in the output
 * path as text file.
 * 
 */
public class Kmeans {

  public static void main(String[] args) throws Exception {
    if (args.length < 4 || (args.length > 4 && args.length != 7)) {
      System.out
          .println("USAGE:     -g [ ]");
      return;
    }
    HamaConfiguration conf = new HamaConfiguration();

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileSystem fs = FileSystem.get(conf);
    Path center = null;
    if (fs.isFile(in)) {
      center = new Path(in.getParent(), "center/cen.seq");
    } else {
      center = new Path(in, "center/cen.seq");
    }
    Path centerOut = new Path(out, "center/center_output.seq");
    conf.set(KMeansBSP.CENTER_IN_PATH, center.toString());
    conf.set(KMeansBSP.CENTER_OUT_PATH, centerOut.toString());
    int iterations = Integer.parseInt(args[2]);
    conf.setInt(KMeansBSP.MAX_ITERATIONS_KEY, iterations);
    int k = Integer.parseInt(args[3]);
    if (args.length == 7 && args[4].equals("-g")) {
      int count = Integer.parseInt(args[5]);
      if (k > count)
        throw new IllegalArgumentException("K can't be greater than n!");
      int dimension = Integer.parseInt(args[6]);
      System.out.println("N: " + count + " Dimension: " + dimension
          + " Iterations: " + iterations);
      if (!fs.isFile(in)) {
        in = new Path(in, "input.seq");
      }
      // prepare the input, like deleting old versions and creating centers
      KMeansBSP.prepareInput(count, k, dimension, conf, in, center, out, fs);
    } else {
      if (!fs.isFile(in)) {
        System.out.println("Cannot read text input file: " + in.toString());
        return;
      }
      // Set the last argument to TRUE if first column is required to be the key
      in = KMeansBSP.prepareInputText(k, conf, in, center, out, fs, true);
    }

    BSPJob job = KMeansBSP.createJob(conf, in, out, true);

    long startTime = System.currentTimeMillis();
    // just submit the job
    if (job.waitForCompletion(true)) {
      System.out.println("Job Finished in "
          + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    }

    System.out.println("\nHere are a few lines of output:");
    List results = KMeansBSP.readOutput(conf, out, fs, 4);
    for (String line : results) {
      System.out.println(line);
    }
    System.out.println("...");
  }
}