org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansDriver Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-mr Show documentation
Show all versions of mahout-mr Show documentation
Scalable machine learning libraries
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.streaming.mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.math.Centroid;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.neighborhood.BruteSearch;
import org.apache.mahout.math.neighborhood.ProjectionSearch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Classifies the vectors into different clusters found by the clustering
* algorithm.
*/
public final class StreamingKMeansDriver extends AbstractJob {
/**
* Streaming KMeans options
*/
/**
* The number of cluster that Mappers will use should be \(O(k log n)\) where k is the number of clusters
* to get at the end and n is the number of points to cluster. This doesn't need to be exact.
* It will be adjusted at runtime.
*/
public static final String ESTIMATED_NUM_MAP_CLUSTERS = "estimatedNumMapClusters";
/**
* The initial estimated distance cutoff between two points for forming new clusters.
* @see org.apache.mahout.clustering.streaming.cluster.StreamingKMeans
* Defaults to 10e-6.
*/
public static final String ESTIMATED_DISTANCE_CUTOFF = "estimatedDistanceCutoff";
/**
* Ball KMeans options
*/
/**
* After mapping finishes, we get an intermediate set of vectors that represent approximate
* clusterings of the data from each Mapper. These can be clustered by the Reducer using
* BallKMeans in memory. This variable is the maximum number of iterations in the final
* BallKMeans algorithm.
* Defaults to 10.
*/
public static final String MAX_NUM_ITERATIONS = "maxNumIterations";
/**
* The "ball" aspect of ball k-means means that only the closest points to the centroid will actually be used
* for updating. The fraction of the points to be used is those points whose distance to the center is within
* trimFraction * distance to the closest other center.
* Defaults to 0.9.
*/
public static final String TRIM_FRACTION = "trimFraction";
/**
* Whether to use k-means++ initialization or random initialization of the seed centroids.
* Essentially, k-means++ provides better clusters, but takes longer, whereas random initialization takes less
* time, but produces worse clusters, and tends to fail more often and needs multiple runs to compare to
* k-means++. If set, uses randomInit.
* @see org.apache.mahout.clustering.streaming.cluster.BallKMeans
*/
public static final String RANDOM_INIT = "randomInit";
/**
* Whether to correct the weights of the centroids after the clustering is done. The weights end up being wrong
* because of the trimFraction and possible train/test splits. In some cases, especially in a pipeline, having
* an accurate count of the weights is useful. If set, ignores the final weights.
*/
public static final String IGNORE_WEIGHTS = "ignoreWeights";
/**
* The percentage of points that go into the "test" set when evaluating BallKMeans runs in the reducer.
*/
public static final String TEST_PROBABILITY = "testProbability";
/**
* The percentage of points that go into the "training" set when evaluating BallKMeans runs in the reducer.
*/
public static final String NUM_BALLKMEANS_RUNS = "numBallKMeansRuns";
/**
Searcher options
*/
/**
* The Searcher class when performing nearest neighbor search in StreamingKMeans.
* Defaults to ProjectionSearch.
*/
public static final String SEARCHER_CLASS_OPTION = "searcherClass";
/**
* The number of projections to use when using a projection searcher like ProjectionSearch or
* FastProjectionSearch. Projection searches work by projection the all the vectors on to a set of
* basis vectors and searching for the projected query in that totally ordered set. This
* however can produce false positives (vectors that are closer when projected than they would
* actually be.
* So, there must be more than one projection vectors in the basis. This variable is the number
* of vectors in a basis.
* Defaults to 3
*/
public static final String NUM_PROJECTIONS_OPTION = "numProjections";
/**
* When using approximate searches (anything that's not BruteSearch),
* more than just the seemingly closest element must be considered. This variable has different
* meanings depending on the actual Searcher class used but is a measure of how many candidates
* will be considered.
* See the ProjectionSearch, FastProjectionSearch, LocalitySensitiveHashSearch classes for more
* details.
* Defaults to 2.
*/
public static final String SEARCH_SIZE_OPTION = "searchSize";
/**
* Whether to run another pass of StreamingKMeans on the reducer's points before BallKMeans. On some data sets
* with a large number of mappers, the intermediate number of clusters passed to the reducer is too large to
* fit into memory directly, hence the option to collapse the clusters further with StreamingKMeans.
*/
public static final String REDUCE_STREAMING_KMEANS = "reduceStreamingKMeans";
private static final Logger log = LoggerFactory.getLogger(StreamingKMeansDriver.class);
public static final float INVALID_DISTANCE_CUTOFF = -1;
@Override
public int run(String[] args) throws Exception {
// Standard options for any Mahout job.
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.overwriteOption().create());
// The number of clusters to create for the data.
addOption(DefaultOptionCreator.numClustersOption().withDescription(
"The k in k-Means. Approximately this many clusters will be generated.").create());
// StreamingKMeans (mapper) options
// There will be k final clusters, but in the Map phase to get a good approximation of the data, O(k log n)
// clusters are needed. Since n is the number of data points and not knowable until reading all the vectors,
// provide a decent estimate.
addOption(ESTIMATED_NUM_MAP_CLUSTERS, "km", "The estimated number of clusters to use for the "
+ "Map phase of the job when running StreamingKMeans. This should be around k * log(n), "
+ "where k is the final number of clusters and n is the total number of data points to "
+ "cluster.", String.valueOf(1));
addOption(ESTIMATED_DISTANCE_CUTOFF, "e", "The initial estimated distance cutoff between two "
+ "points for forming new clusters. If no value is given, it's estimated from the data set",
String.valueOf(INVALID_DISTANCE_CUTOFF));
// BallKMeans (reducer) options
addOption(MAX_NUM_ITERATIONS, "mi", "The maximum number of iterations to run for the "
+ "BallKMeans algorithm used by the reducer. If no value is given, defaults to 10.", String.valueOf(10));
addOption(TRIM_FRACTION, "tf", "The 'ball' aspect of ball k-means means that only the closest points "
+ "to the centroid will actually be used for updating. The fraction of the points to be used is those "
+ "points whose distance to the center is within trimFraction * distance to the closest other center. "
+ "If no value is given, defaults to 0.9.", String.valueOf(0.9));
addFlag(RANDOM_INIT, "ri", "Whether to use k-means++ initialization or random initialization "
+ "of the seed centroids. Essentially, k-means++ provides better clusters, but takes longer, whereas random "
+ "initialization takes less time, but produces worse clusters, and tends to fail more often and needs "
+ "multiple runs to compare to k-means++. If set, uses the random initialization.");
addFlag(IGNORE_WEIGHTS, "iw", "Whether to correct the weights of the centroids after the clustering is done. "
+ "The weights end up being wrong because of the trimFraction and possible train/test splits. In some cases, "
+ "especially in a pipeline, having an accurate count of the weights is useful. If set, ignores the final "
+ "weights");
addOption(TEST_PROBABILITY, "testp", "A double value between 0 and 1 that represents the percentage of "
+ "points to be used for 'testing' different clustering runs in the final BallKMeans "
+ "step. If no value is given, defaults to 0.1", String.valueOf(0.1));
addOption(NUM_BALLKMEANS_RUNS, "nbkm", "Number of BallKMeans runs to use at the end to try to cluster the "
+ "points. If no value is given, defaults to 4", String.valueOf(4));
// Nearest neighbor search options
// The distance measure used for computing the distance between two points. Generally, the
// SquaredEuclideanDistance is used for clustering problems (it's equivalent to CosineDistance for normalized
// vectors).
// WARNING! You can use any metric but most of the literature is for the squared euclidean distance.
addOption(DefaultOptionCreator.distanceMeasureOption().create());
// The default searcher should be something more efficient that BruteSearch (ProjectionSearch, ...). See
// o.a.m.math.neighborhood.*
addOption(SEARCHER_CLASS_OPTION, "sc", "The type of searcher to be used when performing nearest "
+ "neighbor searches. Defaults to ProjectionSearch.", ProjectionSearch.class.getCanonicalName());
// In the original paper, the authors used 1 projection vector.
addOption(NUM_PROJECTIONS_OPTION, "np", "The number of projections considered in estimating the "
+ "distances between vectors. Only used when the distance measure requested is either "
+ "ProjectionSearch or FastProjectionSearch. If no value is given, defaults to 3.", String.valueOf(3));
addOption(SEARCH_SIZE_OPTION, "s", "In more efficient searches (non BruteSearch), "
+ "not all distances are calculated for determining the nearest neighbors. The number of "
+ "elements whose distances from the query vector is actually computer is proportional to "
+ "searchSize. If no value is given, defaults to 1.", String.valueOf(2));
addFlag(REDUCE_STREAMING_KMEANS, "rskm", "There might be too many intermediate clusters from the mapper "
+ "to fit into memory, so the reducer can run another pass of StreamingKMeans to collapse them down to a "
+ "fewer clusters");
addOption(DefaultOptionCreator.methodOption().create());
if (parseArguments(args) == null) {
return -1;
}
Path output = getOutputPath();
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.delete(getConf(), output);
}
configureOptionsForWorkers();
run(getConf(), getInputPath(), output);
return 0;
}
private void configureOptionsForWorkers() throws ClassNotFoundException {
log.info("Starting to configure options for workers");
String method = getOption(DefaultOptionCreator.METHOD_OPTION);
int numClusters = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
// StreamingKMeans
int estimatedNumMapClusters = Integer.parseInt(getOption(ESTIMATED_NUM_MAP_CLUSTERS));
float estimatedDistanceCutoff = Float.parseFloat(getOption(ESTIMATED_DISTANCE_CUTOFF));
// BallKMeans
int maxNumIterations = Integer.parseInt(getOption(MAX_NUM_ITERATIONS));
float trimFraction = Float.parseFloat(getOption(TRIM_FRACTION));
boolean randomInit = hasOption(RANDOM_INIT);
boolean ignoreWeights = hasOption(IGNORE_WEIGHTS);
float testProbability = Float.parseFloat(getOption(TEST_PROBABILITY));
int numBallKMeansRuns = Integer.parseInt(getOption(NUM_BALLKMEANS_RUNS));
// Nearest neighbor search
String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
String searcherClass = getOption(SEARCHER_CLASS_OPTION);
// Get more parameters depending on the kind of search class we're working with. BruteSearch
// doesn't need anything else.
// LocalitySensitiveHashSearch and ProjectionSearches need searchSize.
// ProjectionSearches also need the number of projections.
boolean getSearchSize = false;
boolean getNumProjections = false;
if (!searcherClass.equals(BruteSearch.class.getName())) {
getSearchSize = true;
getNumProjections = true;
}
// The search size to use. This is quite fuzzy and might end up not being configurable at all.
int searchSize = 0;
if (getSearchSize) {
searchSize = Integer.parseInt(getOption(SEARCH_SIZE_OPTION));
}
// The number of projections to use. This is only useful in projection searches which
// project the vectors on multiple basis vectors to get distance estimates that are faster to
// calculate.
int numProjections = 0;
if (getNumProjections) {
numProjections = Integer.parseInt(getOption(NUM_PROJECTIONS_OPTION));
}
boolean reduceStreamingKMeans = hasOption(REDUCE_STREAMING_KMEANS);
configureOptionsForWorkers(getConf(), numClusters,
/* StreamingKMeans */
estimatedNumMapClusters, estimatedDistanceCutoff,
/* BallKMeans */
maxNumIterations, trimFraction, randomInit, ignoreWeights, testProbability, numBallKMeansRuns,
/* Searcher */
measureClass, searcherClass, searchSize, numProjections,
method,
reduceStreamingKMeans);
}
/**
* Checks the parameters for a StreamingKMeans job and prepares a Configuration with them.
*
* @param conf the Configuration to populate
* @param numClusters k, the number of clusters at the end
* @param estimatedNumMapClusters O(k log n), the number of clusters requested from each mapper
* @param estimatedDistanceCutoff an estimate of the minimum distance that separates two clusters (can be smaller and
* will be increased dynamically)
* @param maxNumIterations the maximum number of iterations of BallKMeans
* @param trimFraction the fraction of the points to be considered in updating a ball k-means
* @param randomInit whether to initialize the ball k-means seeds randomly
* @param ignoreWeights whether to ignore the invalid final ball k-means weights
* @param testProbability the percentage of vectors assigned to the test set for selecting the best final centers
* @param numBallKMeansRuns the number of BallKMeans runs in the reducer that determine the centroids to return
* (clusters are computed for the training set and the error is computed on the test set)
* @param measureClass string, name of the distance measure class; theory works for Euclidean-like distances
* @param searcherClass string, name of the searcher that will be used for nearest neighbor search
* @param searchSize the number of closest neighbors to look at for selecting the closest one in approximate nearest
* neighbor searches
* @param numProjections the number of projected vectors to use for faster searching (only useful for ProjectionSearch
* or FastProjectionSearch); @see org.apache.mahout.math.neighborhood.ProjectionSearch
*/
public static void configureOptionsForWorkers(Configuration conf,
int numClusters,
/* StreamingKMeans */
int estimatedNumMapClusters, float estimatedDistanceCutoff,
/* BallKMeans */
int maxNumIterations, float trimFraction, boolean randomInit,
boolean ignoreWeights, float testProbability, int numBallKMeansRuns,
/* Searcher */
String measureClass, String searcherClass,
int searchSize, int numProjections,
String method,
boolean reduceStreamingKMeans) throws ClassNotFoundException {
// Checking preconditions for the parameters.
Preconditions.checkArgument(numClusters > 0,
"Invalid number of clusters requested: " + numClusters + ". Must be: numClusters > 0!");
// StreamingKMeans
Preconditions.checkArgument(estimatedNumMapClusters > numClusters, "Invalid number of estimated map "
+ "clusters; There must be more than the final number of clusters (k log n vs k)");
Preconditions.checkArgument(estimatedDistanceCutoff == INVALID_DISTANCE_CUTOFF || estimatedDistanceCutoff > 0,
"estimatedDistanceCutoff must be equal to -1 or must be greater then 0!");
// BallKMeans
Preconditions.checkArgument(maxNumIterations > 0, "Must have at least one BallKMeans iteration");
Preconditions.checkArgument(trimFraction > 0, "trimFraction must be positive");
Preconditions.checkArgument(testProbability >= 0 && testProbability < 1, "test probability is not in the "
+ "interval [0, 1)");
Preconditions.checkArgument(numBallKMeansRuns > 0, "numBallKMeans cannot be negative");
// Searcher
if (!searcherClass.contains("Brute")) {
// These tests only make sense when a relevant searcher is being used.
Preconditions.checkArgument(searchSize > 0, "Invalid searchSize. Must be positive.");
if (searcherClass.contains("Projection")) {
Preconditions.checkArgument(numProjections > 0, "Invalid numProjections. Must be positive");
}
}
// Setting the parameters in the Configuration.
conf.setInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, numClusters);
/* StreamingKMeans */
conf.setInt(ESTIMATED_NUM_MAP_CLUSTERS, estimatedNumMapClusters);
if (estimatedDistanceCutoff != INVALID_DISTANCE_CUTOFF) {
conf.setFloat(ESTIMATED_DISTANCE_CUTOFF, estimatedDistanceCutoff);
}
/* BallKMeans */
conf.setInt(MAX_NUM_ITERATIONS, maxNumIterations);
conf.setFloat(TRIM_FRACTION, trimFraction);
conf.setBoolean(RANDOM_INIT, randomInit);
conf.setBoolean(IGNORE_WEIGHTS, ignoreWeights);
conf.setFloat(TEST_PROBABILITY, testProbability);
conf.setInt(NUM_BALLKMEANS_RUNS, numBallKMeansRuns);
/* Searcher */
// Checks if the measureClass is available, throws exception otherwise.
Class.forName(measureClass);
conf.set(DefaultOptionCreator.DISTANCE_MEASURE_OPTION, measureClass);
// Checks if the searcherClass is available, throws exception otherwise.
Class.forName(searcherClass);
conf.set(SEARCHER_CLASS_OPTION, searcherClass);
conf.setInt(SEARCH_SIZE_OPTION, searchSize);
conf.setInt(NUM_PROJECTIONS_OPTION, numProjections);
conf.set(DefaultOptionCreator.METHOD_OPTION, method);
conf.setBoolean(REDUCE_STREAMING_KMEANS, reduceStreamingKMeans);
log.info("Parameters are: [k] numClusters {}; "
+ "[SKM] estimatedNumMapClusters {}; estimatedDistanceCutoff {} "
+ "[BKM] maxNumIterations {}; trimFraction {}; randomInit {}; ignoreWeights {}; "
+ "testProbability {}; numBallKMeansRuns {}; "
+ "[S] measureClass {}; searcherClass {}; searcherSize {}; numProjections {}; "
+ "method {}; reduceStreamingKMeans {}", numClusters, estimatedNumMapClusters, estimatedDistanceCutoff,
maxNumIterations, trimFraction, randomInit, ignoreWeights, testProbability, numBallKMeansRuns,
measureClass, searcherClass, searchSize, numProjections, method, reduceStreamingKMeans);
}
/**
* Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
* cluster the input vectors.
*
* @param input the directory pathname for input points.
* @param output the directory pathname for output points.
* @return 0 on success, -1 on failure.
*/
public static int run(Configuration conf, Path input, Path output)
throws IOException, InterruptedException, ClassNotFoundException, ExecutionException {
log.info("Starting StreamingKMeans clustering for vectors in {}; results are output to {}",
input.toString(), output.toString());
if (conf.get(DefaultOptionCreator.METHOD_OPTION,
DefaultOptionCreator.MAPREDUCE_METHOD).equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
return runSequentially(conf, input, output);
} else {
return runMapReduce(conf, input, output);
}
}
private static int runSequentially(Configuration conf, Path input, Path output)
throws IOException, ExecutionException, InterruptedException {
long start = System.currentTimeMillis();
// Run StreamingKMeans step in parallel by spawning 1 thread per input path to process.
ExecutorService pool = Executors.newCachedThreadPool();
List>> intermediateCentroidFutures = new ArrayList<>();
for (FileStatus status : HadoopUtil.listStatus(FileSystem.get(conf), input, PathFilters.logsCRCFilter())) {
intermediateCentroidFutures.add(pool.submit(new StreamingKMeansThread(status.getPath(), conf)));
}
log.info("Finished running Mappers");
// Merge the resulting "mapper" centroids.
List intermediateCentroids = new ArrayList<>();
for (Future> futureIterable : intermediateCentroidFutures) {
for (Centroid centroid : futureIterable.get()) {
intermediateCentroids.add(centroid);
}
}
pool.shutdown();
pool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
log.info("Finished StreamingKMeans");
SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(conf), conf, new Path(output, "part-r-00000"), IntWritable.class,
CentroidWritable.class);
int numCentroids = 0;
// Run BallKMeans on the intermediate centroids.
for (Vector finalVector : StreamingKMeansReducer.getBestCentroids(intermediateCentroids, conf)) {
Centroid finalCentroid = (Centroid)finalVector;
writer.append(new IntWritable(numCentroids++), new CentroidWritable(finalCentroid));
}
writer.close();
long end = System.currentTimeMillis();
log.info("Finished BallKMeans. Took {}.", (end - start) / 1000.0);
return 0;
}
public static int runMapReduce(Configuration conf, Path input, Path output)
throws IOException, ClassNotFoundException, InterruptedException {
// Prepare Job for submission.
Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class,
StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class,
conf);
job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job,
StreamingKMeansMapper.class, StreamingKMeansReducer.class));
// There is only one reducer so that the intermediate centroids get collected on one
// machine and are clustered in memory to get the right number of clusters.
job.setNumReduceTasks(1);
// Set the JAR (so that the required libraries are available) and run.
job.setJarByClass(StreamingKMeansDriver.class);
// Run job!
long start = System.currentTimeMillis();
if (!job.waitForCompletion(true)) {
return -1;
}
long end = System.currentTimeMillis();
log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start);
return 0;
}
/**
* Constructor to be used by the ToolRunner.
*/
private StreamingKMeansDriver() {}
public static void main(String[] args) throws Exception {
ToolRunner.run(new StreamingKMeansDriver(), args);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy