org.apache.mahout.clustering.evaluation.ClusterEvaluator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-integration Show documentation
Show all versions of mahout-integration Show documentation
Optional components of Mahout which generally support interaction with third party systems,
formats, APIs, etc.
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.evaluation;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
public class ClusterEvaluator {
private static final Logger log = LoggerFactory.getLogger(ClusterEvaluator.class);
private final Map> representativePoints;
private final List clusters;
private final DistanceMeasure measure;
/**
* For testing only
*
* @param representativePoints
* a Map> of representative points keyed by clusterId
* @param clusters
* a Map of the clusters keyed by clusterId
* @param measure
* an appropriate DistanceMeasure
*/
public ClusterEvaluator(Map> representativePoints, List clusters,
DistanceMeasure measure) {
this.representativePoints = representativePoints;
this.clusters = clusters;
this.measure = measure;
}
/**
* Initialize a new instance from job information
*
* @param conf
* a Configuration with appropriate parameters
* @param clustersIn
* a String path to the input clusters directory
*/
public ClusterEvaluator(Configuration conf, Path clustersIn) {
measure = ClassUtils
.instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY), DistanceMeasure.class);
representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
clusters = loadClusters(conf, clustersIn);
}
/**
* Load the clusters from their sequence files
*
* @param clustersIn
* a String pathname to the directory containing input cluster files
* @return a List of the clusters
*/
private static List loadClusters(Configuration conf, Path clustersIn) {
List clusters = new ArrayList<>();
for (ClusterWritable clusterWritable : new SequenceFileDirValueIterable(clustersIn, PathType.LIST,
PathFilters.logsCRCFilter(), conf)) {
Cluster cluster = clusterWritable.getValue();
clusters.add(cluster);
}
return clusters;
}
/**
* Computes the inter-cluster density as defined in "Mahout In Action"
*
* @return the interClusterDensity
*/
public double interClusterDensity() {
double max = Double.NEGATIVE_INFINITY;
double min = Double.POSITIVE_INFINITY;
double sum = 0;
int count = 0;
Map distances = interClusterDistances();
for (Vector row : distances.values()) {
for (Element element : row.nonZeroes()) {
double d = element.get();
min = Math.min(d, min);
max = Math.max(d, max);
sum += d;
count++;
}
}
double density = (sum / count - min) / (max - min);
log.info("Scaled Inter-Cluster Density = {}", density);
return density;
}
/**
* Computes the inter-cluster distances
*
* @return a Map
*/
public Map interClusterDistances() {
Map distances = new TreeMap<>();
for (int i = 0; i < clusters.size(); i++) {
Cluster clusterI = clusters.get(i);
RandomAccessSparseVector row = new RandomAccessSparseVector(Integer.MAX_VALUE);
distances.put(clusterI.getId(), row);
for (int j = i + 1; j < clusters.size(); j++) {
Cluster clusterJ = clusters.get(j);
double d = measure.distance(clusterI.getCenter(), clusterJ.getCenter());
row.set(clusterJ.getId(), d);
}
}
return distances;
}
/**
* Computes the average intra-cluster density as the average of each cluster's intra-cluster density
*
* @return the average intraClusterDensity
*/
public double intraClusterDensity() {
double avgDensity = 0;
int count = 0;
for (Element elem : intraClusterDensities().nonZeroes()) {
double value = elem.get();
if (!Double.isNaN(value)) {
avgDensity += value;
count++;
}
}
avgDensity = clusters.isEmpty() ? 0 : avgDensity / count;
log.info("Average Intra-Cluster Density = {}", avgDensity);
return avgDensity;
}
/**
* Computes the intra-cluster densities for all clusters as the average distance of the representative points from
* each other
*
* @return a Vector of the intraClusterDensity of the representativePoints by clusterId
*/
public Vector intraClusterDensities() {
Vector densities = new RandomAccessSparseVector(Integer.MAX_VALUE);
for (Cluster cluster : clusters) {
int count = 0;
double max = Double.NEGATIVE_INFINITY;
double min = Double.POSITIVE_INFINITY;
double sum = 0;
List repPoints = representativePoints.get(cluster.getId());
for (int i = 0; i < repPoints.size(); i++) {
for (int j = i + 1; j < repPoints.size(); j++) {
Vector v1 = repPoints.get(i).get();
Vector v2 = repPoints.get(j).get();
double d = measure.distance(v1, v2);
min = Math.min(d, min);
max = Math.max(d, max);
sum += d;
count++;
}
}
double density = (sum / count - min) / (max - min);
densities.set(cluster.getId(), density);
log.info("Intra-Cluster Density[{}] = {}", cluster.getId(), density);
}
return densities;
}
}