pairsICSE15.1000021.CDbwEvaluator.1000021_CDbwEvaluator_s Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.cdbw;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.SquareRootFunction;
public class CDbwEvaluator {
private final Map> representativePoints;
private final Map stDevs = new HashMap();
private final Map clusters;
private final DistanceMeasure measure;
/**
* For testing only
*
* @param representativePoints
* a Map> of representative points keyed by clusterId
* @param clusters
* a Map of the clusters keyed by clusterId
* @param measure
* an appropriate DistanceMeasure
*/
public CDbwEvaluator(Map> representativePoints,
Map clusters,
DistanceMeasure measure) {
this.representativePoints = representativePoints;
this.clusters = clusters;
this.measure = measure;
for (Integer cId : representativePoints.keySet()) {
setStDev(cId);
}
}
/**
* Initialize a new instance from job information
*
* @param conf
* a JobConf with appropriate parameters
* @param clustersIn
* a String path to the input clusters directory
*/
public CDbwEvaluator(Configuration conf, Path clustersIn)
throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException {
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
measure = ccl.loadClass(conf.get(CDbwDriver.DISTANCE_MEASURE_KEY))
.asSubclass(DistanceMeasure.class).newInstance();
representativePoints = CDbwMapper.getRepresentativePoints(conf);
clusters = loadClusters(conf, clustersIn);
for (Integer cId : representativePoints.keySet()) {
setStDev(cId);
}
}
public double getCDbw() {
return intraClusterDensity() * separation();
}
public double intraClusterDensity() {
double avgStd = 0.0;
for (Integer cId : representativePoints.keySet()) {
avgStd += stDevs.get(cId);
}
avgStd /= representativePoints.size();
double sum = 0.0;
for (Map.Entry> entry : representativePoints.entrySet()) {
Integer cId = entry.getKey();
List repI = entry.getValue();
double cSum = 0.0;
for (VectorWritable aRepI : repI) {
double inDensity = intraDensity(clusters.get(cId).getCenter(), aRepI.get(), avgStd);
double std = stDevs.get(cId);
if (std > 0.0) {
cSum += inDensity / std;
}
}
sum += cSum / repI.size();
}
return sum / representativePoints.size();
}
public double interClusterDensity() {
double sum = 0.0;
for (Map.Entry> entry1 : representativePoints.entrySet()) {
Integer cI = entry1.getKey();
List repI = entry1.getValue();
double stDevI = stDevs.get(cI);
for (Map.Entry> entry2 : representativePoints.entrySet()) {
Integer cJ = entry2.getKey();
if (cI.equals(cJ)) {
continue;
}
List repJ = entry2.getValue();
double minDistance = Double.MAX_VALUE;
Vector uIJ = null;
for (VectorWritable aRepI : repI) {
for (VectorWritable aRepJ : repJ) {
Vector vI = aRepI.get();
Vector vJ = aRepJ.get();
double distance = measure.distance(vI, vJ);
if (distance < minDistance) {
minDistance = distance;
uIJ = vI.plus(vJ).divide(2);
}
}
}
double stDevJ = stDevs.get(cJ);
double interDensity = interDensity(uIJ, cI, cJ);
double stdSum = stDevI + stDevJ;
double density = 0.0;
if (stdSum > 0.0) {
density = minDistance * interDensity / stdSum;
}
// Use a logger
//if (false) {
// System.out.println("minDistance[" + cI + "," + cJ + "]=" + minDistance);
// System.out.println("stDev[" + cI + "]=" + stDevI);
// System.out.println("stDev[" + cJ + "]=" + stDevJ);
// System.out.println("interDensity[" + cI + "," + cJ + "]=" + interDensity);
// System.out.println("density[" + cI + "," + cJ + "]=" + density);
// System.out.println();
//}
sum += density;
}
}
//System.out.println("interClusterDensity=" + sum);
return sum;
}
public double separation() {
double minDistance = Double.MAX_VALUE;
for (Map.Entry> entry1 : representativePoints.entrySet()) {
Integer cI = entry1.getKey();
List repI = entry1.getValue();
for (Map.Entry> entry2 : representativePoints.entrySet()) {
if (cI.equals(entry2.getKey())) {
continue;
}
List repJ = entry2.getValue();
for (VectorWritable aRepI : repI) {
for (VectorWritable aRepJ : repJ) {
double distance = measure.distance(aRepI.get(), aRepJ.get());
if (distance < minDistance) {
minDistance = distance;
}
}
}
}
}
return minDistance / (1.0 + interClusterDensity());
}
/**
* Load the clusters from their sequence files
*
* @param clustersIn
* a String pathname to the directory containing input cluster files
* @return a List of the clusters
*/
private static Map loadClusters(Configuration conf, Path clustersIn)
throws InstantiationException, IllegalAccessException, IOException {
Map clusters = new HashMap();
FileSystem fs = clustersIn.getFileSystem(conf);
for (FileStatus part : fs.listStatus(clustersIn)) {
if (!part.getPath().getName().startsWith(".")) {
Path inPart = part.getPath();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, inPart, conf);
Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
while (reader.next(key, value)) {
Cluster cluster = (Cluster) value;
clusters.put(cluster.getId(), cluster);
value = reader.getValueClass().asSubclass(Writable.class).newInstance();
}
reader.close();
}
}
return clusters;
}
double interDensity(Vector uIJ, int cI, int cJ) {
List repI = representativePoints.get(cI);
List repJ = representativePoints.get(cJ);
double density = 0.0;
double std = (stDevs.get(cI) + stDevs.get(cJ)) / 2.0;
for (VectorWritable vwI : repI) {
if (measure.distance(uIJ, vwI.get()) <= std) {
density++;
}
}
for (VectorWritable vwJ : repJ) {
if (measure.distance(uIJ, vwJ.get()) <= std) {
density++;
}
}
return density / (repI.size() + repJ.size());
}
private void setStDev(int cI) {
List repPts = representativePoints.get(cI);
//if (repPts == null) {
// System.out.println();
//}
int s0 = 0;
Vector s1 = null;
Vector s2 = null;
for (VectorWritable vw : repPts) {
s0++;
Vector v = vw.get();
s1 = s1 == null ? v.clone() : s1.plus(v);
s2 = s2 == null ? v.times(v) : s2.plus(v.times(v));
}
Vector std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0);
double d = std.zSum() / std.size();
//System.out.println("stDev[" + cI + "]=" + d);
stDevs.put(cI, d);
}
/*
double minRpDistance(Iterable repI, Iterable repJ) {
double minDistance = Double.MAX_VALUE;
for (VectorWritable aRepI : repI) {
for (VectorWritable aRepJ : repJ) {
double distance = measure.distance(aRepI.get(), aRepJ.get());
if (distance < minDistance) {
minDistance = distance;
}
}
}
return minDistance;
}
*/
double intraDensity(Vector clusterCenter, Vector repPoint, double avgStd) {
return measure.distance(clusterCenter, repPoint) <= avgStd ? 1.0 : 0.0;
}
}