org.apache.mahout.clustering.cdbw.CDbwEvaluator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mahout-integration Show documentation
Optional components of Mahout which generally support interaction with third party systems, formats, APIs, etc.
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.cdbw;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.GaussianAccumulator;
import org.apache.mahout.clustering.OnlineGaussianAccumulator;
import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/**
 * This class calculates the CDbw metric as defined in
 * http://www.db-net.aueb.gr/index.php/corporate/content/download/227/833/file/HV_poster2002.pdf
 */
public final class CDbwEvaluator {
  
  private static final Logger log = LoggerFactory.getLogger(CDbwEvaluator.class);
  
  private final Map> representativePoints;
  private final Map stDevs = new HashMap<>();
  private final List clusters;
  private final DistanceMeasure measure;
  private Double interClusterDensity = null;
  // these are symmetric so we only compute half of them
  private Map> minimumDistances = null;
  // these are symmetric too
  private Map> interClusterDensities = null;
  // these are symmetric too
  private Map> closestRepPointIndices = null;
  
  /**
   * For testing only
   * 
   * @param representativePoints
   *          a Map> of representative points keyed by clusterId
   * @param clusters
   *          a Map of the clusters keyed by clusterId
   * @param measure
   *          an appropriate DistanceMeasure
   */
  public CDbwEvaluator(Map> representativePoints, List clusters,
      DistanceMeasure measure) {
    this.representativePoints = representativePoints;
    this.clusters = clusters;
    this.measure = measure;
    for (Integer cId : representativePoints.keySet()) {
      computeStd(cId);
    }
  }
  
  /**
   * Initialize a new instance from job information
   * 
   * @param conf
   *          a Configuration with appropriate parameters
   * @param clustersIn
   *          a String path to the input clusters directory
   */
  public CDbwEvaluator(Configuration conf, Path clustersIn) {
    measure = ClassUtils
        .instantiateAs(conf.get(RepresentativePointsDriver.DISTANCE_MEASURE_KEY), DistanceMeasure.class);
    representativePoints = RepresentativePointsMapper.getRepresentativePoints(conf);
    clusters = loadClusters(conf, clustersIn);
    for (Integer cId : representativePoints.keySet()) {
      computeStd(cId);
    }
  }
  
  /**
   * Load the clusters from their sequence files
   * 
   * @param clustersIn
   *          a String pathname to the directory containing input cluster files
   * @return a List of the clusters
   */
  private static List loadClusters(Configuration conf, Path clustersIn) {
    List clusters = new ArrayList<>();
    for (ClusterWritable clusterWritable : new SequenceFileDirValueIterable(clustersIn, PathType.LIST,
        PathFilters.logsCRCFilter(), conf)) {
      Cluster cluster = clusterWritable.getValue();
      clusters.add(cluster);
    }
    return clusters;
  }
  
  /**
   * Compute the standard deviation of the representative points for the given cluster. Store these in stDevs, indexed
   * by cI
   * 
   * @param cI
   *          a int clusterId.
   */
  private void computeStd(int cI) {
    List repPts = representativePoints.get(cI);
    GaussianAccumulator accumulator = new OnlineGaussianAccumulator();
    for (VectorWritable vw : repPts) {
      accumulator.observe(vw.get(), 1.0);
    }
    accumulator.compute();
    double d = accumulator.getAverageStd();
    stDevs.put(cI, d);
  }
  
  /**
   * Compute the density of points near the midpoint between the two closest points of the clusters (eqn 2) used for
   * inter-cluster density calculation
   * 
   * @param uIJ
   *          the Vector midpoint between the closest representative points of the clusters
   * @param cI
   *          the int clusterId of the i-th cluster
   * @param cJ
   *          the int clusterId of the j-th cluster
   * @param avgStd
   *          the double average standard deviation of the two clusters
   * @return a double
   */
  private double density(Vector uIJ, int cI, int cJ, double avgStd) {
    List repI = representativePoints.get(cI);
    List repJ = representativePoints.get(cJ);
    double sum = 0.0;
    // count the number of representative points of the clusters which are within the
    // average std of the two clusters from the midpoint uIJ (eqn 3)
    for (VectorWritable vwI : repI) {
      if (uIJ != null && measure.distance(uIJ, vwI.get()) <= avgStd) {
        sum++;
      }
    }
    for (VectorWritable vwJ : repJ) {
      if (uIJ != null && measure.distance(uIJ, vwJ.get()) <= avgStd) {
        sum++;
      }
    }
    int nI = repI.size();
    int nJ = repJ.size();
    return sum / (nI + nJ);
  }
  
  /**
   * Compute the CDbw validity metric (eqn 8). The goal of this metric is to reward clusterings which have a high
   * intraClusterDensity and also a high cluster separation.
   * 
   * @return a double
   */
  public double getCDbw() {
    return intraClusterDensity() * separation();
  }
  
  /**
   * The average density within clusters is defined as the percentage of representative points that reside in the
   * neighborhood of the clusters' centers. The goal is the density within clusters to be significantly high. (eqn 5)
   * 
   * @return a double
   */
  public double intraClusterDensity() {
    double avgDensity = 0;
    int count = 0;
    for (Element elem : intraClusterDensities().nonZeroes()) {
      double value = elem.get();
      if (!Double.isNaN(value)) {
        avgDensity += value;
        count++;
      }
    }
    return avgDensity / count;
  }
  
  /**
   * This function evaluates the density of points in the regions between each clusters (eqn 1). The goal is the density
   * in the area between clusters to be significant low.
   * 
   * @return a Map> of the inter-cluster densities
   */
  public Map> interClusterDensities() {
    if (interClusterDensities != null) {
      return interClusterDensities;
    }
    interClusterDensities = new TreeMap<>();
    // find the closest representative points between the clusters
    for (int i = 0; i < clusters.size(); i++) {
      int cI = clusters.get(i).getId();
      Map map = new TreeMap<>();
      interClusterDensities.put(cI, map);
      for (int j = i + 1; j < clusters.size(); j++) {
        int cJ = clusters.get(j).getId();
        double minDistance = minimumDistance(cI, cJ); // the distance between the closest representative points
        Vector uIJ = midpointVector(cI, cJ); // the midpoint between the closest representative points
        double stdSum = stDevs.get(cI) + stDevs.get(cJ);
        double density = density(uIJ, cI, cJ, stdSum / 2);
        double interDensity = minDistance * density / stdSum;
        map.put(cJ, interDensity);
        if (log.isDebugEnabled()) {
          log.debug("minDistance[{},{}]={}", cI, cJ, minDistance);
          log.debug("interDensity[{},{}]={}", cI, cJ, density);
          log.debug("density[{},{}]={}", cI, cJ, interDensity);
        }
      }
    }
    return interClusterDensities;
  }
  
  /**
   * Calculate the separation of clusters (eqn 4) taking into account both the distances between the clusters' closest
   * points and the Inter-cluster density. The goal is the distances between clusters to be high while the
   * representative point density in the areas between them are low.
   * 
   * @return a double
   */
  public double separation() {
    double minDistanceSum = 0;
    Map> distances = minimumDistances();
    for (Map map : distances.values()) {
      for (Double dist : map.values()) {
        if (!Double.isInfinite(dist)) {
          minDistanceSum += dist * 2; // account for other half of calculated triangular minimumDistances matrix
        }
      }
    }
    return minDistanceSum / (1.0 + interClusterDensity());
  }
  
  /**
   * This function evaluates the average density of points in the regions between clusters (eqn 1). The goal is the
   * density in the area between clusters to be significant low.
   * 
   * @return a double
   */
  public double interClusterDensity() {
    if (interClusterDensity != null) {
      return interClusterDensity;
    }
    double sum = 0.0;
    int count = 0;
    Map> distances = interClusterDensities();
    for (Map row : distances.values()) {
      for (Double density : row.values()) {
        if (!Double.isNaN(density)) {
          sum += density;
          count++;
        }
      }
    }
    log.debug("interClusterDensity={}", sum);
    interClusterDensity = sum / count;
    return interClusterDensity;
  }
  
  /**
   * The average density within clusters is defined as the percentage of representative points that reside in the
   * neighborhood of the clusters' centers. The goal is the density within clusters to be significantly high. (eqn 5)
   * 
   * @return a Vector of the intra-densities of each clusterId
   */
  public Vector intraClusterDensities() {
    Vector densities = new RandomAccessSparseVector(Integer.MAX_VALUE);
    // compute the average standard deviation of the clusters
    double stdev = 0.0;
    for (Integer cI : representativePoints.keySet()) {
      stdev += stDevs.get(cI);
    }
    int c = representativePoints.size();
    stdev /= c;
    for (Cluster cluster : clusters) {
      Integer cI = cluster.getId();
      List repPtsI = representativePoints.get(cI);
      int r = repPtsI.size();
      double sumJ = 0.0;
      // compute the term density (eqn 6)
      for (VectorWritable pt : repPtsI) {
        // compute f(x, vIJ) (eqn 7)
        Vector repJ = pt.get();
        double densityIJ = measure.distance(cluster.getCenter(), repJ) <= stdev ? 1.0 : 0.0;
        // accumulate sumJ
        sumJ += densityIJ / stdev;
      }
      densities.set(cI, sumJ / r);
    }
    return densities;
  }
  
  /**
   * Calculate and cache the distances between the clusters' closest representative points. Also cache the indices of
   * the closest representative points used for later use
   * 
   * @return a Map of the closest distances, keyed by clusterId
   */
  private Map> minimumDistances() {
    if (minimumDistances != null) {
      return minimumDistances;
    }
    minimumDistances = new TreeMap<>();
    closestRepPointIndices = new TreeMap<>();
    for (int i = 0; i < clusters.size(); i++) {
      Integer cI = clusters.get(i).getId();
      Map map = new TreeMap<>();
      Map treeMap = new TreeMap<>();
      closestRepPointIndices.put(cI, treeMap);
      minimumDistances.put(cI, map);
      List closRepI = representativePoints.get(cI);
      for (int j = i + 1; j < clusters.size(); j++) {
        // find min{d(closRepI, closRepJ)}
        Integer cJ = clusters.get(j).getId();
        List closRepJ = representativePoints.get(cJ);
        double minDistance = Double.MAX_VALUE;
        int[] midPointIndices = null;
        for (int xI = 0; xI < closRepI.size(); xI++) {
          VectorWritable aRepI = closRepI.get(xI);
          for (int xJ = 0; xJ < closRepJ.size(); xJ++) {
            VectorWritable aRepJ = closRepJ.get(xJ);
            double distance = measure.distance(aRepI.get(), aRepJ.get());
            if (distance < minDistance) {
              minDistance = distance;
              midPointIndices = new int[] {xI, xJ};
            }
          }
        }
        map.put(cJ, minDistance);
        treeMap.put(cJ, midPointIndices);
      }
    }
    return minimumDistances;
  }
  
  private double minimumDistance(int cI, int cJ) {
    Map distances = minimumDistances().get(cI);
    if (distances != null) {
      return distances.get(cJ);
    } else {
      return minimumDistances().get(cJ).get(cI);
    }
  }
  
  private Vector midpointVector(int cI, int cJ) {
    Map distances = minimumDistances().get(cI);
    if (distances != null) {
      int[] ks = closestRepPointIndices.get(cI).get(cJ);
      if (ks == null) {
        return null;
      }
      return representativePoints.get(cI).get(ks[0]).get().plus(representativePoints.get(cJ).get(ks[1]).get())
          .divide(2);
    } else {
      int[] ks = closestRepPointIndices.get(cJ).get(cI);
      if (ks == null) {
        return null;
      }
      return representativePoints.get(cJ).get(ks[1]).get().plus(representativePoints.get(cI).get(ks[0]).get())
          .divide(2);
    }
    
  }
}