All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.clustering.canopy.CanopyClusterer Maven / Gradle / Ivy

There is a newer version: 0.13.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.canopy;

import java.util.Collection;
import java.util.Iterator;
import java.util.List;

import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;

@Deprecated
public class CanopyClusterer {

  private static final Logger log = LoggerFactory.getLogger(CanopyClusterer.class);

  private int nextCanopyId;

  // the T1 distance threshold
  private double t1;

  // the T2 distance threshold
  private double t2;

  // the T3 distance threshold
  private double t3;

  // the T4 distance threshold
  private double t4;

  // the distance measure
  private DistanceMeasure measure;

  public CanopyClusterer(DistanceMeasure measure, double t1, double t2) {
    this.t1 = t1;
    this.t2 = t2;
    this.t3 = t1;
    this.t4 = t2;
    this.measure = measure;
  }

  public double getT1() {
    return t1;
  }

  public double getT2() {
    return t2;
  }

  public double getT3() {
    return t3;
  }

  public double getT4() {
    return t4;
  }

  /**
   * Used by CanopyReducer to set t1=t3 and t2=t4 configuration values
   */
  public void useT3T4() {
    t1 = t3;
    t2 = t4;
  }

  /**
   * This is the same algorithm as the reference but inverted to iterate over
   * existing canopies instead of the points. Because of this it does not need
   * to actually store the points, instead storing a total points vector and
   * the number of points. From this a centroid can be computed.
   * 

* This method is used by the CanopyMapper, CanopyReducer and CanopyDriver. * * @param point * the point to be added * @param canopies * the List to be appended */ public void addPointToCanopies(Vector point, Collection canopies) { boolean pointStronglyBound = false; for (Canopy canopy : canopies) { double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point); if (dist < t1) { if (log.isDebugEnabled()) { log.debug("Added point: {} to canopy: {}", AbstractCluster.formatVector(point, null), canopy.getIdentifier()); } canopy.observe(point); } pointStronglyBound = pointStronglyBound || dist < t2; } if (!pointStronglyBound) { if (log.isDebugEnabled()) { log.debug("Created new Canopy:{} at center:{}", nextCanopyId, AbstractCluster.formatVector(point, null)); } canopies.add(new Canopy(point, nextCanopyId++, measure)); } } /** * Return if the point is covered by the canopy * * @param point * a point * @return if the point is covered */ public boolean canopyCovers(Canopy canopy, Vector point) { return measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point) < t1; } /** * Iterate through the points, adding new canopies. Return the canopies. * * @param points * a list defining the points to be clustered * @param measure * a DistanceMeasure to use * @param t1 * the T1 distance threshold * @param t2 * the T2 distance threshold * @return the List created */ public static List createCanopies(List points, DistanceMeasure measure, double t1, double t2) { List canopies = Lists.newArrayList(); /** * Reference Implementation: Given a distance metric, one can create * canopies as follows: Start with a list of the data points in any * order, and with two distance thresholds, T1 and T2, where T1 > T2. * (These thresholds can be set by the user, or selected by * cross-validation.) Pick a point on the list and measure its distance * to all other points. Put all points that are within distance * threshold T1 into a canopy. Remove from the list all points that are * within distance threshold T2. Repeat until the list is empty. */ int nextCanopyId = 0; while (!points.isEmpty()) { Iterator ptIter = points.iterator(); Vector p1 = ptIter.next(); ptIter.remove(); Canopy canopy = new Canopy(p1, nextCanopyId++, measure); canopies.add(canopy); while (ptIter.hasNext()) { Vector p2 = ptIter.next(); double dist = measure.distance(p1, p2); // Put all points that are within distance threshold T1 into the // canopy if (dist < t1) { canopy.observe(p2); } // Remove from the list all points that are within distance // threshold T2 if (dist < t2) { ptIter.remove(); } } for (Canopy c : canopies) { c.computeParameters(); } } return canopies; } /** * Iterate through the canopies, adding their centroids to a list * * @param canopies * a List * @return the List */ public static List getCenters(Iterable canopies) { List result = Lists.newArrayList(); for (Canopy canopy : canopies) { result.add(canopy.getCenter()); } return result; } /** * Iterate through the canopies, resetting their center to their centroids * * @param canopies * a List */ public static void updateCentroids(Iterable canopies) { for (Canopy canopy : canopies) { canopy.computeParameters(); } } public void setT3(double t3) { this.t3 = t3; } public void setT4(double t4) { this.t4 = t4; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy