weka.clusterers.Canopy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    Canopy.java
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.clusterers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Random;
import java.util.Vector;

import weka.classifiers.rules.DecisionTableHashKey;
import weka.core.AttributeStats;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.NormalizableDistance;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SparseInstance;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

/**
 
 * Cluster data using the capopy clustering algorithm, which requires just one pass over the data. Can run in eitherbatch or incremental mode. Results are generally not as good when running incrementally as the min/max for each numeric attribute is not known in advance. Has a heuristic (based on attribute std. deviations), that can be used in batch mode, for setting the T2 distance. The T2 distance determines how many canopies (clusters) are formed. When the user specifies a specific number (N) of clusters to generate, the algorithm will return the top N canopies (as determined by T2 density) when N < number of canopies (this applies to both batch and incremental learning); when N > number of canopies, the difference is made up by selecting training instances randomly (this can only be done when batch training). For more information see:

 * 

 * A. McCallum, K. Nigam, L.H. Ungar: Efficient Clustering of High Dimensional Data Sets with Application to Reference Matching. In: Proceedings of the sixth ACM SIGKDD internation conference on knowledge discovery and data mining ACM-SIAM symposium on Discrete algorithms, 169-178, 2000.
 * 
 
 * 
 
 * BibTeX:
 * 
 * @inproceedings{McCallum2000,
 *    author = {A. McCallum and K. Nigam and L.H. Ungar},
 *    booktitle = {Proceedings of the sixth ACM SIGKDD internation conference on knowledge discovery and data mining ACM-SIAM symposium on Discrete algorithms},
 *    pages = {169-178},
 *    title = {Efficient Clustering of High Dimensional Data Sets with Application to Reference Matching},
 *    year = {2000}
 * }
 * 
 * 
 
 * 
 
 * Valid options are: 

 * 
 * 
 -N <num>
 *  Number of clusters.
 *  (default 2).
 * 
 *  -max-candidates <num>
 *  Maximum number of candidate canopies to retain in memory
 *  at any one time. T2 distance plus, data characteristics,
 *  will determine how many candidate canopies are formed before
 *  periodic and final pruning are performed, which might result
 *  in exceess memory consumption. This setting avoids large numbers
 *  of candidate canopies consuming memory. (default = 100)
 * 
 *  -periodic-pruning <num>
 *  How often to prune low density canopies. 
 *  (default = every 10,000 training instances)
 * 
 *  -min-density
 *  Minimum canopy density, below which a canopy will be pruned
 *  during periodic pruning. (default = 2 instances)
 * 
 *  -t2
 *  The T2 distance to use. Values < 0 indicate that
 *  a heuristic based on attribute std. deviation should be used to set this.
 *  Note that this heuristic can only be used when batch training
 *  (default = -1.0)
 * 
 *  -t1
 *  The T1 distance to use. A value < 0 is taken as a
 *  positive multiplier for T2. (default = -1.5)
 * 
 *  -M
 *  Don't replace missing values with mean/mode when running in batch mode.
 * 
 * 
 *  -S <num>
 *  Random number seed.
 *  (default 1)
 * 
 *  -output-debug-info
 *  If set, clusterer is run in debug mode and
 *  may output additional info to the console
 * 
 *  -do-not-check-capabilities
 *  If set, clusterer capabilities are not checked before clusterer is built
 *  (use with caution).
 * 
 
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 11012 $
 */
public class Canopy extends RandomizableClusterer implements
  UpdateableClusterer, NumberOfClustersRequestable, OptionHandler,
  TechnicalInformationHandler {

  /** For serialization */
  private static final long serialVersionUID = 2067574593448223334L;

  /** The canopy centers */
  protected Instances m_canopies;

  /** The T2 density of each canopy */
  protected List m_canopyT2Density;
  protected List m_canopyCenters;
  protected List m_canopyNumMissingForNumerics;

  /**
   * The list of canopies that each canopy is a member of (according to the T1
   * radius, which can overlap). Each bit position in the long values
   * corresponds to one canopy. Outer list order corresponds to the order of the
   * instances that store the actual canopy centers
   */
  protected List m_clusterCanopies;

  public static final double DEFAULT_T2 = -1.0;
  public static final double DEFAULT_T1 = -1.25;

  /** < 0 means use the heuristic based on std. dev. to set the t2 radius */
  protected double m_userT2 = DEFAULT_T2;

  /**
   * < 0 indicates the multiplier to use for T2 when setting T1, otherwise the
   * value is take as is
   */
  protected double m_userT1 = DEFAULT_T1;

  /** Outer radius */
  protected double m_t1 = m_userT1;

  /** Inner radius */
  protected double m_t2 = m_userT2;

  /**
   * Prune low-density candidate canopies after every x instances have been seen
   */
  protected int m_periodicPruningRate = 10000;

  /**
   * The minimum cluster density (according to T2 distance) allowed. Used when
   * periodically pruning candidate canopies
   */
  protected double m_minClusterDensity = 2;

  /** The maximum number of candidate canopies to hold in memory at any one time */
  protected int m_maxCanopyCandidates = 100;

  /**
   * True if the pruning operation did remove at least one low density canopy
   * the last time it was invoked
   */
  protected boolean m_didPruneLastTime = true;

  /** Number of training instances seen so far */
  protected int m_instanceCount;

  /**
   * Default is to let the t2 radius determine how many canopies/clusters are
   * formed
   */
  protected int m_numClustersRequested = -1;

  /**
   * If not null, then this is expected to be a filter that can replace missing
   * values immediately (at training and testing time)
   */
  protected Filter m_missingValuesReplacer;

  /**
   * Replace missing values globally when running in batch mode?
   */
  protected boolean m_dontReplaceMissing = false;

  /** The distance function to use */
  protected NormalizableDistance m_distanceFunction = new EuclideanDistance();

  /**
   * Used to pad out number of cluster centers if fewer canopies are generated
   * than the number of requested clusters and we are running in batch mode.
   */
  protected Instances m_trainingData;

  /**
   * Returns a string describing this clusterer.
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Cluster data using the capopy clustering algorithm, which requires just "
      + "one pass over the data. Can run in either"
      + "batch or incremental mode. Results are generally not as good when "
      + "running incrementally as the min/max for each numeric attribute is not "
      + "known in advance. Has a heuristic (based on attribute std. deviations), "
      + "that can be used in batch mode, for setting the T2 distance. The T2 distance "
      + "determines how many canopies (clusters) are formed. When the user specifies "
      + "a specific number (N) of clusters to generate, the algorithm will return the "
      + "top N canopies (as determined by T2 density) when N < number of canopies "
      + "(this applies to both batch and incremental learning); "
      + "when N > number of canopies, the difference is made up by selecting training "
      + "instances randomly (this can only be done when batch training). For more "
      + "information see:\n\n" + getTechnicalInformation().toString();

  }

  @Override
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation result;

    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR, "A. McCallum and K. Nigam and L.H. Ungar");
    result
      .setValue(
        Field.TITLE,
        "Efficient Clustering of High Dimensional Data Sets with Application to Reference Matching");
    result.setValue(Field.BOOKTITLE,
      "Proceedings of the sixth ACM SIGKDD internation conference on "
        + "knowledge discovery and data mining "
        + "ACM-SIAM symposium on Discrete algorithms");
    result.setValue(Field.YEAR, "2000");
    result.setValue(Field.PAGES, "169-178");

    return result;
  }

  /**
   * Returns default capabilities of the clusterer.
   * 
   * @return the capabilities of this clusterer
   */
  @Override
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();
    result.disableAll();
    result.enable(Capability.NO_CLASS);

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);

    return result;
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration