weka.clusterers.EM Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-stable Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This is the stable version. Apart from bugfixes, this version does not receive any other updates.
There is a newer version: 3.8.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    EM.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.clusterers;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
import java.util.Random;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.estimators.DiscreteEstimator;
import weka.estimators.Estimator;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

/**
 * 
 * Simple EM (expectation maximisation) class.

 * 

 * EM assigns a probability distribution to each instance which indicates the probability of it belonging to each of the clusters. EM can decide how many clusters to create by cross validation, or you may specify apriori how many clusters to generate.

 * 

 * The cross validation performed to determine the number of clusters is done in the following steps:

 * 1. the number of clusters is set to 1

 * 2. the training set is split randomly into 10 folds.

 * 3. EM is performed 10 times using the 10 folds the usual CV way.

 * 4. the loglikelihood is averaged over all 10 results.

 * 5. if loglikelihood has increased the number of clusters is increased by 1 and the program continues at step 2. 

 * 

 * The number of folds is fixed to 10, as long as the number of instances in the training set is not smaller 10. If this is the case the number of folds is set equal to the number of instances.

 * 

 * Missing values are globally replaced with ReplaceMissingValues.
 * 
 * 
 * 
 * 
 * Valid options are: 

 * 
 * 
 -N <num>
 *  number of clusters. If omitted or -1 specified, then 
 *  cross validation is used to select the number of clusters.
 * 
 *  -X <num>
 *  Number of folds to use when cross-validating to find the best number of clusters.
 * 
 *  -K <num>
 *  Number of runs of k-means to perform.
 *  (default 10)
 * 
 *  -max <num>
 *  Maximum number of clusters to consider during cross-validation. If omitted or -1 specified, then 
 *  there is no upper limit on the number of clusters.
 * 
 *  -ll-cv <num>
 *  Minimum improvement in cross-validated log likelihood required
 *  to consider increasing the number of clusters.
 *  (default 1e-6)
 * 
 *  -I <num>
 *  max iterations.
 *  (default 100)
 * 
 *  -ll-iter <num>
 *  Minimum improvement in log likelihood required
 *  to perform another iteration of the E and M steps.
 *  (default 1e-6)
 * 
 *  -V
 *  verbose.
 * 
 *  -M <num>
 *  minimum allowable standard deviation for normal density
 *  computation
 *  (default 1e-6)
 * 
 *  -O
 *  Display model in old format (good when there are many clusters)
 * 
 * 
 *  -num-slots <num>
 *  Number of execution slots.
 *  (default 1 - i.e. no parallelism)
 * 
 *  -S <num>
 *  Random number seed.
 *  (default 100)
 * 
 *  -output-debug-info
 *  If set, clusterer is run in debug mode and
 *  may output additional info to the console
 * 
 *  -do-not-check-capabilities
 *  If set, clusterer capabilities are not checked before clusterer is built
 *  (use with caution).
 * 
 * 
 * 
 * @author Mark Hall ([email protected])
 * @author Eibe Frank ([email protected])
 * @version $Revision: 11451 $
 */
public class EM extends RandomizableDensityBasedClusterer implements
NumberOfClustersRequestable, WeightedInstancesHandler {

  /** for serialization */
  static final long serialVersionUID = 8348181483812829475L;

  private Estimator m_modelPrev[][];
  private double[][][] m_modelNormalPrev;
  private double[] m_priorsPrev;

  /** hold the discrete estimators for each cluster */
  private Estimator m_model[][];

  /** hold the normal estimators for each cluster */
  private double m_modelNormal[][][];

  /** default minimum standard deviation */
  private double m_minStdDev = 1e-6;

  private double[] m_minStdDevPerAtt;

  /** hold the weights of each instance for each cluster */
  private double m_weights[][];

  /** the prior probabilities for clusters */
  private double m_priors[];

  /** full training instances */
  private Instances m_theInstances = null;

  /** number of clusters selected by the user or cross validation */
  private int m_num_clusters;

  /**
   * the initial number of clusters requested by the user--- -1 if xval is to be
   * used to find the number of clusters
   */
  private int m_initialNumClusters;

  /** Don't consider more clusters than this under CV (-1 means no upper bound) */
  private int m_upperBoundNumClustersCV = -1;

  /** number of attributes */
  private int m_num_attribs;

  /** number of training instances */
  private int m_num_instances;

  /** maximum iterations to perform */
  private int m_max_iterations;

  /** attribute min values */
  private double[] m_minValues;

  /** attribute max values */
  private double[] m_maxValues;

  /** random number generator */
  private Random m_rr;

  /** Verbose? */
  private boolean m_verbose;

  /** globally replace missing values */
  private ReplaceMissingValues m_replaceMissing;

  /** display model output in old-style format */
  private boolean m_displayModelInOldFormat;

  /** Number of threads to use for E and M steps */
  protected int m_executionSlots = 1;

  /** For parallel execution mode */
  protected transient ExecutorService m_executorPool;

  /** False once training has completed */
  protected boolean m_training;

  /** The actual number of iterations performed */
  protected int m_iterationsPerformed;

  /** Minimum improvement in log likelihood when iterating */
  protected double m_minLogLikelihoodImprovementIterating = 1e-6;

  /** Minimum improvement to increase number of clusters when cross-validating */
  protected double m_minLogLikelihoodImprovementCV = 1e-6;

  /** The number of folds to use for cross-validation */
  protected int m_cvFolds = 10;

  /** The number of runs of k-means to perform */
  protected int m_NumKMeansRuns = 10;

  /**
   * Returns a string describing this clusterer
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Simple EM (expectation maximisation) class.\n\n"
      + "EM assigns a probability distribution to each instance which "
      + "indicates the probability of it belonging to each of the clusters. "
      + "EM can decide how many clusters to create by cross validation, or you "
      + "may specify apriori how many clusters to generate.\n\n"
      + "The cross validation performed to determine the number of clusters "
      + "is done in the following steps:\n"
      + "1. the number of clusters is set to 1\n"
      + "2. the training set is split randomly into 10 folds.\n"
      + "3. EM is performed 10 times using the 10 folds the usual CV way.\n"
      + "4. the loglikelihood is averaged over all 10 results.\n"
      + "5. if loglikelihood has increased the number of clusters is increased "
      + "by 1 and the program continues at step 2. \n\n"
      + "The number of folds is fixed to 10, as long as the number of "
      + "instances in the training set is not smaller 10. If this is the case "
      + "the number of folds is set equal to the number of instances.\n\n"
      + "Missing values are globally replaced with ReplaceMissingValues.";
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration
   *          -t training file [-T test file] [-N number of clusters] [-S random
   *          seed]
   */
  public static void main(String[] argv) {
    runClusterer(new EM(), argv);
  }
}