weka.clusterers.HierarchicalClusterer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * HierarchicalClusterer.java
 * Copyright (C) 2009-2012 University of Waikato, Hamilton, New Zealand
 */

package weka.clusterers;

import java.io.Serializable;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Locale;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.PriorityQueue;
import java.util.Vector;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DistanceFunction;
import weka.core.Drawable;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;

/**
 *  Hierarchical clustering class. Implements a number
 * of classic hierarchical clustering methods. 
 * 
 *  Valid options are:
 * 
 * 
 * 
 * -N
 *  number of clusters
 * 
 * 
 * 
 *  * -L
 *  Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining)
 *  [SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMPLETE|NEIGHBOR_JOINING]
 * 
 * 
 *  * -A
 * Distance function to use. (default: weka.core.EuclideanDistance)
 * 
 * 
 *  * -P
 * Print hierarchy in Newick format, which can be used for display in other programs.
 * 
 * 
 *  * -D
 * If set, classifier is run in debug mode and may output additional info to the console.
 * 
 * 
 *  * -B
 * \If set, distance is interpreted as branch length, otherwise it is node height.
 * 
 * 
 * 
 * 
 * 
 * @author Remco Bouckaert ([email protected], [email protected])
 * @author Eibe Frank ([email protected])
 * @version $Revision: 15519 $
 */
public class HierarchicalClusterer extends AbstractClusterer implements
  OptionHandler, Drawable {
  private static final long serialVersionUID = 1L;

  /**
   * Whether the distance represent node height (if false) or branch length (if
   * true).
   */
  protected boolean m_bDistanceIsBranchLength = false;

  /** training data **/
  Instances m_instances;

  /** number of clusters desired in clustering **/
  int m_nNumClusters = 2;

  public void setNumClusters(int nClusters) {
    m_nNumClusters = Math.max(1, nClusters);
  }

  public int getNumClusters() {
    return m_nNumClusters;
  }

  /** distance function used for comparing members of a cluster **/
  protected DistanceFunction m_DistanceFunction = new EuclideanDistance();

  public DistanceFunction getDistanceFunction() {
    return m_DistanceFunction;
  }

  public void setDistanceFunction(DistanceFunction distanceFunction) {
    m_DistanceFunction = distanceFunction;
  }

  /**
   * used for priority queue for efficient retrieval of pair of clusters to
   * merge
   **/
  class Tuple {
    public Tuple(double d, int i, int j, int nSize1, int nSize2) {
      m_fDist = d;
      m_iCluster1 = i;
      m_iCluster2 = j;
      m_nClusterSize1 = nSize1;
      m_nClusterSize2 = nSize2;
    }

    double m_fDist;
    int m_iCluster1;
    int m_iCluster2;
    int m_nClusterSize1;
    int m_nClusterSize2;
  }

  /** comparator used by priority queue **/
  class TupleComparator implements Comparator {
    @Override
    public int compare(Tuple o1, Tuple o2) {
      if (o1.m_fDist < o2.m_fDist) {
        return -1;
      } else if (o1.m_fDist == o2.m_fDist) {
        return 0;
      }
      return 1;
    }
  }

  /** the various link types */
  final static int SINGLE = 0;
  final static int COMPLETE = 1;
  final static int AVERAGE = 2;
  final static int MEAN = 3;
  final static int CENTROID = 4;
  final static int WARD = 5;
  final static int ADJCOMPLETE = 6;
  final static int NEIGHBOR_JOINING = 7;
  public static final Tag[] TAGS_LINK_TYPE = { new Tag(SINGLE, "SINGLE"),
    new Tag(COMPLETE, "COMPLETE"), new Tag(AVERAGE, "AVERAGE"),
    new Tag(MEAN, "MEAN"), new Tag(CENTROID, "CENTROID"),
    new Tag(WARD, "WARD"), new Tag(ADJCOMPLETE, "ADJCOMPLETE"),
    new Tag(NEIGHBOR_JOINING, "NEIGHBOR_JOINING") };

  /**
   * Holds the Link type used calculate distance between clusters
   */
  int m_nLinkType = SINGLE;

  boolean m_bPrintNewick = true;;

  public boolean getPrintNewick() {
    return m_bPrintNewick;
  }

  public void setPrintNewick(boolean bPrintNewick) {
    m_bPrintNewick = bPrintNewick;
  }

  public void setLinkType(SelectedTag newLinkType) {
    if (newLinkType.getTags() == TAGS_LINK_TYPE) {
      m_nLinkType = newLinkType.getSelectedTag().getID();
    }
  }

  public SelectedTag getLinkType() {
    return new SelectedTag(m_nLinkType, TAGS_LINK_TYPE);
  }

  /** class representing node in cluster hierarchy **/
  class Node implements Serializable {

    /** ID added to avoid warning */
    private static final long serialVersionUID = 7639483515789717908L;

    Node m_left;
    Node m_right;
    Node m_parent;
    int m_iLeftInstance;
    int m_iRightInstance;
    double m_fLeftLength = 0;
    double m_fRightLength = 0;
    double m_fHeight = 0;

    public String toString(int attIndex) {
      NumberFormat nf = NumberFormat.getNumberInstance(new Locale("en","US"));
      DecimalFormat myFormatter = (DecimalFormat)nf;
      myFormatter.applyPattern("#.#####");

      if (m_left == null) {
        if (m_right == null) {
          return "("
            + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":"
            + myFormatter.format(m_fLeftLength) + ","
            + m_instances.instance(m_iRightInstance).stringValue(attIndex)
            + ":" + myFormatter.format(m_fRightLength) + ")";
        } else {
          return "("
            + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":"
            + myFormatter.format(m_fLeftLength) + ","
            + m_right.toString(attIndex) + ":"
            + myFormatter.format(m_fRightLength) + ")";
        }
      } else {
        if (m_right == null) {
          return "(" + m_left.toString(attIndex) + ":"
            + myFormatter.format(m_fLeftLength) + ","
            + m_instances.instance(m_iRightInstance).stringValue(attIndex)
            + ":" + myFormatter.format(m_fRightLength) + ")";
        } else {
          return "(" + m_left.toString(attIndex) + ":"
            + myFormatter.format(m_fLeftLength) + ","
            + m_right.toString(attIndex) + ":"
            + myFormatter.format(m_fRightLength) + ")";
        }
      }
    }

    public String toString2(int attIndex) {
      NumberFormat nf = NumberFormat.getNumberInstance(new Locale("en","US"));
      DecimalFormat myFormatter = (DecimalFormat)nf;
      myFormatter.applyPattern("#.#####");

      if (m_left == null) {
        if (m_right == null) {
          return "(" + m_instances.instance(m_iLeftInstance).value(attIndex)
            + ":" + myFormatter.format(m_fLeftLength) + ","
            + m_instances.instance(m_iRightInstance).value(attIndex) + ":"
            + myFormatter.format(m_fRightLength) + ")";
        } else {
          return "(" + m_instances.instance(m_iLeftInstance).value(attIndex)
            + ":" + myFormatter.format(m_fLeftLength) + ","
            + m_right.toString2(attIndex) + ":"
            + myFormatter.format(m_fRightLength) + ")";
        }
      } else {
        if (m_right == null) {
          return "(" + m_left.toString2(attIndex) + ":"
            + myFormatter.format(m_fLeftLength) + ","
            + m_instances.instance(m_iRightInstance).value(attIndex) + ":"
            + myFormatter.format(m_fRightLength) + ")";
        } else {
          return "(" + m_left.toString2(attIndex) + ":"
            + myFormatter.format(m_fLeftLength) + ","
            + m_right.toString2(attIndex) + ":"
            + myFormatter.format(m_fRightLength) + ")";
        }
      }
    }

    void setHeight(double fHeight1, double fHeight2) {
      m_fHeight = fHeight1;
      if (m_left == null) {
        m_fLeftLength = fHeight1;
      } else {
        m_fLeftLength = fHeight1 - m_left.m_fHeight;
      }
      if (m_right == null) {
        m_fRightLength = fHeight2;
      } else {
        m_fRightLength = fHeight2 - m_right.m_fHeight;
      }
    }

    void setLength(double fLength1, double fLength2) {
      m_fLeftLength = fLength1;
      m_fRightLength = fLength2;
      m_fHeight = fLength1;
      if (m_left != null) {
        m_fHeight += m_left.m_fHeight;
      }
    }
  }

  protected Node[] m_clusters;
  int[] m_nClusterNr;

  @Override
  public void buildClusterer(Instances data) throws Exception {
    // /System.err.println("Method " + m_nLinkType);
    m_instances = data;
    int nInstances = m_instances.numInstances();
    if (nInstances == 0) {
      return;
    }
    m_DistanceFunction.setInstances(m_instances);
    // use array of integer vectors to store cluster indices,
    // starting with one cluster per instance
    @SuppressWarnings("unchecked")
    Vector[] nClusterID = new Vector[data.numInstances()];
    for (int i = 0; i < data.numInstances(); i++) {
      nClusterID[i] = new Vector();
      nClusterID[i].add(i);
    }
    // calculate distance matrix
    int nClusters = data.numInstances();

    // used for keeping track of hierarchy
    Node[] clusterNodes = new Node[nInstances];
    if (m_nLinkType == NEIGHBOR_JOINING) {
      neighborJoining(nClusters, nClusterID, clusterNodes);
    } else {
      doLinkClustering(nClusters, nClusterID, clusterNodes);
    }

    // move all clusters in m_nClusterID array
    // & collect hierarchy
    int iCurrent = 0;
    m_clusters = new Node[m_nNumClusters];
    m_nClusterNr = new int[nInstances];
    for (int i = 0; i < nInstances; i++) {
      if (nClusterID[i].size() > 0) {
        for (int j = 0; j < nClusterID[i].size(); j++) {
          m_nClusterNr[nClusterID[i].elementAt(j)] = iCurrent;
        }
        m_clusters[iCurrent] = clusterNodes[i];
        iCurrent++;
      }
    }

  } // buildClusterer

  /**
   * use neighbor joining algorithm for clustering This is roughly based on the
   * RapidNJ simple implementation and runs at O(n^3) More efficient
   * implementations exist, see RapidNJ (or my GPU implementation :-))
   * 
   * @param nClusters
   * @param nClusterID
   * @param clusterNodes
   */
  void neighborJoining(int nClusters, Vector[] nClusterID,
    Node[] clusterNodes) {
    int n = m_instances.numInstances();

    double[][] fDist = new double[nClusters][nClusters];
    for (int i = 0; i < nClusters; i++) {
      fDist[i][i] = 0;
      for (int j = i + 1; j < nClusters; j++) {
        fDist[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
        fDist[j][i] = fDist[i][j];
      }
    }

    double[] fSeparationSums = new double[n];
    double[] fSeparations = new double[n];
    int[] nNextActive = new int[n];

    // calculate initial separation rows
    for (int i = 0; i < n; i++) {
      double fSum = 0;
      for (int j = 0; j < n; j++) {
        fSum += fDist[i][j];
      }
      fSeparationSums[i] = fSum;
      fSeparations[i] = fSum / (nClusters - 2);
      nNextActive[i] = i + 1;
    }

    while (nClusters > 2) {
      // find minimum
      int iMin1 = -1;
      int iMin2 = -1;
      double fMin = Double.MAX_VALUE;
      if (m_Debug) {
        for (int i = 0; i < n; i++) {
          if (nClusterID[i].size() > 0) {
            double[] fRow = fDist[i];
            double fSep1 = fSeparations[i];
            for (int j = 0; j < n; j++) {
              if (nClusterID[j].size() > 0 && i != j) {
                double fSep2 = fSeparations[j];
                double fVal = fRow[j] - fSep1 - fSep2;

                if (fVal < fMin) {
                  // new minimum
                  iMin1 = i;
                  iMin2 = j;
                  fMin = fVal;
                }
              }
            }
          }
        }
      } else {
        int i = 0;
        while (i < n) {
          double fSep1 = fSeparations[i];
          double[] fRow = fDist[i];
          int j = nNextActive[i];
          while (j < n) {
            double fSep2 = fSeparations[j];
            double fVal = fRow[j] - fSep1 - fSep2;
            if (fVal < fMin) {
              // new minimum
              iMin1 = i;
              iMin2 = j;
              fMin = fVal;
            }
            j = nNextActive[j];
          }
          i = nNextActive[i];
        }
      }
      // record distance
      double fMinDistance = fDist[iMin1][iMin2];
      nClusters--;
      double fSep1 = fSeparations[iMin1];
      double fSep2 = fSeparations[iMin2];
      double fDist1 = (0.5 * fMinDistance) + (0.5 * (fSep1 - fSep2));
      double fDist2 = (0.5 * fMinDistance) + (0.5 * (fSep2 - fSep1));
      if (nClusters > 2) {
        // update separations & distance
        double fNewSeparationSum = 0;
        double fMutualDistance = fDist[iMin1][iMin2];
        double[] fRow1 = fDist[iMin1];
        double[] fRow2 = fDist[iMin2];
        for (int i = 0; i < n; i++) {
          if (i == iMin1 || i == iMin2 || nClusterID[i].size() == 0) {
            fRow1[i] = 0;
          } else {
            double fVal1 = fRow1[i];
            double fVal2 = fRow2[i];
            double fDistance = (fVal1 + fVal2 - fMutualDistance) / 2.0;
            fNewSeparationSum += fDistance;
            // update the separationsum of cluster i.
            fSeparationSums[i] += (fDistance - fVal1 - fVal2);
            fSeparations[i] = fSeparationSums[i] / (nClusters - 2);
            fRow1[i] = fDistance;
            fDist[i][iMin1] = fDistance;
          }
        }
        fSeparationSums[iMin1] = fNewSeparationSum;
        fSeparations[iMin1] = fNewSeparationSum / (nClusters - 2);
        fSeparationSums[iMin2] = 0;
        merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
        int iPrev = iMin2;
        // since iMin1 < iMin2 we havenActiveRows[0] >= 0, so the next loop
        // should be save
        while (nClusterID[iPrev].size() == 0) {
          iPrev--;
        }
        nNextActive[iPrev] = nNextActive[iMin2];
      } else {
        merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
        break;
      }
    }

    for (int i = 0; i < n; i++) {
      if (nClusterID[i].size() > 0) {
        for (int j = i + 1; j < n; j++) {
          if (nClusterID[j].size() > 0) {
            double fDist1 = fDist[i][j];
            if (nClusterID[i].size() == 1) {
              merge(i, j, fDist1, 0, nClusterID, clusterNodes);
            } else if (nClusterID[j].size() == 1) {
              merge(i, j, 0, fDist1, nClusterID, clusterNodes);
            } else {
              merge(i, j, fDist1 / 2.0, fDist1 / 2.0, nClusterID, clusterNodes);
            }
            break;
          }
        }
      }
    }
  } // neighborJoining

  /**
   * Perform clustering using a link method This implementation uses a priority
   * queue resulting in a O(n^2 log(n)) algorithm
   * 
   * @param nClusters number of clusters
   * @param nClusterID
   * @param clusterNodes
   */
  void doLinkClustering(int nClusters, Vector[] nClusterID,
    Node[] clusterNodes) {
    int nInstances = m_instances.numInstances();
    PriorityQueue queue = new PriorityQueue(nClusters * nClusters
      / 2, new TupleComparator());
    double[][] fDistance0 = new double[nClusters][nClusters];
    double[][] fClusterDistance = null;
    if (m_Debug) {
      fClusterDistance = new double[nClusters][nClusters];
    }
    for (int i = 0; i < nClusters; i++) {
      fDistance0[i][i] = 0;
      for (int j = i + 1; j < nClusters; j++) {
        fDistance0[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
        fDistance0[j][i] = fDistance0[i][j];
        queue.add(new Tuple(fDistance0[i][j], i, j, 1, 1));
        if (m_Debug) {
          fClusterDistance[i][j] = fDistance0[i][j];
          fClusterDistance[j][i] = fDistance0[i][j];
        }
      }
    }
    while (nClusters > m_nNumClusters) {
      int iMin1 = -1;
      int iMin2 = -1;
      // find closest two clusters
      if (m_Debug) {
        /* simple but inefficient implementation */
        double fMinDistance = Double.MAX_VALUE;
        for (int i = 0; i < nInstances; i++) {
          if (nClusterID[i].size() > 0) {
            for (int j = i + 1; j < nInstances; j++) {
              if (nClusterID[j].size() > 0) {
                double fDist = fClusterDistance[i][j];
                if (fDist < fMinDistance) {
                  fMinDistance = fDist;
                  iMin1 = i;
                  iMin2 = j;
                }
              }
            }
          }
        }
        merge(iMin1, iMin2, fMinDistance, fMinDistance, nClusterID,
          clusterNodes);
      } else {
        // use priority queue to find next best pair to cluster
        Tuple t;
        do {
          t = queue.poll();
        } while (t != null
          && (nClusterID[t.m_iCluster1].size() != t.m_nClusterSize1 || nClusterID[t.m_iCluster2]
            .size() != t.m_nClusterSize2));
        iMin1 = t.m_iCluster1;
        iMin2 = t.m_iCluster2;
        merge(iMin1, iMin2, t.m_fDist, t.m_fDist, nClusterID, clusterNodes);
      }
      // merge clusters

      // update distances & queue
      for (int i = 0; i < nInstances; i++) {
        if (i != iMin1 && nClusterID[i].size() != 0) {
          int i1 = Math.min(iMin1, i);
          int i2 = Math.max(iMin1, i);
          double fDistance = getDistance(fDistance0, nClusterID[i1],
            nClusterID[i2]);
          if (m_Debug) {
            fClusterDistance[i1][i2] = fDistance;
            fClusterDistance[i2][i1] = fDistance;
          }
          queue.add(new Tuple(fDistance, i1, i2, nClusterID[i1].size(),
            nClusterID[i2].size()));
        }
      }

      nClusters--;
    }
  } // doLinkClustering

  void merge(int iMin1, int iMin2, double fDist1, double fDist2,
    Vector[] nClusterID, Node[] clusterNodes) {
    if (m_Debug) {
      System.err.println("Merging " + iMin1 + " " + iMin2 + " " + fDist1 + " "
        + fDist2);
    }
    if (iMin1 > iMin2) {
      int h = iMin1;
      iMin1 = iMin2;
      iMin2 = h;
      double f = fDist1;
      fDist1 = fDist2;
      fDist2 = f;
    }
    nClusterID[iMin1].addAll(nClusterID[iMin2]);
    nClusterID[iMin2].removeAllElements();

    // track hierarchy
    Node node = new Node();
    if (clusterNodes[iMin1] == null) {
      node.m_iLeftInstance = iMin1;
    } else {
      node.m_left = clusterNodes[iMin1];
      clusterNodes[iMin1].m_parent = node;
    }
    if (clusterNodes[iMin2] == null) {
      node.m_iRightInstance = iMin2;
    } else {
      node.m_right = clusterNodes[iMin2];
      clusterNodes[iMin2].m_parent = node;
    }
    if (m_bDistanceIsBranchLength) {
      node.setLength(fDist1, fDist2);
    } else {
      node.setHeight(fDist1, fDist2);
    }
    clusterNodes[iMin1] = node;
  } // merge

  /** calculate distance the first time when setting up the distance matrix **/
  double getDistance0(Vector cluster1, Vector cluster2) {
    double fBestDist = Double.MAX_VALUE;
    switch (m_nLinkType) {
    case SINGLE:
    case NEIGHBOR_JOINING:
    case CENTROID:
    case COMPLETE:
    case ADJCOMPLETE:
    case AVERAGE:
    case MEAN:
      // set up two instances for distance function
      Instance instance1 = (Instance) m_instances.instance(
        cluster1.elementAt(0)).copy();
      Instance instance2 = (Instance) m_instances.instance(
        cluster2.elementAt(0)).copy();
      fBestDist = m_DistanceFunction.distance(instance1, instance2);
      break;
    case WARD: {
      // finds the distance of the change in caused by merging the cluster.
      // The information of a cluster is calculated as the error sum of squares
      // of the
      // centroids of the cluster and its members.
      double ESS1 = calcESS(cluster1);
      double ESS2 = calcESS(cluster2);
      Vector merged = new Vector();
      merged.addAll(cluster1);
      merged.addAll(cluster2);
      double ESS = calcESS(merged);
      fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2
        * cluster2.size();
    }
      break;
    }
    return fBestDist;
  } // getDistance0

  /**
   * calculate the distance between two clusters
   * 
   * @param cluster1 list of indices of instances in the first cluster
   * @param cluster2 dito for second cluster
   * @return distance between clusters based on link type
   */
  double getDistance(double[][] fDistance, Vector cluster1,
    Vector cluster2) {
    double fBestDist = Double.MAX_VALUE;
    switch (m_nLinkType) {
    case SINGLE:
      // find single link distance aka minimum link, which is the closest
      // distance between
      // any item in cluster1 and any item in cluster2
      fBestDist = Double.MAX_VALUE;
      for (int i = 0; i < cluster1.size(); i++) {
        int i1 = cluster1.elementAt(i);
        for (int j = 0; j < cluster2.size(); j++) {
          int i2 = cluster2.elementAt(j);
          double fDist = fDistance[i1][i2];
          if (fBestDist > fDist) {
            fBestDist = fDist;
          }
        }
      }
      break;
    case COMPLETE:
    case ADJCOMPLETE:
      // find complete link distance aka maximum link, which is the largest
      // distance between
      // any item in cluster1 and any item in cluster2
      fBestDist = 0;
      for (int i = 0; i < cluster1.size(); i++) {
        int i1 = cluster1.elementAt(i);
        for (int j = 0; j < cluster2.size(); j++) {
          int i2 = cluster2.elementAt(j);
          double fDist = fDistance[i1][i2];
          if (fBestDist < fDist) {
            fBestDist = fDist;
          }
        }
      }
      if (m_nLinkType == COMPLETE) {
        break;
      }
      // calculate adjustment, which is the largest within cluster distance
      double fMaxDist = 0;
      for (int i = 0; i < cluster1.size(); i++) {
        int i1 = cluster1.elementAt(i);
        for (int j = i + 1; j < cluster1.size(); j++) {
          int i2 = cluster1.elementAt(j);
          double fDist = fDistance[i1][i2];
          if (fMaxDist < fDist) {
            fMaxDist = fDist;
          }
        }
      }
      for (int i = 0; i < cluster2.size(); i++) {
        int i1 = cluster2.elementAt(i);
        for (int j = i + 1; j < cluster2.size(); j++) {
          int i2 = cluster2.elementAt(j);
          double fDist = fDistance[i1][i2];
          if (fMaxDist < fDist) {
            fMaxDist = fDist;
          }
        }
      }
      fBestDist -= fMaxDist;
      break;
    case AVERAGE:
      // finds average distance between the elements of the two clusters
      fBestDist = 0;
      for (int i = 0; i < cluster1.size(); i++) {
        int i1 = cluster1.elementAt(i);
        for (int j = 0; j < cluster2.size(); j++) {
          int i2 = cluster2.elementAt(j);
          fBestDist += fDistance[i1][i2];
        }
      }
      fBestDist /= (cluster1.size() * cluster2.size());
      break;
    case MEAN: {
      // calculates the mean distance of a merged cluster (akak Group-average
      // agglomerative clustering)
      Vector merged = new Vector();
      merged.addAll(cluster1);
      merged.addAll(cluster2);
      fBestDist = 0;
      for (int i = 0; i < merged.size(); i++) {
        int i1 = merged.elementAt(i);
        for (int j = i + 1; j < merged.size(); j++) {
          int i2 = merged.elementAt(j);
          fBestDist += fDistance[i1][i2];
        }
      }
      int n = merged.size();
      fBestDist /= (n * (n - 1.0) / 2.0);
    }
      break;
    case CENTROID:
      // finds the distance of the centroids of the clusters
      double[] fValues1 = new double[m_instances.numAttributes()];
      for (int i = 0; i < cluster1.size(); i++) {
        Instance instance = m_instances.instance(cluster1.elementAt(i));
        for (int j = 0; j < m_instances.numAttributes(); j++) {
          fValues1[j] += instance.value(j);
        }
      }
      double[] fValues2 = new double[m_instances.numAttributes()];
      for (int i = 0; i < cluster2.size(); i++) {
        Instance instance = m_instances.instance(cluster2.elementAt(i));
        for (int j = 0; j < m_instances.numAttributes(); j++) {
          fValues2[j] += instance.value(j);
        }
      }
      for (int j = 0; j < m_instances.numAttributes(); j++) {
        fValues1[j] /= cluster1.size();
        fValues2[j] /= cluster2.size();
      }
      fBestDist = m_DistanceFunction.distance(m_instances.instance(0).copy(fValues1),
              m_instances.instance(0).copy(fValues2));
      break;
    case WARD: {
      // finds the distance of the change in caused by merging the cluster.
      // The information of a cluster is calculated as the error sum of squares
      // of the
      // centroids of the cluster and its members.
      double ESS1 = calcESS(cluster1);
      double ESS2 = calcESS(cluster2);
      Vector merged = new Vector();
      merged.addAll(cluster1);
      merged.addAll(cluster2);
      double ESS = calcESS(merged);
      fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2
        * cluster2.size();
    }
      break;
    }
    return fBestDist;
  } // getDistance

  /** calculated error sum-of-squares for instances wrt centroid **/
  double calcESS(Vector cluster) {
    double[] fValues1 = new double[m_instances.numAttributes()];
    for (int i = 0; i < cluster.size(); i++) {
      Instance instance = m_instances.instance(cluster.elementAt(i));
      for (int j = 0; j < m_instances.numAttributes(); j++) {
        fValues1[j] += instance.value(j);
      }
    }
    for (int j = 0; j < m_instances.numAttributes(); j++) {
      fValues1[j] /= cluster.size();
    }
    // set up instance for distance function
    Instance centroid = m_instances.instance(cluster.elementAt(0)).copy(fValues1);
    double fESS = 0;
    for (int i = 0; i < cluster.size(); i++) {
      Instance instance = m_instances.instance(cluster.elementAt(i));
      fESS += m_DistanceFunction.distance(centroid, instance);
    }
    return fESS / cluster.size();
  } // calcESS

  @Override
  /** instances are assigned a cluster by finding the instance in the training data 
   * with the closest distance to the instance to be clustered. The cluster index of
   * the training data point is taken as the cluster index.
   */
  public int clusterInstance(Instance instance) throws Exception {
    if (m_instances.numInstances() == 0) {
      return 0;
    }
    double fBestDist = Double.MAX_VALUE;
    int iBestInstance = -1;
    for (int i = 0; i < m_instances.numInstances(); i++) {
      double fDist = m_DistanceFunction.distance(instance,
        m_instances.instance(i));
      if (fDist < fBestDist) {
        fBestDist = fDist;
        iBestInstance = i;
      }
    }
    return m_nClusterNr[iBestInstance];
  }

  @Override
  /** create distribution with all clusters having zero probability, except the
   * cluster the instance is assigned to.
   */
  public double[] distributionForInstance(Instance instance) throws Exception {
    if (numberOfClusters() == 0) {
      double[] p = new double[1];
      p[0] = 1;
      return p;
    }
    double[] p = new double[numberOfClusters()];
    p[clusterInstance(instance)] = 1.0;
    return p;
  }

  @Override
  public Capabilities getCapabilities() {
    Capabilities result = new Capabilities(this);
    result.disableAll();
    result.enable(Capability.NO_CLASS);

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.DATE_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);
    result.enable(Capability.STRING_ATTRIBUTES);

    // other
    result.setMinimumNumberInstances(0);
    return result;
  }

  @Override
  public int numberOfClusters() throws Exception {
    return Math.min(m_nNumClusters, m_instances.numInstances());
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration
   * 
   * 
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  @Override
  public void setOptions(String[] options) throws Exception {
    m_bPrintNewick = Utils.getFlag('P', options);

    String optionString = Utils.getOption('N', options);
    if (optionString.length() != 0) {
      Integer temp = new Integer(optionString);
      setNumClusters(temp);
    } else {
      setNumClusters(2);
    }

    setDistanceIsBranchLength(Utils.getFlag('B', options));

    String sLinkType = Utils.getOption('L', options);

    if (sLinkType.compareTo("SINGLE") == 0) {
      setLinkType(new SelectedTag(SINGLE, TAGS_LINK_TYPE));
    }
    if (sLinkType.compareTo("COMPLETE") == 0) {
      setLinkType(new SelectedTag(COMPLETE, TAGS_LINK_TYPE));
    }
    if (sLinkType.compareTo("AVERAGE") == 0) {
      setLinkType(new SelectedTag(AVERAGE, TAGS_LINK_TYPE));
    }
    if (sLinkType.compareTo("MEAN") == 0) {
      setLinkType(new SelectedTag(MEAN, TAGS_LINK_TYPE));
    }
    if (sLinkType.compareTo("CENTROID") == 0) {
      setLinkType(new SelectedTag(CENTROID, TAGS_LINK_TYPE));
    }
    if (sLinkType.compareTo("WARD") == 0) {
      setLinkType(new SelectedTag(WARD, TAGS_LINK_TYPE));
    }
    if (sLinkType.compareTo("ADJCOMPLETE") == 0) {
      setLinkType(new SelectedTag(ADJCOMPLETE, TAGS_LINK_TYPE));
    }
    if (sLinkType.compareTo("NEIGHBOR_JOINING") == 0) {
      setLinkType(new SelectedTag(NEIGHBOR_JOINING, TAGS_LINK_TYPE));
    }

    String nnSearchClass = Utils.getOption('A', options);
    if (nnSearchClass.length() != 0) {
      String nnSearchClassSpec[] = Utils.splitOptions(nnSearchClass);
      if (nnSearchClassSpec.length == 0) {
        throw new Exception("Invalid DistanceFunction specification string.");
      }
      String className = nnSearchClassSpec[0];
      nnSearchClassSpec[0] = "";

      setDistanceFunction((DistanceFunction) Utils.forName(
        DistanceFunction.class, className, nnSearchClassSpec));
    } else {
      setDistanceFunction(new EuclideanDistance());
    }

    super.setOptions(options);
  }

  /**
   * Gets the current settings of the clusterer.
   * 
   * @return an array of strings suitable for passing to setOptions()
   */
  @Override
  public String[] getOptions() {

    Vector options = new Vector();

    options.add("-N");
    options.add("" + getNumClusters());

    options.add("-L");
    switch (m_nLinkType) {
    case (SINGLE):
      options.add("SINGLE");
      break;
    case (COMPLETE):
      options.add("COMPLETE");
      break;
    case (AVERAGE):
      options.add("AVERAGE");
      break;
    case (MEAN):
      options.add("MEAN");
      break;
    case (CENTROID):
      options.add("CENTROID");
      break;
    case (WARD):
      options.add("WARD");
      break;
    case (ADJCOMPLETE):
      options.add("ADJCOMPLETE");
      break;
    case (NEIGHBOR_JOINING):
      options.add("NEIGHBOR_JOINING");
      break;
    }
    if (m_bPrintNewick) {
      options.add("-P");
    }
    if (getDistanceIsBranchLength()) {
      options.add("-B");
    }

    options.add("-A");
    options.add((m_DistanceFunction.getClass().getName() + " " + Utils
      .joinOptions(m_DistanceFunction.getOptions())).trim());

    Collections.addAll(options, super.getOptions());

    return options.toArray(new String[0]);
  }

  @Override
  public String toString() {
    StringBuffer buf = new StringBuffer();
    int attIndex = m_instances.classIndex();
    if (attIndex < 0) {
      // try find a string, or last attribute otherwise
      attIndex = 0;
      while (attIndex < m_instances.numAttributes() - 1) {
        if (m_instances.attribute(attIndex).isString()) {
          break;
        }
        attIndex++;
      }
    }
    try {
      if (m_bPrintNewick && (numberOfClusters() > 0)) {
        for (int i = 0; i < m_clusters.length; i++) {
          if (m_clusters[i] != null) {
            buf.append("Cluster " + i + "\n");
            if (m_instances.attribute(attIndex).isString()) {
              buf.append(m_clusters[i].toString(attIndex));
            } else {
              buf.append(m_clusters[i].toString2(attIndex));
            }
            buf.append("\n\n");
          }
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return buf.toString();
  }

  public boolean getDistanceIsBranchLength() {
    return m_bDistanceIsBranchLength;
  }

  public void setDistanceIsBranchLength(boolean bDistanceIsHeight) {
    m_bDistanceIsBranchLength = bDistanceIsHeight;
  }

  public String distanceIsBranchLengthTipText() {
    return "If set to false, the distance between clusters is interpreted "
      + "as the height of the node linking the clusters. This is appropriate for "
      + "example for single link clustering. However, for neighbor joining, the "
      + "distance is better interpreted as branch length. Set this flag to "
      + "get the latter interpretation.";
  }

  /**
   * @return a string to describe the NumClusters
   */
  public String numClustersTipText() {
    return "Sets the number of clusters. "
      + "If a single hierarchy is desired, set this to 1.";
  }

  /**
   * @return a string to describe the print Newick flag
   */
  public String printNewickTipText() {
    return "Flag to indicate whether the cluster should be print in Newick format."
      + " This can be useful for display in other programs. However, for large datasets"
      + " a lot of text may be produced, which may not be a nuisance when the Newick format"
      + " is not required";
  }

  /**
   * @return a string to describe the distance function
   */
  public String distanceFunctionTipText() {
    return "Sets the distance function, which measures the distance between two individual. "
      + "instances (or possibly the distance between an instance and the centroid of a cluster"
      + "depending on the Link type).";
  }

  /**
   * @return a string to describe the Link type
   */
  public String linkTypeTipText() {
    return "Sets the method used to measure the distance between two clusters.\n"
      + "SINGLE:\n"
      + " find single link distance aka minimum link, which is the closest distance between"
      + " any item in cluster1 and any item in cluster2\n"
      + "COMPLETE:\n"
      + " find complete link distance aka maximum link, which is the largest distance between"
      + " any item in cluster1 and any item in cluster2\n"
      + "ADJCOMPLETE:\n"
      + " as COMPLETE, but with adjustment, which is the largest within cluster distance\n"
      + "AVERAGE:\n"
      + " finds average distance between the elements of the two clusters\n"
      + "MEAN: \n"
      + " calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering)\n"
      + "CENTROID:\n"
      + " finds the distance of the centroids of the clusters\n"
      + "WARD:\n"
      + " finds the distance of the change in caused by merging the cluster."
      + " The information of a cluster is calculated as the error sum of squares of the"
      + " centroids of the cluster and its members.\n"
      + "NEIGHBOR_JOINING\n"
      + " use neighbor joining algorithm.";
  }

  /**
   * This will return a string describing the clusterer.
   * 
   * @return The string.
   */
  public String globalInfo() {
    return "Hierarchical clustering class.\n"
      + "Implements a number of classic agglomerative (i.e., bottom up) hierarchical clustering methods.";
  }

  public static void main(String[] argv) {
    runClusterer(new HierarchicalClusterer(), argv);
  }

  @Override
  public String graph() throws Exception {
    if (numberOfClusters() == 0) {
      return "Newick:(no,clusters)";
    }
    int attIndex = m_instances.classIndex();
    if (attIndex < 0) {
      // try find a string, or last attribute otherwise
      attIndex = 0;
      while (attIndex < m_instances.numAttributes() - 1) {
        if (m_instances.attribute(attIndex).isString()) {
          break;
        }
        attIndex++;
      }
    }
    String sNewick = null;
    if (m_instances.attribute(attIndex).isString()) {
      sNewick = m_clusters[0].toString(attIndex);
    } else {
      sNewick = m_clusters[0].toString2(attIndex);
    }
    return "Newick:" + sNewick;
  }

  @Override
  public int graphType() {
    return Drawable.Newick;
  }

  /**
   * Returns the revision string.
   * 
   * @return the revision
   */
  @Override
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 15519 $");
  }
} // class HierarchicalClusterer