weka.clusterers.FarthestFirst Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* FarthestFirst.java
* Copyright (C) 2002-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.clusterers;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;
/**
* Cluster data using the FarthestFirst algorithm.
*
* For more information see:
*
* Hochbaum, Shmoys (1985). A best possible heuristic for the k-center problem. Mathematics of Operations Research. 10(2):180-184.
*
* Sanjoy Dasgupta: Performance Guarantees for Hierarchical Clustering. In: 15th Annual Conference on Computational Learning Theory, 351-363, 2002.
*
* Notes:
* - works as a fast simple approximate clusterer
* - modelled after SimpleKMeans, might be a useful initializer for it
*
*
* BibTeX:
*
* @article{Hochbaum1985,
* author = {Hochbaum and Shmoys},
* journal = {Mathematics of Operations Research},
* number = {2},
* pages = {180-184},
* title = {A best possible heuristic for the k-center problem},
* volume = {10},
* year = {1985}
* }
*
* @inproceedings{Dasgupta2002,
* author = {Sanjoy Dasgupta},
* booktitle = {15th Annual Conference on Computational Learning Theory},
* pages = {351-363},
* publisher = {Springer},
* title = {Performance Guarantees for Hierarchical Clustering},
* year = {2002}
* }
*
*
*
* Valid options are:
*
* -N <num>
* number of clusters. (default = 2).
*
* -S <num>
* Random number seed.
* (default 1)
*
*
* @author Bernhard Pfahringer ([email protected])
* @version $Revision: 15519 $
* @see RandomizableClusterer
*/
public class FarthestFirst extends RandomizableClusterer implements
TechnicalInformationHandler {
// Todo: rewrite to be fully incremental
// cleanup, like deleting m_instances
/** for serialization */
static final long serialVersionUID = 7499838100631329509L;
/**
* training instances, not necessary to keep, could be replaced by
* m_ClusterCentroids where needed for header info
*/
protected Instances m_instances;
/**
* replace missing values in training instances
*/
protected ReplaceMissingValues m_ReplaceMissingFilter;
/**
* number of clusters to generate
*/
protected int m_NumClusters = 2;
/**
* holds the cluster centroids
*/
protected Instances m_ClusterCentroids;
/**
* attribute min values
*/
private double[] m_Min;
/**
* attribute max values
*/
private double[] m_Max;
/**
* Returns a string describing this clusterer
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Cluster data using the FarthestFirst algorithm.\n\n"
+ "For more information see:\n\n" + getTechnicalInformation().toString()
+ "\n\n" + "Notes:\n"
+ "- works as a fast simple approximate clusterer\n"
+ "- modelled after SimpleKMeans, might be a useful initializer for it";
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
TechnicalInformation additional;
result = new TechnicalInformation(Type.ARTICLE);
result.setValue(Field.AUTHOR, "Hochbaum and Shmoys");
result.setValue(Field.YEAR, "1985");
result.setValue(Field.TITLE,
"A best possible heuristic for the k-center problem");
result.setValue(Field.JOURNAL, "Mathematics of Operations Research");
result.setValue(Field.VOLUME, "10");
result.setValue(Field.NUMBER, "2");
result.setValue(Field.PAGES, "180-184");
additional = result.add(Type.INPROCEEDINGS);
additional.setValue(Field.AUTHOR, "Sanjoy Dasgupta");
additional.setValue(Field.TITLE,
"Performance Guarantees for Hierarchical Clustering");
additional.setValue(Field.BOOKTITLE,
"15th Annual Conference on Computational Learning Theory");
additional.setValue(Field.YEAR, "2002");
additional.setValue(Field.PAGES, "351-363");
additional.setValue(Field.PUBLISHER, "Springer");
return result;
}
/**
* Returns default capabilities of the clusterer.
*
* @return the capabilities of this clusterer
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
result.enable(Capability.NO_CLASS);
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
return result;
}
/**
* Generates a clusterer. Has to initialize all fields of the clusterer that
* are not being set via options.
*
* @param data set of instances serving as training data
* @throws Exception if the clusterer has not been generated successfully
*/
@Override
public void buildClusterer(Instances data) throws Exception {
// can clusterer handle the data?
getCapabilities().testWithFail(data);
// long start = System.currentTimeMillis();
m_ReplaceMissingFilter = new ReplaceMissingValues();
m_ReplaceMissingFilter.setInputFormat(data);
m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);
initMinMax(m_instances);
m_ClusterCentroids = new Instances(m_instances, m_NumClusters);
int n = m_instances.numInstances();
Random r = new Random(getSeed());
boolean[] selected = new boolean[n];
double[] minDistance = new double[n];
for (int i = 0; i < n; i++) {
minDistance[i] = Double.MAX_VALUE;
}
int firstI = r.nextInt(n);
m_ClusterCentroids.add(m_instances.instance(firstI));
selected[firstI] = true;
updateMinDistance(minDistance, selected, m_instances,
m_instances.instance(firstI));
if (m_NumClusters > n) {
m_NumClusters = n;
}
for (int i = 1; i < m_NumClusters; i++) {
int nextI = farthestAway(minDistance, selected);
m_ClusterCentroids.add(m_instances.instance(nextI));
selected[nextI] = true;
updateMinDistance(minDistance, selected, m_instances,
m_instances.instance(nextI));
}
m_instances = new Instances(m_instances, 0);
// long end = System.currentTimeMillis();
// System.out.println("Clustering Time = " + (end-start));
}
protected void updateMinDistance(double[] minDistance, boolean[] selected,
Instances data, Instance center) {
for (int i = 0; i < selected.length; i++) {
if (!selected[i]) {
double d = distance(center, data.instance(i));
if (d < minDistance[i]) {
minDistance[i] = d;
}
}
}
}
protected int farthestAway(double[] minDistance, boolean[] selected) {
double maxDistance = -1.0;
int maxI = -1;
for (int i = 0; i < selected.length; i++) {
if (!selected[i]) {
if (maxDistance < minDistance[i]) {
maxDistance = minDistance[i];
maxI = i;
}
}
}
return maxI;
}
protected void initMinMax(Instances data) {
m_Min = new double[data.numAttributes()];
m_Max = new double[data.numAttributes()];
for (int i = 0; i < data.numAttributes(); i++) {
m_Min[i] = m_Max[i] = Double.NaN;
}
for (int i = 0; i < data.numInstances(); i++) {
updateMinMax(data.instance(i));
}
}
/**
* Updates the minimum and maximum values for all the attributes based on a
* new instance.
*
* @param instance the new instance
*/
private void updateMinMax(Instance instance) {
for (int j = 0; j < instance.numAttributes(); j++) {
if (Double.isNaN(m_Min[j])) {
m_Min[j] = instance.value(j);
m_Max[j] = instance.value(j);
} else {
if (instance.value(j) < m_Min[j]) {
m_Min[j] = instance.value(j);
} else {
if (instance.value(j) > m_Max[j]) {
m_Max[j] = instance.value(j);
}
}
}
}
}
/**
* clusters an instance that has been through the filters
*
* @param instance the instance to assign a cluster to
* @return a cluster number
*/
protected int clusterProcessedInstance(Instance instance) {
double minDist = Double.MAX_VALUE;
int bestCluster = 0;
for (int i = 0; i < m_NumClusters; i++) {
double dist = distance(instance, m_ClusterCentroids.instance(i));
if (dist < minDist) {
minDist = dist;
bestCluster = i;
}
}
return bestCluster;
}
/**
* Classifies a given instance.
*
* @param instance the instance to be assigned to a cluster
* @return the number of the assigned cluster as an integer if the class is
* enumerated, otherwise the predicted value
* @throws Exception if instance could not be classified successfully
*/
@Override
public int clusterInstance(Instance instance) throws Exception {
m_ReplaceMissingFilter.input(instance);
m_ReplaceMissingFilter.batchFinished();
Instance inst = m_ReplaceMissingFilter.output();
return clusterProcessedInstance(inst);
}
/**
* Calculates the distance between two instances
*
* @param first the first instance
* @param second the second instance
* @return the distance between the two given instances, between 0 and 1
*/
protected double distance(Instance first, Instance second) {
double distance = 0;
int firstI, secondI;
for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues();) {
if (p1 >= first.numValues()) {
firstI = m_instances.numAttributes();
} else {
firstI = first.index(p1);
}
if (p2 >= second.numValues()) {
secondI = m_instances.numAttributes();
} else {
secondI = second.index(p2);
}
if (firstI == m_instances.classIndex()) {
p1++;
continue;
}
if (secondI == m_instances.classIndex()) {
p2++;
continue;
}
double diff;
if (firstI == secondI) {
diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
p1++;
p2++;
} else if (firstI > secondI) {
diff = difference(secondI, 0, second.valueSparse(p2));
p2++;
} else {
diff = difference(firstI, first.valueSparse(p1), 0);
p1++;
}
distance += diff * diff;
}
return Math.sqrt(distance / m_instances.numAttributes());
}
/**
* Computes the difference between two given attribute values.
*/
protected double difference(int index, double val1, double val2) {
switch (m_instances.attribute(index).type()) {
case Attribute.NOMINAL:
// If attribute is nominal
if (Utils.isMissingValue(val1) || Utils.isMissingValue(val2)
|| ((int) val1 != (int) val2)) {
return 1;
} else {
return 0;
}
case Attribute.NUMERIC:
// If attribute is numeric
if (Utils.isMissingValue(val1) || Utils.isMissingValue(val2)) {
if (Utils.isMissingValue(val1) && Utils.isMissingValue(val2)) {
return 1;
} else {
double diff;
if (Utils.isMissingValue(val2)) {
diff = norm(val1, index);
} else {
diff = norm(val2, index);
}
if (diff < 0.5) {
diff = 1.0 - diff;
}
return diff;
}
} else {
return norm(val1, index) - norm(val2, index);
}
default:
return 0;
}
}
/**
* Normalizes a given value of a numeric attribute.
*
* @param x the value to be normalized
* @param i the attribute's index
* @return the normalized value
*/
protected double norm(double x, int i) {
if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i], m_Min[i])) {
return 0;
} else {
return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
}
}
/**
* Returns the number of clusters.
*
* @return the number of clusters generated for a training dataset.
* @throws Exception if number of clusters could not be returned successfully
*/
@Override
public int numberOfClusters() throws Exception {
return m_NumClusters;
}
/**
* Get the centroids found by FarthestFirst
*
* @return the centroids found by FarthestFirst
*/
public Instances getClusterCentroids() {
return m_ClusterCentroids;
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
@Override
public Enumeration
© 2015 - 2024 Weber Informatics LLC | Privacy Policy