smile.clustering.SOM Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.clustering;
import java.util.Arrays;
import smile.clustering.linkage.Linkage;
import smile.clustering.linkage.UPGMALinkage;
import smile.math.Math;
import smile.math.matrix.EigenValueDecomposition;
/**
* Self-Organizing Map. An SOM is a unsupervised learning method to produce
* a low-dimensional (typically two-dimensional) discretized representation
* (called a map) of the input space of the training samples. The model was
* first described as an artificial neural network by Teuvo Kohonen, and is
* sometimes called a Kohonen map.
*
* While it is typical to consider SOMs as related to feed-forward networks where
* the nodes are visualized as being attached, this type of architecture is
* fundamentally different in arrangement and motivation because SOMs use a
* neighborhood function to preserve the topological properties of the input
* space. This makes SOMs useful for visualizing low-dimensional views of
* high-dimensional data, akin to multidimensional scaling.
*
* SOMs belong to a large family of competitive learning process and vector
* quantization. An SOM consists of components called nodes or neurons.
* Associated with each node is a weight vector of the same dimension as
* the input data vectors and a position in the map space. The usual arrangement
* of nodes is a regular spacing in a hexagonal or rectangular grid. The
* self-organizing map describes a mapping from a higher dimensional input
* space to a lower dimensional map space. During the (iterative) learning,
* the input vectors are compared to the weight vector of each neuron. Neurons
* who most closely match the input are known as the best match unit (BMU) of
* the system. The weight vector of the BMU and those of nearby neurons are
* adjusted to be closer to the input vector by a certain step size.
*
* There are two ways to interpret a SOM. Because in the training phase weights
* of the whole neighborhood are moved in the same direction, similar items
* tend to excite adjacent neurons. Therefore, SOM forms a semantic map where
* similar samples are mapped close together and dissimilar apart.
* The other way is to think of neuronal weights as pointers to the input space.
* They form a discrete approximation of the distribution of training samples.
* More neurons point to regions with high training sample concentration and
* fewer where the samples are scarce.
*
* SOM may be considered a nonlinear generalization of Principal components
* analysis (PCA). It has been shown, using both artificial and real
* geophysical data, that SOM has many advantages over the conventional feature
* extraction methods such as Empirical Orthogonal Functions (EOF) or PCA.
*
* It has been shown that while SOMs with a small number of nodes behave in a
* way that is similar to K-means. However, larger SOMs rearrange data
* in a way that is fundamentally topological in character and display properties
* which are emergent. Therefore, large maps are preferable to smaller ones.
* In maps consisting of thousands of nodes, it is possible to perform cluster
* operations on the map itself.
*
* A common way to display SOMs is the heat map of U-matrix. The U-matrix value
* of a particular node is the minimum/maximum/average distance between the node
* and its closest neighbors. In a rectangular grid for instance, we might
* consider the closest 4 or 8 nodes.
*
*
References
*
* - Teuvo KohonenDan. Self-organizing maps. Springer, 3rd edition, 2000.
*
*
* @author Haifeng Li
*/
public class SOM implements Clustering {
/**
* Self-Organizing Map Neuron.
*/
public static class Neuron {
/**
* Weight vector.
*/
public double[] w;
/**
* Cluster id of this neuron.
*/
public int cluster;
/**
* The samples that are best matched.
*/
public int[] samples;
/**
* The class label of majority, y = which.max(ni).
*/
public int y;
/**
* The count of each class that best matched samples belong to.
*/
public int[] ni;
/**
* The distance to neighbors.
*/
public double[] distance;
}
/**
* The width of map.
*/
private int width;
/**
* The height of map.
*/
private int height;
/**
* The dimension of input space.
*/
private int d;
/**
* The SOM map of neurons.
*/
private double[][][] neurons;
/**
* The best matched unit. This is n-by-2 matrix, of which each row is
* for each data point. The entry bmu[i][0] and bmu[i][1] are the row
* index and column index of the best matched unit for each sample,
* respectively.
*/
private int[][] bmu;
/**
* The number of samples in each units.
*/
private int[][] voronoi;
/**
* The cluster labels of neurons.
*/
private int[] y;
/**
* Unified distance matrix, or u-matrix, is a popular method of displaying
* SOMs. The value of umatrix is the maximum of distances between a map unit
* to its neighbors.
*/
private double[][] umatrix;
/**
* Constructor. Learn the SOM of given data.
* @param size the size of a squared map.
*/
public SOM(double[][] data, int size) {
this(data, size, size);
}
/**
* Constructor. Learn the SOM of given data.
* @param width the width of map.
* @param height the height of map.
*/
public SOM(double[][] data, int width, int height) {
if (height <= 0 || width <= 0 || height * width == 1) {
throw new IllegalArgumentException("Invalide map width = " + width + " or height = " + height);
}
this.width = width;
this.height = height;
this.d = data[0].length;
int n = data.length;
neurons = new double[height][width][d];
bmu = new int[n][2];
double mpd = Math.max(0.5, height * width / (double) n);
int roughTrainLen = (int) Math.ceil(10 * mpd);
int fineTrainLen = (int) Math.ceil(40 * mpd);
int trainLen = roughTrainLen + fineTrainLen;
double initRadius = Math.max(1.0, Math.max(height, width) / 2.0);
double finalRadius = Math.max(1.0, initRadius / 4.0);
double[] radius = new double[trainLen];
for (int i = 0; i < roughTrainLen; i++) {
radius[roughTrainLen - i - 1] = finalRadius + (double) i / (roughTrainLen - 1) * (initRadius - finalRadius);
}
initRadius = finalRadius - 1.0 / (fineTrainLen - 1);
finalRadius = 1;
for (int i = 0; i < fineTrainLen; i++) {
radius[roughTrainLen + fineTrainLen - i - 1] = finalRadius + (double) i / (fineTrainLen - 1) * (initRadius - finalRadius);
}
for (int i = 0; i < trainLen; i++) {
radius[i] = radius[i] * radius[i];
}
double[] mu = new double[d];
for (int i = 0; i < n; i++) {
for (int j = 0; j < d; j++) {
mu[j] += data[i][j];
}
}
for (int i = 0; i < d; i++) {
mu[i] /= n;
}
double[][] D = new double[n][d];
for (int i = 0; i < n; i++) {
for (int j = 0; j < d; j++) {
D[i][j] = data[i][j] - mu[j];
}
}
double[][] V = new double[d][d];
for (int i = 0; i < d; i++) {
for (int j = i; j < d; j++) {
for (int k = 0; k < n; k++) {
V[i][j] += D[k][i] * D[k][j];
}
V[i][j] /= n;
V[j][i] = V[i][j];
}
}
EigenValueDecomposition eigen = Math.eigen(V, 2);
double[] v1 = new double[d];
double[] v2 = new double[d];
for (int i = 0; i < d; i++) {
v1[i] = eigen.getEigenVectors()[i][0];
v2[i] = eigen.getEigenVectors()[i][1];
}
for (int i = 0; i < height; i++) {
double w = (double) i / height - 0.5;
for (int j = 0; j < width; j++) {
double h = (double) j / width - 0.5;
for (int k = 0; k < d; k++) {
neurons[i][j][k] = mu[k] + w * v1[k] + h * v2[k];
}
}
}
BBDTree bbd = new BBDTree(data);
double[][] centroids = new double[width * height][];
double[][] sums = new double[width * height][d];
int[] hit = new int[width * height];
int[] sy = new int[n]; // cluster label of samples.
for (int i = 0, k = 0; i < height; i++) {
for (int j = 0; j < width; j++, k++) {
centroids[k] = neurons[i][j];
}
}
for (int iter = 0; iter < trainLen; iter++) {
int r = (int) Math.round(Math.sqrt(radius[iter]));
double distortion = bbd.clustering(centroids, sums, hit, sy);
System.out.format("SOM distortion after %3d iterations (radius = %d): %.5f\n", iter + 1, r, distortion);
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
double denom = 0.0;
Arrays.fill(neurons[i][j], 0.0);
int minH = Math.max(0, i - 3 * r) + 1;
int maxH = Math.min(height, i + 3 * r) - 1;
int minW = Math.max(0, j - 3 * r) + 1;
int maxW = Math.min(width, j + 3 * r) - 1;
int hits = 0;
while (hits == 0) {
minH = Math.max(0, minH - 1);
maxH = Math.min(height, maxH + 1);
minW = Math.max(0, minW - 1);
maxW = Math.min(width, maxW + 1);
for (int s = minH; s < maxH; s++) {
for (int t = minW; t < maxW; t++) {
int pos = s * width + t;
hits += hit[pos];
}
}
}
for (int s = minH; s < maxH; s++) {
for (int t = minW; t < maxW; t++) {
int pos = s * width + t;
if (hit[pos] > 0) {
int dx = i - s;
int dy = j - t;
double h = Math.exp(-(dx * dx + dy * dy) / (2 * radius[iter]));
denom += h * hit[pos];
for (int k = 0; k < d; k++) {
neurons[i][j][k] += h * sums[pos][k];
}
}
}
}
for (int k = 0; k < d; k++) {
neurons[i][j][k] /= denom;
}
}
}
}
// best matched unit for each sample.
for (int i = 0; i < n; i++) {
bmu[i][0] = sy[i] / width;
bmu[i][1] = sy[i] % width;
}
// count the number of samples in each unit.
voronoi = new int[height][width];
voronoi = new int[height][width];
for (int i = 0; i < bmu.length; i++) {
voronoi[bmu[i][0]][bmu[i][1]]++;
}
// calculate u-matrix.
umatrix = new double[height][width];
for (int i = 0; i < height - 1; i++) {
for (int j = 0; j < width - 1; j++) {
double dist = Math.sqrt(Math.squaredDistance(neurons[i][j], neurons[i][j + 1]));
umatrix[i][j] = Math.max(umatrix[i][j], dist);
umatrix[i][j + 1] = Math.max(umatrix[i][j + 1], dist);
dist = Math.sqrt(Math.squaredDistance(neurons[i][j], neurons[i + 1][j]));
umatrix[i][j] = Math.max(umatrix[i][j], dist);
umatrix[i + 1][j] = Math.max(umatrix[i + 1][j], dist);
}
}
for (int i = 0; i < height - 1; i++) {
double dist = Math.sqrt(Math.squaredDistance(neurons[i][width - 1], neurons[i + 1][width - 1]));
umatrix[i][width - 1] = Math.max(umatrix[i][width - 1], dist);
umatrix[i + 1][width - 1] = Math.max(umatrix[i + 1][width - 1], dist);
}
for (int j = 0; j < width - 1; j++) {
double dist = Math.sqrt(Math.squaredDistance(neurons[height - 1][j], neurons[height - 1][j + 1]));
umatrix[height - 1][j] = Math.max(umatrix[height - 1][j], dist);
umatrix[height - 1][j + 1] = Math.max(umatrix[height - 1][j + 1], dist);
}
umatrix[height - 1][width - 1] = Math.max(umatrix[height - 1][width - 2], umatrix[height - 2][width - 1]);
}
/**
* Returns the SOM map grid.
*/
public double[][][] map() {
return neurons;
}
/**
* Returns the U-Matrix of SOM map for visualization.
*/
public double[][] umatrix() {
return umatrix;
}
/**
* Returns the best matched unit for each sample.
* @return the best matched unit. This is n-by-2 matrix, of which each row
* is for each data point. The entry bmu[i][0] and bmu[i][1] are the row
* index and column index of the best matched unit for each sample,
* respectively.
*/
public int[][] bmu() {
return bmu;
}
/**
* Returns the number of samples in each unit.
*/
public int[][] size() {
return voronoi;
}
/**
* Clustering the neurons into k groups. And then assigns the samples in
* each neuron to the corresponding cluster.
* @param k the number of clusters.
* @return the cluster label of samples.
*/
public int[] partition(int k) {
int n = width * height;
double[][] units = new double[n][d];
for (int i = 0, l = 0; i < height; i++) {
for (int j = 0; j < width; j++, l++) {
units[l] = neurons[i][j];
}
}
double[][] proximity = new double[n][];
for (int i = 0; i < n; i++) {
proximity[i] = new double[i + 1];
for (int j = 0; j < i; j++) {
proximity[i][j] = Math.distance(units[i], units[j]);
}
}
Linkage linkage = new UPGMALinkage(proximity);
HierarchicalClustering hc = new HierarchicalClustering(linkage);
y = hc.partition(k);
int[] cluster = new int[bmu.length];
for (int i = 0; i < cluster.length; i++) {
cluster[i] = y[bmu[i][0] * width + bmu[i][1]];
}
return cluster;
}
/**
* Cluster a new instance to the nearest neuron. For clustering purpose,
* one should build a sufficient large map to capture the structure of
* data space. Then the neurons of map can be clustered into a small number
* of clusters. Finally the sample should be assign to the cluster of
* its nearest neurons.
*
* @param x a new instance.
* @return the cluster label. If the method {@link #partition(int)} is
* called before, this is the cluster label of the nearest neuron.
* Otherwise, this is the index of neuron (i * width + j).
*/
@Override
public int predict(double[] x) {
double best = Double.MAX_VALUE;
int ii = -1, jj = -1;
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
double dist = Math.squaredDistance(neurons[i][j], x);
if (dist < best) {
best = dist;
ii = i;
jj = j;
}
}
}
if (y == null) {
return ii * width + jj;
} else {
return y[ii * width + jj];
}
}
}