smile.manifold.IsoMap Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.manifold;
import smile.graph.AdjacencyList;
import smile.graph.Graph;
import smile.graph.Graph.Edge;
import smile.math.distance.EuclideanDistance;
import smile.math.Math;
import smile.math.matrix.EigenValueDecomposition;
import smile.neighbor.CoverTree;
import smile.neighbor.KDTree;
import smile.neighbor.KNNSearch;
import smile.neighbor.Neighbor;
/**
* Isometric feature mapping. Isomap is a widely used low-dimensional embedding methods,
* where geodesic distances on a weighted graph are incorporated with the
* classical multidimensional scaling. Isomap is used for computing a
* quasi-isometric, low-dimensional embedding of a set of high-dimensional
* data points. Isomap is highly efficient and generally applicable to a broad
* range of data sources and dimensionalities.
*
* To be specific, the classical MDS performs low-dimensional embedding based
* on the pairwise distance between data points, which is generally measured
* using straight-line Euclidean distance. Isomap is distinguished by
* its use of the geodesic distance induced by a neighborhood graph
* embedded in the classical scaling. This is done to incorporate manifold
* structure in the resulting embedding. Isomap defines the geodesic distance
* to be the sum of edge weights along the shortest path between two nodes.
* The top n eigenvectors of the geodesic distance matrix, represent the
* coordinates in the new n-dimensional Euclidean space.
*
* The connectivity of each data point in the neighborhood graph is defined
* as its nearest k Euclidean neighbors in the high-dimensional space. This
* step is vulnerable to "short-circuit errors" if k is too large with
* respect to the manifold structure or if noise in the data moves the
* points slightly off the manifold. Even a single short-circuit error
* can alter many entries in the geodesic distance matrix, which in turn
* can lead to a drastically different (and incorrect) low-dimensional
* embedding. Conversely, if k is too small, the neighborhood graph may
* become too sparse to approximate geodesic paths accurately.
*
* This class implements C-Isomap that involves magnifying the regions
* of high density and shrink the regions of low density of data points
* in the manifold. Edge weights that are maximized in Multi-Dimensional
* Scaling(MDS) are modified, with everything else remaining unaffected.
*
* @see LLE
* @see LaplacianEigenmap
*
*
References
*
* - J. B. Tenenbaum, V. de Silva and J. C. Langford A Global Geometric Framework for Nonlinear Dimensionality Reduction. Science 290(5500):2319-2323, 2000.
*
*
* @author Haifeng Li
*/
public class IsoMap {
/**
* The original sample index.
*/
private int[] index;
/**
* Coordinate matrix.
*/
private double[][] coordinates;
/**
* Nearest neighbor graph.
*/
private Graph graph;
/**
* Constructor. C-Isomap algorithm by default.
* @param data the dataset.
* @param d the dimension of the manifold.
* @param k k-nearest neighbor.
*/
public IsoMap(double[][] data, int d, int k) {
this(data, d, k, true);
}
/**
* Constructor.
* @param data the dataset.
* @param d the dimension of the manifold.
* @param k k-nearest neighbor.
* @param CIsomap C-Isomap algorithm if true, otherwise standard algorithm.
*/
public IsoMap(double[][] data, int d, int k, boolean CIsomap) {
int n = data.length;
KNNSearch knn = null;
if (data[0].length < 10) {
knn = new KDTree(data, data);
} else {
knn = new CoverTree(data, new EuclideanDistance());
}
graph = new AdjacencyList(n);
double[] M = new double[n];
for (int i = 0; i < n; i++) {
Neighbor[] neighbors = knn.knn(data[i], k);
for (int j = 0; j < neighbors.length; j++) {
graph.setWeight(i, neighbors[j].index, neighbors[j].distance);
M[i] += neighbors[j].distance;
}
M[i] = Math.sqrt(M[i] / neighbors.length);
}
// C-Isomap
if (CIsomap) {
for (Edge edge : graph.getEdges()) {
edge.weight /= (M[edge.v1] * M[edge.v2]);
}
}
// Use largest connected component.
int[][] cc = graph.dfs();
if (cc.length == 1) {
index = new int[n];
for (int i = 0; i < n; i++) {
index[i] = i;
}
} else {
n = 0;
int component = 0;
for (int i = 0; i < cc.length; i++) {
if (cc[i].length > n) {
component = i;
n = cc[i].length;
}
}
System.out.format("IsoMap: %d connected components, largest one has %d samples.\n", cc.length, n);
index = cc[component];
graph = graph.subgraph(index);
}
double[][] D = graph.dijkstra();
for (int i = 0; i < n; i++) {
for (int j = 0; j < i; j++) {
D[i][j] = -0.5 * D[i][j] * D[i][j];
D[j][i] = D[i][j];
}
}
double[] mean = Math.rowMean(D);
double mu = Math.mean(mean);
double[][] B = new double[n][n];
for (int i = 0; i < n; i++) {
for (int j = 0; j <= i; j++) {
B[i][j] = D[i][j] - mean[i] - mean[j] + mu;
B[j][i] = B[i][j];
}
}
EigenValueDecomposition eigen = Math.eigen(B, d);
coordinates = new double[n][d];
for (int j = 0; j < d; j++) {
if (eigen.getEigenValues()[j] < 0) {
throw new IllegalArgumentException(String.format("Some of the first %d eigenvalues are < 0.", d));
}
double scale = Math.sqrt(eigen.getEigenValues()[j]);
for (int i = 0; i < n; i++) {
coordinates[i][j] = eigen.getEigenVectors()[i][j] * scale;
}
}
}
/**
* Returns the original sample index. Because IsoMap is applied to the largest
* connected component of k-nearest neighbor graph, we record the the original
* indices of samples in the largest component.
*/
public int[] getIndex() {
return index;
}
/**
* Returns the coordinates of projected data.
*/
public double[][] getCoordinates() {
return coordinates;
}
/**
* Returns the nearest neighbor graph.
*/
public Graph getNearestNeighborGraph() {
return graph;
}
}