All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.clustering.DBScan Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.clustering;

import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import smile.neighbor.Neighbor;
import smile.neighbor.RNNSearch;
import smile.neighbor.LinearSearch;
import smile.neighbor.CoverTree;
import smile.math.Math;
import smile.math.distance.Distance;
import smile.math.distance.Metric;

/**
 * Density-Based Spatial Clustering of Applications with Noise.
 * DBScan finds a number of clusters starting from the estimated density
 * distribution of corresponding nodes.
 * 

* DBScan requires two parameters: radius (i.e. neighborhood radius) and the * number of minimum points required to form a cluster (minPts). It starts * with an arbitrary starting point that has not been visited. This point's * neighborhood is retrieved, and if it contains sufficient number of points, * a cluster is started. Otherwise, the point is labeled as noise. Note that * this point might later be found in a sufficiently sized radius-environment * of a different point and hence be made part of a cluster. *

* If a point is found to be part of a cluster, its neighborhood is also * part of that cluster. Hence, all points that are found within the * neighborhood are added, as is their own neighborhood. This process * continues until the cluster is completely found. Then, a new unvisited point * is retrieved and processed, leading to the discovery of a further cluster * of noise. *

* DBScan visits each point of the database, possibly multiple times (e.g., * as candidates to different clusters). For practical considerations, however, * the time complexity is mostly governed by the number of nearest neighbor * queries. DBScan executes exactly one such query for each point, and if * an indexing structure is used that executes such a neighborhood query * in O(log n), an overall runtime complexity of O(n log n) is obtained. *

* DBScan has many advantages such as *

    *
  • DBScan does not need to know the number of clusters in the data * a priori, as opposed to k-means. *
  • DBScan can find arbitrarily shaped clusters. It can even find clusters * completely surrounded by (but not connected to) a different cluster. * Due to the MinPts parameter, the so-called single-link effect * (different clusters being connected by a thin line of points) is reduced. *
  • DBScan has a notion of noise. *
  • DBScan requires just two parameters and is mostly insensitive to the * ordering of the points in the database. (Only points sitting on the * edge of two different clusters might swap cluster membership if the * ordering of the points is changed, and the cluster assignment is unique * only up to isomorphism.) *
* On the other hand, DBScan has the disadvantages of *
    *
  • In high dimensional space, the data are sparse everywhere * because of the curse of dimensionality. Therefore, DBScan doesn't * work well on high-dimensional data in general. *
  • DBScan does not respond well to data sets with varying densities. *
* *

References

*
    *
  1. Martin Ester, Hans-Peter Kriegel, Jorg Sander, Xiaowei Xu (1996-). A density-based algorithm for discovering clusters in large spatial databases with noise". KDD, 1996.
  2. *
  3. Jorg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu. (1998). Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its Applications. 1998.
  4. *
* * @param the type of input object. * * @author Haifeng Li */ public class DBScan extends PartitionClustering { /** * Label for unclassified data samples. */ private static final int UNCLASSIFIED = -1; /** * The minimum number of points required to form a cluster */ private double minPts; /** * The range of neighborhood. */ private double radius; /** * Data structure for neighborhood search. */ private RNNSearch nns; /** * Constructor. Clustering the data. Note that this one could be very * slow because of brute force nearest neighbor search. * @param data the dataset for clustering. * @param distance the distance measure for neighborhood search. * @param minPts the minimum number of neighbors for a core data point. * @param radius the neighborhood radius. */ public DBScan(T[] data, Distance distance, int minPts, double radius) { this(data, new LinearSearch(data, distance), minPts, radius); } /** * Constructor. Clustering the data. Using cover tree for nearest neighbor * search. * @param data the dataset for clustering. * @param distance the distance measure for neighborhood search. * @param minPts the minimum number of neighbors for a core data point. * @param radius the neighborhood radius. */ public DBScan(T[] data, Metric distance, int minPts, double radius) { this(data, new CoverTree(data, distance), minPts, radius); } /** * Clustering the data. * @param data the dataset for clustering. * @param nns the data structure for neighborhood search. * @param minPts the minimum number of neighbors for a core data point. * @param radius the neighborhood radius. */ public DBScan(T[] data, RNNSearch nns, int minPts, double radius) { if (minPts < 1) { throw new IllegalArgumentException("Invalid minPts: " + minPts); } if (radius <= 0.0) { throw new IllegalArgumentException("Invalid radius: " + radius); } this.nns = nns; this.minPts = minPts; this.radius = radius; k = 0; int n = data.length; y = new int[n]; Arrays.fill(y, UNCLASSIFIED); for (int i = 0; i < data.length; i++) { if (y[i] == UNCLASSIFIED) { List> neighbors = new ArrayList>(); nns.range(data[i], radius, neighbors); if (neighbors.size() < minPts) { y[i] = OUTLIER; } else { y[i] = k; for (int j = 0; j < neighbors.size(); j++) { if (y[neighbors.get(j).index] == UNCLASSIFIED) { y[neighbors.get(j).index] = k; Neighbor neighbor = neighbors.get(j); List> secondaryNeighbors = new ArrayList>(); nns.range(neighbor.key, radius, secondaryNeighbors); if (secondaryNeighbors.size() >= minPts) { neighbors.addAll(secondaryNeighbors); } } if (y[neighbors.get(j).index] == OUTLIER) { y[neighbors.get(j).index] = k; } } k++; } } } size = new int[k + 1]; for (int i = 0; i < n; i++) { if (y[i] == OUTLIER) { size[k]++; } else { size[y[i]]++; } } } /** * Returns the parameter of minimum number of neighbors. */ public double getMinPts() { return minPts; } /** * Returns the radius of neighborhood. */ public double getRadius() { return radius; } /** * Cluster a new instance. * @param x a new instance. * @return the cluster label. Note that it may be {@link #OUTLIER}. */ @Override public int predict(T x) { List> neighbors = new ArrayList>(); nns.range(x, radius, neighbors); if (neighbors.size() < minPts) { return OUTLIER; } int[] label = new int[k + 1]; for (Neighbor neighbor : neighbors) { int yi = y[neighbor.index]; if (yi == OUTLIER) yi = k; label[yi]++; } int c = Math.whichMax(label); if (c == k) c = OUTLIER; return c; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(String.format("DBScan clusters of %d data points:\n", y.length)); for (int i = 0; i < k; i++) { int r = (int) Math.round(1000.0 * size[i] / y.length); sb.append(String.format("%3d\t%5d (%2d.%1d%%)\n", i, size[i], r / 10, r % 10)); } int r = (int) Math.round(1000.0 * size[k] / y.length); sb.append(String.format("Noise\t%5d (%2d.%1d%%)\n", size[k], r / 10, r % 10)); return sb.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy