All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.clustering.DBSCAN Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.clustering;

import java.util.Collections;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import smile.neighbor.Neighbor;
import smile.neighbor.KDTree;
import smile.neighbor.LinearSearch;
import smile.neighbor.RNNSearch;
import smile.math.MathEx;
import smile.math.distance.Distance;

/**
 * Density-Based Spatial Clustering of Applications with Noise.
 * DBSCAN finds a number of clusters starting from the estimated density
 * distribution of corresponding nodes.
 * 

* DBSCAN requires two parameters: radius (i.e. neighborhood radius) and the * number of minimum points required to form a cluster (minPts). It starts * with an arbitrary starting point that has not been visited. This point's * neighborhood is retrieved, and if it contains sufficient number of points, * a cluster is started. Otherwise, the point is labeled as noise. Note that * this point might later be found in a sufficiently sized radius-environment * of a different point and hence be made part of a cluster. *

* If a point is found to be part of a cluster, its neighborhood is also * part of that cluster. Hence, all points that are found within the * neighborhood are added, as is their own neighborhood. This process * continues until the cluster is completely found. Then, a new unvisited point * is retrieved and processed, leading to the discovery of a further cluster * of noise. *

* DBSCAN visits each point of the database, possibly multiple times (e.g., * as candidates to different clusters). For practical considerations, however, * the time complexity is mostly governed by the number of nearest neighbor * queries. DBSCAN executes exactly one such query for each point, and if * an indexing structure is used that executes such a neighborhood query * in O(log n), an overall runtime complexity of O(n log n) is obtained. *

* DBSCAN has many advantages such as *

    *
  • DBSCAN does not need to know the number of clusters in the data * a priori, as opposed to k-means. *
  • DBSCAN can find arbitrarily shaped clusters. It can even find clusters * completely surrounded by (but not connected to) a different cluster. * Due to the MinPts parameter, the so-called single-link effect * (different clusters being connected by a thin line of points) is reduced. *
  • DBSCAN has a notion of noise. Outliers are labeled as Clustering.OUTLIER, * which is Integer.MAX_VALUE. *
  • DBSCAN requires just two parameters and is mostly insensitive to the * ordering of the points in the database. (Only points sitting on the * edge of two different clusters might swap cluster membership if the * ordering of the points is changed, and the cluster assignment is unique * only up to isomorphism.) *
* On the other hand, DBSCAN has the disadvantages of *
    *
  • In high dimensional space, the data are sparse everywhere * because of the curse of dimensionality. Therefore, DBSCAN doesn't * work well on high-dimensional data in general. *
  • DBSCAN does not respond well to data sets with varying densities. *
* *

References

*
    *
  1. Martin Ester, Hans-Peter Kriegel, Jorg Sander, Xiaowei Xu (1996-). A density-based algorithm for discovering clusters in large spatial databases with noise". KDD, 1996.
  2. *
  3. Jorg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu. (1998). Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its Applications. 1998.
  4. *
* * @param the type of input object. * * @author Haifeng Li */ public class DBSCAN extends PartitionClustering { private static final long serialVersionUID = 2L; /** * The minimum number of points required to form a cluster */ public final double minPts; /** * The neighborhood radius. */ public final double radius; /** * Data structure for neighborhood search. */ private final RNNSearch nns; /** * The flag if the point is a core point (at least minPts points are within radius). */ private final boolean[] core; /** * Constructor. * @param minPts the minimum number of neighbors for a core data point. * @param radius the neighborhood radius. * @param nns the data structure for neighborhood search. * @param k the number of clusters. * @param y the cluster labels. * @param core the flag if the point is a core point. */ public DBSCAN(int minPts, double radius, RNNSearch nns, int k, int[] y, boolean[] core) { super(k, y); this.minPts = minPts; this.radius = radius; this.nns = nns; this.core = core; } /** * Clustering the data with KD-tree. DBSCAN is generally applied on * low-dimensional data. Therefore, KD-tree can speed up the nearest * neighbor search a lot. * @param data the observations. * @param minPts the minimum number of neighbors for a core data point. * @param radius the neighborhood radius. * @return the model. */ public static DBSCAN fit(double[][] data, int minPts, double radius) { return fit(data, new KDTree<>(data, data), minPts, radius); } /** * Clustering the data. * @param data the observations. * @param distance the distance function. * @param minPts the minimum number of neighbors for a core data point. * @param radius the neighborhood radius. * @param the data type. * @return the model. */ public static DBSCAN fit(T[] data, Distance distance, int minPts, double radius) { return fit(data, LinearSearch.of(data, distance), minPts, radius); } /** * Clustering the data. * @param data the observations. * @param nns the data structure for neighborhood search. * @param minPts the minimum number of neighbors for a core data point. * @param radius the neighborhood radius. * @param the data type. * @return the model. */ public static DBSCAN fit(T[] data, RNNSearch nns, int minPts, double radius) { if (minPts < 1) { throw new IllegalArgumentException("Invalid minPts: " + minPts); } if (radius <= 0.0) { throw new IllegalArgumentException("Invalid radius: " + radius); } // The label for data samples in BFS queue. final int QUEUED = -2; // The label for unclassified data samples. final int UNDEFINED = -1; int k = 0; int n = data.length; boolean[] core = new boolean[n]; int[] y = new int[n]; Arrays.fill(y, UNDEFINED); for (int i = 0; i < data.length; i++) { if (y[i] == UNDEFINED) { List> neighbors = new ArrayList<>(); nns.search(data[i], radius, neighbors); if (neighbors.size() < minPts) { y[i] = OUTLIER; } else { y[i] = k; core[i] = true; for (Neighbor neighbor : neighbors) { if (y[neighbor.index] == UNDEFINED) { y[neighbor.index] = QUEUED; } } for (int j = 0; j < neighbors.size(); j++) { Neighbor neighbor = neighbors.get(j); int index = neighbor.index; if (y[index] == OUTLIER) { y[index] = k; } if (y[index] == UNDEFINED || y[index] == QUEUED) { y[index] = k; List> secondaryNeighbors = new ArrayList<>(); nns.search(neighbor.key, radius, secondaryNeighbors); if (secondaryNeighbors.size() >= minPts) { core[neighbor.index] = true; for (Neighbor sn : secondaryNeighbors) { int label = y[sn.index]; if (label == UNDEFINED) { y[sn.index] = QUEUED; } if (label == UNDEFINED || label == OUTLIER) { neighbors.add(sn); } } } } } k++; } } } return new DBSCAN<>(minPts, radius, nns, k, y, core); } /** * Classifies a new observation. * @param x a new observation. * @return the cluster label. Note that it may be {@link #OUTLIER}. */ public int predict(T x) { List> neighbors = new ArrayList<>(); nns.search(x, radius, neighbors); Collections.sort(neighbors); for (Neighbor neighbor : neighbors) { if (core[neighbor.index]) { return y[neighbor.index]; } } return OUTLIER; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy