smile.clustering.DBSCAN Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-core Show documentation
smile-core
The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.clustering;

import java.util.Collections;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import smile.neighbor.Neighbor;
import smile.neighbor.KDTree;
import smile.neighbor.LinearSearch;
import smile.neighbor.RNNSearch;
import smile.math.MathEx;
import smile.math.distance.Distance;

/**
 * Density-Based Spatial Clustering of Applications with Noise.
 * DBSCAN finds a number of clusters starting from the estimated density
 * distribution of corresponding nodes.
 * 
 * DBSCAN requires two parameters: radius (i.e. neighborhood radius) and the
 * number of minimum points required to form a cluster (minPts). It starts
 * with an arbitrary starting point that has not been visited. This point's
 * neighborhood is retrieved, and if it contains sufficient number of points,
 * a cluster is started. Otherwise, the point is labeled as noise. Note that
 * this point might later be found in a sufficiently sized radius-environment
 * of a different point and hence be made part of a cluster.
 * 

 * If a point is found to be part of a cluster, its neighborhood is also
 * part of that cluster. Hence, all points that are found within the
 * neighborhood are added, as is their own neighborhood. This process
 * continues until the cluster is completely found. Then, a new unvisited point
 * is retrieved and processed, leading to the discovery of a further cluster
 * of noise.
 * 

 * DBSCAN visits each point of the database, possibly multiple times (e.g.,
 * as candidates to different clusters). For practical considerations, however,
 * the time complexity is mostly governed by the number of nearest neighbor
 * queries. DBSCAN executes exactly one such query for each point, and if
 * an indexing structure is used that executes such a neighborhood query
 * in O(log n), an overall runtime complexity of O(n log n) is obtained.
 * 

 * DBSCAN has many advantages such as
 * 

 *  DBSCAN does not need to know the number of clusters in the data
 *      a priori, as opposed to k-means.
 * 
 DBSCAN can find arbitrarily shaped clusters. It can even find clusters
 *      completely surrounded by (but not connected to) a different cluster.
 *      Due to the MinPts parameter, the so-called single-link effect
 *     (different clusters being connected by a thin line of points) is reduced.
 * 
 DBSCAN has a notion of noise. Outliers are labeled as Clustering.OUTLIER,
 *      which is Integer.MAX_VALUE.
 * 
 DBSCAN requires just two parameters and is mostly insensitive to the
 *      ordering of the points in the database. (Only points sitting on the
 *      edge of two different clusters might swap cluster membership if the
 *      ordering of the points is changed, and the cluster assignment is unique
 *      only up to isomorphism.)
 * 
 * On the other hand, DBSCAN has the disadvantages of
 * 
 *  In high dimensional space, the data are sparse everywhere
 *      because of the curse of dimensionality. Therefore, DBSCAN doesn't
 *      work well on high-dimensional data in general.
 * 
 DBSCAN does not respond well to data sets with varying densities.
 * 
 *
 * References
 * 
 *  Martin Ester, Hans-Peter Kriegel, Jorg Sander, Xiaowei Xu (1996-). A density-based algorithm for discovering clusters in large spatial databases with noise". KDD, 1996. 
 *  Jorg Sander, Martin Ester, Hans-Peter  Kriegel, Xiaowei Xu. (1998). Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its Applications. 1998. 
 * 
 * 
 * @param  the type of input object.
 * 
 * @author Haifeng Li
 */
public class DBSCAN extends PartitionClustering {
    private static final long serialVersionUID = 2L;

    /**
     * The minimum number of points required to form a cluster
     */
    public final double minPts;
    /**
     * The neighborhood radius.
     */
    public final double radius;
    /**
     * Data structure for neighborhood search.
     */
    private final RNNSearch nns;
    /**
     * The flag if the point is a core point (at least minPts points are within radius).
     */
    private final boolean[] core;

    /**
     * Constructor.
     * @param minPts the minimum number of neighbors for a core data point.
     * @param radius the neighborhood radius.
     * @param nns the data structure for neighborhood search.
     * @param k the number of clusters.
     * @param y the cluster labels.
     * @param core the flag if the point is a core point.
     */
    public DBSCAN(int minPts, double radius, RNNSearch nns, int k, int[] y, boolean[] core) {
        super(k, y);
        this.minPts = minPts;
        this.radius = radius;
        this.nns = nns;
        this.core = core;
    }

    /**
     * Clustering the data with KD-tree. DBSCAN is generally applied on
     * low-dimensional data. Therefore, KD-tree can speed up the nearest
     * neighbor search a lot.
     * @param data the observations.
     * @param minPts the minimum number of neighbors for a core data point.
     * @param radius the neighborhood radius.
     * @return the model.
     */
    public static DBSCAN fit(double[][] data, int minPts, double radius) {
        return fit(data, new KDTree<>(data, data), minPts, radius);
    }

    /**
     * Clustering the data.
     * @param data the observations.
     * @param distance the distance function.
     * @param minPts the minimum number of neighbors for a core data point.
     * @param radius the neighborhood radius.
     * @param  the data type.
     * @return the model.
     */
    public static  DBSCAN fit(T[] data, Distance distance, int minPts, double radius) {
        return fit(data, LinearSearch.of(data, distance), minPts, radius);
    }

    /**
     * Clustering the data.
     * @param data the observations.
     * @param nns the data structure for neighborhood search.
     * @param minPts the minimum number of neighbors for a core data point.
     * @param radius the neighborhood radius.
     * @param  the data type.
     * @return the model.
     */
    public static  DBSCAN fit(T[] data, RNNSearch nns, int minPts, double radius) {
        if (minPts < 1) {
            throw new IllegalArgumentException("Invalid minPts: " + minPts);
        }

        if (radius <= 0.0) {
            throw new IllegalArgumentException("Invalid radius: " + radius);
        }

        // The label for data samples in BFS queue.
        final int QUEUED = -2;
        // The label for unclassified data samples.
        final int UNDEFINED = -1;

        int k = 0;
        int n = data.length;
        boolean[] core = new boolean[n];
        int[] y = new int[n];
        Arrays.fill(y, UNDEFINED);

        for (int i = 0; i < data.length; i++) {
            if (y[i] == UNDEFINED) {
                List> neighbors = new ArrayList<>();
                nns.search(data[i], radius, neighbors);
                if (neighbors.size() < minPts) {
                    y[i] = OUTLIER;
                } else {
                    y[i] = k;
                    core[i] = true;

                    for (Neighbor neighbor : neighbors) {
                        if (y[neighbor.index] == UNDEFINED) {
                            y[neighbor.index] = QUEUED;
                        }
                    }

                    for (int j = 0; j < neighbors.size(); j++) {
                        Neighbor neighbor = neighbors.get(j);
                        int index = neighbor.index;

                        if (y[index] == OUTLIER) {
                            y[index] = k;
                        }

                        if (y[index] == UNDEFINED || y[index] == QUEUED) {
                            y[index] = k;

                            List> secondaryNeighbors = new ArrayList<>();
                            nns.search(neighbor.key, radius, secondaryNeighbors);

                            if (secondaryNeighbors.size() >= minPts) {
                                core[neighbor.index] = true;
                                for (Neighbor sn : secondaryNeighbors) {
                                    int label = y[sn.index];
                                    if (label == UNDEFINED) {
                                        y[sn.index] = QUEUED;
                                    }

                                    if (label == UNDEFINED || label == OUTLIER) {
                                        neighbors.add(sn);
                                    }
                                }
                            }
                        }
                    }

                    k++;
                }
            }
        }

        return new DBSCAN<>(minPts, radius, nns, k, y, core);
    }

    /**
     * Classifies a new observation.
     * @param x a new observation.
     * @return the cluster label. Note that it may be {@link #OUTLIER}.
     */
    public int predict(T x) {
        List> neighbors = new ArrayList<>();
        nns.search(x, radius, neighbors);

        Collections.sort(neighbors);
        for (Neighbor neighbor : neighbors) {
            if (core[neighbor.index]) {
                return y[neighbor.index];
            }
        }

        return OUTLIER;
    }
}