smile.clustering.DENCLUE Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-core Show documentation
smile-core
The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.clustering;

import java.util.Arrays;
import java.util.stream.IntStream;
import smile.math.MathEx;

/**
 * DENsity CLUstering. The DENCLUE algorithm employs a cluster model based on
 * kernel density estimation. A cluster is defined by a local maximum of the
 * estimated density function. Observations going to the same local maximum
 * are put into the same cluster.
 * 
 * Clearly, DENCLUE doesn't work on data with uniform distribution. In high
 * dimensional space, the data always look like uniformly distributed because
 * of the curse of dimensionality. Therefore, DENCLUDE doesn't work well on
 * high-dimensional data in general.
 *
 * 
References
 * 
 *  A. Hinneburg and D. A. Keim. A general approach to clustering in large databases with noise. Knowledge and Information Systems, 5(4):387-415, 2003.
 *  Alexander Hinneburg and Hans-Henning Gabriel. DENCLUE 2.0: Fast Clustering based on Kernel Density Estimation. IDA, 2007.
 * 
 * 
 * @author Haifeng Li
 */
public class DENCLUE extends PartitionClustering {
    private static final long serialVersionUID = 2L;
    private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(DENCLUE.class);

    /**
     * The tolerance of hill-climbing procedure.
     */
    private final double tol;
    /**
     * The smooth parameter in the Gaussian kernel.
     */
    private final double sigma;
    /**
     * The density attractor of each observation.
     */
    public final double[][] attractors;
    /**
     * The radius of density attractor.
     */
    private final double[] radius;
    /**
     * The samples decided by k-means used in the iterations of hill climbing.
     */
    private final double[][] samples;

    /**
     * Constructor.
     * @param k the number of clusters.
     * @param attractors the density attractor of each observation.
     * @param radius the radius of density attractor.
     * @param samples the samples in the iterations of hill climbing.
     * @param sigma the smooth parameter in the Gaussian kernel. The user can
     *              choose sigma such that number of density attractors is
     *              constant for a long interval of sigma.
     * @param y the cluster labels.
     * @param tol the tolerance of hill-climbing procedure.
     */
    public DENCLUE(int k, double[][] attractors, double[] radius, double[][] samples, double sigma, int[] y, double tol) {
        super(k, y);
        this.attractors = attractors;
        this.radius = radius;
        this.samples = samples;
        this.sigma = sigma;
        this.tol = tol;
    }

    /**
     * Clustering data.
     * @param data the input data of which each row is an observation.
     * @param sigma the smooth parameter in the Gaussian kernel. The user can
     *              choose sigma such that number of density attractors is
     *              constant for a long interval of sigma.
     * @param m the number of selected samples used in the iteration.
     *          This number should be much smaller than the number of
     *          observations to speed up the algorithm. It should also be
     *          large enough to capture the sufficient information of
     *          underlying distribution.
     * @return the model.
     */
    public static DENCLUE fit(double[][] data, double sigma, int m) {
        int n = data.length;
        return fit(data, sigma, m, 1E-2, Math.max(10, n/200));
    }

    /**
     * Clustering data.
     * @param data the input data of which each row is an observation.
     * @param sigma the smooth parameter in the Gaussian kernel. The user can
     *              choose sigma such that number of density attractors is
     *              constant for a long interval of sigma.
     * @param m the number of selected samples used in the iteration.
     *          This number should be much smaller than the number of
     *          observations to speed up the algorithm. It should also be
     *          large enough to capture the sufficient information of
     *          underlying distribution.
     * @param tol the tolerance of hill-climbing procedure.
     * @param minPts the minimum number of neighbors for a core attractor.
     * @return the model.
     */
    public static DENCLUE fit(double[][] data, double sigma, int m, double tol, int minPts) {
        if (sigma <= 0.0) {
            throw new IllegalArgumentException("Invalid standard deviation of Gaussian kernel: " + sigma);
        }
        
        if (m <= 0 || m > data.length) {
            throw new IllegalArgumentException("Invalid number of selected samples: " + m);
        }

        logger.info("Select {} samples by k-means", m);
        KMeans kmeans = KMeans.fit(data, m);
        double[][] samples = kmeans.centroids;

        int n = data.length;
        int d = data[0].length;
        double[][] attractors = new double[n][d];
        double[][] steps = new double[n][2];

        logger.info("Hill-climbing of density function for each observation");
        // Stream is lazy. Without calling toArray(), the computation won't be executed.
        IntStream.range(0, n).parallel()
                .forEach(i -> climb(data[i], attractors[i], steps[i], samples, sigma, tol));

        if (Arrays.stream(attractors).flatMapToDouble(Arrays::stream).anyMatch(ai -> !Double.isFinite(ai))) {
            throw new IllegalStateException("Attractors contains NaN/infinity. sigma is likely too small.");
        }

        double[] radius = Arrays.stream(steps).mapToDouble(step -> step[0] + step[1]).toArray();
        double r = MathEx.mean(radius);

        if (!Double.isFinite(r)) {
            throw new IllegalStateException("The average of last steps of hill-climbing is NaN/infinity. sigma is likely too small.");
        }

        logger.info("Clustering attractors with DBSCAN (radius = {})", r);
        DBSCAN dbscan = DBSCAN.fit(attractors, minPts, r);

        return new DENCLUE(dbscan.k, attractors, radius, samples, sigma, dbscan.y, tol);
    }

    /**
     * Classifies a new observation.
     * @param x a new observation.
     * @return the cluster label. Note that it may be {@link #OUTLIER}.
     */
    public int predict(double[] x) {
        int d = attractors[0].length;
        if (x.length != d) {
            throw new IllegalArgumentException(String.format("Invalid input vector size: %d, expected: %d", x.length, d));
        }

        double[] attractor = new double[d];
        double[] step = new double[2];
        climb(x, attractor, step, samples, sigma, tol);

        double r = step[0] + step[1];
        for (int i = 0; i < attractors.length; i++) {
            if (MathEx.distance(attractors[i], attractor) < radius[i] + r) {
                return y[i];
            }
        }

        return OUTLIER;
    }

    /**
     * The hill-climbing is started for each observation, which assigns the
     * observation to a local maxima (attractor).
     * @param x the observation to start with.
     * @param attractor the local maxima on output.
     * @param step the last k step sizes.
     * @param samples the samples used in kernel density estimation.
     * @param sigma the bandwidth of Gaussian kernel.
     * @param tol the tolerance of convergence test.
     * @return the local density of attractor.
     */
    private static double climb(double[] x, double[] attractor, double[] step, double[][] samples, double sigma, double tol) {
        int m = samples.length;
        int d = x.length;
        int k = step.length;

        double p = 1.0;
        double h = Math.pow(2 * Math.PI * sigma, d/2.0);
        double gamma = -0.5 / (sigma * sigma);

        // Don't overwrite the origin observation
        x = x.clone();
        double[] w = new double[m];

        double diff = Double.MAX_VALUE;
        for (int iter = 0; iter < k || diff > tol; iter++) {
            for (int i = 0; i < m; i++) {
                w[i] = Math.exp(gamma * MathEx.squaredDistance(x, samples[i]));
            }

            Arrays.fill(attractor, 0.0);
            for (int i = 0; i < m; i++) {
                double wi = w[i];
                double[] xi = samples[i];
                for (int j = 0; j < d; j++) {
                    attractor[j] += wi * xi[j];
                }
            }

            double W = MathEx.sum(w);
            for (int j = 0; j < d; j++) {
                attractor[j] /= W;
            }

            double prob = W / (m * h);
            diff = Math.abs(prob - p) / p;
            p = prob;

            step[iter % k] = MathEx.distance(attractor, x);
            System.arraycopy(attractor, 0, x, 0, d);
        }

        return p;
    }
}