All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.math3.ml.clustering.DBSCANClusterer Maven / Gradle / Ivy

Go to download

The Apache Commons Math project is a library of lightweight, self-contained mathematics and statistics components addressing the most common practical problems not immediately available in the Java programming language or commons-lang.

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.math3.ml.clustering;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.math3.exception.NotPositiveException;
import org.apache.commons.math3.exception.NullArgumentException;
import org.apache.commons.math3.ml.distance.DistanceMeasure;
import org.apache.commons.math3.ml.distance.EuclideanDistance;
import org.apache.commons.math3.util.MathUtils;

/**
 * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
 * 

* The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e. * a point p is density connected to another point q, if there exists a chain of * points pi, with i = 1 .. n and p1 = p and pn = q, * such that each pair <pi, pi+1> is directly density-reachable. * A point q is directly density-reachable from point p if it is in the ε-neighborhood * of this point. *

* Any point that is not density-reachable from a formed cluster is treated as noise, and * will thus not be present in the result. *

* The algorithm requires two parameters: *

    *
  • eps: the distance that defines the ε-neighborhood of a point *
  • minPoints: the minimum number of density-connected points required to form a cluster *
* * @param type of the points to cluster * @see DBSCAN (wikipedia) * @see * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise * @since 3.2 */ public class DBSCANClusterer extends Clusterer { /** Maximum radius of the neighborhood to be considered. */ private final double eps; /** Minimum number of points needed for a cluster. */ private final int minPts; /** Status of a point during the clustering process. */ private enum PointStatus { /** The point has is considered to be noise. */ NOISE, /** The point is already part of a cluster. */ PART_OF_CLUSTER } /** * Creates a new instance of a DBSCANClusterer. *

* The euclidean distance will be used as default distance measure. * * @param eps maximum radius of the neighborhood to be considered * @param minPts minimum number of points needed for a cluster * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} */ public DBSCANClusterer(final double eps, final int minPts) throws NotPositiveException { this(eps, minPts, new EuclideanDistance()); } /** * Creates a new instance of a DBSCANClusterer. * * @param eps maximum radius of the neighborhood to be considered * @param minPts minimum number of points needed for a cluster * @param measure the distance measure to use * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} */ public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure) throws NotPositiveException { super(measure); if (eps < 0.0d) { throw new NotPositiveException(eps); } if (minPts < 0) { throw new NotPositiveException(minPts); } this.eps = eps; this.minPts = minPts; } /** * Returns the maximum radius of the neighborhood to be considered. * @return maximum radius of the neighborhood */ public double getEps() { return eps; } /** * Returns the minimum number of points needed for a cluster. * @return minimum number of points needed for a cluster */ public int getMinPts() { return minPts; } /** * Performs DBSCAN cluster analysis. * * @param points the points to cluster * @return the list of clusters * @throws NullArgumentException if the data points are null */ @Override public List> cluster(final Collection points) throws NullArgumentException { // sanity checks MathUtils.checkNotNull(points); final List> clusters = new ArrayList>(); final Map visited = new HashMap(); for (final T point : points) { if (visited.get(point) != null) { continue; } final List neighbors = getNeighbors(point, points); if (neighbors.size() >= minPts) { // DBSCAN does not care about center points final Cluster cluster = new Cluster(); clusters.add(expandCluster(cluster, point, neighbors, points, visited)); } else { visited.put(point, PointStatus.NOISE); } } return clusters; } /** * Expands the cluster to include density-reachable items. * * @param cluster Cluster to expand * @param point Point to add to cluster * @param neighbors List of neighbors * @param points the data set * @param visited the set of already visited points * @return the expanded cluster */ private Cluster expandCluster(final Cluster cluster, final T point, final List neighbors, final Collection points, final Map visited) { cluster.addPoint(point); visited.put(point, PointStatus.PART_OF_CLUSTER); List seeds = new ArrayList(neighbors); int index = 0; while (index < seeds.size()) { final T current = seeds.get(index); PointStatus pStatus = visited.get(current); // only check non-visited points if (pStatus == null) { final List currentNeighbors = getNeighbors(current, points); if (currentNeighbors.size() >= minPts) { seeds = merge(seeds, currentNeighbors); } } if (pStatus != PointStatus.PART_OF_CLUSTER) { visited.put(current, PointStatus.PART_OF_CLUSTER); cluster.addPoint(current); } index++; } return cluster; } /** * Returns a list of density-reachable neighbors of a {@code point}. * * @param point the point to look for * @param points possible neighbors * @return the List of neighbors */ private List getNeighbors(final T point, final Collection points) { final List neighbors = new ArrayList(); for (final T neighbor : points) { if (point != neighbor && distance(neighbor, point) <= eps) { neighbors.add(neighbor); } } return neighbors; } /** * Merges two lists together. * * @param one first list * @param two second list * @return merged lists */ private List merge(final List one, final List two) { final Set oneSet = new HashSet(one); for (T item : two) { if (!oneSet.contains(item)) { one.add(item); } } return one; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy