All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.vq.BIRCH Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.vq;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Optional;
import smile.math.MathEx;
import smile.util.IntPair;

/**
 * Balanced Iterative Reducing and Clustering using Hierarchies. BIRCH performs
 * hierarchical clustering over particularly large data. An advantage of
 * BIRCH is its ability to incrementally and dynamically cluster incoming,
 * multi-dimensional metric data points in an attempt to produce the high
 * quality clustering for a given set of resources (memory and time constraints).
 * 

* BIRCH has several advantages. For example, each clustering decision is made * without scanning all data points and currently existing clusters. It * exploits the observation that data space is not usually uniformly occupied * and not every data point is equally important. It makes full use of * available memory to derive the finest possible sub-clusters while minimizing * I/O costs. It is also an incremental method that does not require the whole * data set in advance. *

* This implementation produces a clustering in three steps. First step * builds a CF (clustering feature) tree by a single scan of database. * The second step clusters the leaves of CF tree by hierarchical clustering. * Then the user can use the learned model to classify input data in the final * step. In total, we scan the database twice. * *

References

*
    *
  1. Tian Zhang, Raghu Ramakrishnan, and Miron Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. SIGMOD, 1996.
  2. *
* * @see smile.clustering.HierarchicalClustering * * @author Haifeng Li */ public class BIRCH implements VectorQuantizer { private static final long serialVersionUID = 2L; /** * The branching factor of non-leaf nodes. */ public final int B; /** * The number of CF entries in the leaf nodes. */ public final int L; /** * THe maximum radius of a sub-cluster. */ public final double T; /** * The dimensionality of data. */ public final int d; /** * The root of CF tree. */ private Node root; /** * The Clustering Feature (CF) vector of the cluster is defined as a * triple: CF = (N, LS, SS). */ private class ClusteringFeature implements Serializable { /** The number of observations. */ private int n; /** The sum of observations. */ private final double[] sum = new double[d]; /** The square sum of observations. */ private final double[] ss = new double[d]; /** * Constructor. * * @param x the first observation added to this CF. */ public ClusteringFeature(double[] x) { n = 1; System.arraycopy(x, 0, sum, 0, d); for (int i = 0; i < d; i++) { ss[i] = x[i] * x[i]; } } /** * Constructor. * * @param clusters sub-clusters. */ public ClusteringFeature(ClusteringFeature... clusters) { n = 0; for (ClusteringFeature cluster : clusters) { n += cluster.n; for (int i = 0; i < d; i++) { sum[i] += cluster.sum[i]; ss[i] += cluster.ss[i]; } } } /** Returns the centroid of CF. */ public double[] centroid() { double[] centroid = new double[d]; for (int i = 0; i < d; i++) { centroid[i] = sum[i] / n; } return centroid; } /** Returns the radius of CF. */ public double radius() { double r = 0.0; for (int i = 0; i < d; i++) { double mu = sum[i] / n; r += ss[i] / n - mu * mu; } return Math.sqrt(r); } /** Returns the radius of CF with an additional observation. */ public double radius(double[] x) { int n1 = n + 1; double r = 0.0; for (int i = 0; i < d; i++) { double mu = (sum[i] + x[i]) / n1; r += (ss[i] + x[i] * x[i]) / n1 - mu * mu; } return Math.sqrt(r); } /** * Adds an observation to the CF. No check of radius and split. * */ public void update(double[] x) { n = n + 1; for (int i = 0; i < d; i++) { sum[i] += x[i]; ss[i] += x[i] * x[i]; } } /** * Adds an observation to the CF. If the radius of CF with additional * observation is greater than the radius threshold, don't update * the CF and returns a new CF with the observation. * */ public Optional add(double[] x) { if (radius(x) > T) { return Optional.of(new ClusteringFeature(x)); } update(x); return Optional.empty(); } /** * Returns the distance between x and CF centroid. */ public double distance(double[] x) { double dist = 0.0; for (int i = 0; i < d; i++) { double diff = sum[i] / n - x[i]; dist += diff * diff; } return Math.sqrt(dist); } /** * Returns the distance between CF centroids. */ public double distance(ClusteringFeature o) { double dist = 0.0; for (int i = 0; i < d; i++) { double diff = sum[i] / n - o.sum[i] / o.n; dist += diff * diff; } return Math.sqrt(dist); } } /** The node interface of CF tree. */ private abstract class Node implements Serializable { /** * The clustering feature of observations in the node. */ protected ClusteringFeature cluster; /** Constructor. */ public Node(ClusteringFeature... clusters) { cluster = new ClusteringFeature(clusters); } /** Constructor. */ public Node(Node... nodes) { ClusteringFeature[] clusters = Arrays.stream(nodes).map(node -> node.cluster).toArray(ClusteringFeature[]::new); cluster = new ClusteringFeature(clusters); } /** * Returns the leaf CF closest to the given observation. */ public abstract ClusteringFeature nearest(double[] x); /** * Adds a new observation to the node. If the node is split, * returns the added node. */ public abstract Optional add(double[] x); /** * Calculates the distance between x and CF center */ public double distance(double[] x) { return cluster.distance(x); } /** Pair-wise distance. */ public double[][] pdist(ClusteringFeature[] clusters) { int k = clusters.length; double[][] dist = new double[k][k]; for (int i = 0; i < k; i++) { for (int j = i + 1; j < k; j++) { dist[i][j] = clusters[i].distance(clusters[j]); dist[j][i] = dist[i][j]; } } return dist; } /** Pair-wise distance. */ public double[][] pdist(Node[] nodes) { ClusteringFeature[] clusters = Arrays.stream(nodes).map(node -> node.cluster).toArray(ClusteringFeature[]::new); return pdist(clusters); } } /** * A CF tree is a height balanced tree. */ private class InternalNode extends Node { /** * The children nodes. */ private final Node[] children; /** * The number of children. */ private int k; /** * Constructor of root node */ public InternalNode(Node... nodes) { super(nodes); k = nodes.length; children = new Node[B]; System.arraycopy(nodes, 0, children, 0, nodes.length); } @Override public ClusteringFeature nearest(double[] x) { int index = 0; double nearest = children[0].distance(x); // find the closest child node to this data point for (int i = 1; i < k; i++) { double dist = children[i].distance(x); if (dist < nearest) { index = i; nearest = dist; } } return children[index].nearest(x); } @Override public Optional add(double[] x) { int index = 0; double nearest = children[0].distance(x); // find the closest child node to this data point for (int i = 1; i < k; i++) { double dist = children[i].distance(x); if (dist < nearest) { index = i; nearest = dist; } } Optional sister = children[index].add(x); if (sister.isPresent()) { if (k < B) { children[k++] = sister.get(); } else { return Optional.of(split(sister.get())); } } cluster.update(x); return Optional.empty(); } /** * Split the node and return a new node to add into the parent */ private Node split(Node node) { Node[] nodes = new Node[B+1]; System.arraycopy(children, 0, nodes, 0, B); nodes[B] = node; double[][] dist = pdist(nodes); IntPair farthest = MathEx.whichMax(dist); k = 0; int n = 0; Node[] sister = new Node[B]; for (int i = 0; i <= B; i++) { if (dist[i][farthest.i] < dist[i][farthest.j]) { children[k++] = nodes[i]; } else { sister[n++] = nodes[i]; } } for (int i = k; i < B; i++) { this.children[i] = null; } this.cluster = new ClusteringFeature(Arrays.stream(children).limit(k).map(child -> child.cluster).toArray(ClusteringFeature[]::new)); return new InternalNode(Arrays.copyOf(sister, n)); } } /** * The leaf node of CF tree. */ private class Leaf extends Node { private final ClusteringFeature[] clusters; private int k; /** * Constructor. */ public Leaf(ClusteringFeature... clusters) { super(clusters); k = clusters.length; this.clusters = new ClusteringFeature[L]; System.arraycopy(clusters, 0, this.clusters, 0, clusters.length); } /** * Constructor. */ public Leaf(double[] x) { this(new ClusteringFeature(x)); } @Override public ClusteringFeature nearest(double[] x) { int index = 0; double nearest = clusters[0].distance(x); // find the closest child node to this data point for (int i = 1; i < k; i++) { double dist = clusters[i].distance(x); if (dist < nearest) { index = i; nearest = dist; } } return clusters[index]; } @Override public Optional add(double[] x) { ClusteringFeature cluster = nearest(x); Optional sister = cluster.add(x); if (sister.isPresent()) { if (k < L) { clusters[k++] = sister.get(); } else { return Optional.of(split(sister.get())); } } this.cluster.update(x); return Optional.empty(); } /** * Splits the node and returns a new sister node. */ private Node split(ClusteringFeature cluster) { ClusteringFeature[] clusters = new ClusteringFeature[L+1]; System.arraycopy(this.clusters, 0, clusters, 0, L); clusters[L] = cluster; double[][] dist = pdist(clusters); IntPair farthest = MathEx.whichMax(dist); k = 0; int n = 0; ClusteringFeature[] sister = new ClusteringFeature[L]; for (int i = 0; i <= L; i++) { if (dist[i][farthest.i] < dist[i][farthest.j]) { this.clusters[k++] = clusters[i]; } else { sister[n++] = clusters[i]; } } for (int i = k; i < L; i++) { this.clusters[i] = null; } this.cluster = new ClusteringFeature(Arrays.copyOf(this.clusters, k)); return new Leaf(Arrays.copyOf(sister, n)); } } /** * Constructor. * @param d the dimensionality of data. * @param B the branching factor of non-leaf nodes, i.e. the maximum number * of children nodes. * @param L the number entries in the leaf nodes. * @param T the maximum radius of a sub-cluster. */ public BIRCH(int d, int B, int L, double T) { this.d = d; this.B = B; this.L = L; this.T = T; } @Override public void update(double[] x) { if (root == null) { root = new Leaf(x); } else { Optional sister = root.add(x); sister.ifPresent(child -> root = new InternalNode(root, child)); } } @Override public double[] quantize(double[] x) { ClusteringFeature cluster = root.nearest(x); return cluster.centroid(); } /** * Returns the cluster centroids of leaf nodes. * @return the cluster centroids of leaf nodes. */ public double[][] centroids() { ArrayList list = new ArrayList<>(); centroids(root, list); return list.toArray(new double[list.size()][]); } /** Collects the centroids of leaf nodes in the subtree. */ private void centroids(Node node, ArrayList list) { if (node instanceof Leaf) { Leaf leaf = (Leaf) node; for (int i = 0; i < leaf.k; i++) list.add(leaf.clusters[i].centroid()); } else { InternalNode parent = (InternalNode) node; for (int i = 0; i < parent.k; i++) { centroids(parent.children[i], list); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy