All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.clustering.NeuralMap Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.clustering;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import smile.clustering.linkage.Linkage;
import smile.clustering.linkage.UPGMALinkage;
import smile.sort.HeapSelect;
import smile.math.Math;
import smile.stat.distribution.GaussianDistribution;

/**
 * NeuralMap is an efficient competitive learning algorithm inspired by growing
 * neural gas and BIRCH. Like growing neural gas, NeuralMap has the ability to
 * add and delete neurons with competitive Hebbian learning. Edges exist between
 * neurons close to each other. Such edges are intended place holders for
 * localized data distribution. Such edges also help to locate distinct clusters
 * (those clusters are not connected by edges). NeuralMap employs Locality-Sensitive
 * Hashing to speedup the learning while BIRCH uses balanced CF trees.
 *
 * @see NeuralGas
 * @see GrowingNeuralGas
 * @see BIRCH
 * 
 * @author Haifeng Li
 */
public class NeuralMap implements Clustering {
    
    /**
     * The neurons in the network.
     */
    public static class Neuron {
        /**
         * The number of samples associated with this neuron.
         */
        public int n = 1;
        /**
         * The cluster label.
         */
        public int y = OUTLIER;
        /**
         * Reference vector.
         */
        public final double[] w;
        /**
         * Connected neighbors.
         */
        public final LinkedList neighbors = new LinkedList();

        /**
         * Constructor.
         * @param w the reference vector.
         */
        public Neuron(double[] w) {
            this.w = w;
        }
    }

    /**
     * The object encapsulates the results of nearest neighbor search.
     */
    class Neighbor implements Comparable {

        /**
         * The neighbor neuron.
         */
        Neuron neuron;
        /**
         * The distance between the query and the neighbor.
         */
        double distance;

        /**
         * Constructor.
         * @param neuron the neighbor neuron.
         * @param distance the distance between the query and the neighbor.
         */
        Neighbor(Neuron neuron, double distance) {
            this.neuron = neuron;
            this.distance = distance;
        }

        @Override
        public int compareTo(Neighbor o) {
            return (int) Math.signum(distance - o.distance);
        }
    }

    /**
     * Locality-Sensitive Hashing (LSH) is an algorithm for solving the
     * (approximate/exact) Nearest Neighbor Search in high dimensional spaces
     * by performing probabilistic dimension reduction of data. The basic idea
     * is to hash the input items so that similar items are mapped to the same
     * buckets with high probability (the number of buckets being much smaller
     * than the universe of possible input items). This class implements a
     * space-efficient LSH algorithm in Euclidean spaces.
     */
    class LSH {

        /**
         * The hash function for data in Euclidean spaces.
         */
        class Hash {

            /**
             * The object in the hash table.
             */
            class Item {

                /**
                 * The bucket id given by the universal bucket hashing.
                 */
                int bucket;
                /**
                 * The neuron object.
                 */
                Neuron neuron;

                /**
                 * Constructor
                 */
                Item(int bucket, Neuron neuron) {
                    this.bucket = bucket;
                    this.neuron = neuron;
                }
            }
            /**
             * The random vectors with entries chosen independently from a Gaussian
             * distribution.
             */
            double[][] a;
            /**
             * Real numbers chosen uniformly from the range [0, w].
             */
            double[] b;
            /**
             * Hash table.
             */
            LinkedList[] table;

            /**
             * Constructor.
             */
            @SuppressWarnings("unchecked")
            Hash() {
                a = new double[k][d];
                b = new double[k];

                for (int i = 0; i < k; i++) {
                    for (int j = 0; j < d; j++) {
                        a[i][j] = GaussianDistribution.getInstance().rand();
                    }

                    b[i] = Math.random(0, w);
                }

                LinkedList list = new LinkedList();
                table = (LinkedList[]) java.lang.reflect.Array.newInstance(list.getClass(), H);
            }

            /**
             * Returns the raw hash value of given vector x.
             * @param x the vector to be hashed.
             * @param m the m-th hash function to be employed.
             * @return the raw hash value.
             */
            double hash(double[] x, int m) {
                double r = b[m];
                for (int j = 0; j < d; j++) {
                    r += a[m][j] * x[j];
                }
                return r / w;
            }

            /**
             * Apply hash functions on given vector x.
             * @param x the vector to be hashed.
             * @return the bucket of hash table for given vector x.
             */
            int hash(double[] x) {
                long r = 0;
                for (int i = 0; i < k; i++) {
                    double ri = hash(x, i);
                    r += c[i] * (int) Math.floor(ri);
                }

                int h = (int) (r % P);
                if (h < 0) {
                    h += P;
                }

                return h;
            }

            /**
             * Insert an item into the hash table.
             */
            void add(Neuron neuron) {
                int bucket = hash(neuron.w);
                int i = bucket % H;

                if (table[i] == null) {
                    table[i] = new LinkedList();
                }

                table[i].add(new Item(bucket, neuron));
            }
        }
        
        /**
         * Hash functions.
         */
        Hash[] hash;
        /**
         * The size of hash table.
         */
        int H;
        /**
         * The number of random projection hash functions.
         */
        int k;
        /**
         * The hash function is defined as floor((a * x + b) / w). The value
         * of w determines the bucket interval.
         */
        double w;
        /**
         * The random integer used for universal bucket hashing.
         */
        int[] c;
        /**
         * The prime number in universal bucket hashing.
         */
        int P = 2147483647;

        /**
         * Constructor.
         * @param L the number of hash tables.
         * @param k the number of random projection hash functions.
         * @param w the bucket interval.
         */
        LSH(int L, int k, double w) {
            this(L, k, w, 1017881);
        }

        /**
         * Constructor.
         * @param L the number of hash tables.
         * @param k the number of random projection hash functions.
         * @param w the bucket interval.
         * @param H the number of buckets of hash tables.
         */
        LSH(int L, int k, double w, int H) {
            this.k = k;
            this.w = w;
            this.H = H;

            hash = new Hash[L];

            c = new int[k];
            for (int i = 0; i < c.length; i++) {
                c[i] = Math.randomInt(P);
            }

            for (int i = 0; i < L; i++) {
                hash[i] = new Hash();
            }
        }

        /**
         * Insert a neuron to the hash table.
         */
        void add(Neuron neuron) {
            for (int i = 0; i < hash.length; i++) {
                hash[i].add(neuron);
            }
        }

        /**
         * Remove a neuron to the hash table.
         */
        void remove(Neuron neuron) {
            for (int i = 0; i < hash.length; i++) {
                int bucket = hash[i].hash(neuron.w);

                LinkedList bin = hash[i].table[bucket % H];
                if (bin != null) {
                    for (Hash.Item e : bin) {
                        if (e.bucket == bucket && e.neuron == neuron) {
                            bin.remove(e);
                            break;
                        }
                    }
                }
            }
        }

        /**
         * Returns the nearest neighbor of x.
         */
        Neighbor nearest(double[] x) {
            Neighbor neighbor = new Neighbor(null, Double.MAX_VALUE);

            for (int i = 0; i < hash.length; i++) {
                int bucket = hash[i].hash(x);
                LinkedList bin = hash[i].table[bucket % H];
                if (bin != null) {
                    for (Hash.Item e : bin) {
                        if (e.bucket == bucket) {
                            double distance = Math.distance(x, e.neuron.w);
                            if (distance < neighbor.distance) {
                                neighbor.distance = distance;
                                neighbor.neuron = e.neuron;
                            }
                        }
                    }
                }
            }

            return neighbor;
        }

        /**
         * Returns the k-nearest neighbors of x.
         */
        int knn(double[] x, Neighbor[] neighbors) {
            int hit = 0;
            HeapSelect heap = new HeapSelect(neighbors);

            for (int i = 0; i < hash.length; i++) {
                int bucket = hash[i].hash(x);
                LinkedList bin = hash[i].table[bucket % H];
                if (bin != null) {
                    for (Hash.Item e : bin) {
                        if (e.bucket == bucket) {
                            boolean existed = false;
                            for (Neighbor n : neighbors) {
                                if (n != null && e.neuron == n.neuron) {
                                    existed = true;
                                    break;
                                }
                            }

                            if (!existed) {
                                //hit++;
                                double distance = Math.distance(x, e.neuron.w);
                                if (heap.peek() == null || distance < heap.peek().distance) {
                                    heap.add(new Neighbor(e.neuron, distance));
                                    hit++;
                                }
                            }
                        }
                    }
                }
            }

            return hit;
        }
    }
    
    /**
     * The dimensionality of signals.
     */
    private int d;
    /**
     * The distance radius to activate a neuron for a given signal.
     */
    private double r;
    /**
     * The fraction to update nearest neuron.
     */
    private double epsBest = 0.05;
    /**
     * The fraction to update neighbors of nearest neuron.
     */
    private double epsNeighbor = 0.0006;
    /**
     * Neurons in the neural network.
     */
    private LSH lsh;
    /**
     * The list of neurons.
     */
    private List neurons = new ArrayList();

    /**
     * Constructor.
     * @param d the dimensionality of signals.
     * @param r the distance radius to activate a neuron for a given signal.
     * @param epsBest the fraction to update activated neuron.
     * @param epsNeighbor the fraction to update neighbors of activated neuron.
     * @param L the number of hash tables.
     * @param k the number of random projection hash functions.
     */
    public NeuralMap(int d, double r, double epsBest, double epsNeighbor, int L, int k) {
        this.d = d;
        this.r = r;
        this.epsBest = epsBest;
        this.epsNeighbor = epsNeighbor;
        lsh = new LSH(L, k, 4 * r);
    }

    /**
     * Update the network with a new signal.
     */
    public void update(double[] x) {
        // Find the nearest (s1) and second nearest (s2) neuron to x.
        Neighbor[] top2 = new Neighbor[2];
        int k = lsh.knn(x, top2);

        double dist = Double.MAX_VALUE;
        Neuron neuron = null;
        if (k == 0) {
            neuron = new Neuron(x.clone());
            lsh.add(neuron);
            neurons.add(neuron);
            return;
        } else if (k == 1) {
            dist = top2[0].distance;

            if (dist <= r) {
                neuron = top2[0].neuron;
                neuron.n++;
                lsh.remove(neuron);
                for (int i = 0; i < d; i++) {
                    neuron.w[i] += epsBest * (x[i] - neuron.w[i]);
                }
                lsh.add(neuron);

            } else {
                neuron = new Neuron(x.clone());
                lsh.add(neuron);
                neurons.add(neuron);

                Neuron second = top2[0].neuron;
                neuron.neighbors.add(second);
                second.neighbors.add(neuron);
            }
        } else {
            dist = top2[1].distance;
            if (dist <= r) {
                neuron = top2[1].neuron;
                lsh.remove(neuron);
                for (int i = 0; i < d; i++) {
                    neuron.w[i] += epsBest * (x[i] - neuron.w[i]);
                }
                lsh.add(neuron);

                Neuron second = top2[0].neuron;
                second.n++;
                boolean connected = false;
                for (Neuron neighbor : neuron.neighbors) {
                    if (neighbor == second) {
                        connected = true;
                        break;
                    }
                }

                if (!connected) {
                    neuron.neighbors.add(second);
                    second.neighbors.add(neuron);
                }
            } else {
                neuron = new Neuron(x.clone());
                lsh.add(neuron);
                neurons.add(neuron);

                Neuron second = top2[1].neuron;
                neuron.neighbors.add(second);
                second.neighbors.add(neuron);
            }
        }

        // update the neighbors of activated neuron.
        for (Iterator iter = neuron.neighbors.iterator(); iter.hasNext(); ) {
            Neuron neighbor = iter.next();
            lsh.remove(neighbor);
            for (int i = 0; i < d; i++) {
                neighbor.w[i] += epsNeighbor * (x[i] - neighbor.w[i]);
            }

            if (Math.distance(neuron.w, neighbor.w) > 2 * r) {
                neighbor.neighbors.remove(neuron);
                iter.remove();
            }

            if (neighbor.neighbors.size() > 0) {
                lsh.add(neighbor);
            } else {
                neurons.remove(neighbor);
            }
        }

        if (neuron.neighbors.size() == 0) {
            lsh.remove(neuron);
            neurons.remove(neuron);
        }
    }

    /**
     * Returns the set of neurons.
     */
    public List neurons() {
        return neurons;
    }
    
    /**
     * Removes neurons with the number of samples less than a given threshold.
     * The neurons without neighbors will also be removed.
     * @param minPts neurons will be removed if the number of its points is
     * less than minPts.
     * @return the number of neurons after purging.
     */
    public int purge(int minPts) {
        List outliers = new ArrayList();
        for (Neuron neuron : neurons) {
            if (neuron.n < minPts) {
                outliers.add(neuron);
            }
        }
        
        neurons.removeAll(outliers);
        for (Neuron neuron : neurons) {
            neuron.neighbors.removeAll(outliers);
        }
        
        outliers.clear();
        for (Neuron neuron : neurons) {
            if (neuron.neighbors.isEmpty()) {
                outliers.add(neuron);
            }
        }
        
        neurons.removeAll(outliers);
        return neurons.size();
    }

    /**
     * Clustering neurons into k clusters.
     * @param k the number of clusters.
     */
    public void partition(int k) {
        partition(k, 0);
    }
    
    /**
     * Clustering neurons into k clusters.
     * @param k the number of clusters.
     * @param minPts a neuron will be treated as outlier if the number of its
     * points is less than minPts.
     * @return the number of non-outlier leaves.
     */
    public int partition(int k, int minPts) {
        List data = new ArrayList();
        for (Neuron neuron : neurons) {
            neuron.y = OUTLIER;
            if (neuron.n >= minPts) {
                data.add(neuron);
            }
        }
        
        double[][] proximity = new double[data.size()][];
        for (int i = 0; i < data.size(); i++) {
            proximity[i] = new double[i + 1];
            for (int j = 0; j < i; j++) {
                proximity[i][j] = Math.distance(data.get(i).w, data.get(j).w);
            }
        }

        Linkage linkage = new UPGMALinkage(proximity);
        HierarchicalClustering hc = new HierarchicalClustering(linkage);
        int[] y = hc.partition(k);

        for (int i = 0; i < data.size(); i++) {
            data.get(i).y = y[i];
        }
        
        return data.size();
    }

    /**
     * Cluster a new instance to the nearest neuron. The method partition()
     * should be called first.
     * @param x a new instance.
     * @return the cluster label of nearest neuron.
     */
    @Override
    public int predict(double[] x) {
        double minDist = Double.MAX_VALUE;
        int bestCluster = 0;

        for (Neuron neuron : neurons) {
            double dist = Math.squaredDistance(x, neuron.w);
            if (dist < minDist) {
                minDist = dist;
                bestCluster = neuron.y;
            }
        }

        return bestCluster;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy