All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.integratedmodelling.engine.geospace.districting.algorithms.KMeansAlgorithm Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 *  Copyright (C) 2007, 2014:
 *  
 *    - Ferdinando Villa 
 *    - integratedmodelling.org
 *    - any other authors listed in @author annotations
 *
 *    All rights reserved. This file is part of the k.LAB software suite,
 *    meant to enable modular, collaborative, integrated 
 *    development of interoperable data and model components. For
 *    details, see http://integratedmodelling.org.
 *    
 *    This program is free software; you can redistribute it and/or
 *    modify it under the terms of the Affero General Public License 
 *    Version 3 or any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but without any warranty; without even the implied warranty of
 *    merchantability or fitness for a particular purpose.  See the
 *    Affero General Public License for more details.
 *  
 *     You should have received a copy of the Affero General Public License
 *     along with this program; if not, write to the Free Software
 *     Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 *     The license is also available at: https://www.gnu.org/licenses/agpl.html
 *******************************************************************************/
package org.integratedmodelling.engine.geospace.districting.algorithms;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;

import org.integratedmodelling.engine.geospace.districting.interfaces.IDistrictingAlgorithm;
import org.integratedmodelling.engine.geospace.districting.utils.DistrictingResults;
import org.integratedmodelling.exceptions.KlabException;

public class KMeansAlgorithm implements IDistrictingAlgorithm {
    public DistrictingResults createDistricts(double[][] dataset, int initialK) throws KlabException {
        int l = dataset.length;
        int n = dataset[0].length;
        int k = initialK;
        double stoppingThreshold = 0.2;

        DistrictingResults results = new DistrictingResults();

        int[] typeset = null;
        ArrayList pointsPerCluster = null;
        ArrayList[] centroids = selectInitialCentroids(l, n, k, dataset);
        ArrayList[] prevCentroids = null;
        ArrayList[] variances = null;
        int iterations = 0;

        System.out.println("Beginning stabilization...");

        while (centroidsAreMoving(l, k, centroids, prevCentroids, stoppingThreshold)) {
            prevCentroids = centroids;
            typeset = collectPointsByCluster(l, n, k, centroids, dataset);
            pointsPerCluster = countPointsPerCluster(n, k, typeset);
            centroids = recomputeCentroids(l, n, k, typeset, pointsPerCluster, dataset);
            pointsPerCluster = removeEmptyClusters(pointsPerCluster);
            k = centroids[0].size();
            iterations++;
        }
        variances = computeVariancesByCluster(l, n, k, centroids, typeset, pointsPerCluster, dataset);

        System.out.println("Completed after " + iterations + " total iterations.");

        results.setTypeset(typeset);
        results.setPointsPerCluster(pointsPerCluster);
        results.setCentroids(centroids);
        results.setStdevs(computeStdevs(l, k, variances));
        results.setIterations(iterations);
        results.setInitialK(initialK);
        results.setFinalK(k);
        results.setDatasetVariance(computeDatasetVariance(l, n, dataset));
        return results;
    }

    public DistrictingResults createDistricts(double[][] dataset, int initialK, double stoppingThreshold,
            double varianceRatio, double membershipRatio, double separationRatio) throws KlabException {
        // Just call the simpler createDistricts and ignore the extra parameters.
        return createDistricts(dataset, initialK);
    }

    private double[] computeDatasetVariance(int l, int n, double[][] dataset) {
        double[] datasetMean = new double[l];
        double[] datasetVariance = new double[l];
        for (int i = 0; i < l; i++) {
            for (int j = 0; j < n; j++) {
                datasetMean[i] += dataset[i][j];
            }
            datasetMean[i] /= n;
            for (int j = 0; j < n; j++) {
                datasetVariance[i] += Math.pow(dataset[i][j] - datasetMean[i], 2);
            }
            datasetVariance[i] /= n;
        }
        return datasetVariance;
    }

    private ArrayList[] selectInitialCentroids(int l, int n, int k, double[][] dataset) {
        ArrayList[] centroids = new ArrayList[l];
        for (int i = 0; i < l; i++) {
            centroids[i] = new ArrayList();
        }

        Random randomNumberGenerator = new Random();
        int[] datasetIndeces = new int[k];

        for (int i = 0; i < k; i++) {
            datasetIndeces[i] = randomNumberGenerator.nextInt(n);
            for (int j = 0; j < i; j++) {
                if (datasetIndeces[j] == datasetIndeces[i]) {
                    i--;
                    break;
                }
            }
        }

        for (int i = 0; i < l; i++) {
            for (int j = 0; j < k; j++) {
                centroids[i].add(dataset[i][datasetIndeces[j]]);
            }
        }

        return centroids;
    }

    private boolean centroidsAreMoving(int l, int k, ArrayList[] centroids,
            ArrayList[] prevCentroids, double stoppingThreshold) {
        if (centroids == null || prevCentroids == null || centroids[0].size() != prevCentroids[0].size()) {
            return true;
        }
        for (int i = 0; i < k; i++) {
            double centroidMovement = 0;
            for (int j = 0; j < l; j++) {
                centroidMovement += Math.pow(centroids[j].get(i) - prevCentroids[j].get(i), 2);
            }
            if (centroidMovement > stoppingThreshold) {
                return true;
            }
        }
        return false;
    }

    private int[] collectPointsByCluster(int l, int n, int k, ArrayList[] centroids,
            double[][] dataset) {
        int[] typeset = new int[n];

        for (int a = 0; a < n; a++) {
            double minDistance = 1.0e9;
            for (int b = 0; b < k; b++) {
                double distance = 0;
                for (int c = 0; c < l; c++) {
                    distance += Math.pow(dataset[c][a] - centroids[c].get(b), 2);
                }

                if (distance < minDistance) {
                    minDistance = distance;
                    typeset[a] = b;
                }
            }
        }

        return typeset;
    }

    private ArrayList countPointsPerCluster(int n, int k, int[] typeset) {
        ArrayList pointsPerCluster = new ArrayList();
        for (int i = 0; i < k; i++) {
            pointsPerCluster.add(0);
        }
        for (int i = 0; i < n; i++) {
            pointsPerCluster.set(typeset[i], 1 + pointsPerCluster.get(typeset[i]));
        }
        return pointsPerCluster;
    }

    private ArrayList[] recomputeCentroids(int l, int n, int k, int[] typeset,
            ArrayList pointsPerCluster, double[][] dataset) {
        ArrayList[] centroids = new ArrayList[l];
        for (int i = 0; i < l; i++) {
            centroids[i] = new ArrayList();
        }

        for (int i = 0; i < l; i++) {
            for (int j = 0; j < k; j++) {
                centroids[i].add(0.0);
            }
            for (int j = 0; j < n; j++) {
                centroids[i].set(typeset[j], centroids[i].get(typeset[j]) + dataset[i][j]);
            }
            for (int j = 0; j < k; j++) {
                if (pointsPerCluster.get(j) == 0) {
                    centroids[i].set(j, null);
                } else {
                    centroids[i].set(j, centroids[i].get(j) / pointsPerCluster.get(j));
                }
            }
            Iterator iter = centroids[i].iterator();
            while (iter.hasNext()) {
                if (iter.next() == null) {
                    iter.remove();
                }
            }
        }

        return centroids;
    }

    private ArrayList removeEmptyClusters(ArrayList pointsPerCluster) {
        ArrayList newPointsPerCluster = (ArrayList) pointsPerCluster.clone();
        Iterator iter = newPointsPerCluster.iterator();
        while (iter.hasNext()) {
            if ((Integer) iter.next() == 0) {
                iter.remove();
            }
        }
        return newPointsPerCluster;
    }

    private ArrayList[] computeVariancesByCluster(int l, int n, int k, ArrayList[] centroids,
            int[] typeset, ArrayList pointsPerCluster, double[][] dataset) {
        ArrayList[] variances = new ArrayList[l];
        for (int i = 0; i < l; i++) {
            variances[i] = new ArrayList();
        }
        for (int i = 0; i < l; i++) {
            for (int j = 0; j < k; j++) {
                variances[i].add(0.0);
            }
            for (int j = 0; j < n; j++) {
                variances[i].set(
                        typeset[j],
                        variances[i].get(typeset[j])
                                + Math.pow(dataset[i][j] - centroids[i].get(typeset[j]), 2));
            }
            for (int j = 0; j < k; j++) {
                variances[i].set(j, variances[i].get(j) / pointsPerCluster.get(j));
            }
        }
        return variances;
    }

    private ArrayList[] computeStdevs(int l, int k, ArrayList[] variances) {
        ArrayList[] stdevs = new ArrayList[l];
        for (int i = 0; i < l; i++) {
            stdevs[i] = new ArrayList();
        }
        for (int i = 0; i < l; i++) {
            for (int j = 0; j < k; j++) {
                stdevs[i].add(Math.sqrt(variances[i].get(j)));
            }
        }
        return stdevs;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy