com.aliasi.test.unit.cluster.KMeansClustererTest Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
package com.aliasi.test.unit.cluster;

import com.aliasi.cluster.Clusterer;
import com.aliasi.cluster.KMeansClusterer;

import com.aliasi.io.LogLevel;
import com.aliasi.io.Reporter;
import com.aliasi.io.Reporters;

import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenFeatureExtractor;

import com.aliasi.util.Distance;
import com.aliasi.util.FeatureExtractor;

import org.junit.Test;

import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertTrue;


import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;



public class KMeansClustererTest  {

    static FeatureExtractor FEATURE_EXTRACTOR
        = new TokenFeatureExtractor(IndoEuropeanTokenizerFactory.INSTANCE);

    static Random RANDOM = new Random(42);

    static final String AA = "A A";
    static final String AAA = "A A A";
    static final String BBB = "B B B";
    static final String CCC = "C C C";
    static final String AAB = "A A B";
    static final String BBA = "B B A";

    static Set randomDenseElts(int numElts, int numDims, Random random) {
        Set inputSet = new HashSet(numElts*2);
        for (int i = 0; i < numElts; ++i)
            inputSet.add(randomDenseElt(numDims,random));
        return inputSet;
    }

    static double[] randomDenseElt(int numDims, Random random) {
        double[] xs = new double[numDims];
        for (int i = 0; i < xs.length; ++i)
            xs[i] =  10.0 * random.nextDouble() - 5.0;
        return xs;
    }

    static FeatureExtractor ID_PARSER
        = new FeatureExtractor() {
        public Map features(double[] xs) {
            Map result = new HashMap();
            for (int i = 0; i < xs.length; ++i)
                result.put(Integer.toString(i), Double.valueOf(xs[i]));
            return result;
        }
    };

    // @Test // need to scale down for release and remove main!
    public void testKMeansPlusPlus() {
        int K = 500; // 1000; // 
        int n = 89037; // 167194; // 
        int d = 50;

        int maxEpochs = 200;
        double minImprovement = 0.0001;
        KMeansClusterer clustererKmpp
            = new KMeansClusterer(ID_PARSER,K,maxEpochs,true,minImprovement);

        KMeansClusterer clusterer
            = new KMeansClusterer(ID_PARSER,K,maxEpochs,false,minImprovement);

        Random random = new Random(); // (747L);

        Set eltSet = randomDenseElts(n,d,random);

        Set> clustering
            = clusterer.cluster(eltSet,random,Reporters.silent());
    }


    public static void main(String[] args) {
        new KMeansClustererTest().testKMeansPlusPlus();
    }






    @Test(expected=IllegalArgumentException.class)
    public void testZeroExc1() {
        int numClusters = 0;
        int maxIterations = 10;
        new KMeansClusterer(FEATURE_EXTRACTOR,
                            numClusters,
                            maxIterations, false, 0.0);
    }


    @Test(expected=IllegalArgumentException.class)
    public void testZeroExc2() {
        int numClusters = 10;
        int maxIterations = -1;
        new KMeansClusterer(FEATURE_EXTRACTOR,
                            numClusters,
                            maxIterations,
                            false, 0.0);
    }

    @Test
    public void testOne() {
        int numClusters = 1;
        int maxIterations = 10;
        KMeansClusterer clusterer
            = new KMeansClusterer(FEATURE_EXTRACTOR,
                                  numClusters,
                                  maxIterations,
                                  false,0.0);
        KMeansClusterer clustererPP
            = new KMeansClusterer(FEATURE_EXTRACTOR,
                                  numClusters,
                                  maxIterations,
                                  false,0.0);

        assertCluster(clusterer,clustererPP,
                      new String[] { },
                      new String[][] { });

        assertCluster(clusterer,clustererPP,
                      new String[] { AAA },
                      new String[][] { { AAA } });

        assertCluster(clusterer,clustererPP,
                      new String[] { AAA, BBB },
                      new String[][] { { AAA, BBB } });

        assertCluster(clusterer,clustererPP,
                      new String[] { AAA, BBB, CCC },
                      new String[][] { { AAA, BBB, CCC } });
    }

    @Test
    public void testTwo() {
        int numClusters = 2;
        int maxIterations = 10;
        KMeansClusterer clusterer
            = new KMeansClusterer(FEATURE_EXTRACTOR,
                                  numClusters,
                                  maxIterations,
                                  false,0.0);
        KMeansClusterer clustererPP
            = new KMeansClusterer(FEATURE_EXTRACTOR,
                                  numClusters,
                                  maxIterations,
                                  true,0.0);

        assertCluster(clusterer,clustererPP,
                      new String[] { },
                      new String[][] { });

        assertCluster(clusterer,clustererPP,
                      new String[] { AAA },
                      new String[][] { { AAA } });

        assertCluster(clusterer,clustererPP,
                      new String[] { AAA, BBB },
                      new String[][] { { AAA }, { BBB } });

        // all inits produce same result in 2 iterations
        // { AAA, AAB }, { BBB }
        // { AAA, BBB }, { AAB }
        // { AAB, BBB }, { AAA }
        assertCluster(clusterer,clustererPP,
                      new String[] { AA, AAB, BBB },
                      new String[][] { { AA, AAB }, { BBB } });


    }

    @Test
    public void testRandomRecluster() {
        KMeansClusterer clusterer
            = new KMeansClusterer(FEATURE_EXTRACTOR,100,20,false,0.0);
        for (int t = 0; t < 5; ++t) {
            Random random = new Random();
            long seed = random.nextLong();
            random = new Random(seed);
            int numClusters = 2 + random.nextInt(1);
            Set inputSet = new HashSet();
            Set> initialClustering
                = new HashSet>();
            for (int i = 0; i < numClusters; ++i)
                initialClustering.add(randomInputs(1+random.nextInt(3),random,inputSet));
            Set unclusteredElements 
                = randomInputs(0 + random.nextInt(5),random,inputSet);
            Set> clustering
                = clusterer.recluster(initialClustering,
                                      unclusteredElements,
                                      Reporters.silent());
            assertCovers(clustering,inputSet);
        }
    }

    Set randomInputs(int numInputs, Random random) {
        return randomInputs(numInputs,random,new HashSet());
    }

    Set randomInputs(int numInputs, Random random, Set accum) {
        Set inputSet = new HashSet(numInputs*2);
        for (int n = 0; n < numInputs; ++n) {
            StringBuilder sb = new StringBuilder();
                int length = 5; // random.nextInt(20) + 5;
                int numChars = 2; // random.nextInt(16);
                for (int i = 0; i < length; ++i) {
                    if (i > 0) sb.append(' ');
                    char c = (char) ( ((int)'a') + random.nextInt(numChars));
                    sb.append(c);
                }
                if (!accum.add(sb.toString())) {
                    --n; 
                    continue;
                }
                inputSet.add(sb.toString());
            }
        return inputSet;
    }

    @Test
    public void testRandom() {
        for (int t = 0; t < 100; ++t) {
            Random random = new Random();
            long seed = random.nextLong();
            // seed = -3241003427224084927L;
            random = new Random(seed);
            int numInputs = random.nextInt(10) + 5;
            Set inputSet = randomInputs(numInputs,random);

            int maxIterations = random.nextInt(50) + 50;
            int numClusters = 3; // random.nextInt(5) + 2;
            KMeansClusterer clusterer
                = new KMeansClusterer(FEATURE_EXTRACTOR,
                                      numClusters,
                                      maxIterations,
                                      false,0.0);

            KMeansClusterer clustererPlusPlus
                = new KMeansClusterer(FEATURE_EXTRACTOR,
                                      numClusters,
                                      maxIterations,
                                      true,0.0);

            int expectedNumClusters = Math.min(inputSet.size(),numClusters);

            Set> clustering =
                clusterer.cluster(inputSet,random,null);
            assertCovers(clustering,inputSet);
            // assertEquals("cluster() seed=" + seed + " t=" + t + " input set=" + inputSet
            // + " clustering=" + clustering,
            // expectedNumClusters,clustering.size());

            Set> clustering2 =
                clustererPlusPlus.cluster(inputSet,random,null);

            assertCovers(clustering2,inputSet);
            // assertEquals("seed=" + seed, expectedNumClusters,clustering.size());
        }
    }

    void assertCovers(Set> clustering, Set elts) {
        // System.out.println("clustering=" + clustering);
        Set clusteringElts = new HashSet();
        for (Set cluster : clustering) {
            assertTrue(cluster.size() > 0);
            clusteringElts.addAll(cluster);
        }
        assertEquals(elts,clusteringElts);
    }

    void assertCluster(KMeansClusterer clusterer, KMeansClusterer clustererPP,
                       String[] inputs,
                       String[][] expectedClusters) {

        Set> expectedClustering = toClustering(expectedClusters);

        // lots of random inits, etc.
        for (int i = 0; i < 1000; ++i) {
            Set inputSet = toSet(inputs);

            Set> clustering = clusterer.cluster(inputSet,RANDOM,null);
            assertEquals(expectedClustering,clustering);

            Set> clusteringPP = clustererPP.cluster(inputSet,RANDOM,null);
            assertEquals(expectedClustering,clusteringPP);
        }
    }

    static Set> toClustering(String[][] clusters) {
        Set> clustering
            = new HashSet>();
        for (int i = 0; i < clusters.length; ++i)
            clustering.add(toSet(clusters[i]));
        return clustering;
    }


    static Set toSet(String[] xs) {
        Set result = new HashSet();
        for (int i = 0; i < xs.length; ++i)
            result.add(xs[i]);
        return result;
    }

}