All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencompare.hac.agglomeration.WardLinkage Maven / Gradle / Ivy

/*
 * This file is licensed to You under the "Simplified BSD License".
 * You may not use this software except in compliance with the License. 
 * You may obtain a copy of the License at
 *
 * http://www.opensource.org/licenses/bsd-license.php
 * 
 * See the COPYRIGHT file distributed with this work for information
 * regarding copyright ownership.
 */
package org.opencompare.hac.agglomeration;


/**
 * The "Ward", "inner squared distance", "sum of squares", "error sum of squares", 
 * or "minimum variance" method.
 * 
 * This method fuses those two clusters that result in the smallest increase 
 * in the total within-group error sum of squares.
 * This quantity is defined as the sum of squared deviation 
 * of each object from the centroid of its own cluster.
 * In contrast to the other methods that use prior criteria,
 * this method is based on a posterior fusion criterion.
 * [The data analysis handbook. By Ildiko E. Frank, Roberto Todeschini]
 *  
 * Used only for Euclidean distance!
 * 
 * The general form of the Lance-Williams matrix-update formula:
 * d[(i,j),k] = ai*d[i,k] + aj*d[j,k] + b*d[i,j] + g*|d[i,k]-d[j,k]|
 *
 * For the "Ward" method:
 * ai = (ci+ck)/(ci+cj+ck)
 * aj = (cj+ck)/(ci+cj+ck)
 * b  = -ck/(ci+cj+ck)
 * g  = 0
 * 
 * Thus:
 * d[(i,j),k] = (ci+ck)/(ci+cj+ck)*d[i,k] + (cj+ck)/(ci+cj+ck)*d[j,k] - ck/(ci+cj+ck)*d[i,j]
 *            = ( (ci+ck)*d[i,k] + (cj+ck)*d[j,k] - ck*d[i,j] ) / (ci+cj+ck)
 * 
 * @author [email protected]
 */
public final class WardLinkage implements AgglomerationMethod {

    public double computeDissimilarity(final double dik, final double djk, final double dij, final int ci, final int cj, final int ck) {
        return ((ci+ck)*dik+(cj+ck)*djk-ck*dij)/(ci+cj+ck);
    }

    public String toString() {
        return "Ward";
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy