All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.cluster.AbstractHierarchicalClusterer Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.cluster;

import com.aliasi.util.Distance;
import com.aliasi.util.Scored;


import java.util.HashSet;
import java.util.Set;

/**
 * An AbstractHierachicalClusterer provides an adapter
 * for clustering for hierarchical clusterers.  The abstract method
 * {@link #hierarchicalCluster(Set)} defines hierarchical
 * clustering for the specified input set, returning a dendrogram.
 * The basic clustering interface {@link #cluster(Set)} is defined
 * by specifying a cutoff in terms of distance.
 *
 * 

Distance measures between elements provide measures of * dissimilarity in that the larger the distance the more dissimilar * the members. Zero values indicate perfect similarity and larger * numbers indicate less similarity. The typical example is a * distance measure of some kind; closer objects are clustered more * readily in these cases. A typical distance metric is Euclidean * distance between vector objects. Other Minkowski metrics are also * common, such as the Manhattan metric, which reduces to Hamming * distance for binary vectors. Edit distance, as implemented in the * {@link com.aliasi.spell} package is another popular dissimilarity * metric for text. Two texts, * text1 and * text2, may be compared by sample * cross-entropy. If Mi is the * result of training a language model on * texti, then a symmetric measure of of * dissimilarity is * M1.crossEntropy(text2) * + * M2.crossEntropy(text1). * Averages, min or max may also be used. * * @author Bob Carpenter * @version 3.8 * @since LingPipe2.0 * @param the type of objects being clustered */ public abstract class AbstractHierarchicalClusterer implements HierarchicalClusterer { private double mMaxDistance; private final Distance mDistance; /** * Construct an abstract hierarchical clusterer with the specified * maximum distance. The distance must be a number greater than or * equal to zero, but it may be positive infinity. * * @param maxDistance Maximum distance between clusters that can * be linked. // * @param minClusters Minimum number of clusters to return. // * @param maxClusters Maximum number of clusters to return. * @throws IllegalArgumentException If the specified distance is not * a non-negative number. */ public AbstractHierarchicalClusterer(double maxDistance, Distance distance) { setMaxDistance(maxDistance); mDistance = distance; } /** * Returns the distance function for this hierarchical clusterer. * * @return The distance function for this hierarchical clusterer. */ public Distance distance() { return mDistance; } /** * Returns the array of clusters derived from performing * clustering with this class's specified maximum distance. * Setting the maximum distance to {@link * Double#POSITIVE_INFINITY} should result in a complete * clustering. * * @param elements Set of objects to cluster. */ public abstract Dendrogram hierarchicalCluster(Set elements); /** * Returns the clustering of the specified elements. The * clustering is determined by splitting a complete hierarchical * clustering at this class's distance bound. Thus the pairwise * distances between the sets in the clustering returned * will all be greater than this clusterer's maximum distance. * * @param elements Elements to cluster. * @return Clustering of elements. */ public Set> cluster(Set elements) { if (elements.isEmpty()) return new HashSet>(); Dendrogram dendrogram = hierarchicalCluster(elements); return dendrogram.partitionDistance(mMaxDistance); } /** * Returns the maximum distance for clusters in a dendrogram. * *@return The maximimum distance score for a dendrogram to * remain after cutting. */ public double getMaxDistance() { return mMaxDistance; } /** * Sets the maximum distance at which two clusters may * be merged. * * @param maxDistance New value for maximum distance. */ public final void setMaxDistance(double maxDistance) { assertValidDistanceBound(maxDistance); mMaxDistance = maxDistance; } static void assertValidDistanceBound(double maxDistance) { if (maxDistance < 0.0 || Double.isNaN(maxDistance)) { String msg = "Max distance must be non-negative number." + " Found maxDistance=" + maxDistance; throw new IllegalArgumentException(msg); } } E[] toElements(Set elementSet) { int len = elementSet.size(); // required for array @SuppressWarnings("unchecked") E[] elements = (E[]) new Object[len]; elementSet.toArray(elements); return elements; } static class PairScore implements Scored { final Dendrogram mDendrogram1; final Dendrogram mDendrogram2; final double mScore; public PairScore(Dendrogram dendrogram1, Dendrogram dendrogram2, double score) { mDendrogram1 = dendrogram1; mDendrogram2 = dendrogram2; mScore = score; } public double score() { return mScore; } @Override public String toString() { return "ps(" + mDendrogram1 + "," + mDendrogram2 + ":" + mScore + ") "; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy