org.carrot2.core.Cluster Maven / Gradle / Ivy
Show all versions of carrot2-mini Show documentation
/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.core;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.carrot2.util.MapUtils;
import org.carrot2.util.StringUtils;
import org.carrot2.util.simplexml.SimpleXmlWrapperValue;
import org.carrot2.util.simplexml.SimpleXmlWrappers;
import org.simpleframework.xml.Attribute;
import org.simpleframework.xml.ElementList;
import org.simpleframework.xml.ElementMap;
import org.simpleframework.xml.Root;
import org.simpleframework.xml.core.Commit;
import org.simpleframework.xml.core.Persist;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.carrot2.shaded.guava.common.base.Function;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.shaded.guava.common.collect.Maps;
import org.carrot2.shaded.guava.common.collect.Ordering;
import org.carrot2.shaded.guava.common.collect.Sets;
/**
* A cluster (group) of {@link Document}s. Each cluster has a human-readable label
* consisting of one or more phrases, a list of documents it contains and a list of its
* subclusters. Optionally, additional attributes can be associated with a cluster, e.g.
* {@link #OTHER_TOPICS}. This class is not thread-safe.
*/
@Root(name = "group", strict = false)
@JsonAutoDetect(
creatorVisibility = JsonAutoDetect.Visibility.NONE,
fieldVisibility = JsonAutoDetect.Visibility.NONE,
getterVisibility = JsonAutoDetect.Visibility.NONE,
isGetterVisibility = JsonAutoDetect.Visibility.NONE,
setterVisibility = JsonAutoDetect.Visibility.NONE)
@JsonSerialize()
@JsonInclude(JsonInclude.Include.NON_NULL)
public final class Cluster
{
/**
* Indicates that the cluster is an Other Topics cluster. Such a cluster
* contains documents that remain unclustered at given level of cluster hierarchy.
*
* Type of this attribute is {@link Boolean}.
*
*
* @see #setAttribute(String, Object)
* @see #getAttribute(String)
*/
public static final String OTHER_TOPICS = "other-topics";
/**
* Default label for the Other Topics cluster.
*/
public static final String OTHER_TOPICS_LABEL = "Other Topics";
/**
* Score of this cluster that indicates the clustering algorithm's beliefs on the
* quality of this cluster. The exact semantics of the score varies across algorithms.
*
* Type of this attribute is {@link Double}.
*
*
* @see #setAttribute(String, Object)
* @see #getAttribute(String)
*/
public static final String SCORE = "score";
/**
* @see #getId()
*/
@Attribute(required = false)
Integer id;
/** Phrases describing this cluster. */
@ElementList(required = false, name = "title", entry = "phrase")
private ArrayList phrases = new ArrayList();
/** A read-only list of phrases exposed in {@link #getPhrases()}. */
private List phrasesView = Collections.unmodifiableList(phrases);
/** Subclusters of this cluster. */
@ElementList(required = false, inline = true)
private ArrayList subclusters = new ArrayList();
/** A read-only list of subclusters exposed in {@link #getSubclusters()}. */
private List subclustersView = Collections.unmodifiableList(subclusters);
/** Documents contained in this cluster. */
private final ArrayList documents = new ArrayList();
/** A read-only list of this cluster's documents exposed in {@link #getDocuments()}. */
private final List documentsView = Collections.unmodifiableList(documents);
/** Attributes of this cluster. */
private Map attributes = new HashMap();
/** A Read-only view of the attributes of this cluster. */
private Map attributesView = Collections.unmodifiableMap(attributes);
/** Cached concatenated label */
private String labelCache = null;
/** Cached list of documents from this cluster and subclusters */
private List allDocuments;
/** Attributes of this cluster for serialization/ deserialization purposes. */
@ElementMap(entry = "attribute", key = "key", attribute = true, inline = true, required = false)
private HashMap otherAttributesForSerialization;
/**
* List of document ids used for serialization/ deserialization purposes.
*/
@ElementList(required = false, inline = true)
List documentIds;
/**
* A helper class for serialization/ deserialization of documents with refids.
*/
@Root(name = "document")
static class DocumentRefid
{
@Attribute
String refid;
DocumentRefid()
{
}
DocumentRefid(String refid)
{
this.refid = refid;
}
}
/**
* Creates a {@link Cluster} with an empty label, no documents and no subclusters.
*/
public Cluster()
{
}
/**
* Creates a {@link Cluster} with the provided phrase
to be used as the
* cluster's label and documents
contained in the cluster.
*
* @param phrase the phrase to form the cluster's label
* @param documents documents contained in the cluster
*/
public Cluster(String phrase, Document... documents)
{
addPhrases(phrase);
addDocuments(documents);
}
/**
* Same as {@link #Cluster(String,Document...)} but allows specifying
* cluster identifier.
*/
public Cluster(Integer id, String phrase, Document... documents)
{
this(phrase, documents);
this.id = id;
}
/**
* Formats this cluster's label. If there is more than one phrase describing this
* cluster, phrases will be separated by a comma followed by a space, e.g. "Phrase
* one, Phrase two". To format multi-phrase label in a different way, use
* {@link #getPhrases()}.
*
* @return formatted label of this cluster
*/
public String getLabel()
{
if (labelCache == null)
{
labelCache = StringUtils.toString(phrases, ", ");
}
return labelCache;
}
/**
* Returns all phrases describing this cluster. The returned list is unmodifiable.
*
* @return phrases describing this cluster
*/
@JsonProperty
public List getPhrases()
{
return phrasesView;
}
/**
* Returns all subclusters of this cluster. The returned list is unmodifiable.
*
* @return subclusters of this cluster
*/
public List getSubclusters()
{
return subclustersView;
}
/**
* For JSON serialization only.
*/
@JsonProperty("clusters")
private List getSubclustersForSerialization()
{
return subclustersView.isEmpty() ? null : subclustersView;
}
/**
* Returns all documents contained in this cluster. The returned list is unmodifiable.
*
* @return documents contained in this cluster
*/
public List getDocuments()
{
return documentsView;
}
/**
* Returns all documents contained in this cluster and (recursively) all documents
* from this cluster's subclusters. The returned list contains unique documents, i.e.
* if a document is attached to multiple subclusters if this cluster, the document
* will appear only once on the list. The documents are enumerated in breadth first
* order, i.e. first come documents returned by {@link #getDocuments()} and then
* documents from subclusters.
*
* @return all documents from this cluster and its subclusters
*/
public List getAllDocuments()
{
if (allDocuments == null)
{
allDocuments = new ArrayList(collectAllDocuments(this,
new LinkedHashSet()));
}
return allDocuments;
}
/**
* Returns all documents in this cluster ordered according to the provided comparator.
* See {@link Document} for common comparators.
*/
public List getAllDocuments(Comparator comparator)
{
final List sortedDocuments = Lists.newArrayList(getAllDocuments());
Collections.sort(sortedDocuments, comparator);
return sortedDocuments;
}
/**
* A recursive routine for collecting unique documents from this cluster and
* subclusters.
*/
private static Set collectAllDocuments(Cluster cluster, Set docs)
{
if (cluster == null)
{
return docs;
}
docs.addAll(cluster.getDocuments());
final List subclusters = cluster.getSubclusters();
for (final Cluster subcluster : subclusters)
{
collectAllDocuments(subcluster, docs);
}
return docs;
}
/**
* Adds phrases to the description of this cluster.
*
* @param phrases to be added to the description of this cluster
* @return this cluster for convenience
*/
public Cluster addPhrases(String... phrases)
{
labelCache = null;
for (final String phrase : phrases)
{
this.phrases.add(phrase);
}
return this;
}
/**
* Adds phrases to the description of this cluster.
*
* @param phrases to be added to the description of this cluster
* @return this cluster for convenience
*/
public Cluster addPhrases(Iterable phrases)
{
labelCache = null;
for (final String phrase : phrases)
{
this.phrases.add(phrase);
}
return this;
}
/**
* Adds document to this cluster.
*
* @param documents to be added to this cluster
* @return this cluster for convenience
*/
public Cluster addDocuments(Document... documents)
{
for (final Document document : documents)
{
this.documents.add(document);
}
allDocuments = null;
return this;
}
/**
* Method optimized for single document instead of a vararg.
* @see #addDocuments(Document...)
*/
public Cluster addDocument(Document document)
{
this.documents.add(document);
allDocuments = null;
return this;
}
/**
* Adds document to this cluster.
*
* @param documents to be added to this cluster
* @return this cluster for convenience
*/
public Cluster addDocuments(Iterable documents)
{
for (final Document document : documents)
{
this.documents.add(document);
}
allDocuments = null;
return this;
}
/**
* Adds subclusters to this cluster
*
* @param subclusters to be added to this cluster
* @return this cluster for convenience
*/
public Cluster addSubclusters(Cluster... subclusters)
{
for (final Cluster cluster : subclusters)
{
this.subclusters.add(cluster);
}
allDocuments = null;
return this;
}
/**
* Adds a subcluster to this cluster.
* @see #addSubclusters(Cluster...)
*/
public Cluster addSubcluster(Cluster cluster)
{
this.subclusters.add(cluster);
this.allDocuments = null;
return this;
}
/**
* Adds subclusters to this cluster
*
* @param clusters to be added to this cluster
* @return this cluster for convenience
*/
public Cluster addSubclusters(Iterable clusters)
{
for (final Cluster cluster : clusters)
{
this.subclusters.add(cluster);
}
allDocuments = null;
return this;
}
/**
* Returns this cluster's {@value #SCORE} field.
*/
@JsonProperty
@Attribute(required = false)
public Double getScore()
{
return getAttribute(SCORE);
}
/**
* Sets this cluster's {@link #SCORE} field.
*
* @param score score to set
* @return this cluster for convenience
*/
@Attribute(required = false)
public Cluster setScore(Double score)
{
return setAttribute(SCORE, score);
}
/**
* Returns the attribute associated with this cluster under the provided
* key
. If there is no attribute under the provided key
,
* null
will be returned.
*
* @param key of the attribute
* @return attribute value of null
*/
@SuppressWarnings("unchecked")
public T getAttribute(String key)
{
return (T) attributes.get(key);
}
/**
* Associates an attribute with this cluster.
*
* @param key for the attribute
* @param value for the attribute
* @return this cluster for convenience
*/
public Cluster setAttribute(String key, T value)
{
attributes.put(key, value);
return this;
}
/**
* Unconditionally remove an attribute from this cluster, if it exists. If there
* is no such attribute, nothing happens.
*/
public Cluster removeAttribute(String key)
{
attributes.remove(key);
return this;
}
/**
* Returns all attributes of this cluster. The returned map is unmodifiable.
*
* @return all attributes of this cluster
*/
public Map getAttributes()
{
return attributesView;
}
/**
* Returns the size of the cluster calculated as the number of unique documents it
* contains, including its subclusters.
*
* @return size of the cluster
*/
public int size()
{
return getAllDocuments().size();
}
/**
* For serialization only.
*/
@JsonProperty
@Attribute(required = false)
private int getSize()
{
return size();
}
/**
* Empty implementation, SimpleXML requires both a getter and a setter.
*/
@Attribute(required = false)
private void setSize(int size)
{
// We only serialize the size, hence empty implementation
}
/**
* Internal identifier of this cluster within the {@link ProcessingResult}. This
* identifier is assigned dynamically after clusters are passed to
* {@link ProcessingResult}.
*
* @see ProcessingResult
*/
@JsonProperty
public Integer getId()
{
return id;
}
/**
* Returns true
if this cluster is the {@link #OTHER_TOPICS} cluster.
*/
public boolean isOtherTopics()
{
final Boolean otherTopics = getAttribute(OTHER_TOPICS);
return otherTopics != null && otherTopics.booleanValue();
}
/**
* Sets the {@link #OTHER_TOPICS} attribute of this cluster.
*
* @param isOtherTopics if true
, this cluster will be marked as an
* Other Topics cluster.
* @return this cluster for convenience
*/
public Cluster setOtherTopics(boolean isOtherTopics)
{
if (isOtherTopics) {
setAttribute(OTHER_TOPICS, Boolean.TRUE).setScore(0.0);
} else {
removeAttribute(OTHER_TOPICS);
}
return this;
}
/**
* Compares clusters by size as returned by {@link #size()}. Clusters with more
* documents are larger.
*/
public static final Comparator BY_SIZE_COMPARATOR = Ordering.natural()
.nullsFirst().onResultOf(new Function(){
public Integer apply(Cluster cluster)
{
return cluster.size();
}
});
/**
* Compares clusters by score as returned by {@link #SCORE}. Clusters with larger
* score are larger.
*/
public static final Comparator BY_SCORE_COMPARATOR = Ordering.natural()
.nullsFirst().onResultOf(new Function(){
public Double apply(Cluster cluster)
{
return cluster.getAttribute(SCORE);
}
});
/**
* Compares clusters by the natural order of their labels as returned by
* {@link #getLabel()}.
*/
public static final Comparator BY_LABEL_COMPARATOR = Ordering.natural()
.nullsFirst().onResultOf(new Function(){
public String apply(Cluster cluster)
{
return cluster.getLabel();
}
});
/**
* Compares clusters first by their size as returned by {@link #size()} and labels as
* returned by {@link #getLabel()}. In case of equal sizes, natural order of the
* labels decides.
*
* Please note: this is a reversed comparator, so "larger" clusters end up
* nearer the beginning of the list being sorted (which is usually the order in which
* the applications want to display clusters).
*
*/
public static final Comparator BY_REVERSED_SIZE_AND_LABEL_COMPARATOR = Ordering
.from(Collections.reverseOrder(BY_SIZE_COMPARATOR)).compound(BY_LABEL_COMPARATOR);
/**
* Compares clusters first by their size as returned by {@link #SCORE} and labels as
* returned by {@link #getLabel()}. In case of equal scores, natural order of the
* labels decides.
*
* Please note: this is a reversed comparator, so "larger" clusters end up
* nearer the beginning of the list being sorted (which is usually the order in which
* the applications want to display clusters).
*
*/
public static final Comparator BY_REVERSED_SCORE_AND_LABEL_COMPARATOR = Ordering
.from(Collections.reverseOrder(BY_SCORE_COMPARATOR))
.compound(BY_LABEL_COMPARATOR);
/**
* Returns a comparator that compares clusters based on the aggregation of their size
* and score. If scoreWeight
is 0.0, the order depends only on cluster
* sizes. If scoreWeight
is 1.1, the order depends only on cluster
* scores. For scoreWeight
values between 0.0 and 1.0, the higher the
* scoreWeight
, the more contribution of cluster scores to the order. In
* case of a tie on the aggregated cluster size and score, clusters are compared by
* the natural order of their labels.
*
* Please note: this is a reversed comparator, so "larger" clusters end up
* nearer the beginning of the list being sorted (which is usually the order in which
* the applications want to display clusters).
*
*/
public static Comparator byReversedWeightedScoreAndSizeComparator(
final double scoreWeight)
{
if (scoreWeight < 0 || scoreWeight > 1)
{
throw new IllegalArgumentException(
"Score weight must be between 0.0 (inclusive) and 1.0 (inclusive) ");
}
return Ordering.natural().onResultOf(new Function()
{
public Double apply(Cluster cluster)
{
return -Math.pow(cluster.size(), (1 - scoreWeight))
* Math.pow((Double) cluster.getAttribute(SCORE), scoreWeight);
}
}).compound(BY_LABEL_COMPARATOR);
}
/**
* A comparator that puts {@link #OTHER_TOPICS} clusters at the end of the list. In
* other words, to this comparator an {@link #OTHER_TOPICS} topics cluster is "bigger"
* than a non-{{@link #OTHER_TOPICS} cluster.
*
* Note: This comparator is designed for use in combination with
* other comparators, such as {@link #BY_REVERSED_SIZE_AND_LABEL_COMPARATOR}. If you
* only need to partition a list of clusters into regular and other topic ones, this
* is better done in linear time without resorting to {@link Collections#sort(List)}.
*
*/
public static final Comparator OTHER_TOPICS_AT_THE_END = Ordering.natural()
.onResultOf(new Function()
{
public Double apply(Cluster cluster)
{
return cluster.isOtherTopics() ? 1.0 : -1.0;
}
});
/**
* Assigns sequential identifiers to the provided clusters
(and their
* sub-clusters). If any cluster already has an identifier, identifier will not be
* changed but all clusters must have unique identifiers.
*
* @param clusters Clusters to assign identifiers to.
* @throws IllegalArgumentException if the provided clusters contain non-unique
* identifiers.
*/
public static void assignClusterIds(Collection clusters)
{
final List flattened = flatten(clusters);
synchronized (clusters)
{
// First, find the start value for the id and check uniqueness of the ids
// already provided.
boolean hadIds = false;
for (final Cluster cluster : flattened)
{
if (cluster.id != null)
{
hadIds = true;
break;
}
}
if (hadIds)
{
final HashSet ids = Sets.newHashSet();
for (final Cluster c : flattened)
{
if (!ids.add(c.id))
{
throw new IllegalArgumentException(
"Cluster identifiers must be unique, duplicated identifier: " + c.id);
}
}
if (ids.contains(null))
{
throw new IllegalArgumentException(
"Null cluster identifiers cannot be mixed with existing non-null identifiers.");
}
}
else
{
// Assign new IDs.
int id = 0;
for (final Cluster c : flattened)
{
if (c.id == null)
{
c.id = id++;
}
}
}
}
}
/**
* Flattens a hierarchy of clusters into a flat list.
*/
public static List flatten(Collection hierarchical)
{
return flatten(hierarchical, Lists. newArrayList());
}
/*
* Recursive descent into subclusters.
*/
private static List flatten(Collection hierarchical, List flat)
{
for (Cluster c : hierarchical)
{
flat.add(c);
flatten(c.getSubclusters(), flat);
}
return flat;
}
/**
* Locate the first cluster that has id equal to id
. The search includes
* all the clusters in the input and their sub-clusters. The first cluster with
* matching identifier is returned or null
if no such cluster could be
* found.
*/
public static Cluster find(int id, Collection clusters)
{
for (Cluster c : clusters)
{
if (c != null)
{
if (c.id != null && c.id == id)
{
return c;
}
if (!c.getSubclusters().isEmpty())
{
final Cluster sub = find(id, c.getSubclusters());
if (sub != null)
{
return sub;
}
}
}
}
return null;
}
/**
* Builds an "Other Topics" cluster that groups those documents from
* allDocument
that were not referenced in any cluster in
* clusters
.
*
* @param allDocuments all documents to check against
* @param clusters list of clusters with assigned documents
* @return the "Other Topics" cluster
*/
public static Cluster buildOtherTopics(List allDocuments,
List clusters)
{
return buildOtherTopics(allDocuments, clusters, OTHER_TOPICS_LABEL);
}
/**
* Builds an "Other Topics" cluster that groups those documents from
* allDocument
that were not referenced in any cluster in
* clusters
.
*
* @param allDocuments all documents to check against
* @param clusters list of clusters with assigned documents
* @param label label for the "Other Topics" group
* @return the "Other Topics" cluster
*/
public static Cluster buildOtherTopics(List allDocuments,
List clusters, String label)
{
final Set unclusteredDocuments = Sets.newLinkedHashSet(allDocuments);
final Set assignedDocuments = Sets.newHashSet();
for (Cluster cluster : clusters)
{
collectAllDocuments(cluster, assignedDocuments);
}
unclusteredDocuments.removeAll(assignedDocuments);
final Cluster otherTopics = new Cluster(label);
otherTopics.addDocuments(unclusteredDocuments);
otherTopics.setOtherTopics(true);
return otherTopics;
}
/**
* If there are unclustered documents, appends the "Other Topics" group to the
* clusters
.
*
* @see #buildOtherTopics(List, List)
*/
public static Cluster appendOtherTopics(List allDocuments,
List clusters)
{
return appendOtherTopics(allDocuments, clusters, OTHER_TOPICS_LABEL);
}
/**
* If there are unclustered documents, appends the "Other Topics" group to the
* clusters
.
*
* @see #buildOtherTopics(List, List, String)
*/
public static Cluster appendOtherTopics(List allDocuments,
List clusters, String label)
{
final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label);
if (!otherTopics.getDocuments().isEmpty())
{
clusters.add(otherTopics);
}
return otherTopics;
}
/**
* An extremely dodgy method that remaps {@link Document} references
* inside this cluster. This operation is allowed only when the cluster has not been
* assigned an ID yet (so theoretically before the {@link ProcessingResult} has been
* published. While there are theoretically other ways to achieve the same result (copying
* the entire set of clusters) this is the most memory and cpu efficient way.
*
* Only documents from this cluster are remapped, subclusters need to be processed separately.
*/
public void remapDocumentReferences(IdentityHashMap docMapping)
{
if (this.id != null) throw new IllegalStateException();
for (int i = documents.size(); --i >= 0;)
{
Document doc = documents.get(i);
Document remapped = docMapping.get(doc);
if (remapped != null) {
documents.set(i, remapped);
}
}
// Invalidate recursive flattened cache.
this.allDocuments = null;
}
@Persist
private void beforeSerialization()
{
documentIds = Lists.transform(documents, new Function()
{
public DocumentRefid apply(Document document)
{
return new DocumentRefid(document.getStringId());
}
});
// Remove score from attributes for serialization
otherAttributesForSerialization = MapUtils.asHashMap(SimpleXmlWrappers
.wrap(attributes));
otherAttributesForSerialization.remove(SCORE);
if (otherAttributesForSerialization.isEmpty())
{
otherAttributesForSerialization = null;
}
}
@Commit
private void afterDeserialization() throws Exception
{
if (otherAttributesForSerialization != null)
{
attributes.putAll(SimpleXmlWrappers.unwrap(otherAttributesForSerialization));
}
phrasesView = Collections.unmodifiableList(phrases);
subclustersView = Collections.unmodifiableList(subclusters);
// Documents will be restored on the ProcessingResult level
}
/**
* For JSON serialization only.
*/
@JsonProperty("documents")
private List getDocumentIds()
{
return Lists.transform(documents, DOCUMENT_TO_ID);
}
private static Function DOCUMENT_TO_ID = new Function()
{
@Override
public String apply(Document doc)
{
return doc.getStringId();
}
};
/**
* For JSON and XML serialization only.
*/
@JsonProperty("attributes")
private Map getOtherAttributes()
{
final Map otherAttributes = Maps.newHashMap(attributesView);
return otherAttributes.isEmpty() ? null : otherAttributes;
}
@Override
public String toString()
{
return "[Cluster, label: " + getLabel() + ", docs: " + size() + ", subclusters: " + getSubclusters().size() + "]";
}
}