org.carrot2.text.clustering.MultilingualClustering Maven / Gradle / Ivy
/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.clustering;
import java.util.*;
import org.carrot2.core.*;
import org.carrot2.core.attribute.Processing;
import org.carrot2.util.attribute.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.carrot2.shaded.guava.common.base.Function;
import org.carrot2.shaded.guava.common.base.Predicate;
import org.carrot2.shaded.guava.common.collect.*;
import org.carrot2.shaded.guava.common.collect.Multiset.Entry;
/**
* A helper for clustering multilingual collections of documents. The helper partitions
* the input documents by {@link org.carrot2.core.Document#LANGUAGE}, clusters each such monolingual
* partition separately and then aggregates the partial cluster lists based on the
* selected {@link LanguageAggregationStrategy}.
*/
@Bindable(prefix = "MultilingualClustering")
public class MultilingualClustering
{
/** {@link Group} name. */
private static final String MULTILINGUAL_CLUSTERING = "Multilingual clustering";
/**
* Defines how monolingual partial clusters will be combined to form final results.
*/
public enum LanguageAggregationStrategy
{
/**
* Combines clusters created for all languages into one flat list. In this
* setting, the first level of the clusters hierarchy will contain labels in all
* input languages.
*/
FLATTEN_ALL("Flatten clusters from all languages"),
/**
* Puts clusters generated for the largest language partition on the first level
* of the hierarchy. Clusters generated for the other languages are placed in the
* "Other Languages" cluster appended at the end of the list.
*/
FLATTEN_MAJOR_LANGUAGE("Flatten clusters from the majority language"),
/**
* Puts clusters corresponding to language names, e.g. English, German, Spanish,
* on the first level of the hierarchy. Each such cluster contains the actual
* clusters generated for documents in the corresponding language.
*/
FLATTEN_NONE("Dedicated parent cluster for each language"),
/**
* Clusters all documents assuming the language of the majority of documents.
* For example, if 40 documents are English, 30 documents German and 30 French,
* all 100 documents will be clustered with English settings. In case of ties,
* an arbitrary major language will be chosen. When the majority of documents
* have undefined language, {@link MultilingualClustering#defaultLanguage}
* will be used.
*/
CLUSTER_IN_MAJORITY_LANGUAGE("Cluster all documents assuming the language of the majority");
private String label;
private LanguageAggregationStrategy(String label)
{
this.label = label;
}
@Override
public String toString()
{
return label;
}
}
/**
* Logger for this class.
*/
private final static Logger logger = LoggerFactory.getLogger(MultilingualClustering.class);
/**
* Language aggregation strategy. Determines how clusters generated for individual
* languages should be combined to form the final result. Please see
* {@link org.carrot2.text.clustering.MultilingualClustering.LanguageAggregationStrategy}
* for the list of available options.
*/
@Input
@Processing
@Attribute
@Required
@Group(MULTILINGUAL_CLUSTERING)
@Level(AttributeLevel.MEDIUM)
public LanguageAggregationStrategy languageAggregationStrategy = LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE;
/**
* Default clustering language. The default language to use for documents with
* undefined {@link org.carrot2.core.Document#LANGUAGE}.
*/
@Input
@Processing
@Attribute
@Required
@Group(MULTILINGUAL_CLUSTERING)
@Level(AttributeLevel.MEDIUM)
public LanguageCode defaultLanguage = LanguageCode.ENGLISH;
/**
* Document languages. The number of documents in each language. Empty string key means
* unknown language.
*/
@Output
@Processing
@Attribute
@Group(MULTILINGUAL_CLUSTERING)
@Level(AttributeLevel.MEDIUM)
public Map languageCounts;
/**
* Majority language.
* If {@link #languageAggregationStrategy} is
* {@link org.carrot2.text.clustering.MultilingualClustering.LanguageAggregationStrategy#CLUSTER_IN_MAJORITY_LANGUAGE},
* this attribute will provide the majority language that was used to cluster all the documents.
* If the majority of the documents have undefined language, this attribute will be
* empty and the clustering will be performed in the {@link #defaultLanguage}.
*/
@Output
@Processing
@Attribute
@Group(MULTILINGUAL_CLUSTERING)
@Level(AttributeLevel.MEDIUM)
public String majorityLanguage = "";
public List process(List documents, IMonolingualClusteringAlgorithm algorithm)
{
languageCounts = Maps.newHashMap();
if (documents.isEmpty())
{
return Lists.newArrayList();
}
if (LanguageAggregationStrategy.CLUSTER_IN_MAJORITY_LANGUAGE.equals(languageAggregationStrategy))
{
return clusterInMajorityLanguage(documents, algorithm);
}
// Clusters documents in each language separately,
// creates a map of top-level Cluster instances named after the language code.
final Map clustersByLanguage = clusterByLanguage(documents, algorithm);
final List clusters = Lists.newArrayList(clustersByLanguage.values());
// For FLATTEN_ALL we combine all clusters
if (clustersByLanguage.size() == 1 ||
LanguageAggregationStrategy.FLATTEN_ALL.equals(languageAggregationStrategy))
{
// For FLATTEN_ALL, we simply mix up all clusters, moving all unclustered
// documents under one common Other Topics cluster.
final List flattenedClusters = Lists.newArrayList();
for (Cluster cluster : clusters)
{
final List subclusters = cluster.getSubclusters();
for (Cluster subcluster : subclusters)
{
if (!subcluster.isOtherTopics())
{
flattenedClusters.add(subcluster);
}
}
}
// If there's more than one language, sort clusters by their number of
// documents, irrespectively of their original score. We don't know how
// to normalize the score between languages (independent clustering)
// and larger clusters are typically more intuitive (better?) than smaller clusters.
if (clustersByLanguage.size() > 1)
{
Collections.sort(flattenedClusters, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR);
}
Cluster.appendOtherTopics(documents, flattenedClusters);
return flattenedClusters;
}
else
{
Collections.sort(clusters, Collections.reverseOrder(Cluster.BY_SIZE_COMPARATOR));
if (LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE.equals(languageAggregationStrategy))
{
// For FLATTEN_MAJOR_LANGUAGE, we flatten the first biggest language
// cluster that has some nontrivial subclusters (clusters in that
// language). If there's no cluster with nontrivial subclusters,
// we default to FLATTEN_NONE.
final Iterator iterator = clusters.iterator();
Cluster majorLanguageCluster = null;
try
{
majorLanguageCluster = Iterators.find(iterator,
new Predicate()
{
public boolean apply(Cluster cluster)
{
return !cluster.getSubclusters().isEmpty();
}
});
}
catch (NoSuchElementException ignored)
{
// When element is not found, Google Collections throws this exception
}
if (majorLanguageCluster != null)
{
iterator.remove();
final List flattenedClusters = Lists.newArrayList();
flattenedClusters.addAll(majorLanguageCluster.getSubclusters());
final Cluster otherLanguages = new Cluster("Other Languages");
otherLanguages.addSubclusters(clusters);
flattenedClusters.add(otherLanguages);
return flattenedClusters;
}
else
{
return clusters;
}
}
else
{
return clusters;
}
}
}
/**
* Clusters documents in each language separately.
*/
private Map clusterByLanguage(List documents,
IMonolingualClusteringAlgorithm algorithm)
{
// Partition by language first. As Multimaps.index() does not handle null
// keys, we'd need to index by LanguageCode string and have a dedicated empty
// string for the null language.
final ImmutableListMultimap documentsByLanguage =
Multimaps.index(documents, new Function()
{
public String apply(Document document)
{
final LanguageCode language = document.getLanguage();
return language != null ? language.name() : "";
}
});
// For each language, perform clustering. Please note that implementations of
// IMonolingualClusteringAlgorithm.cluster() are not guaranteed to be thread-safe
// and hence the method must NOT be called concurrently.
final Map clusters = Maps.newHashMap();
for (String language : documentsByLanguage.keySet())
{
final ImmutableList languageDocuments = documentsByLanguage.get(language);
final LanguageCode languageCode = language.equals("") ? null : LanguageCode.valueOf(language);
final Cluster languageCluster = new Cluster(
languageCode != null ? languageCode.toString() : "Unknown Language");
languageCounts.put(languageCode != null ? languageCode.getIsoCode() : "",
languageDocuments.size());
// Perform clustering
final LanguageCode currentLanguage = languageCode != null ? languageCode : defaultLanguage;
logger.debug("Performing monolingual clustering in: " + currentLanguage);
final List clustersForLanguage = algorithm.process(
languageDocuments, currentLanguage);
if (clustersForLanguage.size() == 0 ||
clustersForLanguage.size() == 1 && clustersForLanguage.get(0).isOtherTopics())
{
languageCluster.addDocuments(languageDocuments);
}
else
{
languageCluster.addSubclusters(clustersForLanguage);
}
clusters.put(languageCode, languageCluster);
}
return clusters;
}
private List clusterInMajorityLanguage(List documents,
IMonolingualClusteringAlgorithm algorithm)
{
final Multiset counts = HashMultiset.create();
for (Document d : documents)
{
counts.add(d.getLanguage());
}
LanguageCode majorityLanguage = defaultLanguage;
int maxCount = 0;
for (Entry entry : counts.entrySet())
{
if (entry.getElement() != null)
{
if (entry.getCount() > maxCount)
{
maxCount = entry.getCount();
majorityLanguage = entry.getElement();
this.majorityLanguage = entry.getElement().getIsoCode();
}
}
languageCounts.put(entry.getElement() != null ? entry.getElement().getIsoCode() : "",
entry.getCount());
}
logger.debug("Performing clustering in majority language: " + majorityLanguage);
final List clusters = algorithm.process(documents, majorityLanguage);
Cluster.appendOtherTopics(documents, clusters);
return clusters;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy