org.carrot2.text.clustering.MultilingualClustering Maven / Gradle / Ivy

Go to download

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.clustering;

import java.util.*;

import org.carrot2.core.*;
import org.carrot2.core.attribute.Processing;
import org.carrot2.util.attribute.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.carrot2.shaded.guava.common.base.Function;
import org.carrot2.shaded.guava.common.base.Predicate;
import org.carrot2.shaded.guava.common.collect.*;
import org.carrot2.shaded.guava.common.collect.Multiset.Entry;

/**
 * A helper for clustering multilingual collections of documents. The helper partitions
 * the input documents by {@link org.carrot2.core.Document#LANGUAGE}, clusters each such monolingual
 * partition separately and then aggregates the partial cluster lists based on the
 * selected {@link LanguageAggregationStrategy}.
 */
@Bindable(prefix = "MultilingualClustering")
public class MultilingualClustering
{
    /** {@link Group} name. */
    private static final String MULTILINGUAL_CLUSTERING = "Multilingual clustering";

    /**
     * Defines how monolingual partial clusters will be combined to form final results.
     */
    public enum LanguageAggregationStrategy
    {
        /**
         * Combines clusters created for all languages into one flat list. In this
         * setting, the first level of the clusters hierarchy will contain labels in all
         * input languages.
         */
        FLATTEN_ALL("Flatten clusters from all languages"),

        /**
         * Puts clusters generated for the largest language partition on the first level
         * of the hierarchy. Clusters generated for the other languages are placed in the
         * "Other Languages" cluster appended at the end of the list.
         */
        FLATTEN_MAJOR_LANGUAGE("Flatten clusters from the majority language"),

        /**
         * Puts clusters corresponding to language names, e.g. English, German, Spanish,
         * on the first level of the hierarchy. Each such cluster contains the actual
         * clusters generated for documents in the corresponding language.
         */
        FLATTEN_NONE("Dedicated parent cluster for each language"),
        
        /**
         * Clusters all documents assuming the language of the majority of documents.
         * For example, if 40 documents are English, 30 documents German and 30 French,
         * all 100 documents will be clustered with English settings. In case of ties,
         * an arbitrary major language will be chosen. When the majority of documents
         * have undefined language, {@link MultilingualClustering#defaultLanguage}
         * will be used.
         */
        CLUSTER_IN_MAJORITY_LANGUAGE("Cluster all documents assuming the language of the majority");
        
        private String label;
        
        private LanguageAggregationStrategy(String label)
        {
            this.label = label;
        }

        @Override
        public String toString()
        {
            return label;
        }
    }

    /**
     * Logger for this class.
     */
    private final static Logger logger = LoggerFactory.getLogger(MultilingualClustering.class);

    /**
     * Language aggregation strategy. Determines how clusters generated for individual
     * languages should be combined to form the final result. Please see
     * {@link org.carrot2.text.clustering.MultilingualClustering.LanguageAggregationStrategy} 
     * for the list of available options.
     */
    @Input
    @Processing
    @Attribute
    @Required
    @Group(MULTILINGUAL_CLUSTERING)
    @Level(AttributeLevel.MEDIUM)
    public LanguageAggregationStrategy languageAggregationStrategy = LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE;

    /**
     * Default clustering language. The default language to use for documents with
     * undefined {@link org.carrot2.core.Document#LANGUAGE}.
     */
    @Input
    @Processing
    @Attribute
    @Required
    @Group(MULTILINGUAL_CLUSTERING)
    @Level(AttributeLevel.MEDIUM)
    public LanguageCode defaultLanguage = LanguageCode.ENGLISH;

    /**
     * Document languages. The number of documents in each language. Empty string key means
     * unknown language.
     */
    @Output
    @Processing
    @Attribute
    @Group(MULTILINGUAL_CLUSTERING)
    @Level(AttributeLevel.MEDIUM)
    public Map languageCounts; 
    
    /**
     * Majority language.
     * If {@link #languageAggregationStrategy} is 
     * {@link org.carrot2.text.clustering.MultilingualClustering.LanguageAggregationStrategy#CLUSTER_IN_MAJORITY_LANGUAGE},
     * this attribute will provide the majority language that was used to cluster all the documents.
     * If the majority of the documents have undefined language, this attribute will be 
     * empty and the clustering will be performed in the {@link #defaultLanguage}.
     */
    @Output
    @Processing
    @Attribute
    @Group(MULTILINGUAL_CLUSTERING)
    @Level(AttributeLevel.MEDIUM)
    public String majorityLanguage = ""; 
    
    public List process(List documents, IMonolingualClusteringAlgorithm algorithm)
    {
        languageCounts = Maps.newHashMap();
        
        if (documents.isEmpty())
        {
            return Lists.newArrayList();
        }
        
        if (LanguageAggregationStrategy.CLUSTER_IN_MAJORITY_LANGUAGE.equals(languageAggregationStrategy)) 
        {
            return clusterInMajorityLanguage(documents, algorithm);
        }

        // Clusters documents in each language separately,
        // creates a map of top-level Cluster instances named after the language code.
        final Map clustersByLanguage = clusterByLanguage(documents, algorithm);
        final List clusters = Lists.newArrayList(clustersByLanguage.values());

        // For FLATTEN_ALL we combine all clusters
        if (clustersByLanguage.size() == 1 ||
            LanguageAggregationStrategy.FLATTEN_ALL.equals(languageAggregationStrategy))
        {
            // For FLATTEN_ALL, we simply mix up all clusters, moving all unclustered
            // documents under one common Other Topics cluster.
            final List flattenedClusters = Lists.newArrayList();
            for (Cluster cluster : clusters)
            {
                final List subclusters = cluster.getSubclusters();
                for (Cluster subcluster : subclusters)
                {
                    if (!subcluster.isOtherTopics())
                    {
                        flattenedClusters.add(subcluster);
                    }
                }
            }

            // If there's more than one language, sort clusters by their number of
            // documents, irrespectively of their original score. We don't know how
            // to normalize the score between languages (independent clustering)
            // and larger clusters are typically more intuitive (better?) than smaller clusters.
            if (clustersByLanguage.size() > 1)
            {
                Collections.sort(flattenedClusters, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR);    
            }

            Cluster.appendOtherTopics(documents, flattenedClusters);
            return flattenedClusters;
        }
        else
        {
            Collections.sort(clusters, Collections.reverseOrder(Cluster.BY_SIZE_COMPARATOR));

            if (LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE.equals(languageAggregationStrategy))
            {
                // For FLATTEN_MAJOR_LANGUAGE, we flatten the first biggest language
                // cluster that has some nontrivial subclusters (clusters in that
                // language). If there's no cluster with nontrivial subclusters,
                // we default to FLATTEN_NONE.
                final Iterator iterator = clusters.iterator();
                Cluster majorLanguageCluster = null;
                try
                {
                    majorLanguageCluster = Iterators.find(iterator,
                        new Predicate()
                        {
                            public boolean apply(Cluster cluster)
                            {
                                return !cluster.getSubclusters().isEmpty();
                            }
                        });
                }
                catch (NoSuchElementException ignored)
                {
                    // When element is not found, Google Collections throws this exception
                }

                if (majorLanguageCluster != null)
                {
                    iterator.remove();
                    final List flattenedClusters = Lists.newArrayList();
                    flattenedClusters.addAll(majorLanguageCluster.getSubclusters());

                    final Cluster otherLanguages = new Cluster("Other Languages");
                    otherLanguages.addSubclusters(clusters);
                    flattenedClusters.add(otherLanguages);
                    return flattenedClusters;
                }
                else
                {
                    return clusters;
                }
            }
            else
            {
                return clusters;
            }
        }
    }

    /**
     * Clusters documents in each language separately.
     */
    private Map clusterByLanguage(List documents,
        IMonolingualClusteringAlgorithm algorithm)
    {
        // Partition by language first. As Multimaps.index() does not handle null
        // keys, we'd need to index by LanguageCode string and have a dedicated empty
        // string for the null language.
        final ImmutableListMultimap documentsByLanguage = 
            Multimaps.index(documents, new Function()
            {
                public String apply(Document document)
                {
                    final LanguageCode language = document.getLanguage();
                    return language != null ? language.name() : "";
                }
            });

        // For each language, perform clustering. Please note that implementations of 
        // IMonolingualClusteringAlgorithm.cluster() are not guaranteed to be thread-safe
        // and hence the method must NOT be called concurrently.
        final Map clusters = Maps.newHashMap();
        for (String language : documentsByLanguage.keySet())
        {
            final ImmutableList languageDocuments = documentsByLanguage.get(language);
            final LanguageCode languageCode = language.equals("") ? null : LanguageCode.valueOf(language);
            final Cluster languageCluster = new Cluster(
                languageCode != null ? languageCode.toString() : "Unknown Language");
            
            languageCounts.put(languageCode != null ? languageCode.getIsoCode() : "",
                languageDocuments.size());

            // Perform clustering
            final LanguageCode currentLanguage = languageCode != null ? languageCode : defaultLanguage;
            logger.debug("Performing monolingual clustering in: " + currentLanguage);
            final List clustersForLanguage = algorithm.process(
                languageDocuments, currentLanguage);

            if (clustersForLanguage.size() == 0 || 
                clustersForLanguage.size() == 1 && clustersForLanguage.get(0).isOtherTopics())
            {
                languageCluster.addDocuments(languageDocuments);
            }
            else
            {
                languageCluster.addSubclusters(clustersForLanguage);
            }

            clusters.put(languageCode, languageCluster);
        }

        return clusters;
    }
    
    private List clusterInMajorityLanguage(List documents,
        IMonolingualClusteringAlgorithm algorithm)
    {
        final Multiset counts = HashMultiset.create();
        for (Document d : documents)
        {
            counts.add(d.getLanguage());
        }
        LanguageCode majorityLanguage = defaultLanguage;
        int maxCount = 0;
        for (Entry entry : counts.entrySet())
        {
            if (entry.getElement() != null)
            {
                if (entry.getCount() > maxCount)
                {
                    maxCount = entry.getCount();
                    majorityLanguage = entry.getElement();
                    this.majorityLanguage = entry.getElement().getIsoCode();
                }
            } 
            languageCounts.put(entry.getElement() != null ? entry.getElement().getIsoCode() : "", 
                entry.getCount());
        }
        
        logger.debug("Performing clustering in majority language: " + majorityLanguage);
        final List clusters = algorithm.process(documents, majorityLanguage);
        Cluster.appendOtherTopics(documents, clusters);
        return clusters;
    }
}