org.carrot2.clustering.synthetic.ByUrlClusteringAlgorithm Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).
There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.clustering.synthetic;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.lang3.ArrayUtils;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.CommonAttributes;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.shaded.guava.common.collect.LinkedHashMultimap;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.shaded.guava.common.collect.Multimap;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Output;

/**
 * Hierarchically clusters documents according to their content URLs.
 * {@link Document#CONTENT_URL} property will be used to obtain a document's URL.
 * 
 * Groups at the top level of the hierarchy will correspond to the last segments of the
 * URLs, usually domain suffixes, such as ".com" or ".co.uk". Subgroups will be created
 * based on further segments of the URLs, very often domains subdomains, e.g. "yahoo.com",
 * "bbc.co.uk" and then e.g. "mail.yahoo.com", "news.yahoo.com". The "www" segment of the
 * URLs will be ignored.
 * 
 * Clusters will be ordered by size (number of documents) descendingly; in case of equal
 * sizes, alphabetically by URL, see {@link Cluster#BY_REVERSED_SIZE_AND_LABEL_COMPARATOR}.
 */
@Bindable(inherit = CommonAttributes.class)
@Label("By URL Clustering")
public class ByUrlClusteringAlgorithm extends ProcessingComponentBase implements
    IClusteringAlgorithm
{
    /** A set of URL segments to be ignored. */
    private static final Set STOP_URL_PARTS;
    static
    {
        STOP_URL_PARTS = new HashSet();
        STOP_URL_PARTS.add("www");
    }

    /**
     * Documents to cluster.
     */
    @Processing
    @Input
    @Internal
    @Attribute(key = AttributeNames.DOCUMENTS, inherit = true)
    public List documents;

    /**
     * Clusters created by the algorithm.
     */
    @Processing
    @Output
    @Internal
    @Attribute(key = AttributeNames.CLUSTERS, inherit = true)
    public List clusters = null;

    /**
     * Performs by URL clustering.
     */
    @Override
    public void process() throws ProcessingException
    {
        // Just in case we get a linked list, create an array of documents
        final Document [] documentArray = this.documents
            .toArray(new Document [this.documents.size()]);

        // Prepare an array of url parts
        final String [][] urlParts = buildUrlParts(documentArray);

        // Recursively build the cluster structure
        final List documentIndexes = new ArrayList(documentArray.length);
        for (int i = 0; i < documentArray.length; i++)
        {
            documentIndexes.add(i);
        }
        this.clusters = createClusters(documentArray, documentIndexes, urlParts, 0, "");
        
        if (clusters.size() == 0) {
            Cluster.appendOtherTopics(documents, clusters, "Other Sites");
        }
    }

    /**
     * The actual, recursive, clustering routine.
     */
    private List createClusters(Document [] documents,
        Collection documentIndexes, String [][] urlParts, int level,
        String labelSuffix)
    {
        final Multimap urlPartToDocumentIndex = LinkedHashMultimap.create();
        for (final Integer documentIndex : documentIndexes)
        {
            final String [] urlPartsForDocument = urlParts[documentIndex.intValue()];
            if (urlPartsForDocument != null && urlPartsForDocument.length > level
                && !STOP_URL_PARTS.contains(urlPartsForDocument[level]))
            {
                urlPartToDocumentIndex.put(urlPartsForDocument[level], documentIndex);
            }
        }

        final Set documentsInClusters = new LinkedHashSet();
        final List clusters = new ArrayList();
        for (final String urlPart : urlPartToDocumentIndex.keySet())
        {
            final Collection indexes = urlPartToDocumentIndex.get(urlPart);

            if (indexes.size() > 1)
            {
                final Cluster cluster = new Cluster();
                String clusterLabel = urlPart
                    + (labelSuffix.length() > 0 ? "." + labelSuffix : "");

                final List subclusters = createClusters(documents, indexes,
                    urlParts, level + 1, clusterLabel);
                if (subclusters.size() > 1)
                {
                    cluster.addSubclusters(subclusters);
                }
                else
                {
                    // only one subcluster -- move the label one level up
                    if (subclusters.size() == 1)
                    {
                        final Cluster subcluster = subclusters.get(0);
                        clusterLabel = subcluster.getPhrases().get(0);
                        cluster.addDocuments(subcluster.getDocuments());
                        cluster.addSubclusters(subcluster.getSubclusters());
                    }
                    else
                    {
                        for (final Integer documentIndex : indexes)
                        {
                            cluster.addDocuments(documents[documentIndex.intValue()]);
                        }
                    }
                }

                cluster.addPhrases(clusterLabel);
                clusters.add(cluster);
                documentsInClusters.addAll(indexes);
            }
        }

        if (documentsInClusters.isEmpty())
        {
            return Lists.newArrayList();
        }

        // Sort clusters
        Collections.sort(clusters, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR);

        // Add junk clusters
        final ArrayList documentsInCluster = Lists
            .newArrayListWithExpectedSize(documentIndexes.size());
        for (Integer documentIndex : documentIndexes)
        {
            documentsInCluster.add(documents[documentIndex]);
        }

        Cluster.appendOtherTopics(documentsInCluster, clusters, "Other Sites");

        return clusters;
    }

    /**
     * For each documents builds an array of parts of their corresponding URLs.
     */
    final String [][] buildUrlParts(final Document [] documents)
    {
        final String [][] urlParts = new String [documents.length] [];
        for (int i = 0; i < documents.length; i++)
        {
            final String url = documents[i].getField(Document.CONTENT_URL);
            if (url == null)
            {
                continue;
            }

            int colonSlashSlashIndex = url.indexOf("://");
            if (colonSlashSlashIndex < 0)
            {
                colonSlashSlashIndex = 0;
            }
            else if (colonSlashSlashIndex + 3 >= url.length())
            {
                continue;
            }
            else
            {
                colonSlashSlashIndex += 3;
            }

            int slashIndex = url.indexOf('/', colonSlashSlashIndex + 3);
            if (slashIndex < 0)
            {
                slashIndex = url.length();
            }

            final String urlMainPart = url.substring(colonSlashSlashIndex, slashIndex)
                .toLowerCase();

            final String [] splitUrl = urlMainPart.split("\\.");
            ArrayUtils.reverse(splitUrl);
            urlParts[i] = splitUrl;
        }

        return urlParts;
    }
}