org.carrot2.text.preprocessing.DocumentAssigner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).
There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import java.util.ArrayList;

import org.carrot2.core.attribute.Processing;
import org.carrot2.text.analysis.TokenTypeUtils;
import org.carrot2.text.preprocessing.PreprocessingContext.AllLabels;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange;

import com.carrotsearch.hppc.BitSet;
import com.carrotsearch.hppc.IntArrayList;
import org.carrot2.shaded.guava.common.collect.Lists;

/**
 * Assigns document to label candidates. For each label candidate from
 * {@link AllLabels#featureIndex} an {@link BitSet} with the assigned documents is
 * constructed. The assignment algorithm is rather simple: in order to be assigned to a
 * label, a document must contain at least one occurrence of each non-stop word from the
 * label.
 * 
 * This class saves the following results to the {@link PreprocessingContext} :
 * 

 * {@link AllLabels#documentIndices}
 * 
 * 
 * This class requires that {@link Tokenizer}, {@link CaseNormalizer},
 * {@link StopListMarker}, {@link PhraseExtractor} and {@link LabelFilterProcessor} be
 * invoked first.
 */
@Bindable(prefix = "DocumentAssigner")
public class DocumentAssigner
{
    /**
     * Only exact phrase assignments. Assign only documents that contain the label in its
     * original form, including the order of words. Enabling this option will cause less
     * documents to be put in clusters, which result in higher precision of assignment,
     * but also a larger "Other Topics" group. Disabling this option will cause more
     * documents to be put in clusters, which will make the "Other Topics" cluster
     * smaller, but also lower the precision of cluster-document assignments.
     */
    @Input
    @Processing
    @Attribute
    @Label("Exact phrase assignment")
    @Level(AttributeLevel.MEDIUM)
    @Group(DefaultGroups.PREPROCESSING)
    public boolean exactPhraseAssignment = false;

    /**
     * Determines the minimum number of documents in each cluster.
     */
    @Input
    @Processing
    @Attribute
    @IntRange(min = 1, max = 100)
    @Label("Minimum cluster size")
    @Level(AttributeLevel.MEDIUM)
    @Group(DefaultGroups.PREPROCESSING)
    public int minClusterSize = 2;

    /**
     * Assigns document to label candidates.
     */
    public void assign(PreprocessingContext context)
    {
        final int [] labelsFeatureIndex = context.allLabels.featureIndex;
        final int [][] stemsTfByDocument = context.allStems.tfByDocument;
        final int [] wordsStemIndex = context.allWords.stemIndex;
        final short [] wordsTypes = context.allWords.type;
        final int [][] phrasesTfByDocument = context.allPhrases.tfByDocument;
        final int [][] phrasesWordIndices = context.allPhrases.wordIndices;
        final int wordCount = wordsStemIndex.length;
        final int documentCount = context.documents.size();

        final BitSet [] labelsDocumentIndices = new BitSet [labelsFeatureIndex.length];

        for (int i = 0; i < labelsFeatureIndex.length; i++)
        {
            final BitSet documentIndices = new BitSet(documentCount);

            final int featureIndex = labelsFeatureIndex[i];
            if (featureIndex < wordCount)
            {
                addTfByDocumentToBitSet(documentIndices,
                    stemsTfByDocument[wordsStemIndex[featureIndex]]);
            }
            else
            {
                final int phraseIndex = featureIndex - wordCount;
                if (exactPhraseAssignment)
                {
                    addTfByDocumentToBitSet(documentIndices,
                        phrasesTfByDocument[phraseIndex]);
                }
                else
                {
                    final int [] wordIndices = phrasesWordIndices[phraseIndex];
                    boolean firstAdded = false;

                    for (int j = 0; j < wordIndices.length; j++)
                    {
                        final int wordIndex = wordIndices[j];
                        if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex]))
                        {
                            if (!firstAdded)
                            {
                                addTfByDocumentToBitSet(documentIndices,
                                    stemsTfByDocument[wordsStemIndex[wordIndex]]);
                                firstAdded = true;
                            }
                            else
                            {
                                final BitSet temp = new BitSet(documentCount);
                                addTfByDocumentToBitSet(temp,
                                    stemsTfByDocument[wordsStemIndex[wordIndex]]);
                                // .retainAll == set intersection
                                documentIndices.and(temp);
                            }
                        }
                    }
                }
            }

            labelsDocumentIndices[i] = documentIndices;
        }

        // Filter out labels that do not meet the minimum cluster size
        if (minClusterSize > 1)
        {
            final IntArrayList newFeatureIndex = new IntArrayList(
                labelsFeatureIndex.length);
            final ArrayList newDocumentIndices = Lists
                .newArrayListWithExpectedSize(labelsFeatureIndex.length);

            for (int i = 0; i < labelsFeatureIndex.length; i++)
            {
                if (labelsDocumentIndices[i].cardinality() >= minClusterSize)
                {
                    newFeatureIndex.add(labelsFeatureIndex[i]);
                    newDocumentIndices.add(labelsDocumentIndices[i]);
                }
            }
            context.allLabels.documentIndices = newDocumentIndices
                .toArray(new BitSet [newDocumentIndices.size()]);
            context.allLabels.featureIndex = newFeatureIndex.toArray();
            LabelFilterProcessor.updateFirstPhraseIndex(context);
        }
        else
        {
            context.allLabels.documentIndices = labelsDocumentIndices;
        }
    }

    private static void addTfByDocumentToBitSet(final BitSet documentIndices,
        final int [] tfByDocument)
    {
        for (int j = 0; j < tfByDocument.length / 2; j++)
        {
            documentIndices.set(tfByDocument[j * 2]);
        }
    }
}