org.carrot2.clustering.lingo.SimpleLabelAssigner Maven / Gradle / Ivy

Go to download

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.clustering.lingo;

import java.util.Arrays;

import org.carrot2.mahout.math.function.Functions;
import org.carrot2.mahout.math.matrix.DoubleMatrix2D;
import org.carrot2.matrix.MatrixUtils;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.util.attribute.Bindable;

import com.carrotsearch.hppc.IntIntHashMap;

/**
 * A simple and fast label assigner. For each base vector chooses the label that maximizes
 * the base vector--label term vector cosine similarity. Different vectors can get the
 * same label assigned, which means the number of final labels (after duplicate removal)
 * may be smaller than the number of base vectors on input. 
 * 
 * @see UniqueLabelAssigner
 */
@Bindable
public class SimpleLabelAssigner implements ILabelAssigner
{
    public void assignLabels(LingoProcessingContext context, DoubleMatrix2D stemCos,
        IntIntHashMap filteredRowToStemIndex, DoubleMatrix2D phraseCos)
    {
        final PreprocessingContext preprocessingContext = context.preprocessingContext;
        final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;
        final int [] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
        final int [] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex;
        final int desiredClusterCount = stemCos.columns();

        int [] candidateStemIndices = new int [desiredClusterCount];
        double [] candidateStemScores = new double [desiredClusterCount];
        
        int [] candidatePhraseIndices = new int [desiredClusterCount];
        Arrays.fill(candidatePhraseIndices, -1);
        double [] candidatePhraseScores = new double [desiredClusterCount];

        MatrixUtils.maxInColumns(stemCos, candidateStemIndices, candidateStemScores,
            Functions.ABS);

        if (phraseCos != null)
        {
            MatrixUtils.maxInColumns(phraseCos, candidatePhraseIndices,
                candidatePhraseScores, Functions.ABS);
        }

        // Choose between single words and phrases for each base vector
        final int [] clusterLabelFeatureIndex = new int [desiredClusterCount];
        double [] clusterLabelScore = new double [desiredClusterCount];
        for (int i = 0; i < desiredClusterCount; i++)
        {
            final int phraseFeatureIndex = candidatePhraseIndices[i];
            final int stemIndex = filteredRowToStemIndex.get(candidateStemIndices[i]);

            final double phraseScore = candidatePhraseScores[i];
            if (phraseFeatureIndex >= 0 && phraseScore > candidateStemScores[i])
            {
                clusterLabelFeatureIndex[i] = labelsFeatureIndex[phraseFeatureIndex
                    + firstPhraseIndex];
                clusterLabelScore[i] = phraseScore;
            }
            else
            {
                clusterLabelFeatureIndex[i] = mostFrequentOriginalWordIndex[stemIndex];
                clusterLabelScore[i] = candidateStemScores[i];
            }
        }

        context.clusterLabelFeatureIndex = clusterLabelFeatureIndex;
        context.clusterLabelScore = clusterLabelScore;
    }
}