org.carrot2.clustering.lingo.SimpleLabelAssigner Maven / Gradle / Ivy
/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.clustering.lingo;
import java.util.Arrays;
import org.carrot2.mahout.math.function.Functions;
import org.carrot2.mahout.math.matrix.DoubleMatrix2D;
import org.carrot2.matrix.MatrixUtils;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.util.attribute.Bindable;
import com.carrotsearch.hppc.IntIntHashMap;
/**
* A simple and fast label assigner. For each base vector chooses the label that maximizes
* the base vector--label term vector cosine similarity. Different vectors can get the
* same label assigned, which means the number of final labels (after duplicate removal)
* may be smaller than the number of base vectors on input.
*
* @see UniqueLabelAssigner
*/
@Bindable
public class SimpleLabelAssigner implements ILabelAssigner
{
public void assignLabels(LingoProcessingContext context, DoubleMatrix2D stemCos,
IntIntHashMap filteredRowToStemIndex, DoubleMatrix2D phraseCos)
{
final PreprocessingContext preprocessingContext = context.preprocessingContext;
final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;
final int [] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
final int [] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex;
final int desiredClusterCount = stemCos.columns();
int [] candidateStemIndices = new int [desiredClusterCount];
double [] candidateStemScores = new double [desiredClusterCount];
int [] candidatePhraseIndices = new int [desiredClusterCount];
Arrays.fill(candidatePhraseIndices, -1);
double [] candidatePhraseScores = new double [desiredClusterCount];
MatrixUtils.maxInColumns(stemCos, candidateStemIndices, candidateStemScores,
Functions.ABS);
if (phraseCos != null)
{
MatrixUtils.maxInColumns(phraseCos, candidatePhraseIndices,
candidatePhraseScores, Functions.ABS);
}
// Choose between single words and phrases for each base vector
final int [] clusterLabelFeatureIndex = new int [desiredClusterCount];
double [] clusterLabelScore = new double [desiredClusterCount];
for (int i = 0; i < desiredClusterCount; i++)
{
final int phraseFeatureIndex = candidatePhraseIndices[i];
final int stemIndex = filteredRowToStemIndex.get(candidateStemIndices[i]);
final double phraseScore = candidatePhraseScores[i];
if (phraseFeatureIndex >= 0 && phraseScore > candidateStemScores[i])
{
clusterLabelFeatureIndex[i] = labelsFeatureIndex[phraseFeatureIndex
+ firstPhraseIndex];
clusterLabelScore[i] = phraseScore;
}
else
{
clusterLabelFeatureIndex[i] = mostFrequentOriginalWordIndex[stemIndex];
clusterLabelScore[i] = candidateStemScores[i];
}
}
context.clusterLabelFeatureIndex = clusterLabelFeatureIndex;
context.clusterLabelScore = clusterLabelScore;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy