org.carrot2.text.preprocessing.LabelFormatter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset
(core algorithms and infrastructure, no document sources).
/*
* Carrot2 project.
*
* Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import org.carrot2.text.analysis.TokenTypeUtils;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.attribute.Bindable;
/**
* Formats cluster labels for final rendering.
*/
@Bindable
public class LabelFormatter
{
/**
* Formats a cluster label for final rendering.
*/
public String format(PreprocessingContext context, int featureIndex)
{
final char [][] wordsImage = context.allWords.image;
final int [][] phrasesWordIndices = context.allPhrases.wordIndices;
final int wordCount = wordsImage.length;
final StringBuilder label = new StringBuilder();
if (featureIndex < wordCount)
{
final char [] image = wordsImage[featureIndex];
appendFormatted(label, image, true, false);
}
else
{
final boolean insertSpace = context.language.getLanguageCode().usesSpaceDelimiters();
final int [] wordIndices = phrasesWordIndices[featureIndex - wordCount];
final short [] termTypes = context.allWords.type;
for (int i = 0; i < wordIndices.length; i++)
{
if (insertSpace && i > 0) label.append(' ');
final int wordIndex = wordIndices[i];
appendFormatted(label, wordsImage[wordIndex], i == 0,
TokenTypeUtils.isCommon(termTypes[wordIndex]));
}
}
return label.toString();
}
/**
* A method for formatting cluster labels that is not coupled to
* {@link PreprocessingContext#allLabels} and can be used in algorithms that do not
* use the full preprocessing pipeline.
*
* @param image images of the words making the label.
* @param stopWord determines whether the corresponding word of the label is a stop
* word
* @param joinWithSpace if true
, label tokens will be joined with a space
* character, if false
, no extra characters will be inserted
* between label tokens.
*/
public static String format(char [][] image, boolean [] stopWord,
boolean joinWithSpace)
{
final StringBuilder label = new StringBuilder();
if (image.length == 1)
{
appendFormatted(label, image[0], true, stopWord[0]);
}
else
{
for (int i = 0; i < image.length; i++)
{
appendFormatted(label, image[i], i == 0, stopWord[i]);
if (joinWithSpace && i < image.length - 1)
{
label.append(' ');
}
}
}
return label.toString();
}
/**
* Appends a segment of the label to the buffer, capitalized or lower-cased depending
* on the position and content.
*/
private static void appendFormatted(final StringBuilder label, final char [] image,
boolean isFirst, boolean isCommon)
{
if (CharArrayUtils.hasCapitalizedLetters(image))
{
label.append(image);
}
else if (isFirst)
{
label.append(CharArrayUtils.toCapitalizedCopy(image));
}
else
{
if (isCommon)
{
label.append(CharArrayUtils.toLowerCaseCopy(image));
}
else
{
label.append(CharArrayUtils.toCapitalizedCopy(image));
}
}
}
}