All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.preprocessing.LabelFormatter Maven / Gradle / Ivy

Go to download

Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).

There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import org.carrot2.text.analysis.TokenTypeUtils;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.attribute.Bindable;

/**
 * Formats cluster labels for final rendering.
 */
@Bindable
public class LabelFormatter
{
    /**
     * Formats a cluster label for final rendering.
     */
    public String format(PreprocessingContext context, int featureIndex)
    {
        final char [][] wordsImage = context.allWords.image;
        final int [][] phrasesWordIndices = context.allPhrases.wordIndices;
        final int wordCount = wordsImage.length;

        final StringBuilder label = new StringBuilder();
        if (featureIndex < wordCount)
        {
            final char [] image = wordsImage[featureIndex];
            appendFormatted(label, image, true, false);
        }
        else
        {
            final boolean insertSpace = context.language.getLanguageCode().usesSpaceDelimiters();
            final int [] wordIndices = phrasesWordIndices[featureIndex - wordCount];
            final short [] termTypes = context.allWords.type;
            for (int i = 0; i < wordIndices.length; i++)
            {
                if (insertSpace && i > 0) label.append(' ');

                final int wordIndex = wordIndices[i];
                appendFormatted(label, wordsImage[wordIndex], i == 0,
                    TokenTypeUtils.isCommon(termTypes[wordIndex]));
            }
        }

        return label.toString();
    }

    /**
     * A method for formatting cluster labels that is not coupled to
     * {@link PreprocessingContext#allLabels} and can be used in algorithms that do not
     * use the full preprocessing pipeline.
     * 
     * @param image images of the words making the label.
     * @param stopWord determines whether the corresponding word of the label is a stop
     *            word
     * @param joinWithSpace if true, label tokens will be joined with a space
     *            character, if false, no extra characters will be inserted
     *            between label tokens.
     */
    public static String format(char [][] image, boolean [] stopWord,
        boolean joinWithSpace)
    {
        final StringBuilder label = new StringBuilder();
        if (image.length == 1)
        {
            appendFormatted(label, image[0], true, stopWord[0]);
        }
        else
        {
            for (int i = 0; i < image.length; i++)
            {
                appendFormatted(label, image[i], i == 0, stopWord[i]);
                if (joinWithSpace && i < image.length - 1)
                {
                    label.append(' ');
                }
            }
        }

        return label.toString();
    }

    /**
     * Appends a segment of the label to the buffer, capitalized or lower-cased depending
     * on the position and content.
     */
    private static void appendFormatted(final StringBuilder label, final char [] image,
        boolean isFirst, boolean isCommon)
    {
        if (CharArrayUtils.hasCapitalizedLetters(image))
        {
            label.append(image);
        }
        else if (isFirst)
        {
            label.append(CharArrayUtils.toCapitalizedCopy(image));
        }
        else
        {
            if (isCommon)
            {
                label.append(CharArrayUtils.toLowerCaseCopy(image));
            }
            else
            {
                label.append(CharArrayUtils.toCapitalizedCopy(image));
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy