org.carrot2.text.preprocessing.CaseNormalizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).
There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2013, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import java.util.Arrays;
import java.util.List;

import org.carrot2.core.attribute.Processing;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.util.CharArrayComparators;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.constraint.IntRange;

import com.carrotsearch.hppc.BitSet;
import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntStack;
import com.carrotsearch.hppc.ShortArrayList;
import com.carrotsearch.hppc.sorting.IndirectSort;
import com.google.common.collect.Lists;

/**
 * Performs case normalization and calculates a number of frequency statistics for words.
 * The aim of case normalization is to find the most frequently appearing variants of
 * words in terms of case. For example, if in the input documents MacOS appears 20
 * times, Macos 5 times and macos 2 times, case normalizer will select
 * MacOS to represent all variants and assign the aggregated term frequency of 27
 * to it.
 * 
 * This class saves the following results to the {@link PreprocessingContext}:
 * 

 * {@link AllTokens#wordIndex}
 * {@link AllWords#image}
 * {@link AllWords#tf}
 * {@link AllWords#tfByDocument}
 * 
 * 
 * This class requires that {@link Tokenizer} be invoked first.
 */
@Bindable(prefix = "CaseNormalizer")
public final class CaseNormalizer
{
    /**
     * Word Document Frequency threshold. Words appearing in fewer than
     * dfThreshold documents will be ignored.
     */
    @Processing
    @Input
    @Attribute
    @IntRange(min = 1, max = 100)
    @Label("Word document frequency threshold")
    @Level(AttributeLevel.ADVANCED)
    @Group(DefaultGroups.PREPROCESSING)
    public int dfThreshold = 1;

    /**
     * Performs normalization and saves the results to the context.
     */
    public void normalize(PreprocessingContext context)
    {
        // Local references to already existing arrays
        final char [][] tokenImages = context.allTokens.image;
        final short [] tokenTypesArray = context.allTokens.type;
        final int [] documentIndexesArray = context.allTokens.documentIndex;
        final byte [] tokensFieldIndex = context.allTokens.fieldIndex;
        final int tokenCount = tokenImages.length;

        // Sort token images
        final int [] tokenImagesOrder = IndirectSort.mergesort(tokenImages, 0,
            tokenImages.length, CharArrayComparators.NORMALIZING_CHAR_ARRAY_COMPARATOR);

        // Create holders for new arrays
        final List normalizedWordImages = Lists.newArrayList();
        final IntArrayList normalizedWordTf = new IntArrayList();
        final List wordTfByDocumentList = Lists.newArrayList();
        final ByteArrayList fieldIndexList = new ByteArrayList();
        final ShortArrayList types = new ShortArrayList();

        final int [] wordIndexes = new int [tokenCount];
        Arrays.fill(wordIndexes, -1);

        // Initial values for counters
        int tf = 1;
        int maxTf = 1;
        int maxTfVariantIndex = tokenImagesOrder[0];
        int totalTf = 1;
        int variantStartIndex = 0;

        // A byte set for word fields tracking
        final BitSet fieldIndices = new BitSet(context.allFields.name.length);

        // A stack for pushing information about the term's documents.
        final IntStack wordDocuments = new IntStack();

        if (documentIndexesArray[tokenImagesOrder[0]] >= 0)
        {
            wordDocuments.push(documentIndexesArray[tokenImagesOrder[0]]);
        }

        // Go through the ordered token images
        for (int i = 0; i < tokenImagesOrder.length - 1; i++)
        {
            final char [] image = tokenImages[tokenImagesOrder[i]];
            final char [] nextImage = tokenImages[tokenImagesOrder[i + 1]];
            final int tokenType = tokenTypesArray[tokenImagesOrder[i]];
            final int documentIndex = documentIndexesArray[tokenImagesOrder[i + 1]];

            // Reached the end of non-null tokens?
            if (image == null)
            {
                break;
            }

            // Check if we want to index this token at all
            if (isNotIndexed(tokenType))
            {
                variantStartIndex = i + 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];

                resetForNewTokenImage(documentIndexesArray, tokenImagesOrder, 
                    fieldIndices, wordDocuments, i);
                continue;
            }

            fieldIndices.set(tokensFieldIndex[tokenImagesOrder[i]]);

            // Now check if image case is changing
            final boolean sameCase = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
                .compare(image, nextImage) == 0;
            if (sameCase)
            {
                // Case has not changed, just increase counters
                tf++;
                totalTf++;
                wordDocuments.push(documentIndex);
                continue;
            }

            // Case (or even token image) has changed. Update most frequent case
            // variant
            if (maxTf < tf)
            {
                maxTf = tf;
                maxTfVariantIndex = tokenImagesOrder[i];
                tf = 1;
            }

            final boolean sameImage = CharArrayComparators.CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR
                .compare(image, nextImage) == 0;

            // Check if token image has changed
            if (sameImage)
            {
                totalTf++;
                wordDocuments.push(documentIndex);
            }
            else
            {
                // The image has changed completely.
                // Before we start processing the new image, we need to
                // see if we want to store the previous image, and if so
                // we need add some data about it to the arrays
                
                // wordDocuments.size() may contain duplicate entries from the same document, 
                // but this check is faster than deduping, so we do it first.  
                if (wordDocuments.size() >= dfThreshold)
                {
                    // Flatten the list of documents this term occurred in.
                    final int [] sparseEncoding = SparseArray.toSparseEncoding(wordDocuments);
                    final int df = (sparseEncoding.length >> 1); 
                    if (df >= dfThreshold)
                    {
                        wordTfByDocumentList.add(sparseEncoding);
    
                        // Add the word to the word list
                        normalizedWordImages.add(tokenImages[maxTfVariantIndex]);
                        types.add(tokenTypesArray[maxTfVariantIndex]);
                        normalizedWordTf.add(totalTf);
                        fieldIndexList.add((byte) fieldIndices.bits[0]);

                        // Add this word's index in AllWords to all its instances
                        // in the AllTokens multiarray
                        for (int j = variantStartIndex; j < i + 1; j++)
                        {
                            wordIndexes[tokenImagesOrder[j]] = normalizedWordImages.size() - 1;
                        }
                    }
                }

                // Reinitialize counters
                totalTf = 1;
                tf = 1;
                maxTf = 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];
                variantStartIndex = i + 1;

                // Re-initialize int set used for document frequency calculation
                resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
                    fieldIndices, wordDocuments, i);
            }
        }

        // Mapping from allTokens
        context.allTokens.wordIndex = wordIndexes;

        context.allWords.image = normalizedWordImages
            .toArray(new char [normalizedWordImages.size()] []);
        context.allWords.tf = normalizedWordTf.toArray();
        context.allWords.tfByDocument = 
            wordTfByDocumentList.toArray(new int [wordTfByDocumentList.size()] []);
        context.allWords.fieldIndices = fieldIndexList.toArray();
        context.allWords.type = types.toArray();
    }

    /**
     * Initializes the counters for the a token image.
     */
    private void resetForNewTokenImage(final int [] documentIndexesArray,
        final int [] tokenImagesOrder,
        final BitSet fieldIndices, IntStack wordDocuments, int i)
    {
        fieldIndices.clear();
        wordDocuments.clear();
        if (documentIndexesArray[tokenImagesOrder[i + 1]] >= 0)
        {
            wordDocuments.push(documentIndexesArray[tokenImagesOrder[i + 1]]);
        }
    }

    /**
     * Determines whether we should include the token in AllWords.
     */
    private boolean isNotIndexed(final int tokenType)
    {
        return tokenType == ITokenizer.TT_PUNCTUATION
            || tokenType == ITokenizer.TT_FULL_URL
            || (tokenType & ITokenizer.TF_SEPARATOR_SENTENCE) != 0;
    }
}