All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.preprocessing.CaseNormalizer Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import java.util.Arrays;
import java.util.List;

import org.carrot2.core.attribute.Processing;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.util.CharArrayComparators;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.constraint.IntRange;

import com.carrotsearch.hppc.BitSet;
import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntStack;
import com.carrotsearch.hppc.ShortArrayList;
import com.carrotsearch.hppc.sorting.IndirectSort;
import org.carrot2.shaded.guava.common.collect.Lists;

/**
 * Performs case normalization and calculates a number of frequency statistics for words.
 * The aim of case normalization is to find the most frequently appearing variants of
 * words in terms of case. For example, if in the input documents MacOS appears 20
 * times, Macos 5 times and macos 2 times, case normalizer will select
 * MacOS to represent all variants and assign the aggregated term frequency of 27
 * to it.
 * 

* This class saves the following results to the {@link PreprocessingContext}: *

    *
  • {@link AllTokens#wordIndex}
  • *
  • {@link AllWords#image}
  • *
  • {@link AllWords#tf}
  • *
  • {@link AllWords#tfByDocument}
  • *
*

* This class requires that {@link Tokenizer} be invoked first. */ @Bindable(prefix = "CaseNormalizer") public final class CaseNormalizer { /** * Word Document Frequency threshold. Words appearing in fewer than * dfThreshold documents will be ignored. */ @Processing @Input @Attribute @IntRange(min = 1, max = 100) @Label("Word document frequency threshold") @Level(AttributeLevel.ADVANCED) @Group(DefaultGroups.PREPROCESSING) public int dfThreshold = 1; /** * Performs normalization and saves the results to the context. */ public void normalize(PreprocessingContext context) { // Local references to already existing arrays final char [][] tokenImages = context.allTokens.image; final short [] tokenTypesArray = context.allTokens.type; final int [] documentIndexesArray = context.allTokens.documentIndex; final byte [] tokensFieldIndex = context.allTokens.fieldIndex; final int tokenCount = tokenImages.length; // Sort token images final int [] tokenImagesOrder = IndirectSort.mergesort(tokenImages, 0, tokenImages.length, CharArrayComparators.NORMALIZING_CHAR_ARRAY_COMPARATOR); // Create holders for new arrays final List normalizedWordImages = Lists.newArrayList(); final IntArrayList normalizedWordTf = new IntArrayList(); final List wordTfByDocumentList = Lists.newArrayList(); final ByteArrayList fieldIndexList = new ByteArrayList(); final ShortArrayList types = new ShortArrayList(); final int [] wordIndexes = new int [tokenCount]; Arrays.fill(wordIndexes, -1); // Initial values for counters int tf = 1; int maxTf = 1; int maxTfVariantIndex = tokenImagesOrder[0]; int totalTf = 1; int variantStartIndex = 0; // A byte set for word fields tracking final BitSet fieldIndices = new BitSet(context.allFields.name.length); // A stack for pushing information about the term's documents. final IntStack wordDocuments = new IntStack(); if (documentIndexesArray[tokenImagesOrder[0]] >= 0) { wordDocuments.push(documentIndexesArray[tokenImagesOrder[0]]); } // Go through the ordered token images for (int i = 0; i < tokenImagesOrder.length - 1; i++) { final char [] image = tokenImages[tokenImagesOrder[i]]; final char [] nextImage = tokenImages[tokenImagesOrder[i + 1]]; final int tokenType = tokenTypesArray[tokenImagesOrder[i]]; final int documentIndex = documentIndexesArray[tokenImagesOrder[i + 1]]; // Reached the end of non-null tokens? if (image == null) { break; } // Check if we want to index this token at all if (isNotIndexed(tokenType)) { variantStartIndex = i + 1; maxTfVariantIndex = tokenImagesOrder[i + 1]; resetForNewTokenImage(documentIndexesArray, tokenImagesOrder, fieldIndices, wordDocuments, i); continue; } fieldIndices.set(tokensFieldIndex[tokenImagesOrder[i]]); // Now check if image case is changing final boolean sameCase = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR .compare(image, nextImage) == 0; if (sameCase) { // Case has not changed, just increase counters tf++; totalTf++; wordDocuments.push(documentIndex); continue; } // Case (or even token image) has changed. Update most frequent case // variant if (maxTf < tf) { maxTf = tf; maxTfVariantIndex = tokenImagesOrder[i]; tf = 1; } final boolean sameImage = CharArrayComparators.CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR .compare(image, nextImage) == 0; // Check if token image has changed if (sameImage) { totalTf++; wordDocuments.push(documentIndex); } else { // The image has changed completely. // Before we start processing the new image, we need to // see if we want to store the previous image, and if so // we need add some data about it to the arrays // wordDocuments.size() may contain duplicate entries from the same document, // but this check is faster than deduping, so we do it first. if (wordDocuments.size() >= dfThreshold) { // Flatten the list of documents this term occurred in. final int [] sparseEncoding = SparseArray.toSparseEncoding(wordDocuments); final int df = (sparseEncoding.length >> 1); if (df >= dfThreshold) { wordTfByDocumentList.add(sparseEncoding); // Add the word to the word list normalizedWordImages.add(tokenImages[maxTfVariantIndex]); types.add(tokenTypesArray[maxTfVariantIndex]); normalizedWordTf.add(totalTf); fieldIndexList.add((byte) fieldIndices.bits[0]); // Add this word's index in AllWords to all its instances // in the AllTokens multiarray for (int j = variantStartIndex; j < i + 1; j++) { wordIndexes[tokenImagesOrder[j]] = normalizedWordImages.size() - 1; } } } // Reinitialize counters totalTf = 1; tf = 1; maxTf = 1; maxTfVariantIndex = tokenImagesOrder[i + 1]; variantStartIndex = i + 1; // Re-initialize int set used for document frequency calculation resetForNewTokenImage(documentIndexesArray, tokenImagesOrder, fieldIndices, wordDocuments, i); } } // Mapping from allTokens context.allTokens.wordIndex = wordIndexes; context.allWords.image = normalizedWordImages .toArray(new char [normalizedWordImages.size()] []); context.allWords.tf = normalizedWordTf.toArray(); context.allWords.tfByDocument = wordTfByDocumentList.toArray(new int [wordTfByDocumentList.size()] []); context.allWords.fieldIndices = fieldIndexList.toArray(); context.allWords.type = types.toArray(); } /** * Initializes the counters for the a token image. */ private void resetForNewTokenImage(final int [] documentIndexesArray, final int [] tokenImagesOrder, final BitSet fieldIndices, IntStack wordDocuments, int i) { fieldIndices.clear(); wordDocuments.clear(); if (documentIndexesArray[tokenImagesOrder[i + 1]] >= 0) { wordDocuments.push(documentIndexesArray[tokenImagesOrder[i + 1]]); } } /** * Determines whether we should include the token in AllWords. */ private boolean isNotIndexed(final int tokenType) { return tokenType == ITokenizer.TT_PUNCTUATION || tokenType == ITokenizer.TT_FULL_URL || (tokenType & ITokenizer.TF_SEPARATOR_SENTENCE) != 0; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy