org.carrot2.text.preprocessing.CaseNormalizer Maven / Gradle / Ivy
/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import java.util.Arrays;
import java.util.List;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.util.CharArrayComparators;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.constraint.IntRange;
import com.carrotsearch.hppc.BitSet;
import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntStack;
import com.carrotsearch.hppc.ShortArrayList;
import com.carrotsearch.hppc.sorting.IndirectSort;
import org.carrot2.shaded.guava.common.collect.Lists;
/**
* Performs case normalization and calculates a number of frequency statistics for words.
* The aim of case normalization is to find the most frequently appearing variants of
* words in terms of case. For example, if in the input documents MacOS appears 20
* times, Macos 5 times and macos 2 times, case normalizer will select
* MacOS to represent all variants and assign the aggregated term frequency of 27
* to it.
*
* This class saves the following results to the {@link PreprocessingContext}:
*
* - {@link AllTokens#wordIndex}
* - {@link AllWords#image}
* - {@link AllWords#tf}
* - {@link AllWords#tfByDocument}
*
*
* This class requires that {@link Tokenizer} be invoked first.
*/
@Bindable(prefix = "CaseNormalizer")
public final class CaseNormalizer
{
/**
* Word Document Frequency threshold. Words appearing in fewer than
* dfThreshold
documents will be ignored.
*/
@Processing
@Input
@Attribute
@IntRange(min = 1, max = 100)
@Label("Word document frequency threshold")
@Level(AttributeLevel.ADVANCED)
@Group(DefaultGroups.PREPROCESSING)
public int dfThreshold = 1;
/**
* Performs normalization and saves the results to the context
.
*/
public void normalize(PreprocessingContext context)
{
// Local references to already existing arrays
final char [][] tokenImages = context.allTokens.image;
final short [] tokenTypesArray = context.allTokens.type;
final int [] documentIndexesArray = context.allTokens.documentIndex;
final byte [] tokensFieldIndex = context.allTokens.fieldIndex;
final int tokenCount = tokenImages.length;
// Sort token images
final int [] tokenImagesOrder = IndirectSort.mergesort(tokenImages, 0,
tokenImages.length, CharArrayComparators.NORMALIZING_CHAR_ARRAY_COMPARATOR);
// Create holders for new arrays
final List normalizedWordImages = Lists.newArrayList();
final IntArrayList normalizedWordTf = new IntArrayList();
final List wordTfByDocumentList = Lists.newArrayList();
final ByteArrayList fieldIndexList = new ByteArrayList();
final ShortArrayList types = new ShortArrayList();
final int [] wordIndexes = new int [tokenCount];
Arrays.fill(wordIndexes, -1);
// Initial values for counters
int tf = 1;
int maxTf = 1;
int maxTfVariantIndex = tokenImagesOrder[0];
int totalTf = 1;
int variantStartIndex = 0;
// A byte set for word fields tracking
final BitSet fieldIndices = new BitSet(context.allFields.name.length);
// A stack for pushing information about the term's documents.
final IntStack wordDocuments = new IntStack();
if (documentIndexesArray[tokenImagesOrder[0]] >= 0)
{
wordDocuments.push(documentIndexesArray[tokenImagesOrder[0]]);
}
// Go through the ordered token images
for (int i = 0; i < tokenImagesOrder.length - 1; i++)
{
final char [] image = tokenImages[tokenImagesOrder[i]];
final char [] nextImage = tokenImages[tokenImagesOrder[i + 1]];
final int tokenType = tokenTypesArray[tokenImagesOrder[i]];
final int documentIndex = documentIndexesArray[tokenImagesOrder[i + 1]];
// Reached the end of non-null tokens?
if (image == null)
{
break;
}
// Check if we want to index this token at all
if (isNotIndexed(tokenType))
{
variantStartIndex = i + 1;
maxTfVariantIndex = tokenImagesOrder[i + 1];
resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
fieldIndices, wordDocuments, i);
continue;
}
fieldIndices.set(tokensFieldIndex[tokenImagesOrder[i]]);
// Now check if image case is changing
final boolean sameCase = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
.compare(image, nextImage) == 0;
if (sameCase)
{
// Case has not changed, just increase counters
tf++;
totalTf++;
wordDocuments.push(documentIndex);
continue;
}
// Case (or even token image) has changed. Update most frequent case
// variant
if (maxTf < tf)
{
maxTf = tf;
maxTfVariantIndex = tokenImagesOrder[i];
tf = 1;
}
final boolean sameImage = CharArrayComparators.CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR
.compare(image, nextImage) == 0;
// Check if token image has changed
if (sameImage)
{
totalTf++;
wordDocuments.push(documentIndex);
}
else
{
// The image has changed completely.
// Before we start processing the new image, we need to
// see if we want to store the previous image, and if so
// we need add some data about it to the arrays
// wordDocuments.size() may contain duplicate entries from the same document,
// but this check is faster than deduping, so we do it first.
if (wordDocuments.size() >= dfThreshold)
{
// Flatten the list of documents this term occurred in.
final int [] sparseEncoding = SparseArray.toSparseEncoding(wordDocuments);
final int df = (sparseEncoding.length >> 1);
if (df >= dfThreshold)
{
wordTfByDocumentList.add(sparseEncoding);
// Add the word to the word list
normalizedWordImages.add(tokenImages[maxTfVariantIndex]);
types.add(tokenTypesArray[maxTfVariantIndex]);
normalizedWordTf.add(totalTf);
fieldIndexList.add((byte) fieldIndices.bits[0]);
// Add this word's index in AllWords to all its instances
// in the AllTokens multiarray
for (int j = variantStartIndex; j < i + 1; j++)
{
wordIndexes[tokenImagesOrder[j]] = normalizedWordImages.size() - 1;
}
}
}
// Reinitialize counters
totalTf = 1;
tf = 1;
maxTf = 1;
maxTfVariantIndex = tokenImagesOrder[i + 1];
variantStartIndex = i + 1;
// Re-initialize int set used for document frequency calculation
resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
fieldIndices, wordDocuments, i);
}
}
// Mapping from allTokens
context.allTokens.wordIndex = wordIndexes;
context.allWords.image = normalizedWordImages
.toArray(new char [normalizedWordImages.size()] []);
context.allWords.tf = normalizedWordTf.toArray();
context.allWords.tfByDocument =
wordTfByDocumentList.toArray(new int [wordTfByDocumentList.size()] []);
context.allWords.fieldIndices = fieldIndexList.toArray();
context.allWords.type = types.toArray();
}
/**
* Initializes the counters for the a token image.
*/
private void resetForNewTokenImage(final int [] documentIndexesArray,
final int [] tokenImagesOrder,
final BitSet fieldIndices, IntStack wordDocuments, int i)
{
fieldIndices.clear();
wordDocuments.clear();
if (documentIndexesArray[tokenImagesOrder[i + 1]] >= 0)
{
wordDocuments.push(documentIndexesArray[tokenImagesOrder[i + 1]]);
}
}
/**
* Determines whether we should include the token in AllWords.
*/
private boolean isNotIndexed(final int tokenType)
{
return tokenType == ITokenizer.TT_PUNCTUATION
|| tokenType == ITokenizer.TT_FULL_URL
|| (tokenType & ITokenizer.TF_SEPARATOR_SENTENCE) != 0;
}
}