All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.preprocessing.LanguageModelStemmer Maven / Gradle / Ivy

Go to download

Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).

There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2013, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import java.util.ArrayList;
import java.util.Set;

import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.preprocessing.PreprocessingContext.AllStems;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.util.CharArrayComparators;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.attribute.Bindable;

import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.sorting.IndirectSort;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

/**
 * Applies stemming to words and calculates a number of frequency statistics for stems.
 * 

* This class saves the following results to the {@link PreprocessingContext}: *

    *
  • {@link AllWords#stemIndex}
  • *
  • {@link AllStems#image}
  • *
  • {@link AllStems#mostFrequentOriginalWordIndex}
  • *
  • {@link AllStems#tf}
  • *
  • {@link AllStems#tfByDocument}
  • *
  • {@link AllWords#type} is populated with {@link ITokenizer#TF_QUERY_WORD}
  • *
* * This class requires that {@link Tokenizer} and {@link CaseNormalizer} be invoked first. */ @Bindable(prefix = "LanguageModelStemmer") public final class LanguageModelStemmer { /** * Performs stemming and saves the results to the context. */ public void stem(PreprocessingContext context) { final IStemmer stemmer = context.language.getStemmer(); final char [][] wordImages = context.allWords.image; final char [][] stemImages = new char [wordImages.length] []; final MutableCharArray mutableCharArray = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY); char [] buffer = new char [128]; for (int i = 0; i < wordImages.length; i++) { final char [] word = wordImages[i]; if (buffer.length < word.length) buffer = new char [word.length]; final boolean different = CharArrayUtils.toLowerCase(word, buffer); mutableCharArray.reset(buffer, 0, word.length); final CharSequence stemmed = stemmer.stem(mutableCharArray); if (stemmed != null) { mutableCharArray.reset(stemmed); stemImages[i] = context.intern(mutableCharArray); } else { // We need to put the original word here, otherwise, we wouldn't be able // to compute frequencies for stems. if (different) stemImages[i] = context.intern(mutableCharArray); else stemImages[i] = word; } } addStemStatistics(context, stemImages, prepareQueryWords(context.query, stemmer)); } /** * Adds frequency statistics to the stems. */ private void addStemStatistics(PreprocessingContext context, char [][] wordStemImages, Set queryStems) { final int [] stemImagesOrder = IndirectSort.mergesort(wordStemImages, 0, wordStemImages.length, CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR); // Local array references final int [] wordTfArray = context.allWords.tf; final int [][] wordTfByDocumentArray = context.allWords.tfByDocument; final byte [] wordsFieldIndices = context.allWords.fieldIndices; final short [] wordsType = context.allWords.type; final int allWordsCount = wordTfArray.length; // Pointers from AllWords to AllStems final int [] stemIndexesArray = new int [allWordsCount]; if (stemImagesOrder.length == 0) { context.allStems.image = new char [0] []; context.allStems.mostFrequentOriginalWordIndex = new int [0]; context.allStems.tf = new int [0]; context.allStems.tfByDocument = new int [0] []; context.allStems.fieldIndices = new byte [0]; context.allWords.stemIndex = new int [context.allWords.image.length]; return; } // Lists to accommodate the results final ArrayList stemImages = new ArrayList(allWordsCount); final IntArrayList stemTf = new IntArrayList(allWordsCount); final IntArrayList stemMostFrequentWordIndexes = new IntArrayList(allWordsCount); final ArrayList stemTfByDocumentList = new ArrayList(allWordsCount); final ByteArrayList fieldIndexList = new ByteArrayList(); // Counters int totalTf = wordTfArray[stemImagesOrder[0]]; int mostFrequentWordFrequency = wordTfArray[stemImagesOrder[0]]; int mostFrequentWordIndex = stemImagesOrder[0]; int stemIndex = 0; // A list of document-term-frequency pairs, by document, for all words with identical stems. final ArrayList stemTfsByDocument = Lists.newArrayList(); stemTfsByDocument.add(wordTfByDocumentArray[stemImagesOrder[0]]); byte fieldIndices = 0; fieldIndices |= wordsFieldIndices[0]; // For locating query words final MutableCharArray buffer = new MutableCharArray( wordStemImages[stemImagesOrder[0]]); boolean inQuery = queryStems.contains(buffer); // Go through all words in the order of stem images for (int i = 0; i < stemImagesOrder.length - 1; i++) { final int orderIndex = stemImagesOrder[i]; final char [] stem = wordStemImages[orderIndex]; final int nextInOrderIndex = stemImagesOrder[i + 1]; final char [] nextStem = wordStemImages[nextInOrderIndex]; stemIndexesArray[orderIndex] = stemIndex; if (inQuery) { wordsType[orderIndex] |= ITokenizer.TF_QUERY_WORD; } // Now check if token image is changing final boolean sameStem = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR .compare(stem, nextStem) == 0; if (sameStem) { totalTf += wordTfArray[nextInOrderIndex]; stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]); fieldIndices |= wordsFieldIndices[nextInOrderIndex]; if (mostFrequentWordFrequency < wordTfArray[nextInOrderIndex]) { mostFrequentWordFrequency = wordTfArray[nextInOrderIndex]; mostFrequentWordIndex = nextInOrderIndex; } } else { stemImages.add(stem); stemTf.add(totalTf); stemMostFrequentWordIndexes.add(mostFrequentWordIndex); storeTfByDocument(stemTfByDocumentList, stemTfsByDocument); fieldIndexList.add(fieldIndices); stemIndex++; totalTf = wordTfArray[nextInOrderIndex]; mostFrequentWordFrequency = wordTfArray[nextInOrderIndex]; mostFrequentWordIndex = nextInOrderIndex; fieldIndices = 0; fieldIndices |= wordsFieldIndices[nextInOrderIndex]; stemTfsByDocument.clear(); stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]); buffer.reset(wordStemImages[nextInOrderIndex]); inQuery = queryStems.contains(buffer); } } // Store tf for the last stem in the array stemImages.add(wordStemImages[stemImagesOrder[stemImagesOrder.length - 1]]); stemTf.add(totalTf); stemMostFrequentWordIndexes.add(mostFrequentWordIndex); stemIndexesArray[stemImagesOrder[stemImagesOrder.length - 1]] = stemIndex; storeTfByDocument(stemTfByDocumentList, stemTfsByDocument); fieldIndexList.add(fieldIndices); if (inQuery) { wordsType[stemImagesOrder[stemImagesOrder.length - 1]] |= ITokenizer.TF_QUERY_WORD; } // Convert lists to arrays and store them in allStems context.allStems.image = stemImages.toArray(new char [stemImages.size()] []); context.allStems.mostFrequentOriginalWordIndex = stemMostFrequentWordIndexes .toArray(); context.allStems.tf = stemTf.toArray(); context.allStems.tfByDocument = stemTfByDocumentList .toArray(new int [stemTfByDocumentList.size()] []); context.allStems.fieldIndices = fieldIndexList.toArray(); // References in allWords context.allWords.stemIndex = stemIndexesArray; } /** * */ private void storeTfByDocument( ArrayList target, ArrayList source) { assert source.size() > 0 : "Empty source document list?"; if (source.size() == 1) { // Just copy the reference over if a single list is available. target.add(source.get(0)); } else { // Merge sparse representations if more than one. target.add(SparseArray.mergeSparseArrays(source)); } } private Set prepareQueryWords(String query, IStemmer stemmer) { final Set queryWords = Sets.newHashSet(); if (query != null) { final String [] split = query.toLowerCase().split("\\s"); for (int i = 0; i < split.length; i++) { final CharSequence stem = stemmer.stem(split[i]); if (stem != null) { queryWords.add(new MutableCharArray(stem)); } else { queryWords.add(new MutableCharArray(split[i])); } } } return queryWords; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy