org.carrot2.text.preprocessing.PreprocessingContext Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).
There is a newer version: 3.16.3
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2013, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import java.io.StringWriter;
import java.util.Arrays;
import java.util.List;

import org.carrot2.core.Document;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.LanguageModel;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.text.util.TabularOutput;

import com.carrotsearch.hppc.*;

/**
 * Document preprocessing context provides low-level (usually integer-coded) data
 * structures useful for further processing.
 * 
 * 
 */
public final class PreprocessingContext
{
    /** Uninitialized structure constant. */
    private static final String UNINITIALIZED = "[uninitialized]\n";

    /** Query used to perform processing, may be null */
    public final String query;

    /** A list of documents to process. */
    public final List documents;

    /** Language model to be used */
    public final LanguageModel language;

    /**
     * Token interning cache. Token images are interned to save memory and allow reference
     * comparisons.
     */
    private ObjectOpenHashSet tokenCache = ObjectOpenHashSet.newInstance();

    /**
     * Creates a preprocessing context for the provided documents and with
     * the provided languageModel.
     */
    public PreprocessingContext(LanguageModel languageModel, List documents,
        String query)
    {
        this.query = query;
        this.documents = documents;
        this.language = languageModel;
    }

    /**
     * Information about all tokens of the input {@link PreprocessingContext#documents}.
     * Each element of each of the arrays corresponds to one individual token from the
     * input or a synthetic separator inserted between documents, fields and sentences.
     * Last element of this array is a special terminator entry.
     * 
     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllTokens
    {
        /**
         * Token image as it appears in the input. On positions where {@link #type} is
         * equal to one of {@link ITokenizer#TF_TERMINATOR},
         * {@link ITokenizer#TF_SEPARATOR_DOCUMENT} or
         * {@link ITokenizer#TF_SEPARATOR_FIELD} , image is null.
         * 

         * This array is produced by {@link Tokenizer}.
         */
        public char [][] image;

        /**
         * Token's {@link ITokenizer} bit flags.
         * 

         * This array is produced by {@link Tokenizer}.
         */
        public short [] type;

        /**
         * Document field the token came from. The index points to arrays in
         * {@link AllFields}, equal to -1 for document and field separators.
         * 

         * This array is produced by {@link Tokenizer}.
         */
        public byte [] fieldIndex;

        /**
         * Index of the document this token came from, points to elements of
         * {@link PreprocessingContext#documents}. Equal to -1 for document
         * separators.
         * 

         * This array is produced by {@link Tokenizer}.
         * 
         * 
         * This array is accessed in in {@link CaseNormalizer} and {@link PhraseExtractor}
         * to compute by-document statistics, e.g. tf-by document, which are then needed
         * to build a VSM or assign documents to labels. An alternative to this representation
         * would be creating an AllDocuments holder and keep there an array
         * of start token indexes for each document and then refactor the model building code
         * to do a binary search to determine the document index given token index. This is
         * likely to be a significant performance hit because model building code accesses 
         * the documentIndex array pretty much randomly (in the suffix order), so we'd be
         * doing twice-the-number-of-tokens binary searches. Unless there's some other
         * data structure that can help us here.
         * 
         */
        public int [] documentIndex;

        /**
         * A pointer to {@link AllWords} arrays for this token. Equal to -1
         * for document, field and {@link ITokenizer#TT_PUNCTUATION} tokens (including
         * sentence separators).
         * 
         * This array is produced by {@link CaseNormalizer}.
         */
        public int [] wordIndex;

        /**
         * The suffix order of tokens. Suffixes starting with a separator come at the end
         * of the array.
         * 

         * This array is produced by {@link PhraseExtractor}.
         */
        public int [] suffixOrder;

        /**
         * The Longest Common Prefix for the adjacent suffix-sorted token sequences.
         * 

         * This array is produced by {@link PhraseExtractor}.
         */
        public int [] lcp;

        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (image == null)
            {
                return UNINITIALIZED;
            }

            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);

            t.addColumn("#");
            t.addColumn("token").alignLeft();
            t.addColumn("type");
            t.addColumn("fieldIndex");
            t.addColumn("=>field").alignLeft();
            t.addColumn("docIdx");
            t.addColumn("wordIdx");
            t.addColumn("=>word").alignLeft();

            for (int i = 0; i < image.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    image[i] == null ? "" : new String(image[i]),
                    type[i],
                    fieldIndex[i],
                    fieldIndex[i] >= 0 ? allFields.name[fieldIndex[i]] : null,
                    documentIndex[i],
                    wordIndex[i],
                    wordIndex[i] >= 0 ? new String(allWords.image[wordIndex[i]]) : null);
            }

            if (suffixOrder != null)
            {
                t = new TabularOutput(sw);
                t.addColumn("#");
                t.addColumn("sa");
                t.addColumn("lcp");
                t.addColumn("=>words").alignLeft();

                sw.append("\n");
                final StringBuilder suffixImage = new StringBuilder();
                for (int i = 0; i < suffixOrder.length; i++, t.nextRow())
                {
                    t.rowData(
                        i,
                        suffixOrder[i],
                        lcp[i]);

                    int windowLength = 5;
                    for (int j = suffixOrder[i], max = Math.min(suffixOrder[i] + windowLength, wordIndex.length); j < max;)
                    {
                        suffixImage.append(
                            wordIndex[j] >= 0 ? new String(allWords.image[wordIndex[j]]) : "|").append(" ");
                        if (++j == max && j != wordIndex.length)
                            suffixImage.append(" [...]");
                    }
                    t.rowData(suffixImage.toString());
                    suffixImage.setLength(0);
                }
                sw.append("\n");
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }
    }

    /**
     * Information about all tokens of the input {@link PreprocessingContext#documents}.
     */
    public final AllTokens allTokens = new AllTokens();

    /**
     * Information about all fields processed for the input
     * {@link PreprocessingContext#documents}.
     */
    public static class AllFields
    {
        /**
         * Name of the document field. Entries of {@link AllTokens#fieldIndex} point to
         * this array.
         * 

         * This array is produced by {@link Tokenizer}.
         */
        public String [] name;
        
        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (name == null)
            {
                return UNINITIALIZED;
            }
            
            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("name").format("%-10s").alignLeft();

            int i = 0;
            for (String n : name)
            {
                t.rowData(i++, n).nextRow();
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }        
    }

    /**
     * Information about all fields processed for the input
     * {@link PreprocessingContext#documents}.
     */
    public final AllFields allFields = new AllFields();

    /**
     * Information about all unique words found in the input
     * {@link PreprocessingContext#documents}. An entry in each parallel array corresponds to one
     * conflated form of a word. For example, data and DATA will most likely become
     * a single entry in the words table. However, different grammatical forms of a single lemma
     * (like computer and computers) will have different entries in the
     * words table. See {@link AllStems} for inflection-conflated versions.
     * 

     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllWords
    {
        /**
         * The most frequently appearing variant of the word with respect to case. E.g. if
         * a token MacOS appeared 12 times in the input and macos
         * appeared 3 times, the image will be equal to MacOS.
         * 

         * This array is produced by {@link CaseNormalizer}.
         */
        public char [][] image;

        /**
         * Token type of this word copied from {@link AllTokens#type}. Additional
         * flags are set for each word by 
         * {@link CaseNormalizer} and {@link LanguageModelStemmer}.
         * 
         * 

         * This array is produced by {@link CaseNormalizer}.
         * This array is modified by {@link LanguageModelStemmer}.
         * 
         * @see ITokenizer
         */
        public short [] type;

        /**
         * Term Frequency of the word, aggregated across all variants with respect to
         * case. Frequencies for each variant separately are not available.
         * 

         * This array is produced by {@link CaseNormalizer}.
         */
        public int [] tf;

        /**
         * Term Frequency of the word for each document. The length of this array is equal
         * to the number of documents this word appeared in (Document Frequency)
         * multiplied by 2. Elements at even indices contain document indices pointing to
         * {@link PreprocessingContext#documents}, elements at odd indices contain the
         * frequency of the word in the document. For example, an array with 4 values:
         * [2, 15, 138, 7] means that the word appeared 15 times in document
         * at index 2 and 7 times in document at index 138.
         * 

         * This array is produced by {@link CaseNormalizer}. The order of documents in this
         * array is not defined.
         */
        public int [][] tfByDocument;

        /**
         * A pointer to the {@link AllStems} arrays for this word.
         * 

         * This array is produced by {@link LanguageModelStemmer}.
         */
        public int [] stemIndex;

        /**
         * A bit-packed indices of all fields in which this word appears at least once. 
         * Indexes (positions) of selected bits are pointers to the 
         * {@link AllFields} arrays. Fast conversion between the bit-packed representation
         * and byte[] with index values is done by {@link #toFieldIndexes(byte)}  
         * 

         * This array is produced by {@link CaseNormalizer}.
         */
        public byte [] fieldIndices;

        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (image == null)
            {
                return UNINITIALIZED;
            }
            
            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("image").alignLeft();
            t.addColumn("type");
            t.addColumn("tf");
            t.addColumn("tfByDocument").alignLeft();
            t.addColumn("fieldIndices");

            if (stemIndex != null)
            {
                t.addColumn("stemIndex");
                t.addColumn("=>stem").alignLeft();
            }

            for (int i = 0; i < image.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    image[i] == null ? "" : new String(image[i]),
                    type[i],
                    tf[i],
                    SparseArray.sparseToString(tfByDocument[i]));

                t.rowData(Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", ""));

                if (stemIndex != null)
                {
                    t.rowData(stemIndex[i]);
                    t.rowData(new String(allStems.image[stemIndex[i]]));
                }
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }
    }

    /**
     * Information about all unique words found in the input
     * {@link PreprocessingContext#documents}.
     */
    public final AllWords allWords = new AllWords();

    /**
     * Information about all unique stems found in the input
     * {@link PreprocessingContext#documents}. Each entry in each array corresponds to one
     * base form different words can be transformed to by the {@link IStemmer} used while
     * processing. E.g. the English mining and mine will be aggregated
     * to one entry in the arrays, while they will have separate entries in
     * {@link AllWords}.
     * 

     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllStems
    {
        /**
         * Stem image as produced by the {@link IStemmer}, may not correspond to any
         * correct word.
         * 

         * This array is produced by {@link LanguageModelStemmer}.
         */
        public char [][] image;

        /**
         * Pointer to the {@link AllWords} arrays, to the most frequent original form of
         * the stem. Pointers to the less frequent variants are not available.
         * 

         * This array is produced by {@link LanguageModelStemmer}.
         */
        public int [] mostFrequentOriginalWordIndex;

        /**
         * Term frequency of the stem, i.e. the sum of all {@link AllWords#tf} values
         * for which the {@link AllWords#stemIndex} points to this stem.
         * 

         * This array is produced by {@link LanguageModelStemmer}.
         */
        public int [] tf;

        /**
         * Term frequency of the stem for each document. For the encoding of this array,
         * see {@link AllWords#tfByDocument}.
         * 

         * This array is produced by {@link LanguageModelStemmer}. The order of documents in this
         * array is not defined.
         */
        public int [][] tfByDocument;

        /**
         * A bit-packed indices of all fields in which this word appears at least once. 
         * Indexes (positions) of selected bits are pointers to the 
         * {@link AllFields} arrays. Fast conversion between the bit-packed representation
         * and byte[] with index values is done by {@link #toFieldIndexes(byte)}  
         * 

         * This array is produced by {@link LanguageModelStemmer}
         */
        public byte [] fieldIndices;

        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (image == null)
            {
                return UNINITIALIZED;
            }
            
            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("stem");
            t.addColumn("mostFrqWord");
            t.addColumn("=>mostFrqWord").alignLeft();
            t.addColumn("tf");
            t.addColumn("tfByDocument").alignLeft();
            t.addColumn("fieldIndices");

            for (int i = 0; i < image.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    image[i] == null ? "" : new String(image[i]),
                    mostFrequentOriginalWordIndex[i],
                    new String(allWords.image[mostFrequentOriginalWordIndex[i]]),
                    tf[i],
                    SparseArray.sparseToString(tfByDocument[i]),
                    Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", ""));
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }
    }

    /**
     * Information about all unique stems found in the input
     * {@link PreprocessingContext#documents}.
     */
    public final AllStems allStems = new AllStems();

    /**
     * Information about all frequently appearing sequences of words found in the input
     * {@link PreprocessingContext#documents}. Each entry in each array corresponds to one
     * sequence.
     * 

     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllPhrases
    {
        /**
         * Pointers to {@link AllWords} for each word in the phrase sequence.
         * 

         * This array is produced by {@link PhraseExtractor}.
         */
        public int [][] wordIndices;

        /**
         * Term frequency of the phrase.
         * 

         * This array is produced by {@link PhraseExtractor}.
         */
        public int [] tf;

        /**
         * Term frequency of the phrase for each document. The encoding of this
         * array is similar to {@link AllWords#tfByDocument}: consecutive pairs of:
         * document index, frequency.
         * 

         * This array is produced by {@link PhraseExtractor}. The order of documents in this
         * array is not defined.
         */
        public int [][] tfByDocument;

        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (wordIndices == null)
            {
                return UNINITIALIZED;
            }

            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("wordIndices");
            t.addColumn("=>words").alignLeft();
            t.addColumn("tf");
            t.addColumn("tfByDocument").alignLeft();

            for (int i = 0; i < wordIndices.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    Arrays.toString(wordIndices[i]).replace(" ", ""),
                    getPhrase(i),
                    tf[i],
                    SparseArray.sparseToString(tfByDocument[i]));
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }

        /** Returns space-separated words that constitute this phrase. */
        public CharSequence getPhrase(int index)
        {
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < wordIndices[index].length; i++)
            {
                if (i > 0) sb.append(" ");
                sb.append(new String(allWords.image[wordIndices[index][i]]));
            }
            return sb;
        }
        
        /**
         * Returns length of all arrays in this {@link AllPhrases}.
         */
        public int size()
        {
            return wordIndices.length;
        }
    }

    /**
     * Information about all frequently appearing sequences of words found in the input
     * {@link PreprocessingContext#documents}.
     */
    public AllPhrases allPhrases = new AllPhrases();

    /**
     * Information about words and phrases that might be good cluster label candidates.
     * Each entry in each array corresponds to one label candidate.
     * 

     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllLabels
    {
        /**
         * Feature index of the label candidate. Features whose values are less than the
         * size of {@link AllWords} arrays are single word features and point to entries
         * in {@link AllWords}. Features whose values are larger or equal to the size of
         * {@link AllWords}, after subtracting the size of {@link AllWords}, point to
         * {@link AllPhrases}.
         * 

         * This array is produced by {@link LabelFilterProcessor}.
         */
        public int [] featureIndex;

        /**
         * Indices of documents assigned to the label candidate.
         * 

         * This array is produced by {@link DocumentAssigner}.
         */
        public BitSet [] documentIndices;

        /**
         * The first index in {@link #featureIndex} which 
         * points to {@link AllPhrases}, or -1 if there are no phrases
         * in {@link #featureIndex}.
         * 
         * This value is set by {@link LabelFilterProcessor}.
         * 
         * @see #featureIndex
         */
        public int firstPhraseIndex;
        
        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (featureIndex == null)
                return UNINITIALIZED;

            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("featureIdx");
            t.addColumn("=>feature").alignLeft();
            t.addColumn("documentIdx").alignLeft();

            for (int i = 0; i < featureIndex.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    featureIndex[i],
                    getLabel(i),
                    documentIndices != null ? documentIndices[i].toString().replace(" ", "") : "");
            }

            t.flush();
            sw.append("\n");
            return t.toString();
        }

        private CharSequence getLabel(int index)
        {
            final int wordsSize = allWords.image.length;
            if (featureIndex[index] < wordsSize)
                return new String(allWords.image[featureIndex[index]]);
            else
                return allPhrases.getPhrase(featureIndex[index] - wordsSize);
        }        
    }

    /**
     * Information about words and phrases that might be good cluster label candidates.
     */
    public final AllLabels allLabels = new AllLabels();

    /**
     * Returns true if this context contains any words.
     */
    public boolean hasWords()
    {
        return allWords.image.length > 0;
    }

    /**
     * Returns true if this context contains any label candidates.
     */
    public boolean hasLabels()
    {
        return allLabels.featureIndex != null && allLabels.featureIndex.length > 0;
    }

    @Override
    public String toString()
    {
        return "PreprocessingContext 0x" + Integer.toHexString(this.hashCode()) + "\n"
            + "== Fields:\n" + this.allFields.toString()
            + "== Tokens:\n" + this.allTokens.toString()
            + "== Words:\n" + this.allWords.toString()
            + "== Stems:\n" + this.allStems.toString()
            + "== Phrases:\n" + this.allPhrases.toString()
            + "== Labels:\n" + this.allLabels.toString();
    }
    
    /**
     * Static conversion between selected bits and an array of indexes of these bits. 
     */
    private final static int [][] bitsCache;
    static
    {
        bitsCache = new int [0x100][];
        for (int i = 0; i < 0x100; i++)
        {
            bitsCache[i] = new int [Integer.bitCount(i & 0xFF)];
            for (int v = 0, bit = 0, j = i & 0xff; j != 0; j >>>= 1, bit++)
            {
                if ((j & 0x1) != 0)
                    bitsCache[i][v++] = bit;
            }
        }
    }
    
    /**
     * Convert the selected bits in a byte to an array of indexes.
     */
    public static int [] toFieldIndexes(byte b)
    {
        return bitsCache[b & 0xff];
    }

    /* 
     * These should really be package-private, shouldn't they? We'd need to move classes under pipeline.
     * here for accessibility.
     */

    /**
     * This method should be invoked after all preprocessing contributors have been executed
     * to release temporary data structures. 
     */
    public void preprocessingFinished()
    {
        this.tokenCache = null;
    }

    /**
     * Return a unique char buffer representing a given character sequence.
     */
    public char [] intern(MutableCharArray chs)
    {
        if (tokenCache.contains(chs))
        {
            return tokenCache.lkey().getBuffer();
        }
        else
        {
            final char [] tokenImage = new char [chs.length()];
            System.arraycopy(chs.getBuffer(), chs.getStart(), tokenImage, 0, chs.length());
            tokenCache.add(new MutableCharArray(tokenImage));
            return tokenImage;
        }
    }
}