org.carrot2.text.preprocessing.PreprocessingContext Maven / Gradle / Ivy
/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.List;
import org.carrot2.core.Document;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.LanguageModel;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.text.util.TabularOutput;
import com.carrotsearch.hppc.*;
/**
* Document preprocessing context provides low-level (usually integer-coded) data
* structures useful for further processing.
*
*
*/
public final class PreprocessingContext
{
/** Uninitialized structure constant. */
private static final String UNINITIALIZED = "[uninitialized]\n";
/** Query used to perform processing, may be null
*/
public final String query;
/** A list of documents to process. */
public final List documents;
/** Language model to be used */
public final LanguageModel language;
/**
* Token interning cache. Token images are interned to save memory and allow reference
* comparisons.
*/
private ObjectHashSet tokenCache = new ObjectHashSet<>();
/**
* Creates a preprocessing context for the provided documents
and with
* the provided languageModel
.
*/
public PreprocessingContext(LanguageModel languageModel, List documents,
String query)
{
this.query = query;
this.documents = documents;
this.language = languageModel;
}
/**
* Information about all tokens of the input {@link PreprocessingContext#documents}.
* Each element of each of the arrays corresponds to one individual token from the
* input or a synthetic separator inserted between documents, fields and sentences.
* Last element of this array is a special terminator entry.
*
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllTokens
{
/**
* Token image as it appears in the input. On positions where {@link #type} is
* equal to one of {@link ITokenizer#TF_TERMINATOR},
* {@link ITokenizer#TF_SEPARATOR_DOCUMENT} or
* {@link ITokenizer#TF_SEPARATOR_FIELD} , image is null
.
*
* This array is produced by {@link Tokenizer}.
*/
public char [][] image;
/**
* Token's {@link ITokenizer} bit flags.
*
* This array is produced by {@link Tokenizer}.
*/
public short [] type;
/**
* Document field the token came from. The index points to arrays in
* {@link AllFields}, equal to -1
for document and field separators.
*
* This array is produced by {@link Tokenizer}.
*/
public byte [] fieldIndex;
/**
* Index of the document this token came from, points to elements of
* {@link PreprocessingContext#documents}. Equal to -1
for document
* separators.
*
* This array is produced by {@link Tokenizer}.
*
*
* This array is accessed in in {@link CaseNormalizer} and {@link PhraseExtractor}
* to compute by-document statistics, e.g. tf-by document, which are then needed
* to build a VSM or assign documents to labels. An alternative to this representation
* would be creating an AllDocuments
holder and keep there an array
* of start token indexes for each document and then refactor the model building code
* to do a binary search to determine the document index given token index. This is
* likely to be a significant performance hit because model building code accesses
* the documentIndex array pretty much randomly (in the suffix order), so we'd be
* doing twice-the-number-of-tokens binary searches. Unless there's some other
* data structure that can help us here.
*
*/
public int [] documentIndex;
/**
* A pointer to {@link AllWords} arrays for this token. Equal to -1
* for document, field and {@link ITokenizer#TT_PUNCTUATION} tokens (including
* sentence separators).
*
* This array is produced by {@link CaseNormalizer}.
*/
public int [] wordIndex;
/**
* The suffix order of tokens. Suffixes starting with a separator come at the end
* of the array.
*
* This array is produced by {@link PhraseExtractor}.
*/
public int [] suffixOrder;
/**
* The Longest Common Prefix for the adjacent suffix-sorted token sequences.
*
* This array is produced by {@link PhraseExtractor}.
*/
public int [] lcp;
/** For debugging purposes. */
@Override
public String toString()
{
if (image == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("token").alignLeft();
t.addColumn("type");
t.addColumn("fieldIndex");
t.addColumn("=>field").alignLeft();
t.addColumn("docIdx");
t.addColumn("wordIdx");
t.addColumn("=>word").alignLeft();
for (int i = 0; i < image.length; i++, t.nextRow())
{
t.rowData(
i,
image[i] == null ? "" : new String(image[i]),
type[i],
fieldIndex[i],
fieldIndex[i] >= 0 ? allFields.name[fieldIndex[i]] : null,
documentIndex[i],
wordIndex[i],
wordIndex[i] >= 0 ? new String(allWords.image[wordIndex[i]]) : null);
}
if (suffixOrder != null)
{
t = new TabularOutput(sw);
t.addColumn("#");
t.addColumn("sa");
t.addColumn("lcp");
t.addColumn("=>words").alignLeft();
sw.append("\n");
final StringBuilder suffixImage = new StringBuilder();
for (int i = 0; i < suffixOrder.length; i++, t.nextRow())
{
t.rowData(
i,
suffixOrder[i],
lcp[i]);
int windowLength = 5;
for (int j = suffixOrder[i], max = Math.min(suffixOrder[i] + windowLength, wordIndex.length); j < max;)
{
suffixImage.append(
wordIndex[j] >= 0 ? new String(allWords.image[wordIndex[j]]) : "|").append(" ");
if (++j == max && j != wordIndex.length)
suffixImage.append(" [...]");
}
t.rowData(suffixImage.toString());
suffixImage.setLength(0);
}
sw.append("\n");
}
t.flush();
sw.append("\n");
return sw.toString();
}
}
/**
* Information about all tokens of the input {@link PreprocessingContext#documents}.
*/
public final AllTokens allTokens = new AllTokens();
/**
* Information about all fields processed for the input
* {@link PreprocessingContext#documents}.
*/
public static class AllFields
{
/**
* Name of the document field. Entries of {@link AllTokens#fieldIndex} point to
* this array.
*
* This array is produced by {@link Tokenizer}.
*/
public String [] name;
/** For debugging purposes. */
@Override
public String toString()
{
if (name == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("name").format("%-10s").alignLeft();
int i = 0;
for (String n : name)
{
t.rowData(i++, n).nextRow();
}
t.flush();
sw.append("\n");
return sw.toString();
}
}
/**
* Information about all fields processed for the input
* {@link PreprocessingContext#documents}.
*/
public final AllFields allFields = new AllFields();
/**
* Information about all unique words found in the input
* {@link PreprocessingContext#documents}. An entry in each parallel array corresponds to one
* conflated form of a word. For example, data and DATA will most likely become
* a single entry in the words table. However, different grammatical forms of a single lemma
* (like computer and computers) will have different entries in the
* words table. See {@link AllStems} for inflection-conflated versions.
*
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllWords
{
/**
* The most frequently appearing variant of the word with respect to case. E.g. if
* a token MacOS appeared 12 times in the input and macos
* appeared 3 times, the image will be equal to MacOS.
*
* This array is produced by {@link CaseNormalizer}.
*/
public char [][] image;
/**
* Token type of this word copied from {@link AllTokens#type}. Additional
* flags are set for each word by
* {@link CaseNormalizer} and {@link LanguageModelStemmer}.
*
*
* This array is produced by {@link CaseNormalizer}.
* This array is modified by {@link LanguageModelStemmer}.
*
* @see ITokenizer
*/
public short [] type;
/**
* Term Frequency of the word, aggregated across all variants with respect to
* case. Frequencies for each variant separately are not available.
*
* This array is produced by {@link CaseNormalizer}.
*/
public int [] tf;
/**
* Term Frequency of the word for each document. The length of this array is equal
* to the number of documents this word appeared in (Document Frequency)
* multiplied by 2. Elements at even indices contain document indices pointing to
* {@link PreprocessingContext#documents}, elements at odd indices contain the
* frequency of the word in the document. For example, an array with 4 values:
* [2, 15, 138, 7]
means that the word appeared 15 times in document
* at index 2 and 7 times in document at index 138.
*
* This array is produced by {@link CaseNormalizer}. The order of documents in this
* array is not defined.
*/
public int [][] tfByDocument;
/**
* A pointer to the {@link AllStems} arrays for this word.
*
* This array is produced by {@link LanguageModelStemmer}.
*/
public int [] stemIndex;
/**
* A bit-packed indices of all fields in which this word appears at least once.
* Indexes (positions) of selected bits are pointers to the
* {@link AllFields} arrays. Fast conversion between the bit-packed representation
* and byte[]
with index values is done by {@link #toFieldIndexes(byte)}
*
* This array is produced by {@link CaseNormalizer}.
*/
public byte [] fieldIndices;
/** For debugging purposes. */
@Override
public String toString()
{
if (image == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("image").alignLeft();
t.addColumn("type");
t.addColumn("tf");
t.addColumn("tfByDocument").alignLeft();
t.addColumn("fieldIndices");
if (stemIndex != null)
{
t.addColumn("stemIndex");
t.addColumn("=>stem").alignLeft();
}
for (int i = 0; i < image.length; i++, t.nextRow())
{
t.rowData(
i,
image[i] == null ? "" : new String(image[i]),
type[i],
tf[i],
SparseArray.sparseToString(tfByDocument[i]));
t.rowData(Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", ""));
if (stemIndex != null)
{
t.rowData(stemIndex[i]);
t.rowData(new String(allStems.image[stemIndex[i]]));
}
}
t.flush();
sw.append("\n");
return sw.toString();
}
}
/**
* Information about all unique words found in the input
* {@link PreprocessingContext#documents}.
*/
public final AllWords allWords = new AllWords();
/**
* Information about all unique stems found in the input
* {@link PreprocessingContext#documents}. Each entry in each array corresponds to one
* base form different words can be transformed to by the {@link IStemmer} used while
* processing. E.g. the English mining and mine will be aggregated
* to one entry in the arrays, while they will have separate entries in
* {@link AllWords}.
*
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllStems
{
/**
* Stem image as produced by the {@link IStemmer}, may not correspond to any
* correct word.
*
* This array is produced by {@link LanguageModelStemmer}.
*/
public char [][] image;
/**
* Pointer to the {@link AllWords} arrays, to the most frequent original form of
* the stem. Pointers to the less frequent variants are not available.
*
* This array is produced by {@link LanguageModelStemmer}.
*/
public int [] mostFrequentOriginalWordIndex;
/**
* Term frequency of the stem, i.e. the sum of all {@link AllWords#tf} values
* for which the {@link AllWords#stemIndex} points to this stem.
*
* This array is produced by {@link LanguageModelStemmer}.
*/
public int [] tf;
/**
* Term frequency of the stem for each document. For the encoding of this array,
* see {@link AllWords#tfByDocument}.
*
* This array is produced by {@link LanguageModelStemmer}. The order of documents in this
* array is not defined.
*/
public int [][] tfByDocument;
/**
* A bit-packed indices of all fields in which this word appears at least once.
* Indexes (positions) of selected bits are pointers to the
* {@link AllFields} arrays. Fast conversion between the bit-packed representation
* and byte[]
with index values is done by {@link #toFieldIndexes(byte)}
*
* This array is produced by {@link LanguageModelStemmer}
*/
public byte [] fieldIndices;
/** For debugging purposes. */
@Override
public String toString()
{
if (image == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("stem");
t.addColumn("mostFrqWord");
t.addColumn("=>mostFrqWord").alignLeft();
t.addColumn("tf");
t.addColumn("tfByDocument").alignLeft();
t.addColumn("fieldIndices");
for (int i = 0; i < image.length; i++, t.nextRow())
{
t.rowData(
i,
image[i] == null ? "" : new String(image[i]),
mostFrequentOriginalWordIndex[i],
new String(allWords.image[mostFrequentOriginalWordIndex[i]]),
tf[i],
SparseArray.sparseToString(tfByDocument[i]),
Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", ""));
}
t.flush();
sw.append("\n");
return sw.toString();
}
}
/**
* Information about all unique stems found in the input
* {@link PreprocessingContext#documents}.
*/
public final AllStems allStems = new AllStems();
/**
* Information about all frequently appearing sequences of words found in the input
* {@link PreprocessingContext#documents}. Each entry in each array corresponds to one
* sequence.
*
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllPhrases
{
/**
* Pointers to {@link AllWords} for each word in the phrase sequence.
*
* This array is produced by {@link PhraseExtractor}.
*/
public int [][] wordIndices;
/**
* Term frequency of the phrase.
*
* This array is produced by {@link PhraseExtractor}.
*/
public int [] tf;
/**
* Term frequency of the phrase for each document. The encoding of this
* array is similar to {@link AllWords#tfByDocument}: consecutive pairs of:
* document index, frequency.
*
* This array is produced by {@link PhraseExtractor}. The order of documents in this
* array is not defined.
*/
public int [][] tfByDocument;
/** For debugging purposes. */
@Override
public String toString()
{
if (wordIndices == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("wordIndices");
t.addColumn("=>words").alignLeft();
t.addColumn("tf");
t.addColumn("tfByDocument").alignLeft();
for (int i = 0; i < wordIndices.length; i++, t.nextRow())
{
t.rowData(
i,
Arrays.toString(wordIndices[i]).replace(" ", ""),
getPhrase(i),
tf[i],
SparseArray.sparseToString(tfByDocument[i]));
}
t.flush();
sw.append("\n");
return sw.toString();
}
/** Returns space-separated words that constitute this phrase. */
public CharSequence getPhrase(int index)
{
StringBuilder sb = new StringBuilder();
for (int i = 0; i < wordIndices[index].length; i++)
{
if (i > 0) sb.append(" ");
sb.append(new String(allWords.image[wordIndices[index][i]]));
}
return sb;
}
/**
* Returns length of all arrays in this {@link AllPhrases}.
*/
public int size()
{
return wordIndices.length;
}
}
/**
* Information about all frequently appearing sequences of words found in the input
* {@link PreprocessingContext#documents}.
*/
public AllPhrases allPhrases = new AllPhrases();
/**
* Information about words and phrases that might be good cluster label candidates.
* Each entry in each array corresponds to one label candidate.
*
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllLabels
{
/**
* Feature index of the label candidate. Features whose values are less than the
* size of {@link AllWords} arrays are single word features and point to entries
* in {@link AllWords}. Features whose values are larger or equal to the size of
* {@link AllWords}, after subtracting the size of {@link AllWords}, point to
* {@link AllPhrases}.
*
* This array is produced by {@link LabelFilterProcessor}.
*/
public int [] featureIndex;
/**
* Indices of documents assigned to the label candidate.
*
* This array is produced by {@link DocumentAssigner}.
*/
public BitSet [] documentIndices;
/**
* The first index in {@link #featureIndex} which
* points to {@link AllPhrases}, or -1 if there are no phrases
* in {@link #featureIndex}.
*
* This value is set by {@link LabelFilterProcessor}.
*
* @see #featureIndex
*/
public int firstPhraseIndex;
/** For debugging purposes. */
@Override
public String toString()
{
if (featureIndex == null)
return UNINITIALIZED;
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("featureIdx");
t.addColumn("=>feature").alignLeft();
t.addColumn("documentIdx").alignLeft();
for (int i = 0; i < featureIndex.length; i++, t.nextRow())
{
t.rowData(
i,
featureIndex[i],
getLabel(i),
documentIndices != null ? documentIndices[i].toString().replace(" ", "") : "");
}
t.flush();
sw.append("\n");
return t.toString();
}
private CharSequence getLabel(int index)
{
final int wordsSize = allWords.image.length;
if (featureIndex[index] < wordsSize)
return new String(allWords.image[featureIndex[index]]);
else
return allPhrases.getPhrase(featureIndex[index] - wordsSize);
}
}
/**
* Information about words and phrases that might be good cluster label candidates.
*/
public final AllLabels allLabels = new AllLabels();
/**
* Returns true
if this context contains any words.
*/
public boolean hasWords()
{
return allWords.image.length > 0;
}
/**
* Returns true
if this context contains any label candidates.
*/
public boolean hasLabels()
{
return allLabels.featureIndex != null && allLabels.featureIndex.length > 0;
}
@Override
public String toString()
{
return "PreprocessingContext 0x" + Integer.toHexString(this.hashCode()) + "\n"
+ "== Fields:\n" + this.allFields.toString()
+ "== Tokens:\n" + this.allTokens.toString()
+ "== Words:\n" + this.allWords.toString()
+ "== Stems:\n" + this.allStems.toString()
+ "== Phrases:\n" + this.allPhrases.toString()
+ "== Labels:\n" + this.allLabels.toString();
}
/**
* Static conversion between selected bits and an array of indexes of these bits.
*/
private final static int [][] bitsCache;
static
{
bitsCache = new int [0x100][];
for (int i = 0; i < 0x100; i++)
{
bitsCache[i] = new int [Integer.bitCount(i & 0xFF)];
for (int v = 0, bit = 0, j = i & 0xff; j != 0; j >>>= 1, bit++)
{
if ((j & 0x1) != 0)
bitsCache[i][v++] = bit;
}
}
}
/**
* Convert the selected bits in a byte to an array of indexes.
*/
public static int [] toFieldIndexes(byte b)
{
return bitsCache[b & 0xff];
}
/*
* These should really be package-private, shouldn't they? We'd need to move classes under pipeline.
* here for accessibility.
*/
/**
* This method should be invoked after all preprocessing contributors have been executed
* to release temporary data structures.
*/
public void preprocessingFinished()
{
this.tokenCache = null;
}
/**
* Return a unique char buffer representing a given character sequence.
*/
public char [] intern(MutableCharArray chs)
{
int index = tokenCache.indexOf(chs);
if (tokenCache.indexExists(index))
{
return tokenCache.indexGet(index).getBuffer();
}
else
{
final char [] tokenImage = new char [chs.length()];
System.arraycopy(chs.getBuffer(), chs.getStart(), tokenImage, 0, chs.length());
tokenCache.add(new MutableCharArray(tokenImage));
return tokenImage;
}
}
}