org.carrot2.text.preprocessing.PhraseExtractor Maven / Gradle / Ivy
/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import java.util.Collections;
import java.util.List;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.preprocessing.PreprocessingContext.AllPhrases;
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
import org.carrot2.util.IntMapUtils;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntIntHashMap;
import com.carrotsearch.hppc.cursors.IntIntCursor;
import org.carrot2.shaded.guava.common.collect.Lists;
/**
* Extracts frequent phrases from the provided document. A frequent phrase is a sequence
* of words that appears in the documents more than once. This phrase extractor aggregates
* different inflection variants of phrase words into one phrase, returning the most
* frequent variant. For example, if phrase computing science appears 2 times and
* computer sciences appears 4 times, the latter will be returned with aggregated
* frequency of 6.
*
* This class saves the following results to the {@link PreprocessingContext}:
*
* - {@link AllPhrases#wordIndices}
* - {@link AllPhrases#tf}
* - {@link AllPhrases#tfByDocument}
* - {@link AllTokens#suffixOrder}
* - {@link AllTokens#lcp}
*
*
* This class requires that {@link Tokenizer}, {@link CaseNormalizer} and
* {@link LanguageModelStemmer} be invoked first.
*/
@Bindable(prefix = "PhraseExtractor")
public class PhraseExtractor
{
/** Internal minimum phrase length, we may want to make it an attribute at some point */
private static final int MIN_PHRASE_LENGTH = 2;
/** Internal maximum phrase length, we may want to make it an attribute at some point */
static final int MAX_PHRASE_LENGTH = 8;
/**
* Phrase Document Frequency threshold. Phrases appearing in fewer than
* dfThreshold
documents will be ignored.
*/
@Processing
@Input
@Attribute
@IntRange(min = 1, max = 100)
@Label("Phrase document frequency threshold")
@Level(AttributeLevel.ADVANCED)
@Group(DefaultGroups.PHRASE_EXTRACTION)
public int dfThreshold = 1;
/**
* Suffix sorter to be used by this phrase extractor. When the suffix sorter gets some
* attributes, we'll need to make this field public.
*/
private SuffixSorter suffixSorter = new SuffixSorter();
/**
* Performs phrase extraction and saves the results to the provided
* context
.
*/
public void extractPhrases(PreprocessingContext context)
{
// Perform suffix sorting first
suffixSorter.suffixSort(context);
final int [] suffixArray = context.allTokens.suffixOrder;
final int [] lcpArray = context.allTokens.lcp;
final int [] wordIndexesArray = context.allTokens.wordIndex;
final int [] documentIndexArray = context.allTokens.documentIndex;
final int [] stemIndexes = context.allWords.stemIndex;
// Find all subphrases
List rcs = discoverRcs(suffixArray, lcpArray, documentIndexArray);
List phraseWordIndexes = Lists.newArrayList();
IntArrayList phraseTf = new IntArrayList();
List phraseTfByDocumentList = Lists.newArrayList();
if (rcs.size() > 0)
{
// Determine most frequent originals and create the final phrase
// array. Also merge the phrase tf by document maps into flat
// arrays.
Collections.sort(rcs, new SubstringComparator(wordIndexesArray, stemIndexes));
int totalPhraseTf = rcs.get(0).frequency;
Substring mostFrequentOriginal = rcs.get(0);
IntIntHashMap phraseTfByDocument = new IntIntHashMap();
phraseTfByDocument.putAll(mostFrequentOriginal.tfByDocument);
// Don't change the rcs list type from ArrayList or we'll
// run into O(n^2) iteration cost :)
for (int i = 0; i < rcs.size() - 1; i++)
{
final Substring substring = rcs.get(i);
final Substring nextSubstring = rcs.get(i + 1);
if (substring
.isEquivalentTo(nextSubstring, wordIndexesArray, stemIndexes))
{
totalPhraseTf += nextSubstring.frequency;
addAllWithOffset(phraseTfByDocument, nextSubstring.tfByDocument, -1);
if (mostFrequentOriginal.frequency < nextSubstring.frequency)
{
mostFrequentOriginal = nextSubstring;
}
}
else
{
int [] wordIndexes = new int [(mostFrequentOriginal.to - mostFrequentOriginal.from)];
for (int j = 0; j < wordIndexes.length; j++)
{
wordIndexes[j] = wordIndexesArray[mostFrequentOriginal.from + j];
}
phraseWordIndexes.add(wordIndexes);
phraseTf.add(totalPhraseTf);
phraseTfByDocumentList.add(IntMapUtils.flatten(phraseTfByDocument));
totalPhraseTf = nextSubstring.frequency;
mostFrequentOriginal = nextSubstring;
phraseTfByDocument.clear();
phraseTfByDocument.putAll(nextSubstring.tfByDocument);
}
}
// Add the last substring
final Substring substring = rcs.get(rcs.size() - 1);
int [] wordIndexes = new int [(substring.to - substring.from)];
for (int j = 0; j < wordIndexes.length; j++)
{
wordIndexes[j] = wordIndexesArray[mostFrequentOriginal.from + j];
}
phraseWordIndexes.add(wordIndexes);
phraseTf.add(totalPhraseTf);
phraseTfByDocumentList.add(IntMapUtils.flatten(phraseTfByDocument));
}
// Store the results to allPhrases
context.allPhrases.wordIndices = phraseWordIndexes
.toArray(new int [phraseWordIndexes.size()] []);
context.allPhrases.tf = phraseTf.toArray();
context.allPhrases.tfByDocument = phraseTfByDocumentList
.toArray(new int [phraseTfByDocumentList.size()] []);
}
/**
* Discovers Right Complete Substrings in the given LCP Suffix Array.
*/
private List discoverRcs(int [] suffixArray, int [] lcpArray,
int [] documentIndexArray)
{
Substring [] rcsStack;
int sp;
int i;
rcsStack = new Substring [lcpArray.length];
sp = -1;
i = 1;
final List result = Lists.newArrayList();
while (i < lcpArray.length - 1)
{
final int currentSuffixIndex = suffixArray[i];
final int currentDocumentIndex = documentIndexArray[currentSuffixIndex];
final int currentLcp = Math.min(MAX_PHRASE_LENGTH, lcpArray[i]);
if (sp < 0)
{
if (currentLcp >= MIN_PHRASE_LENGTH)
{
// Push to the stack phrases of length 2..currentLcp. Only the
// topmost phrase will get its frequencies incremented, the other
// ones will "inherit" the counts when the topmost phrase is
// popped off the stack.
final int length = currentLcp;
for (int j = length - 2; j >= 0; j--)
{
sp++;
// Set initial tf = 2 for the topmost phrase. For the other phrases,
// set tf = 1. During popping of the topmost phrase, the phrase lying
// "below" on the stack, will get its tf increased by the tf of
// the phrase being popped, minus 1.
rcsStack[sp] = new Substring(i, currentSuffixIndex,
currentSuffixIndex + currentLcp - j, (j == 0 ? 2 : 1));
// By document tf. Again, topmost phrase gets tf = 2, the other
// ones get tf = 1. This time, we need to track from which document's
// tf we need to set off the "minus 1", hence the documentIndexToOffset field.
rcsStack[sp].tfByDocument = new IntIntHashMap();
rcsStack[sp].tfByDocument.put(
documentIndexArray[suffixArray[i - 1]], 1);
if (j == 0)
{
rcsStack[sp].tfByDocument.putOrAdd(currentDocumentIndex,
1, 1);
}
else
{
rcsStack[sp].documentIndexToOffset = documentIndexArray[suffixArray[i - 1]];
}
}
}
i++;
}
else
{
Substring r = rcsStack[sp];
if ((r.to - r.from) < currentLcp)
{
Substring r1 = rcsStack[sp];
// The phrase we're about to add is an extension of the topmost
// phrase on the stack. The new phrase will contribute to the
// topmost phrase's tf, so we need to track the document index
// from which we'd set off the "minus 1".
r1.documentIndexToOffset = documentIndexArray[suffixArray[i - 1]];
// Add the intermediate phrases too (which makes
// the algorithm no longer linear btw)
int length = currentLcp - (r1.to - r1.from);
for (int j = length - 1; j >= 0; j--)
{
if (currentLcp - j >= MIN_PHRASE_LENGTH)
{
sp++;
rcsStack[sp] = new Substring(i, currentSuffixIndex,
currentSuffixIndex + currentLcp - j, (j == 0 ? 2 : 1));
rcsStack[sp].tfByDocument = new IntIntHashMap();
rcsStack[sp].tfByDocument.put(
documentIndexArray[suffixArray[i - 1]], 1);
if (j == 0)
{
rcsStack[sp].tfByDocument.putOrAdd(currentDocumentIndex,
1, 1);
}
else
{
rcsStack[sp].documentIndexToOffset = documentIndexArray[suffixArray[i - 1]];
}
}
}
i++;
}
else
{
Substring r1 = rcsStack[sp];
if ((r1.to - r1.from) == currentLcp)
{
// Increase the frequency of the generalized phrase
rcsStack[sp].frequency += 1;
rcsStack[sp].tfByDocument.putOrAdd(currentDocumentIndex, 1, 1);
i++;
}
else
{
Substring s;
// Pop generalized phrases off the stack
do
{
if (rcsStack[sp].tfByDocument.size() >= dfThreshold)
{
// Add the generalized phrase to the result
result.add(rcsStack[sp]);
}
s = rcsStack[sp];
sp--;
// As we update only the frequency of the stack's
// topmost substring we need to propagate the
// accumulated frequencies to the shorter
// substrings
if (sp >= 0)
{
// The "minus 1" mentioned above.
rcsStack[sp].frequency += s.frequency - 1;
addAllWithOffset(rcsStack[sp].tfByDocument,
s.tfByDocument, rcsStack[sp].documentIndexToOffset);
}
}
while (sp >= 0
&& (rcsStack[sp].to - rcsStack[sp].from) > currentLcp);
}
}
}
}
return result;
}
private static void addAllWithOffset(IntIntHashMap dest, IntIntHashMap src,
int documentIndexToOffset)
{
for (IntIntCursor c : src)
{
final int key = c.key;
final int value = c.value + (key != documentIndexToOffset ? 0 : -1);
dest.putOrAdd(key, value, value);
}
}
}