smile.nlp.keyword.CooccurrenceKeywordExtractor Maven / Gradle / Ivy

Go to download
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.nlp.keyword;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import smile.nlp.NGram;
import smile.nlp.Trie;
import smile.nlp.collocation.AprioriPhraseExtractor;
import smile.nlp.stemmer.PorterStemmer;
import smile.nlp.tokenizer.SimpleParagraphSplitter;
import smile.nlp.tokenizer.SimpleSentenceSplitter;
import smile.nlp.tokenizer.SimpleTokenizer;
import smile.sort.QuickSort;

/**
 * Keyword extraction from a single document using word co-occurrence statistical information.
 * The algorithm was proposed by Y. Matsuo and M. Ishizuka. It consists of six steps:
 * 
 *  Stem words by Porter algorithm and extract phrases based APRIORI algorithm
 *      (upto 4 words with frequency more than 3 times). Discard stop words.
 * 
 Select the top frequent terms up to 30% of running terms.
 * 
 Clustering frequent terms. Two terms are in the same cluster if
 *      either their Jensen-Shannon divergence or mutual information is
 *      above the threshold (0.95 * log 2, and log 2, respectively).
 * 
 Calculate the expected co-occurrence probability
 * 
 Calculate the refined χ2 values that removes the maximal term.
 * 
 Output a given number of terms of largest refined χ2 values.
 * 
 * 
 * @author Haifeng Li
 */
public class CooccurrenceKeywordExtractor {

    /**
     * Returns the top 10 keywords.
     * @param text A single document.
     * @return The top 10 keywords.
     */
    public ArrayList extract(String text) {
        return extract(text, 10);
    }
    
    /**
     * Returns a given number of top keywords.
     * @param text A single document.
     * @return The top keywords.
     */
    public ArrayList extract(String text, int maxNumKeywords) {
        ArrayList sentences = new ArrayList<>();
        
        SimpleTokenizer tokenizer = new SimpleTokenizer();
        PorterStemmer stemmer = new PorterStemmer();
        
        // Split text into sentences. Stem words by Porter algorithm.
        int ntotal = 0;
        for (String paragraph : SimpleParagraphSplitter.getInstance().split(text)) {
            for (String s : SimpleSentenceSplitter.getInstance().split(paragraph)) {
                String[] sentence = tokenizer.split(s);
                for (int i = 0; i < sentence.length; i++) {
                    sentence[i] = stemmer.stripPluralParticiple(sentence[i]).toLowerCase();
                }
                sentences.add(sentence);
                ntotal += sentence.length;
            }
        }

        //  Extract phrases by Apriori-like algorithm.
        int maxNGramSize = 4;
        ArrayList terms = new ArrayList<>();
        AprioriPhraseExtractor phraseExtractor = new AprioriPhraseExtractor();
        for (ArrayList ngrams : phraseExtractor.extract(sentences, maxNGramSize, 4)) {
            for (NGram ngram : ngrams) {
                terms.add(ngram);
            }
        }
        Collections.sort(terms);
        
        // Select upto 30% most frequent terms.
        int n = 3 * terms.size() / 10;
        NGram[] freqTerms = new NGram[n];
        for (int i = 0, start = terms.size() - n; i < n; i++) {
            freqTerms[i] = terms.get(start + i);
        }
        
        // Trie for phrase matching.
        Trie trie = new Trie<>();
        for (int i = 0; i < n; i++) {
            trie.put(freqTerms[i].words, i);
        }
        
        // Build co-occurrence table
        int[] nw = new int[n];
        int[][] table = new int[n][n];
        for (String[] sentence : sentences) {
            Set phrases = new HashSet<>();
            for (int j = 1; j <= maxNGramSize; j++) {
                for (int i = 0; i <= sentence.length - j; i++) {
                    String[] phrase  = Arrays.copyOfRange(sentence, i, i+j);
                    Integer index = trie.get(phrase);
                    if (index != null) {
                        phrases.add(index);
                    }
                }
            }
            
            for (int i : phrases) {
                nw[i] += phrases.size();
                for (int j : phrases) {
                    if (i != j) {
                        table[i][j]++;
                    }
                }
            }
        }
        
        // Clustering frequent terms.
        int[] cluster = new int[n];
        for (int i = 0; i < cluster.length; i++) {
            cluster[i] = i;
        }

        //double log2 = Math.log(2.0);
        for (int i = 0; i < n; i++) {
            for (int j = i + 1; j < n; j++) {
                // Mutual information
                if (table[i][j] > 0) {
                    // This doesn't work as ntotal is usually large and thus the mutual information
                    // is way larger than the threshold log2 given in the paper.
                    //double mutual = Math.log((double) ntotal * table[i][j] / (freqTerms[i].freq * freqTerms[j].freq));
                    // Here we just use the (squared) geometric average of co-occurrence probability
                    // It works well to clustering things like "digital computer" and "computer" in practice.
                    double mutual = (double) table[i][j] * table[i][j] / (freqTerms[i].freq * freqTerms[j].freq);
                    if (mutual >= 0.25) {
                        cluster[j] = cluster[i];
                    } /*else {
                        double js = 0.0; // Jsensen-Shannon divergence
                        for (int k = 0; k < n; k++) {
                            double p1 = (double) table[i][k] / freqTerms[i].freq;
                            double p2 = (double) table[j][k] / freqTerms[j].freq;

                            // The formula in the paper is not correct as p is not real probablity.
                            if (p1 > 0 && p2 > 0) {
                                js += -(p1+p2) * Math.log((p1+p2)/2.0) + p1 * Math.log(p1) + p2 * Math.log(p2);
                            }
                        }
                    
                        js /= 2.0;
                        if (js > log2) {
                            cluster[j] = cluster[i];
                        }
                    }*/
                }
            }
        }
        
        // Calculate expected probability
        double[] pc = new double[n];
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                pc[cluster[j]] += table[i][j];
            }
        }
        for (int i = 0; i < n; i++) {
            pc[i] /= ntotal;
        }

        
        // Calculate chi-square scores.
        double[] score = new double[n];
        for (int i = 0; i < n; i++) {
            double max = Double.NEGATIVE_INFINITY;
            for (int j = 0; j < n; j++) {
                if (cluster[j] != j) {
                    continue;
                }
                    
                double fwc = 0.0;
                for (int k = 0; k < n; k++) {
                    if (cluster[k] == j)
                    fwc += table[i][k];
                }
                    
                double expected = nw[i] * pc[j];
                double d = (fwc - expected);
                double chisq = d * d / expected;
                score[i] += chisq;
                if (chisq > max) max = chisq;
            }
            //score[i] -= max;
        }
        
        int[] index = QuickSort.sort(score);
        ArrayList keywords = new ArrayList<>();
        for (int i = n; i-- > 0; ) {
            boolean add = true;
            // filter out components of phrases, e.g. "digital" in "digital computer".
            for (int j = i+1; j < n; j++) {
                if (cluster[index[j]] == cluster[index[i]]) {
                    if (freqTerms[index[j]].words.length >= freqTerms[index[i]].words.length) {
                        add = false;
                        break;
                    } else {
                        keywords.remove(freqTerms[index[j]]);
                        add = true;
                    }
                }
            }
            
            if (add) {
                keywords.add(freqTerms[index[i]]);
                if (keywords.size() >= maxNumKeywords) break;
            }
        }

        return keywords;
    }
}