All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.nlp.keyword.CooccurrenceKeywordExtractor Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.nlp.keyword;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import smile.nlp.NGram;
import smile.nlp.Trie;
import smile.nlp.collocation.AprioriPhraseExtractor;
import smile.nlp.stemmer.PorterStemmer;
import smile.nlp.tokenizer.SimpleParagraphSplitter;
import smile.nlp.tokenizer.SimpleSentenceSplitter;
import smile.nlp.tokenizer.SimpleTokenizer;
import smile.sort.QuickSort;

/**
 * Keyword extraction from a single document using word co-occurrence statistical information.
 * The algorithm was proposed by Y. Matsuo and M. Ishizuka. It consists of six steps:
 * 
    *
  1. Stem words by Porter algorithm and extract phrases based APRIORI algorithm * (upto 4 words with frequency more than 3 times). Discard stop words. *
  2. Select the top frequent terms up to 30% of running terms. *
  3. Clustering frequent terms. Two terms are in the same cluster if * either their Jensen-Shannon divergence or mutual information is * above the threshold (0.95 * log 2, and log 2, respectively). *
  4. Calculate the expected co-occurrence probability *
  5. Calculate the refined χ2 values that removes the maximal term. *
  6. Output a given number of terms of largest refined χ2 values. *
* * @author Haifeng Li */ public class CooccurrenceKeywordExtractor { /** * Returns the top 10 keywords. * @param text A single document. * @return The top 10 keywords. */ public ArrayList extract(String text) { return extract(text, 10); } /** * Returns a given number of top keywords. * @param text A single document. * @return The top keywords. */ public ArrayList extract(String text, int maxNumKeywords) { ArrayList sentences = new ArrayList<>(); SimpleTokenizer tokenizer = new SimpleTokenizer(); PorterStemmer stemmer = new PorterStemmer(); // Split text into sentences. Stem words by Porter algorithm. int ntotal = 0; for (String paragraph : SimpleParagraphSplitter.getInstance().split(text)) { for (String s : SimpleSentenceSplitter.getInstance().split(paragraph)) { String[] sentence = tokenizer.split(s); for (int i = 0; i < sentence.length; i++) { sentence[i] = stemmer.stripPluralParticiple(sentence[i]).toLowerCase(); } sentences.add(sentence); ntotal += sentence.length; } } // Extract phrases by Apriori-like algorithm. int maxNGramSize = 4; ArrayList terms = new ArrayList<>(); AprioriPhraseExtractor phraseExtractor = new AprioriPhraseExtractor(); for (ArrayList ngrams : phraseExtractor.extract(sentences, maxNGramSize, 4)) { for (NGram ngram : ngrams) { terms.add(ngram); } } Collections.sort(terms); // Select upto 30% most frequent terms. int n = 3 * terms.size() / 10; NGram[] freqTerms = new NGram[n]; for (int i = 0, start = terms.size() - n; i < n; i++) { freqTerms[i] = terms.get(start + i); } // Trie for phrase matching. Trie trie = new Trie<>(); for (int i = 0; i < n; i++) { trie.put(freqTerms[i].words, i); } // Build co-occurrence table int[] nw = new int[n]; int[][] table = new int[n][n]; for (String[] sentence : sentences) { Set phrases = new HashSet<>(); for (int j = 1; j <= maxNGramSize; j++) { for (int i = 0; i <= sentence.length - j; i++) { String[] phrase = Arrays.copyOfRange(sentence, i, i+j); Integer index = trie.get(phrase); if (index != null) { phrases.add(index); } } } for (int i : phrases) { nw[i] += phrases.size(); for (int j : phrases) { if (i != j) { table[i][j]++; } } } } // Clustering frequent terms. int[] cluster = new int[n]; for (int i = 0; i < cluster.length; i++) { cluster[i] = i; } //double log2 = Math.log(2.0); for (int i = 0; i < n; i++) { for (int j = i + 1; j < n; j++) { // Mutual information if (table[i][j] > 0) { // This doesn't work as ntotal is usually large and thus the mutual information // is way larger than the threshold log2 given in the paper. //double mutual = Math.log((double) ntotal * table[i][j] / (freqTerms[i].freq * freqTerms[j].freq)); // Here we just use the (squared) geometric average of co-occurrence probability // It works well to clustering things like "digital computer" and "computer" in practice. double mutual = (double) table[i][j] * table[i][j] / (freqTerms[i].freq * freqTerms[j].freq); if (mutual >= 0.25) { cluster[j] = cluster[i]; } /*else { double js = 0.0; // Jsensen-Shannon divergence for (int k = 0; k < n; k++) { double p1 = (double) table[i][k] / freqTerms[i].freq; double p2 = (double) table[j][k] / freqTerms[j].freq; // The formula in the paper is not correct as p is not real probablity. if (p1 > 0 && p2 > 0) { js += -(p1+p2) * Math.log((p1+p2)/2.0) + p1 * Math.log(p1) + p2 * Math.log(p2); } } js /= 2.0; if (js > log2) { cluster[j] = cluster[i]; } }*/ } } } // Calculate expected probability double[] pc = new double[n]; for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { pc[cluster[j]] += table[i][j]; } } for (int i = 0; i < n; i++) { pc[i] /= ntotal; } // Calculate chi-square scores. double[] score = new double[n]; for (int i = 0; i < n; i++) { double max = Double.NEGATIVE_INFINITY; for (int j = 0; j < n; j++) { if (cluster[j] != j) { continue; } double fwc = 0.0; for (int k = 0; k < n; k++) { if (cluster[k] == j) fwc += table[i][k]; } double expected = nw[i] * pc[j]; double d = (fwc - expected); double chisq = d * d / expected; score[i] += chisq; if (chisq > max) max = chisq; } //score[i] -= max; } int[] index = QuickSort.sort(score); ArrayList keywords = new ArrayList<>(); for (int i = n; i-- > 0; ) { boolean add = true; // filter out components of phrases, e.g. "digital" in "digital computer". for (int j = i+1; j < n; j++) { if (cluster[index[j]] == cluster[index[i]]) { if (freqTerms[index[j]].words.length >= freqTerms[index[i]].words.length) { add = false; break; } else { keywords.remove(freqTerms[index[j]]); add = true; } } } if (add) { keywords.add(freqTerms[index[i]]); if (keywords.size() >= maxNumKeywords) break; } } return keywords; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy