com.aliasi.cluster.LatentDirichletAllocation Maven / Gradle / Ivy
Show all versions of aliasi-lingpipe Show documentation
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.cluster;
import com.aliasi.corpus.ObjectHandler;
import com.aliasi.symbol.SymbolTable;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.stats.Statistics;
import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.FeatureExtractor;
import com.aliasi.util.Math;
import com.aliasi.util.Iterators;
import com.aliasi.util.ObjectToCounterMap;
import com.aliasi.util.ObjectToDoubleMap;
import com.aliasi.util.Strings;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
/**
* A LatentDirichletAllocation
object represents a latent
* Dirichlet allocation (LDA) model. LDA provides a Bayesian model of
* document generation in which each document is generated by
* a mixture of topical multinomials. An LDA model specifies the
* number of topics, a Dirichlet prior over topic mixtures for a
* document, and a discrete distribution over words for each topic.
*
* A document is generated from an LDA model by first selecting a
* multinomial over topics given the Dirichlet prior. Then for each
* token in the document, a topic is generated from the
* document-specific topic distribution, and then a word is generated
* from the discrete distribution for that topic. Note that document
* length is not generated by this model; a fully generative model
* would need a way of generating lengths (e.g. a Poisson
* distribution) or terminating documents (e.g. a disginuished
* end-of-document symbol).
*
*
An LDA model may be estimated from an unlabeled training corpus
* (collection of documents) using a second Dirichlet prior, this time
* over word distributions in a topic. This class provides a static
* inference method that produces (collapsed Gibbs) samples from the
* posterior distribution of topic assignments to words, any one of
* which may be used to construct an LDA model instance.
*
*
An LDA model can be used to infer the topic mixture of unseen
* text documents, which can be used to compare documents by topical
* similarity. A fixed LDA model may also be used to estimate the
* likelihood of a word occurring in a document given the other words
* in the document. A collection of LDA models may be used for fully
* Bayesian reasoning at the corpus (collection of documents) level.
*
*
LDA may be applied to arbitrary multinomial data. To apply it
* to text, a tokenizer factory converts a text document to bag of
* words and a symbol table converts these tokens to integer outcomes.
* The static method {@link
* #tokenizeDocuments(CharSequence[],TokenizerFactory,SymbolTable,int)}
* is available to carry out the text to multinomial conversion with
* pruning of low counts.
*
*
*
LDA Generative Model
*
* LDA operates over a fixed vocabulary of discrete outcomes, which
* we call words for convenience, and represent as a set of integers:
*
*
* words = { 0, 1, ..., numWords-1 }
*
* A corpus is a ragged array of documents, each document consisting
* of an array of words:
*
*
* int[][] words = { words[0], words[1], ..., words[numDocs-1] }
*
* A given document words[i]
, i < numDocs
,
* is itself represented as a bag of words, each word being
* represented as an integer:
*
*
* int[] words[i] = { words[i][0], words[i][1], ..., words[i][words[i].length-1] }
*
* The documents do not all need to be the same length, so the
* two-dimensional array words
is ragged.
*
* A particular LDA model is defined over a fixed number of topics,
* also represented as integers:
*
*
* topics = { 0, 1, ..., numTopics-1 }
*
* For a given topic topic < numTopics
, LDA specifies a
* discrete distribution φ[topic]
over words:
*
*
* φ[topic][word] >= 0.0
*
* Σword < numWords φ[topic][word] = 1.0
*
* In an LDA model, each document in the corpus is generated from a
* document-specific mixture of topics θ[doc]
. The
* distribution θ[doc]
is a discrete distribution over
* topics:
*
*
* θ[doc][topic] >= 0.0
*
* Σtopic < numTopics θ[doc][topic] = 1.0
*
* Under LDA's generative model for documents, a document-specific
* topic mixture θ[doc]
is generated from a uniform
* Dirichlet distribution with concentration parameter
* α
. The Dirichlet is the conjugate prior for the
* multinomial in which α
acts as a prior count
* assigned to a topic in a document. Typically, LDA is run with a
* fairly diffuse prior with concentration α <
* 1
, leading to skewed posterior topic distributions.
*
*
Given a topic distribution θ[doc]
for a
* document, tokens are generated (conditionally) independently. For
* each token in the document, a topic
* topics[doc][token]
is generated according to the
* topic distribution θ[doc]
, then the word instance
* words[doc][token]
is
* generated given the topic using the topic-specific distribution
* over tokens φ[topics[doc][token]]
.
*
*
For estimation purposes, LDA places a uniform Dirichlet prior
* with concentration parameter β
on each of the
* topic distributions φ[topic]
. The first step in
* modeling a corpus is to generate the topic distributions
* φ
from a Dirichlet parameterized by
* β
.
*
*
In sampling notation, the LDA generative model is expressed as
* follows:
*
*
* φ[topic] ~ Dirichlet(β)
* θ[doc] ~ Dirichlet(α)
* topics[doc][token] ~ Discrete(θ[doc])
* words[doc][token] ~ Discrete(φ[topic[doc][token]])
*
* A generative Bayesian model is based on a the joint probablity
* of all observed and unobserved variables, given the priors. Given a
* text corpus, only the words are observed. The unobserved variables
* include the assignment of a topic to each word, the assignment of a
* topic distribution to each document, and the assignment of word
* distributions to each topic. We let
* topic[doc][token]
, doc < numDocs, token <
* docLength[doc]
, be the topic assigned to the specified token
* in the specified document. This leaves two Dirichlet priors, one
* parameterized by α
for topics in documents, and
* one parameterized by β
for words in topics.
* These priors are treated as hyperparameters for purposes of
* estimation; cross-validation may be used to provide so-called empirical
* Bayes estimates for the priors.
*
*
The full LDA probability model for a corpus follows the generative
* process outlined above. First, topic distributions are chosen at the
* corpus level for each topic given their Dirichlet prior, and then the
* remaining variables are generating given these topic distributions:
*
*
* p(words, topics, θ, &phi | α, &beta)
* = Πtopic < numTopics
* p(φ[topic] | β)
* * p(words, topics, θ | α, φ)
*
* Note that because the multinomial parameters for
* φ[topic]
are continuous,
* p(φ[topic] | β)
represents a density, not a discrete
* distribution. Thus it does not make sense to talk about the
* probability of a given multinomial φ[topic]
; non-zero
* results only arise from integrals over measurable subsets of the
* multinomial simplex. It is possible to sample from a density, so
* the generative model is well founded.
*
* A document is generated by first generating its topic distribution
* given the Dirichlet prior, and then generating its topics and words:
*
*
* p(words, topics, θ | α, φ)
* = Πdoc < numDocs
* p(θ[doc] | α)
* * p(words[doc], topics[doc] | θ[doc], φ)
*
* The topic and word are generated from the multinomials
* θ[doc]
and the topic distributions
* φ
using the chain rule, first generating the topic
* given the document's topic distribution and then generating the
* word given the topic's word distribution.
*
*
* p(words[doc], topics[doc] | θ[doc], φ)
* = Πtoken < words[doc].length
* p(topics[doc][token] | θ[doc])
* * p(words[doc][token] | φ[topics[doc][token]])
*
* Given the topic and document multinomials, this distribution
* is discrete, and thus may be evaluated. It may also be marginalized
* by summation:
*
*
* p(words[doc] | θ[doc], φ)
* = Π token < words[doc].length
* Σtopic < numTopics p(topic | θ[doc]) * p(words[doc][token] | φ[topic])
*
* Conditional probablities are computed in the usual way by
* marginalizing other variables through integration. Unfortunately,
* this simple mathematical operation is often intractable
* computationally.
*
*
*
Estimating LDA with Collapsed Gibbs Sampling
*
* This class uses a collapsed form of Gibbs sampling over the
* posterior distribution of topic assignments given the documents
* and Dirichlet priors:
*
*
* p(topics | words, α β)
*
* This distribution may be derived from the joint
* distribution by marginalizing (also known as "collapsing"
* or "integrating out") the contributions of the
* document-topic and topic-word distributions.
*
The Gibbs sampler used to estimate LDA models produces samples
* that consist of a topic assignment to every token in the corpus.
* The conjugacy of the Dirichlet prior for multinomials makes the
* sampling straightforward.
*
*
An initial sample is produced by randomly assigning topics to
* tokens. Then, the sampler works iteratively through the corpus,
* one token at a time. At each token, it samples a new topic assignment
* to that token given all the topic assignments to other tokens in the
* corpus:
*
*
* p(topics[doc][token] | words, topics')
*
* The notation topics'
represents the set of topic
* assignments other than to topics[doc][token]
. This
* collapsed posterior conditional is estimated directly:
*
*
* p(topics[doc][token] = topic | words, topics')
* = (count'(doc,topic) + α) / (docLength[doc]-1 + numTopics*α)
* * (count'(topic,word) + β) / (count'(topic) + numWords*β)
*
* The counts are all defined relative to topics'
; that
* is, the current topic assignment for the token being sampled is not
* considered in the counts. Note that the two factors are estimates
* of θ[doc]
and φ[topic]
with all
* data other than the assignment to the current token. Note how the
* prior concentrations arise as additive smoothing factors in these
* estimates, a result of the Dirichlet's conjugacy to the
* multinomial. For the purposes of sampling, the document-length
* normalization in the denominator of the first term is not necessary,
* as it remains constant across topics.
*
* The posterior Dirichlet distributions may be computed using just
* the counts. For instance, the posterior distribution for topics in
* documents is estimated as:
*
*
* p(&theta[doc]|α, β, words)
* = Dirichlet(count(doc,0)+β, count(doc,1)+β, ..., count(doc,numTopics-1)+β)
*
* The sampling distribution is defined from the maximum a posteriori
* (MAP) estimates of the multinomial distribution over topics in a document:
*
*
* θ*[doc] = ARGMAXθ[doc] p(θ[doc] | α, β, words)
*
* which we know from the Dirichlet distribution is:
*
*
* θ*[doc][topic]
* = (count(doc,topic) + α) / (docLength[doc] + numTopics*α)
*
* By the same reasoning, the MAP word distribution in topics is:
*
*
* φ*[topic][word]
* = (count(topic,word) + β) / (count(topic) + numWords*β)
*
* A complete Gibbs sample is represented as an instance of {@link
* LatentDirichletAllocation.GibbsSample}, which provides access to
* the topic assignment to every token, as well as methods to compute
* θ*
and φ*
* as defined above. A sample also maintains the original priors and
* word counts. Just the estimates of the topic-word distributions
* φ[topic]
and the prior topic concentration
* α
are sufficient to define an LDA model. Note
* that the imputed values of θ*[doc]
* used during estimation are part of a sample, but are not part of
* the LDA model itself. The LDA model contains enough information to
* estimate θ*
for an arbitrary
* document, as described in the next section.
*
*
The Gibbs sampling algorithm starts with a random assignment of
* topics to words, then simply iterates through the tokens in turn,
* sampling topics according to the distribution defined above. After
* each run through the entire corpus, a callback is made to a handler
* for the samples. This setup may be configured for
* an initial burnin period, essentially just discarding the first
* batch of samples. Then it may be configured to sample only periodically
* thereafter to avoid correlations between samples.
*
*
LDA as Multi-Topic Classifier
*
* An LDA model consists of a topic distribution Dirichlet prior
* α
and a word distribution
* φ[topic]
for each topic. Given an LDA model and a
* new document words = { words[0], ..., words[length-1] }
* consisting of a sequence of words, the posterior distribution over
* topic weights is given by:
*
*
* p(θ | words, α, φ)
*
* Although this distribution is not solvable analytically, it is easy
* to estimate using a simplified form of the LDA estimator's Gibbs
* sampler. The conditional distribution of a topic assignment
* topics[token]
to a single token given an assignment
* topics'
to all other tokens is given by:
*
*
* p(topic[token] | topics', words, α, φ)
* ∝ p(topic[token], words[token] | topics', α φ)
* = p(topic[token] | topics', α) * p(words[token] | φ[topic[token]])
* = (count(topic[token]) + α) / (words.length - 1 + numTopics * α)
* * p(words[token] | φ[topic[token]])
*
* This leads to a straightforward sampler over posterior topic
* assignments, from which we may directly compute the Dirichlet posterior over
* topic distributions or a MAP topic distribution.
*
* This class provides a method to sample these topic assignments,
* which may then be used to form Dirichlet distributions or MAP point
* estimates of θ*
for the document
* words
.
*
LDA as a Conditional Language Model
*
* An LDA model may be used to estimate the likelihood of a word
* given a previous bag of words:
*
*
* p(word | words, α, φ)
* = ∫p(word | θ, φ) p(θ | words, α, φ) dθ
*
* This integral is easily evaluated using sampling over the topic
* distributions p(θ | words, α, φ)
and
* averaging the word probability determined by each sample.
* The word probability for a sample θ
is defined by:
*
*
* p(word | θ, φ)
* = Σtopic < numTopics p(topic | θ) * p(word | φ[topic])
*
* Although this approach could theoretically be applied to generate
* the probability of a document one word at a time, the cost would
* be prohibitive, as there are quadratically many samples required
* because samples for the n
-th word consist of topic
* assignments to the previous n-1
words.
*
*
*
* Bayesian Calculations and Exchangeability
*
* An LDA model may be used for a variety of statistical
* calculations. For instance, it may be used to determine the
* distribution of topics to words, and using these distributions, may
* determine word similarity. Similarly, document similarity may be
* determined by the topic distributions in a document.
*
*
Point estimates are derived using a single LDA model. For
* Bayesian calculation, multiple samples are taken to produce
* multiple LDA models. The results of a calculation on these
* models is then averaged to produce a Bayesian estimate of the
* quantity of interest. The sampling methodology is effectively
* numerically computing the integral over the posterior.
*
*
Bayesian calculations over multiple samples are complicated by
* the exchangeability of topics in the LDA model. In particular, there
* is no guarantee that topics are the same between samples, thus it
* is not acceptable to combine samples in topic-level reasoning. For
* instance, it does not make sense to estimate the probability of
* a topic in a document using multiple samples.
*
*
*
Non-Document Data
*
* The "words" in an LDA model don't necessarily have to
* represent words in documents. LDA is basically a multinomial
* mixture model, and any multinomial outcomes may be modeled with
* LDA. For instance, a document may correspond to a baseball game
* and the words may correspond to the outcomes of at-bats (some
* might occur more than once). LDA has also been used for
* gene expression data, where expression levels from mRNA microarray
* experiments is quantized into a multinomial outcome.
*
* LDA has also been applied to collaborative filtering. Movies
* act as words, with users modeled as documents, the bag of words
* they've seen. Given an LDA model and a user's films, the user's
* topic distribution may be inferred and used to estimate the likelihood
* of seeing unseen films.
*
*
References
*
*
* - Wikipedia: Gibbs Sampling
*
- Wikipedia: Markov chain Monte Carlo
*
- Wikipedia: Dirichlet Distribution
*
- Wikipedia: Latent Dirichelt Distribution
*
- Steyvers, Mark and Tom Griffiths. 2007.
* Probabilistic topic models.
* In Thomas K. Landauer, Danielle S. McNamara, Simon Dennis and Walter Kintsch (eds.),
* Handbook of Latent Semantic Analysis.
* Lawrence Erlbaum.
*
*
* - Blei, David M., Andrew Y. Ng, and Michael I. Jordan. 2003.
* Latent Dirichlet allocation.
* Journal of Machine Learning Research 3:993-1022.
*
*
*
* @author Bob Carpenter
* @version 4.0.0
* @since LingPipe3.3
*/
public class LatentDirichletAllocation implements Serializable {
static final long serialVersionUID = 6313662446710242382L;
private final double mDocTopicPrior;
private final double[][] mTopicWordProbs;
/**
* Construct a latent Dirichelt allocation (LDA) model using the
* specified document-topic prior and topic-word distributions.
* The topic-word probability array topicWordProbs
* represents a collection of discrete distributions
* topicwordProbs[topic]
for topics, and thus must
* satisfy:
*
*
* topicWordProbs[topic][word] >= 0.0
*
* Σword < numWords topicWordProbs[topic][word] = 1.0
*
* Warning: These requirements are not checked by the
* constructor.
*
*
See the class documentation above for an explanation of
* the parameters and what can be done with a model.
*
* @param docTopicPrior The document-topic prior.
* @param topicWordProbs Array of discrete distributions over words,
* indexed by topic.
* @throws IllegalArgumentException If the document-topic prior is
* not finite and positive, or if the topic-word probabilities
* arrays are not all the same length with entries between 0.0 and
* 1.0 inclusive.
*/
public LatentDirichletAllocation(double docTopicPrior,
double[][] topicWordProbs) {
if (docTopicPrior <= 0.0
|| Double.isNaN(docTopicPrior)
|| Double.isInfinite(docTopicPrior)) {
String msg = "Document-topic prior must be finite and positive."
+ " Found docTopicPrior=" + docTopicPrior;
throw new IllegalArgumentException(msg);
}
int numTopics = topicWordProbs.length;
if (numTopics < 1) {
String msg = "Require non-empty topic-word probabilities.";
throw new IllegalArgumentException(msg);
}
int numWords = topicWordProbs[0].length;
for (int topic = 1; topic < numTopics; ++topic) {
if (topicWordProbs[topic].length != numWords) {
String msg = "All topics must have the same number of words."
+ " topicWordProbs[0].length="
+ topicWordProbs[0].length
+ " topicWordProbs[" + topic + "]="
+ topicWordProbs[topic].length;
throw new IllegalArgumentException(msg);
}
}
for (int topic = 0; topic < numTopics; ++topic) {
for (int word = 0; word < numWords; ++word) {
if (topicWordProbs[topic][word] < 0.0
|| topicWordProbs[topic][word] > 1.0) {
String msg = "All probabilities must be between 0.0 and 1.0"
+ " Found topicWordProbs[" + topic + "][" + word + "]="
+ topicWordProbs[topic][word];
throw new IllegalArgumentException(msg);
}
}
}
mDocTopicPrior = docTopicPrior;
mTopicWordProbs = topicWordProbs;
}
/**
* Returns the number of topics in this LDA model.
*
* @return The number of topics in this model.
*/
public int numTopics() {
return mTopicWordProbs.length;
}
/**
* Returns the number of words on which this LDA model
* is based.
*
* @return The numbe of words in this model.
*/
public int numWords() {
return mTopicWordProbs[0].length;
}
/**
* Returns the concentration value of the uniform Dirichlet prior over
* topic distributions for documents. This value is effectively
* a prior count for topics used for additive smoothing during
* estimation.
*
* @return The prior count of topics in documents.
*/
public double documentTopicPrior() {
return mDocTopicPrior;
}
/**
* Returns the probability of the specified word in the specified
* topic. The values returned should be non-negative and finite,
* and should sum to 1.0 over all words for a specifed topic.
*
* @param topic Topic identifier.
* @param word Word identifier.
* @return Probability of the specified word in the specified
* topic.
*/
public double wordProbability(int topic, int word) {
return mTopicWordProbs[topic][word];
}
/**
* Returns an array representing of probabilities of words in the
* specified topic. The probabilities are indexed by word
* identifier.
*
*
The returned result is a copy of the underlying data in
* the model so that changing it will not change the model.
*
* @param topic Topic identifier.
* @return Array of probabilities of words in the specified topic.
*/
public double[] wordProbabilities(int topic) {
double[] xs = new double[mTopicWordProbs[topic].length];
for (int i = 0; i < xs.length; ++i)
xs[i] = mTopicWordProbs[topic][i];
return xs;
}
/**
* Returns the specified number of Gibbs samples of topics for the
* specified tokens using the specified number of burnin epochs,
* the specified lag between samples, and the specified
* randomizer. The array returned is an array of samples, each
* sample consisting of a topic assignment to each token in the
* specified list of tokens. The tokens must all be in the appropriate
* range for this class
*
*
See the class documentation for more information on how the
* samples are computed.
*
* @param tokens The tokens making up the document.
* @param numSamples Number of Gibbs samples to return.
* @param burnin The number of samples to take and throw away
* during the burnin period.
* @param sampleLag The interval between samples after burnin.
* @param random The random number generator to use for this sampling
* process.
* @return The selection of topic samples generated by this
* sampler.
* @throws IndexOutOfBoundsException If there are tokens whose
* value is less than zero, or whose value is greater than the
* number of tokens in this model.
* @throws IllegalArgumentException If the number of samples is
* not positive, the sample lag is not positive, or if the burnin
* period is negative. if the number of samples, burnin, and lag
* are not positive numbers.
*/
public short[][] sampleTopics(int[] tokens,
int numSamples,
int burnin,
int sampleLag,
Random random) {
if (burnin < 0) {
String msg = "Burnin period must be non-negative."
+ " Found burnin=" + burnin;
throw new IllegalArgumentException(msg);
}
if (numSamples < 1) {
String msg = "Number of samples must be at least 1."
+ " Found numSamples=" + numSamples;
throw new IllegalArgumentException(msg);
}
if (sampleLag < 1) {
String msg = "Sample lag must be at least 1."
+ " Found sampleLag=" + sampleLag;
throw new IllegalArgumentException(msg);
}
double docTopicPrior = documentTopicPrior();
int numTokens = tokens.length;
int numTopics = numTopics();
int[] topicCount = new int[numTopics];
short[][] samples = new short[numSamples][numTokens];
int sample = 0;
short[] currentSample = samples[0];
for (int token = 0; token < numTokens; ++token) {
int randomTopic = random.nextInt(numTopics);
++topicCount[randomTopic];
currentSample[token] = (short) randomTopic;
}
double[] topicDistro = new double[numTopics];
int numEpochs = burnin + sampleLag * (numSamples - 1);
for (int epoch = 0; epoch < numEpochs; ++epoch) {
for (int token = 0; token < numTokens; ++token) {
int word = tokens[token];
int currentTopic = currentSample[token];
--topicCount[currentTopic];
if (topicCount[currentTopic] < 0) {
throw new IllegalArgumentException("bomb");
}
for (int topic = 0; topic < numTopics; ++topic) {
topicDistro[topic]
= (topicCount[topic] + docTopicPrior)
* wordProbability(topic,word)
+ (topic == 0 ? 0.0 : topicDistro[topic-1]);
}
int sampledTopic = Statistics.sample(topicDistro,random);
++topicCount[sampledTopic];
currentSample[token] = (short) sampledTopic;
}
if ((epoch >= burnin) && (((epoch - burnin) % sampleLag) == 0)) {
short[] pastSample = currentSample;
++sample;
currentSample = samples[sample];
for (int token = 0; token < numTokens; ++token)
currentSample[token] = pastSample[token];
}
}
return samples;
}
/**
* Replaced by method {@code bayesTopicEstimate()} because of
* original misnaming.
*
*
Warning: This is actually not a maximum a posterior
* (MAP) estimate as suggested by the name.
*
* @param tokens The tokens making up the document.
* @param numSamples Number of Gibbs samples to return.
* @param burnin The number of samples to take and throw away
* during the burnin period.
* @param sampleLag The interval between samples after burnin.
* @param random The random number generator to use for this sampling
* process.
* @return The selection of topic samples generated by this
* sampler.
* @throws IndexOutOfBoundsException If there are tokens whose
* value is less than zero, or whose value is greater than the
* number of tokens in this model.
* @throws IllegalArgumentException If the number of samples is
* not positive, the sample lag is not positive, or if the burnin
* period is negative.
*/
double[] mapTopicEstimate(int[] tokens,
int numSamples,
int burnin,
int sampleLag,
Random random) {
return bayesTopicEstimate(tokens,numSamples,burnin,sampleLag,random);
}
/**
* Return the Bayesian point estimate of the topic distribution
* for a document consisting of the specified tokens, using Gibbs
* sampling with the specified parameters. The Gibbs topic
* samples are simply averaged to produce the Bayesian estimate,
* which minimizes expected square loss.
*
*
See the method {@link #sampleTopics(int[],int,int,int,Random)}
* and the class documentation for more information on the sampling
* procedure.
*
* @param tokens The tokens making up the document.
* @param numSamples Number of Gibbs samples to return.
* @param burnin The number of samples to take and throw away
* during the burnin period.
* @param sampleLag The interval between samples after burnin.
* @param random The random number generator to use for this sampling
* process.
* @return The selection of topic samples generated by this
* sampler.
* @throws IndexOutOfBoundsException If there are tokens whose
* value is less than zero, or whose value is greater than the
* number of tokens in this model.
* @throws IllegalArgumentException If the number of samples is
* not positive, the sample lag is not positive, or if the burnin
* period is negative.
*/
public double[] bayesTopicEstimate(int[] tokens,
int numSamples,
int burnin,
int sampleLag,
Random random) {
short[][] sampleTopics = sampleTopics(tokens,numSamples,burnin,sampleLag,random);
int numTopics = numTopics();
int[] counts = new int[numTopics];
for (short[] topics : sampleTopics) {
for (int tok = 0; tok < topics.length; ++tok)
++counts[topics[tok]];
}
double totalCount = 0;
for (int topic = 0; topic < numTopics; ++topic)
totalCount += counts[topic];
double[] result = new double[numTopics];
for (int topic = 0; topic < numTopics; ++topic)
result[topic] = counts[topic] / totalCount;
return result;
}
Object writeReplace() {
return new Serializer(this);
}
/**
* Run Gibbs sampling for the specified multinomial data, number
* of topics, priors, search parameters, randomization and
* callback sample handler. Gibbs sampling provides samples from the
* posterior distribution of topic assignments given the corpus
* and prior hyperparameters. A sample is encapsulated as an
* instance of class {@link GibbsSample}. This method will return
* the final sample and also send intermediate samples to an
* optional handler.
*
*
The class documentation above explains Gibbs sampling for
* LDA as used in this method.
*
*
The primary input is an array of documents, where each
* document is represented as an array of integers representing
* the tokens that appear in it. These tokens should be numbered
* contiguously from 0 for space efficiency. The topic assignments
* in the Gibbs sample are aligned as parallel arrays to the array
* of documents.
*
*
The next three parameters are the hyperparameters of the
* model, specifically the number of topics, the prior count
* assigned to topics in a document, and the prior count assigned
* to words in topics. A rule of thumb for the document-topic
* prior is to set it to 5 divided by the number of topics (or
* less if there are very few topics; 0.1 is typically the maximum
* value used). A good general value for the topic-word prior is
* 0.01. Both of these priors will be diffuse and tend to lead to
* skewed posterior distributions.
*
*
The following three parameters specify how the sampling is
* to be done. First, the sampler is "burned in" for a
* number of epochs specified by the burnin parameter. After burn
* in, samples are taken after fixed numbers of documents to avoid
* correlation in the samples; the sampling frequency is specified
* by the sample lag. Finally, the number of samples to be taken
* is specified. For instance, if the burnin is 1000, the sample
* lag is 250, and the number of samples is 5, then samples are
* taken after 1000, 1250, 1500, 1750 and 2000 epochs. If a
* non-null handler object is specified in the method call, its
* handle(GibbsSample)
method is called with each the
* samples produced as above.
*
*
The final sample in the chain of samples is returned as the
* result. Note that this sample will also have been passed to the
* specified handler as the last sample for the handler.
*
*
A random number generator must be supplied as an argument.
* This may just be a new instance of {@link java.util.Random} or
* a custom extension. It is used for all randomization in this
* method.
*
* @param docWords Corpus of documents to be processed.
* @param numTopics Number of latent topics to generate.
* @param docTopicPrior Prior count of topics in a document.
* @param topicWordPrior Prior count of words in a topic.
* @param burninEpochs Number of epochs to run before taking a sample.
* @param sampleLag Frequency between samples.
* @param numSamples Number of samples to take before exiting.
* @param random Random number generator.
* @param handler Handler to which the samples are sent.
* @return The final Gibbs sample.
*/
public static GibbsSample
gibbsSampler(int[][] docWords,
short numTopics,
double docTopicPrior,
double topicWordPrior,
int burninEpochs,
int sampleLag,
int numSamples,
Random random,
ObjectHandler handler) {
validateInputs(docWords,numTopics,docTopicPrior,topicWordPrior,burninEpochs,sampleLag,numSamples);
int numDocs = docWords.length;
int numWords = max(docWords) + 1;
int numTokens = 0;
for (int doc = 0; doc < numDocs; ++doc)
numTokens += docWords[doc].length;
// should inputs be permuted?
// for (int doc = 0; doc < numDocs; ++doc)
// Arrays.permute(docWords[doc]);
short[][] currentSample = new short[numDocs][];
for (int doc = 0; doc < numDocs; ++doc)
currentSample[doc] = new short[docWords[doc].length];
int[][] docTopicCount = new int[numDocs][numTopics];
int[][] wordTopicCount = new int[numWords][numTopics];
int[] topicTotalCount = new int[numTopics];
for (int doc = 0; doc < numDocs; ++doc) {
for (int tok = 0; tok < docWords[doc].length; ++tok) {
int word = docWords[doc][tok];
int topic = random.nextInt(numTopics);
currentSample[doc][tok] = (short) topic;
++docTopicCount[doc][topic];
++wordTopicCount[word][topic];
++topicTotalCount[topic];
}
}
double numWordsTimesTopicWordPrior = numWords * topicWordPrior;
double[] topicDistro = new double[numTopics];
int numEpochs = burninEpochs + sampleLag * (numSamples - 1);
for (int epoch = 0; epoch <= numEpochs; ++epoch) {
double corpusLog2Prob = 0.0;
int numChangedTopics = 0;
for (int doc = 0; doc < numDocs; ++doc) {
int[] docWordsDoc = docWords[doc];
short[] currentSampleDoc = currentSample[doc];
int[] docTopicCountDoc = docTopicCount[doc];
for (int tok = 0; tok < docWordsDoc.length; ++tok) {
int word = docWordsDoc[tok];
int[] wordTopicCountWord = wordTopicCount[word];
int currentTopic = currentSampleDoc[tok];
if (currentTopic == 0) {
topicDistro[0]
= (docTopicCountDoc[0] - 1.0 + docTopicPrior)
* (wordTopicCountWord[0] - 1.0 + topicWordPrior)
/ (topicTotalCount[0] -1.0 + numWordsTimesTopicWordPrior);
} else {
topicDistro[0]
= (docTopicCountDoc[0] + docTopicPrior)
* (wordTopicCountWord[0] + topicWordPrior)
/ (topicTotalCount[0] + numWordsTimesTopicWordPrior);
for (int topic = 1; topic < currentTopic; ++topic) {
topicDistro[topic]
= (docTopicCountDoc[topic] + docTopicPrior)
* (wordTopicCountWord[topic] + topicWordPrior)
/ (topicTotalCount[topic] + numWordsTimesTopicWordPrior)
+ topicDistro[topic-1];
}
topicDistro[currentTopic]
= (docTopicCountDoc[currentTopic] - 1.0 + docTopicPrior)
* (wordTopicCountWord[currentTopic] - 1.0 + topicWordPrior)
/ (topicTotalCount[currentTopic] -1.0 + numWordsTimesTopicWordPrior)
+ topicDistro[currentTopic-1];
}
for (int topic = currentTopic+1; topic < numTopics; ++topic) {
topicDistro[topic]
= (docTopicCountDoc[topic] + docTopicPrior)
* (wordTopicCountWord[topic] + topicWordPrior)
/ (topicTotalCount[topic] + numWordsTimesTopicWordPrior)
+ topicDistro[topic-1];
}
int sampledTopic = Statistics.sample(topicDistro,random);
// compute probs before updates
if (sampledTopic != currentTopic) {
currentSampleDoc[tok] = (short) sampledTopic;
--docTopicCountDoc[currentTopic];
--wordTopicCountWord[currentTopic];
--topicTotalCount[currentTopic];
++docTopicCountDoc[sampledTopic];
++wordTopicCountWord[sampledTopic];
++topicTotalCount[sampledTopic];
}
if (sampledTopic != currentTopic)
++numChangedTopics;
double topicProbGivenDoc
= docTopicCountDoc[sampledTopic] / (double) docWordsDoc.length;
double wordProbGivenTopic
= wordTopicCountWord[sampledTopic] / (double) topicTotalCount[sampledTopic];
double tokenLog2Prob = Math.log2(topicProbGivenDoc * wordProbGivenTopic);
corpusLog2Prob += tokenLog2Prob;
}
}
// double crossEntropyRate = -corpusLog2Prob / numTokens;
if ((epoch >= burninEpochs)
&& (((epoch - burninEpochs) % sampleLag) == 0)) {
GibbsSample sample
= new GibbsSample(epoch,
currentSample,
docWords,
docTopicPrior,
topicWordPrior,
docTopicCount,
wordTopicCount,
topicTotalCount,
numChangedTopics,
numWords,
numTokens);
if (handler != null)
handler.handle(sample);
if (epoch == numEpochs)
return sample;
}
}
throw new IllegalStateException("unreachable in practice because of return if epoch==numEpochs");
}
/**
* Return an iterator over Gibbs samples for the specified
* document-word corpus, number of topics, priors and randomizer.
* These are the same Gibbs samples as wold be produced by the
* method {@link
* #gibbsSampler(int[][],short,double,double,int,int,int,Random,ObjectHandler)}.
* See that method and the class documentation for more details.
*
* @param docWords Corpus of documents to be processed.
* @param numTopics Number of latent topics to generate.
* @param docTopicPrior Prior count of topics in a document.
* @param topicWordPrior Prior count of words in a topic.
* @param random Random number generator.
*/
public static Iterator
gibbsSample(int[][] docWords,
short numTopics,
double docTopicPrior,
double topicWordPrior,
Random random) {
validateInputs(docWords,numTopics,docTopicPrior,topicWordPrior);
return new SampleIterator(docWords,numTopics,docTopicPrior,topicWordPrior,random);
}
static class SampleIterator extends Iterators.Buffered {
private final int[][] mDocWords; // D x W x 4 (shared across threads)
private final short mNumTopics;
private final double mDocTopicPrior;
private final double mTopicWordPrior;
private final Random mRandom;
private final int mNumDocs;
private final int mNumWords;
private final int mNumTokens;
private final short[][] mCurrentSample; // D x W x 2
private final int[][] mDocTopicCount; // D x K x 4
private final int[][] mWordTopicCount; // K x W x 4
private final int[] mTopicTotalCount; // K
private int mNumChangedTopics;
private int mEpoch = 0;
SampleIterator(int[][] docWords,
short numTopics,
double docTopicPrior,
double topicWordPrior,
Random random) {
mDocWords = docWords;
mNumTopics = numTopics;
mDocTopicPrior = docTopicPrior;
mTopicWordPrior = topicWordPrior;
mRandom = random;
mNumDocs = mDocWords.length;
mNumWords = max(mDocWords) + 1;
int numTokens = 0;
for (int doc = 0; doc < mNumDocs; ++doc)
numTokens += mDocWords[doc].length;
mNumTokens = numTokens;
mNumChangedTopics = numTokens;
mCurrentSample = new short[mNumDocs][];
for (int doc = 0; doc < mNumDocs; ++doc)
mCurrentSample[doc] = new short[mDocWords[doc].length];
mDocTopicCount = new int[mNumDocs][numTopics];
mWordTopicCount = new int[mNumWords][numTopics];
mTopicTotalCount = new int[numTopics];
// random initialization
for (int doc = 0; doc < mNumDocs; ++doc) {
for (int tok = 0; tok < docWords[doc].length; ++tok) {
int word = docWords[doc][tok];
int topic = mRandom.nextInt(numTopics);
mCurrentSample[doc][tok] = (short) topic;
++mDocTopicCount[doc][topic];
++mWordTopicCount[word][topic];
++mTopicTotalCount[topic];
}
}
}
@Override
protected GibbsSample bufferNext() {
// create existing sample; then compute next one by setting all vars
GibbsSample sample
= new GibbsSample(mEpoch,
mCurrentSample,
mDocWords,
mDocTopicPrior,
mTopicWordPrior,
mDocTopicCount,
mWordTopicCount,
mTopicTotalCount,
mNumChangedTopics,
mNumWords,
mNumTokens);
++mEpoch;
double numWordsTimesTopicWordPrior = mNumWords * mTopicWordPrior;
double[] topicDistro = new double[mNumTopics];
int numChangedTopics = 0;
for (int doc = 0; doc < mNumDocs; ++doc) {
int[] docWordsDoc = mDocWords[doc];
short[] currentSampleDoc = mCurrentSample[doc];
int[] docTopicCountDoc = mDocTopicCount[doc];
for (int tok = 0; tok < docWordsDoc.length; ++tok) {
int word = docWordsDoc[tok];
int[] wordTopicCountWord = mWordTopicCount[word];
int currentTopic = currentSampleDoc[tok];
if (currentTopic == 0) {
topicDistro[0]
= (docTopicCountDoc[0] - 1.0 + mDocTopicPrior)
* (wordTopicCountWord[0] - 1.0 + mTopicWordPrior)
/ (mTopicTotalCount[0] -1.0 + numWordsTimesTopicWordPrior);
} else {
topicDistro[0]
= (docTopicCountDoc[0] + mDocTopicPrior)
* (wordTopicCountWord[0] + mTopicWordPrior)
/ (mTopicTotalCount[0] + numWordsTimesTopicWordPrior);
for (int topic = 1; topic < currentTopic; ++topic) {
topicDistro[topic]
= (docTopicCountDoc[topic] + mDocTopicPrior)
* (wordTopicCountWord[topic] + mTopicWordPrior)
/ (mTopicTotalCount[topic] + numWordsTimesTopicWordPrior)
+ topicDistro[topic-1];
}
topicDistro[currentTopic]
= (docTopicCountDoc[currentTopic] - 1.0 + mDocTopicPrior)
* (wordTopicCountWord[currentTopic] - 1.0 + mTopicWordPrior)
/ (mTopicTotalCount[currentTopic] -1.0 + numWordsTimesTopicWordPrior)
+ topicDistro[currentTopic-1];
}
for (int topic = currentTopic+1; topic < mNumTopics; ++topic) {
topicDistro[topic]
= (docTopicCountDoc[topic] + mDocTopicPrior)
* (wordTopicCountWord[topic] + mTopicWordPrior)
/ (mTopicTotalCount[topic] + numWordsTimesTopicWordPrior)
+ topicDistro[topic-1];
}
int sampledTopic = Statistics.sample(topicDistro,mRandom);
if (sampledTopic != currentTopic) {
currentSampleDoc[tok] = (short) sampledTopic;
--docTopicCountDoc[currentTopic];
--wordTopicCountWord[currentTopic];
--mTopicTotalCount[currentTopic];
++docTopicCountDoc[sampledTopic];
++wordTopicCountWord[sampledTopic];
++mTopicTotalCount[sampledTopic];
++numChangedTopics;
}
}
}
mNumChangedTopics = numChangedTopics;
return sample;
}
}
/**
* Tokenize an array of text documents represented as character
* sequences into a form usable by LDA, using the specified
* tokenizer factory and symbol table. The symbol table should be
* constructed fresh for this application, but may be used after
* this method is called for further token to symbol conversions.
* Only tokens whose count is equal to or larger the specified
* minimum count are included. Only tokens whose count exceeds
* the minimum are added to the symbol table, thus producing a
* compact set of symbol assignments to tokens for downstream
* processing.
*
* Warning: With some tokenizer factories and or minimum
* count thresholds, there may be documents with no tokens in
* them.
*
* @param texts The text corpus.
* @param tokenizerFactory A tokenizer factory for tokenizing the texts.
* @param symbolTable Symbol table used to convert tokens to identifiers.
* @param minCount Minimum count for a token to be included in a
* document's representation.
* @return The tokenized form of a document suitable for input to LDA.
*/
public static int[][] tokenizeDocuments(CharSequence[] texts,
TokenizerFactory tokenizerFactory,
SymbolTable symbolTable,
int minCount) {
ObjectToCounterMap tokenCounter = new ObjectToCounterMap();
for (CharSequence text : texts) {
char[] cs = Strings.toCharArray(text);
Tokenizer tokenizer = tokenizerFactory.tokenizer(cs,0,cs.length);
for (String token : tokenizer)
tokenCounter.increment(token);
}
tokenCounter.prune(minCount);
Set tokenSet = tokenCounter.keySet();
for (String token : tokenSet)
symbolTable.getOrAddSymbol(token);
int[][] docTokenId = new int[texts.length][];
for (int i = 0; i < docTokenId.length; ++i) {
docTokenId[i] = tokenizeDocument(texts[i],tokenizerFactory,symbolTable);
}
return docTokenId;
}
/**
* Tokenizes the specified text document using the specified tokenizer
* factory returning only tokens that exist in the symbol table. This
* method is useful within a given LDA model for tokenizing new documents
* into lists of words.
*
* @param text Character sequence to tokenize.
* @param tokenizerFactory Tokenizer factory for tokenization.
* @param symbolTable Symbol table to use for converting tokens
* to symbols.
* @return The array of integer symbols for tokens that exist in
* the symbol table.
*/
public static int[] tokenizeDocument(CharSequence text,
TokenizerFactory tokenizerFactory,
SymbolTable symbolTable) {
char[] cs = Strings.toCharArray(text);
Tokenizer tokenizer = tokenizerFactory.tokenizer(cs,0,cs.length);
List idList = new ArrayList();
for (String token : tokenizer) {
int id = symbolTable.symbolToID(token);
if (id >= 0)
idList.add(id);
}
int[] tokenIds = new int[idList.size()];
for (int i = 0; i < tokenIds.length; ++i)
tokenIds[i] = idList.get(i);
return tokenIds;
}
static int max(int[][] xs) {
int max = 0;
for (int i = 0; i < xs.length; ++i) {
int[] xsI = xs[i];
for (int j = 0; j < xsI.length; ++j) {
if (xsI[j] > max)
max = xsI[j];
}
}
return max;
}
static double relativeDifference(double x, double y) {
return java.lang.Math.abs(x - y)
/ (java.lang.Math.abs(x) + java.lang.Math.abs(y));
}
static void validateInputs(int[][] docWords,
short numTopics,
double docTopicPrior,
double topicWordPrior,
int burninEpochs,
int sampleLag,
int numSamples) {
validateInputs(docWords,numTopics,docTopicPrior,topicWordPrior);
if (burninEpochs < 0) {
String msg = "Number of burnin epochs must be non-negative."
+ " Found burninEpochs=" + burninEpochs;
throw new IllegalArgumentException(msg);
}
if (sampleLag < 1) {
String msg = "Sample lag must be positive."
+ " Found sampleLag=" + sampleLag;
throw new IllegalArgumentException(msg);
}
if (numSamples < 1) {
String msg = "Number of samples must be positive."
+ " Found numSamples=" + numSamples;
throw new IllegalArgumentException(msg);
}
}
static void validateInputs(int[][] docWords,
short numTopics,
double docTopicPrior,
double topicWordPrior) {
for (int doc = 0; doc < docWords.length; ++doc) {
for (int tok = 0; tok < docWords[doc].length; ++tok) {
if (docWords[doc][tok] >= 0) continue;
String msg = "All tokens must have IDs greater than 0."
+ " Found docWords[" + doc + "][" + tok + "]="
+ docWords[doc][tok];
throw new IllegalArgumentException(msg);
}
}
if (numTopics < 1) {
String msg = "Num topics must be positive."
+ " Found numTopics=" + numTopics;
throw new IllegalArgumentException(msg);
}
if (Double.isInfinite(docTopicPrior) || Double.isNaN(docTopicPrior) || docTopicPrior < 0.0) {
String msg = "Document-topic prior must be finite and positive."
+ " Found docTopicPrior=" + docTopicPrior;
throw new IllegalArgumentException(msg);
}
if (Double.isInfinite(topicWordPrior) || Double.isNaN(topicWordPrior) || topicWordPrior < 0.0) {
String msg = "Topic-word prior must be finite and positive."
+ " Found topicWordPrior=" + topicWordPrior;
throw new IllegalArgumentException(msg);
}
}
/**
* The LatentDirichletAllocation.GibbsSample
class
* encapsulates all of the information related to a single Gibbs
* sample for latent Dirichlet allocation (LDA). A sample
* consists of the assignment of a topic identifier to each
* token in the corpus. Other methods in this class are derived
* from either the topic samples, the data being estimated, and
* the LDA parameters such as priors.
*
* Instances of
* this class are created by the sampling method in the containing
* class, {@link LatentDirichletAllocation}. For convenience, the
* sample includes all of the data used to construct the sample,
* as well as the hyperparameters used for sampling.
*
*
As described in the class documentation for the containing
* class {@link LatentDirichletAllocation}, the primary content in
* a Gibbs sample for LDA is the assignment of a single topic to
* each token in the corpus. Cumulative counts for topics in
* documents and words in topics as well as total counts are also
* available; they do not entail any additional computation costs
* as the sampler maintains them as part of the sample.
*
*
The sample also contains meta information about the state
* of the sampling procedure. The epoch at which the sample
* was produced is provided, as well as an indication of how many
* topic assignments changed between this sample and the previous
* sample (note that this is the previous sample in the chain, not
* necessarily the previous sample handled by the LDA handler; the
* handler only gets the samples separated by the specified lag.
*
*
The sample may be used to generate an LDA model. The
* resulting model may then be used for estimation of unseen
* documents. Typically, models derived from several samples are
* used for Bayesian computations, as described in the class
* documentation above.
*
* @author Bob Carpenter
* @version 3.3.0
* @since LingPipe3.3
*/
public static class GibbsSample {
private final int mEpoch;
private final short[][] mTopicSample;
private final int[][] mDocWords;
private final double mDocTopicPrior;
private final double mTopicWordPrior;
private final int[][] mDocTopicCount;
private final int[][] mWordTopicCount;
private final int[] mTopicCount;
private final int mNumChangedTopics;
private final int mNumWords;
private final int mNumTokens;
GibbsSample(int epoch,
short[][] topicSample,
int[][] docWords,
double docTopicPrior,
double topicWordPrior,
int[][] docTopicCount,
int[][] wordTopicCount,
int[] topicCount,
int numChangedTopics,
int numWords,
int numTokens) {
mEpoch = epoch;
mTopicSample = topicSample;
mDocWords = docWords;
mDocTopicPrior = docTopicPrior;
mTopicWordPrior = topicWordPrior;
mDocTopicCount = docTopicCount;
mWordTopicCount = wordTopicCount;
mTopicCount = topicCount;
mNumChangedTopics = numChangedTopics;
mNumWords = numWords;
mNumTokens = numTokens;
}
/**
* Returns the epoch in which this sample was generated.
*
* @return The epoch for this sample.
*/
public int epoch() {
return mEpoch;
}
/**
* Returns the number of documents on which the sample was
* based.
*
* @return The number of documents for the sample.
*/
public int numDocuments() {
return mDocWords.length;
}
/**
* Returns the number of distinct words in the documents on
* which the sample was based.
*
* @return The number of words underlying the model.
*/
public int numWords() {
return mNumWords;
}
/**
* Returns the number of tokens in documents on which the
* sample was based. Each token is an instance of a particular
* word.
*/
public int numTokens() {
return mNumTokens;
}
/**
* Returns the number of topics for this sample.
*
* @return The number of topics for this sample.
*/
public int numTopics() {
return mTopicCount.length;
}
/**
* Returns the topic identifier sampled for the specified
* token position in the specified document.
*
* @param doc Identifier for a document.
* @param token Token position in the specified document.
* @return The topic assigned to the specified token in this
* sample.
* @throws IndexOutOfBoundsException If the document
* identifier is not between 0 (inclusive) and the number of
* documents (exclusive), or if the token is not between 0
* (inclusive) and the number of tokens (exclusive) in the
* specified document.
*/
public short topicSample(int doc, int token) {
return mTopicSample[doc][token];
}
/**
* Returns the word identifier for the specified token position
* in the specified document.
*
* @param doc Identifier for a document.
* @param token Token position in the specified document.
* @return The word found at the specified position in the
* specified document.
* @throws IndexOutOfBoundsException If the document
* identifier is not between 0 (inclusive) and the number of
* documents (exclusive), or if the token is not between 0
* (inclusive) and the number of tokens (exclusive) in the
* specified document.
*/
public int word(int doc, int token) {
return mDocWords[doc][token];
}
/**
* Returns the uniform Dirichlet concentration hyperparameter
* α
for document distributions over topics
* from which this sample was produced.
*
* @return The document-topic prior.
*/
public double documentTopicPrior() {
return mDocTopicPrior;
}
/**
* Returns the uniform Dirichlet concentration hyperparameter
* β
for topic distributions over words from
* which this sample was produced.
*/
public double topicWordPrior() {
return mTopicWordPrior;
}
/**
* Returns the number of times the specified topic was
* assigned to the specified document in this sample.
*
* @param doc Identifier for a document.
* @param topic Identifier for a topic.
* @return The count of the topic in the document in this
* sample.
* @throws IndexOutOfBoundsException If the document identifier
* is not between 0 (inclusive) and the number of documents
* (exclusive) or if the topic identifier is not between 0 (inclusive)
* and the number of topics (exclusive).
*/
public int documentTopicCount(int doc, int topic) {
return mDocTopicCount[doc][topic];
}
/**
* Returns the length of the specified document in tokens.
*
* @param doc Identifier for a document.
* @return The length of the specified document in tokens.
* @throws IndexOutOfBoundsException If the document
* identifier is not between 0 (inclusive) and the number of
* documents (exclusive).
*/
public int documentLength(int doc) {
return mDocWords[doc].length;
}
/**
* Returns the number of times tokens for the specified word
* were assigned to the specified topic.
*
* @param topic Identifier for a topic.
* @param word Identifier for a word.
* @return The number of tokens of the specified word assigned
* to the specified topic.
* @throws IndexOutOfBoundsException If the specified topic is
* not between 0 (inclusive) and the number of topics (exclusive),
* or if the word is not between 0 (inclusive) and the number of
* words (exclusive).
*/
public int topicWordCount(int topic, int word) {
return mWordTopicCount[word][topic];
}
/**
* Returns the total number of tokens assigned to the specified
* topic in this sample.
*
* @param topic Identifier for a topic.
* @return The total number of tokens assigned to the
* specified topic.
* @throws IllegalArgumentException If the specified topic is
* not between 0 (inclusive) and the number of topics (exclusive).
*/
public int topicCount(int topic) {
return mTopicCount[topic];
}
/**
* Returns the total number of topic assignments to tokens
* that changed between the last sample and this one. Note
* that this is the last sample in the chain, not the last
* sample necessarily passed to a handler, because handlers
* may not be configured to handle every * sample.
*
* @return The number of topics assignments that changed in this
* sample relative to the previous sample.
*/
public int numChangedTopics() {
return mNumChangedTopics;
}
/**
* Returns the probability estimate for the specified word in
* the specified topic in this sample. This value is
* calculated as a maximum a posteriori estimate computed as
* described in the class documentation for {@link
* LatentDirichletAllocation} using the topic assignment
* counts in this sample and the topic-word prior.
*
* @param topic Identifier for a topic.
* @param word Identifier for a word.
* @return The probability of generating the specified word in
* the specified topic.
* @throws IndexOutOfBoundsException If the specified topic is
* not between 0 (inclusive) and the number of topics (exclusive),
* or if the word is not between 0 (inclusive) and the number of
* words (exclusive).
*/
public double topicWordProb(int topic, int word) {
return (topicWordCount(topic,word) + topicWordPrior())
/ (topicCount(topic) + numWords() * topicWordPrior());
}
/**
* Returns the number of times tokens of the specified word
* appeared in the corpus.
*
* @param word Identifier of a word.
* @return The number of tokens of the word in the corpus.
* @throws IndexOutOfBoundsException If the word identifier is
* not between 0 (inclusive) and the number of words
* (exclusive).
*/
public int wordCount(int word) {
int count = 0;
for (int topic = 0; topic < numTopics(); ++topic)
count += topicWordCount(topic,word);
return count;
}
/**
* Returns the estimate of the probability of the topic being
* assigned to a word in the specified document given the
* topic * assignments in this sample. This is the maximum a
* posteriori estimate computed from the topic assignments *
* as described in the class documentation for {@link
* LatentDirichletAllocation} using the topic assignment
* counts in this sample and the document-topic prior.
*
* @param doc Identifier of a document.
* @param topic Identifier for a topic.
* @return An estimate of the probabilty of the topic in the
* document.
* @throws IndexOutOfBoundsException If the document identifier
* is not between 0 (inclusive) and the number of documents
* (exclusive) or if the topic identifier is not between 0 (inclusive)
* and the number of topics (exclusive).
*/
public double documentTopicProb(int doc, int topic) {
return (documentTopicCount(doc,topic) + documentTopicPrior())
/ (documentLength(doc) + numTopics() * documentTopicPrior());
}
/**
* Returns an estimate of the log (base 2) likelihood of the
* corpus given the point estimates of topic and document
* multinomials determined from this sample.
*
*
This likelihood calculation uses the methods
* {@link #documentTopicProb(int,int)} and {@link
* #topicWordProb(int,int)} for estimating likelihoods
* according the following formula:
*
*
* corpusLog2Probability()
* = Σdoc,i log2 Σtopic p(topic|doc) * p(word[doc][i]|topic)
*
* Note that this is not the complete corpus likelihood,
* which requires integrating over possible topic and document
* multinomials given the priors.
*
* @return The log (base 2) likelihood of the training corpus
* * given the document and topic multinomials determined by
* this sample.
*/
public double corpusLog2Probability() {
double corpusLog2Prob = 0.0;
int numDocs = numDocuments();
int numTopics = numTopics();
for (int doc = 0; doc < numDocs; ++doc) {
int docLength = documentLength(doc);
for (int token = 0; token < docLength; ++token) {
int word = word(doc,token);
double wordProb = 0.0;
for (int topic = 0; topic < numTopics; ++topic) {
double wordTopicProbGivenDoc = topicWordProb(topic,word) * documentTopicProb(doc,topic);
wordProb += wordTopicProbGivenDoc;
}
corpusLog2Prob += Math.log2(wordProb);
}
}
return corpusLog2Prob;
}
/**
* Returns a latent Dirichlet allocation model corresponding
* to this sample. The topic-word probabilities are
* calculated according to {@link #topicWordProb(int,int)},
* and the document-topic prior is as specified in the call
* to LDA that produced this sample.
*
* @return The LDA model for this sample.
*/
public LatentDirichletAllocation lda() {
int numTopics = numTopics();
int numWords = numWords();
double topicWordPrior = topicWordPrior();
double[][] topicWordProbs = new double[numTopics][numWords];
for (int topic = 0; topic < numTopics; ++topic) {
double topicCount = topicCount(topic);
double denominator = topicCount + numWords * topicWordPrior;
for (int word = 0; word < numWords; ++word)
topicWordProbs[topic][word]
= (topicWordCount(topic,word) + topicWordPrior)/ denominator;
}
return new LatentDirichletAllocation(mDocTopicPrior,
topicWordProbs);
}
}
/**
* Return a feature extractor for character sequences based
* on tokenizing with the specified tokenizer and generating
* expected values for topics given the words.
*
*
The symbol table is used to convert the tokens to word
* identifiers used for this LDA model. Symbols not in the symbol
* table or with values out of range of the word probabilities are
* ignored.
*
*
The feature names are determined by prefixing the specified
* string to the topic number (starting from zero).
*
*
In order to maintain some degree of efficiency, the feature
* extraction process estimates topic expectations for each
* feature independently and then sums them, then normalizes the
* result so the sum of the values of all features generated is 1.
*
*
Given a uniform distribution over topics, the probability of
* a topic given a word may be calculated up to a normalizing
* constant using Bayes's law,
*
*
* p(topic|word) = p(word|topic) * p(topic) / p(word)
* ∝ p(word|topic) * p(topic)
* = p(word|topic)
*
*
* Given the finite number of topics, it is easy to normalize the
* distribution and compute p(topic|word)
from
* p(word|topic)
. The values in
* p(topic|word)
are precompiled at construction
* time.
*
* The prior document topic count from this LDA model is
* added to the expected counts for each topic before normalization.
*
*
The expectations for topics for each word are then summed,
* the resulting feature vector is normalized to sum to 1, and then
* it is returned.
*
*
Thus the value of each feature is proportional to the
* expected number of words in that topic plus the document-topic
* prior, divided by the total number of words.
*
*
Serialization
*
* The resulting feature extractor may be serialized if its component
* tokenizer factory and symbol table are serializable.
*
* @param tokenizerFactory Tokenizer factory to use for token extraction.
* @param symbolTable Symbol table mapping tokens to word identifiers.
* @param featurePrefix Prefix of the resulting feature names.
* @return A feature extractor over character sequences based on
* this LDA model.
*/
FeatureExtractor
expectedTopicFeatureExtractor(TokenizerFactory tokenizerFactory,
SymbolTable symbolTable,
String featurePrefix) {
return new ExpectedTopicFeatureExtractor(this,
tokenizerFactory,
symbolTable,
featurePrefix);
}
/**
* Return a feature extractor for character sequences based on
* this LDA model and the specified tokenizer factory and symbol
* tale, which computes the unbiased Bayesian least squares
* estimate for the character sequence's topic distribution.
*
* The method {@link
* #bayesTopicEstimate(int[],int,int,int,Random)} is used to compute
* the topic distribution for the features.
*
*
The feature names are determined by prefixing the specified
* string to the topic number (starting from zero).
*
*
Serialization
*
* The resulting feature extractor may be serialized if its component
* tokenizer factory and symbol table are serializable.
*
* @param tokenizerFactory Tokenizer for character sequences.
* @param symbolTable Symbol table for mapping tokens to dimensions.
* @param burnIn Number of initial Gibbs samples to dispose.
* @param sampleLag The inerval between saved samples after burnin.
* @param numSamples Number of Gibbs samples to return.
* @return Feature extractor with the specified parameters.
* @throws IllegalArgumentException If the number of samples is
* not positive, the sample lag is not positive, or if the burnin
* period is negative.
*/
FeatureExtractor
bayesTopicFeatureExtractor(TokenizerFactory tokenizerFactory,
SymbolTable symbolTable,
String featurePrefix,
int burnIn,
int sampleLag,
int numSamples) {
return new BayesTopicFeatureExtractor(this,
tokenizerFactory,
symbolTable,
featurePrefix,
burnIn,
sampleLag,
numSamples);
}
static String[] genFeatures(String prefix, int numTopics) {
String[] features = new String[numTopics];
for (int k = 0; k < numTopics; ++k)
features[k] = prefix + k;
return features;
}
static class BayesTopicFeatureExtractor
implements FeatureExtractor,
Serializable {
static final long serialVersionUID = 8883227852502200365L;
private final LatentDirichletAllocation mLda;
private final TokenizerFactory mTokenizerFactory;
private final SymbolTable mSymbolTable;
private final String[] mFeatures;
private final int mBurnin;
private final int mSampleLag;
private final int mNumSamples;
public BayesTopicFeatureExtractor(LatentDirichletAllocation lda,
TokenizerFactory tokenizerFactory,
SymbolTable symbolTable,
String featurePrefix,
int burnin,
int sampleLag,
int numSamples) {
this(lda,
tokenizerFactory,
symbolTable,
genFeatures(featurePrefix,lda.numTopics()),
burnin,
sampleLag,
numSamples);
}
BayesTopicFeatureExtractor(LatentDirichletAllocation lda,
TokenizerFactory tokenizerFactory,
SymbolTable symbolTable,
String[] features,
int burnin,
int sampleLag,
int numSamples) {
mLda = lda;
mTokenizerFactory = tokenizerFactory;
mSymbolTable = symbolTable;
mFeatures = features;
mBurnin = burnin;
mSampleLag = sampleLag;
mNumSamples = numSamples;
}
public Map features(CharSequence cSeq) {
int numTopics = mLda.numTopics();
char[] cs = Strings.toCharArray(cSeq);
Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);
List tokenIdList = new ArrayList();
for (String token : tokenizer) {
int symbol = mSymbolTable.symbolToID(token);
if (symbol < 0 || symbol >= mLda.numWords())
continue;
tokenIdList.add(symbol);
}
int[] tokens = new int[tokenIdList.size()];
for (int i = 0; i < tokenIdList.size(); ++i)
tokens[i] = tokenIdList.get(i).intValue();
double[] vals
= mLda.mapTopicEstimate(tokens,
mNumSamples,
mBurnin,
mSampleLag,
new Random());
ObjectToDoubleMap features
= new ObjectToDoubleMap((numTopics*3)/2);
for (int k = 0; k < numTopics; ++k) {
if (vals[k] > 0.0)
features.set(mFeatures[k],vals[k]);
}
return features;
}
Object writeReplace() {
return new Serializer(this);
}
static class Serializer extends AbstractExternalizable {
static final long serialVersionUID = 6719636683732661958L;
final BayesTopicFeatureExtractor mFeatureExtractor;
public Serializer() {
this(null);
}
Serializer(BayesTopicFeatureExtractor featureExtractor) {
mFeatureExtractor = featureExtractor;
}
public void writeExternal(ObjectOutput out) throws IOException {
out.writeObject(mFeatureExtractor.mLda);
out.writeObject(mFeatureExtractor.mTokenizerFactory);
out.writeObject(mFeatureExtractor.mSymbolTable);
writeUTFs(mFeatureExtractor.mFeatures,out);
out.writeInt(mFeatureExtractor.mBurnin);
out.writeInt(mFeatureExtractor.mSampleLag);
out.writeInt(mFeatureExtractor.mNumSamples);
}
public Object read(ObjectInput in) throws IOException, ClassNotFoundException {
@SuppressWarnings("unchecked")
LatentDirichletAllocation lda
= (LatentDirichletAllocation)
in.readObject();
@SuppressWarnings("unchecked")
TokenizerFactory tokenizerFactory
= (TokenizerFactory)
in.readObject();
@SuppressWarnings("unchecked")
SymbolTable symbolTable
= (SymbolTable)
in.readObject();
String[] features = readUTFs(in);
int burnIn = in.readInt();
int sampleLag = in.readInt();
int numSamples = in.readInt();
return new BayesTopicFeatureExtractor(lda,tokenizerFactory,symbolTable,
features,
burnIn,sampleLag,numSamples);
}
}
}
static class ExpectedTopicFeatureExtractor
implements FeatureExtractor,
Serializable {
static final long serialVersionUID = -7996546432550775177L;
private final double[][] mWordTopicProbs;
private final double mDocTopicPrior;
private final TokenizerFactory mTokenizerFactory;
private final SymbolTable mSymbolTable;
private final String[] mFeatures;
public ExpectedTopicFeatureExtractor(LatentDirichletAllocation lda,
TokenizerFactory tokenizerFactory,
SymbolTable symbolTable,
String featurePrefix) {
double[][] wordTopicProbs = new double[lda.numWords()][lda.numTopics()];
for (int word = 0; word < lda.numWords(); ++word)
for (int topic = 0; topic < lda.numTopics(); ++topic)
wordTopicProbs[word][topic] = lda.wordProbability(topic,word);
for (double[] topicProbs : wordTopicProbs) {
double sum = com.aliasi.util.Math.sum(topicProbs);
for (int k = 0; k < topicProbs.length; ++k)
topicProbs[k] /= sum;
}
mWordTopicProbs = wordTopicProbs;
mDocTopicPrior = lda.documentTopicPrior();
mTokenizerFactory = tokenizerFactory;
mSymbolTable = symbolTable;
mFeatures = genFeatures(featurePrefix,lda.numTopics());
}
ExpectedTopicFeatureExtractor(double docTopicPrior,
double[][] wordTopicProbs,
TokenizerFactory tokenizerFactory,
SymbolTable symbolTable,
String[] features) {
mWordTopicProbs = wordTopicProbs;
mDocTopicPrior = docTopicPrior;
mTokenizerFactory = tokenizerFactory;
mSymbolTable = symbolTable;
mFeatures = features;
}
public Map features(CharSequence cSeq) {
int numTopics = mWordTopicProbs[0].length;
char[] cs = Strings.toCharArray(cSeq);
Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);
double[] vals = new double[numTopics];
Arrays.fill(vals,mDocTopicPrior);
for (String token : tokenizer) {
int symbol = mSymbolTable.symbolToID(token);
if (symbol < 0 || symbol >= mWordTopicProbs.length)
continue;
for (int k = 0; k < numTopics; ++k)
vals[k] += mWordTopicProbs[symbol][k];
}
ObjectToDoubleMap featMap
= new ObjectToDoubleMap((numTopics*3)/2);
double sum = com.aliasi.util.Math.sum(vals);
for (int k = 0; k < numTopics; ++k)
if (vals[k] > 0.0)
featMap.set(mFeatures[k],
vals[k] / sum);
return featMap;
}
Object writeReplace() {
return new Serializer(this);
}
static class Serializer extends AbstractExternalizable {
static final long serialVersionUID = -1472744781627035426L;
final ExpectedTopicFeatureExtractor mFeatures;
public Serializer() {
this(null);
}
public Serializer(ExpectedTopicFeatureExtractor features) {
mFeatures = features;
}
public void writeExternal(ObjectOutput out) throws IOException {
out.writeDouble(mFeatures.mDocTopicPrior);
out.writeInt(mFeatures.mWordTopicProbs.length);
for (int w = 0; w < mFeatures.mWordTopicProbs.length; ++w)
writeDoubles(mFeatures.mWordTopicProbs[w],out);
out.writeObject(mFeatures.mTokenizerFactory);
out.writeObject(mFeatures.mSymbolTable);
writeUTFs(mFeatures.mFeatures,out);
}
public Object read(ObjectInput in) throws IOException, ClassNotFoundException {
double docTopicPrior = in.readDouble();
int numWords = in.readInt();
double[][] wordTopicProbs = new double[numWords][];
for (int w = 0; w < numWords; ++w)
wordTopicProbs[w] = readDoubles(in);
@SuppressWarnings("unchecked")
TokenizerFactory tokenizerFactory
= (TokenizerFactory) in.readObject();
@SuppressWarnings("unchecked")
SymbolTable symbolTable
= (SymbolTable) in.readObject();
String[] features = readUTFs(in);
return new ExpectedTopicFeatureExtractor(docTopicPrior,wordTopicProbs,tokenizerFactory,symbolTable,features);
}
}
}
static class Serializer extends AbstractExternalizable {
static final long serialVersionUID = 4725870665020270825L;
final LatentDirichletAllocation mLda;
public Serializer() {
this(null);
}
public Serializer(LatentDirichletAllocation lda) {
mLda = lda;
}
public Object read(ObjectInput in) throws IOException {
double docTopicPrior = in.readDouble();
int numTopics = in.readInt();
double[][] topicWordProbs = new double[numTopics][];
for (int i = 0; i < topicWordProbs.length; ++i)
topicWordProbs[i] = readDoubles(in);
return new LatentDirichletAllocation(docTopicPrior,topicWordProbs);
}
public void writeExternal(ObjectOutput out) throws IOException {
out.writeDouble(mLda.mDocTopicPrior);
out.writeInt(mLda.mTopicWordProbs.length);
for (double[] topicWordProbs : mLda.mTopicWordProbs)
writeDoubles(topicWordProbs,out);
}
}
}