com.aliasi.cluster.LatentDirichletAllocation Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.cluster;

import com.aliasi.corpus.ObjectHandler;

import com.aliasi.symbol.SymbolTable;

import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.stats.Statistics;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.FeatureExtractor;
import com.aliasi.util.Math;
import com.aliasi.util.Iterators;
import com.aliasi.util.ObjectToCounterMap;
import com.aliasi.util.ObjectToDoubleMap;
import com.aliasi.util.Strings;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;

import java.util.Arrays;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

/**
 * A LatentDirichletAllocation object represents a latent
 * Dirichlet allocation (LDA) model.  LDA provides a Bayesian model of
 * document generation in which each document is generated by
 * a mixture of topical multinomials.  An LDA model specifies the
 * number of topics, a Dirichlet prior over topic mixtures for a
 * document, and a discrete distribution over words for each topic.
 *
 * A document is generated from an LDA model by first selecting a
 * multinomial over topics given the Dirichlet prior.  Then for each
 * token in the document, a topic is generated from the
 * document-specific topic distribution, and then a word is generated
 * from the discrete distribution for that topic.  Note that document
 * length is not generated by this model; a fully generative model
 * would need a way of generating lengths (e.g. a Poisson
 * distribution) or terminating documents (e.g. a disginuished
 * end-of-document symbol).
 *
 * 
An LDA model may be estimated from an unlabeled training corpus
 * (collection of documents) using a second Dirichlet prior, this time
 * over word distributions in a topic.  This class provides a static
 * inference method that produces (collapsed Gibbs) samples from the
 * posterior distribution of topic assignments to words, any one of
 * which may be used to construct an LDA model instance.
 *
 * 
An LDA model can be used to infer the topic mixture of unseen
 * text documents, which can be used to compare documents by topical
 * similarity.  A fixed LDA model may also be used to estimate the
 * likelihood of a word occurring in a document given the other words
 * in the document.  A collection of LDA models may be used for fully
 * Bayesian reasoning at the corpus (collection of documents) level.
 *
 * 
LDA may be applied to arbitrary multinomial data.  To apply it
 * to text, a tokenizer factory converts a text document to bag of
 * words and a symbol table converts these tokens to integer outcomes.
 * The static method {@link
 * #tokenizeDocuments(CharSequence[],TokenizerFactory,SymbolTable,int)}
 * is available to carry out the text to multinomial conversion with
 * pruning of low counts.
 *
 *
 * 
LDA Generative Model
 *
 * LDA operates over a fixed vocabulary of discrete outcomes, which
 * we call words for convenience, and represent as a set of integers:
 *
 * 
 * words = { 0, 1, ..., numWords-1 }
 *
 * A corpus is a ragged array of documents, each document consisting
 * of an array of words:
 *
 * 
 * int[][] words = { words[0], words[1], ..., words[numDocs-1] }
 *
 * A given document words[i], i < numDocs,
 * is itself represented as a bag of words, each word being
 * represented as an integer:
 *
 *  * int[] words[i] = { words[i][0], words[i][1], ..., words[i][words[i].length-1] }
 *
 * The documents do not all need to be the same length, so the
 * two-dimensional array words is ragged.
 *
 * A particular LDA model is defined over a fixed number of topics,
 * also represented as integers:
 *
 * 
 * topics = { 0, 1, ..., numTopics-1 }
 *
 * For a given topic topic < numTopics, LDA specifies a
 * discrete distribution φ[topic] over words:
 *
 * 
 * φ[topic][word] >= 0.0
 *
 * Σ_{_{word < numWords}} φ[topic][word] = 1.0
 *
 * In an LDA model, each document in the corpus is generated from a
 * document-specific mixture of topics θ[doc].  The
 * distribution θ[doc] is a discrete distribution over
 * topics:
 *
 * 
 * θ[doc][topic] >= 0.0
 *
 * Σ_{_{topic < numTopics}} θ[doc][topic] = 1.0
 *
 * Under LDA's generative model for documents, a document-specific
 * topic mixture θ[doc] is generated from a uniform
 * Dirichlet distribution with concentration parameter
 * α.  The Dirichlet is the conjugate prior for the
 * multinomial in which α acts as a prior count
 * assigned to a topic in a document.  Typically, LDA is run with a
 * fairly diffuse prior with concentration α <
 * 1, leading to skewed posterior topic distributions.
 *
 * 
Given a topic distribution θ[doc] for a
 * document, tokens are generated (conditionally) independently.  For
 * each token in the document, a topic
 * topics[doc][token] is generated according to the
 * topic distribution θ[doc], then the word instance
 * words[doc][token] is
 * generated given the topic using the topic-specific distribution
 * over tokens φ[topics[doc][token]].
 *
 * 
For estimation purposes, LDA places a uniform Dirichlet prior
 * with concentration parameter β on each of the
 * topic distributions φ[topic].  The first step in
 * modeling a corpus is to generate the topic distributions
 * φ from a Dirichlet parameterized by
 * β.
 *
 * 
In sampling notation, the LDA generative model is expressed as
 * follows:
 *
 * 
 * φ[topic]           ~ Dirichlet(β)
 * θ[doc]             ~ Dirichlet(α)
 * topics[doc][token] ~ Discrete(θ[doc])
 * words[doc][token]  ~ Discrete(φ[topic[doc][token]])
 *
 * A generative Bayesian model is based on a the joint probablity
 * of all observed and unobserved variables, given the priors.  Given a
 * text corpus, only the words are observed.  The unobserved variables
 * include the assignment of a topic to each word, the assignment of a
 * topic distribution to each document, and the assignment of word
 * distributions to each topic.  We let
 * topic[doc][token], doc < numDocs, token <
 * docLength[doc], be the topic assigned to the specified token
 * in the specified document.  This leaves two Dirichlet priors, one
 * parameterized by α for topics in documents, and
 * one parameterized by β for words in topics.
 * These priors are treated as hyperparameters for purposes of
 * estimation; cross-validation may be used to provide so-called empirical
 * Bayes estimates for the priors.
 *
 * 
The full LDA probability model for a corpus follows the generative
 * process outlined above.  First, topic distributions are chosen at the
 * corpus level for each topic given their Dirichlet prior, and then the
 * remaining variables are generating given these topic distributions:
 *
 * 
 * p(words, topics, θ, &phi | α, &beta)
 * = Π_{_{topic < numTopics}}
 *        p(φ[topic] | β)
 *        * p(words, topics, θ | α, φ)
 *
 * Note that because the multinomial parameters for
 * φ[topic] are continuous,
 * p(φ[topic] | β) represents a density, not a discrete
 * distribution.  Thus it does not make sense to talk about the
 * probability of a given multinomial φ[topic]; non-zero
 * results only arise from integrals over measurable subsets of the
 * multinomial simplex.  It is possible to sample from a density, so
 * the generative model is well founded.
 *
 * A document is generated by first generating its topic distribution
 * given the Dirichlet prior, and then generating its topics and words:
 *
 * 
 * p(words, topics, θ | α, φ)
 * = Π_{_{doc < numDocs}}
 *        p(θ[doc] | α)
 *        * p(words[doc], topics[doc] | θ[doc], φ)
 *
 * The topic and word are generated from the multinomials
 * θ[doc] and the topic distributions
 * φ using the chain rule, first generating the topic
 * given the document's topic distribution and then generating the
 * word given the topic's word distribution.
 *
 *  * p(words[doc], topics[doc] | θ[doc], φ)
 * = Π_{_{token < words[doc].length}}
 *        p(topics[doc][token] | θ[doc])
 *        * p(words[doc][token] | φ[topics[doc][token]])
 *
 * Given the topic and document multinomials, this distribution
 * is discrete, and thus may be evaluated.  It may also be marginalized
 * by summation:
 *
 * 
 * p(words[doc] | θ[doc], φ)
 * = Π_{_{token < words[doc].length}}
 *       Σ_{_{topic < numTopics}} p(topic | θ[doc]) * p(words[doc][token] | φ[topic])
 *
 * Conditional probablities are computed in the usual way by
 * marginalizing other variables through integration.  Unfortunately,
 * this simple mathematical operation is often intractable
 * computationally.
 *
 *
 * 
Estimating LDA with Collapsed Gibbs Sampling
 *
 * This class uses a collapsed form of Gibbs sampling over the
 * posterior distribution of topic assignments given the documents
 * and Dirichlet priors:
 *
 * 
 * p(topics | words, α β)
 *
 * This distribution may be derived from the joint
 * distribution by marginalizing (also known as "collapsing"
 * or "integrating out") the contributions of the
 * document-topic and topic-word distributions.

 * 
The Gibbs sampler used to estimate LDA models produces samples
 * that consist of a topic assignment to every token in the corpus.
 * The conjugacy of the Dirichlet prior for multinomials makes the
 * sampling straightforward.
 *
 * 
An initial sample is produced by randomly assigning topics to
 * tokens.  Then, the sampler works iteratively through the corpus,
 * one token at a time.  At each token, it samples a new topic assignment
 * to that token given all the topic assignments to other tokens in the
 * corpus:
 *
 * 
 * p(topics[doc][token] | words, topics')
 *
 * The notation topics' represents the set of topic
 * assignments other than to topics[doc][token].  This
 * collapsed posterior conditional is estimated directly:
 *
 *  * p(topics[doc][token] = topic | words, topics')
 * = (count'(doc,topic) + α) / (docLength[doc]-1 + numTopics*α)
 * * (count'(topic,word) + β) / (count'(topic) + numWords*β)
 *
 * The counts are all defined relative to topics'; that
 * is, the current topic assignment for the token being sampled is not
 * considered in the counts.  Note that the two factors are estimates
 * of θ[doc] and φ[topic] with all
 * data other than the assignment to the current token.  Note how the
 * prior concentrations arise as additive smoothing factors in these
 * estimates, a result of the Dirichlet's conjugacy to the
 * multinomial.  For the purposes of sampling, the document-length
 * normalization in the denominator of the first term is not necessary,
 * as it remains constant across topics.
 *
 * The posterior Dirichlet distributions may be computed using just
 * the counts.  For instance, the posterior distribution for topics in
 * documents is estimated as:
 *
 * 
 * p(&theta[doc]|α, β, words)
 * = Dirichlet(count(doc,0)+β, count(doc,1)+β, ..., count(doc,numTopics-1)+β)
 *
 * The sampling distribution is defined from the maximum a posteriori
 * (MAP) estimates of the multinomial distribution over topics in a document:
 *
 * 
 * θ^*[doc] = ARGMAX_{_θ[doc]} p(θ[doc] | α, β, words)
 *
 * which we know from the Dirichlet distribution is:
 *
 *  * θ^*[doc][topic]
 * = (count(doc,topic) + α) / (docLength[doc] + numTopics*α)
 *
 * By the same reasoning, the MAP word distribution in topics is:
 *
 *  * φ^*[topic][word]
 * = (count(topic,word) + β) / (count(topic) + numWords*β)
 *
 * A complete Gibbs sample is represented as an instance of {@link
 * LatentDirichletAllocation.GibbsSample}, which provides access to
 * the topic assignment to every token, as well as methods to compute
 * θ^* and φ^*
 * as defined above.  A sample also maintains the original priors and
 * word counts.  Just the estimates of the topic-word distributions
 * φ[topic] and the prior topic concentration
 * α are sufficient to define an LDA model.  Note
 * that the imputed values of θ^*[doc]
 * used during estimation are part of a sample, but are not part of
 * the LDA model itself.  The LDA model contains enough information to
 * estimate θ^* for an arbitrary
 * document, as described in the next section.
 *
 * 
The Gibbs sampling algorithm starts with a random assignment of
 * topics to words, then simply iterates through the tokens in turn,
 * sampling topics according to the distribution defined above.  After
 * each run through the entire corpus, a callback is made to a handler
 * for the samples.  This setup may be configured for
 * an initial burnin period, essentially just discarding the first
 * batch of samples.  Then it may be configured to sample only periodically
 * thereafter to avoid correlations between samples.
 *
 * 
LDA as Multi-Topic Classifier
 *
 * An LDA model consists of a topic distribution Dirichlet prior
 * α and a word distribution
 * φ[topic] for each topic.  Given an LDA model and a
 * new document words = { words[0], ..., words[length-1] }
 * consisting of a sequence of words, the posterior distribution over
 * topic weights is given by:
 *
 * 
 * p(θ | words, α, φ)
 *
 * Although this distribution is not solvable analytically, it is easy
 * to estimate using a simplified form of the LDA estimator's Gibbs
 * sampler.  The conditional distribution of a topic assignment
 * topics[token] to a single token given an assignment
 * topics' to all other tokens is given by:
 *
 *  * p(topic[token] | topics', words, α, φ)
 * ∝ p(topic[token], words[token] | topics', α φ)
 * = p(topic[token] | topics', α) * p(words[token] | φ[topic[token]])
 * = (count(topic[token]) + α) / (words.length - 1 + numTopics * α)
 *   * p(words[token] | φ[topic[token]])
 *
 * This leads to a straightforward sampler over posterior topic
 * assignments, from which we may directly compute the Dirichlet posterior over
 * topic distributions or a MAP topic distribution.
 *
 * This class provides a method to sample these topic assignments,
 * which may then be used to form Dirichlet distributions or MAP point
 * estimates of θ^* for the document
 * words.

 * 
LDA as a Conditional Language Model
 *
 * An LDA model may be used to estimate the likelihood of a word
 * given a previous bag of words:
 *
 * 
 * p(word | words, α, φ)
 * = ∫p(word | θ, φ) p(θ | words, α, φ) dθ
 *
 * This integral is easily evaluated using sampling over the topic
 * distributions p(θ | words, α, φ) and
 * averaging the word probability determined by each sample.
 * The word probability for a sample θ is defined by:
 *
 *  * p(word | θ, φ)
 * = Σ_{_{topic < numTopics}} p(topic | θ) * p(word | φ[topic])
 *
 * Although this approach could theoretically be applied to generate
 * the probability of a document one word at a time, the cost would
 * be prohibitive, as there are quadratically many samples required
 * because samples for the n-th word consist of topic
 * assignments to the previous n-1 words.
 *
 * 
 *
 * Bayesian Calculations and Exchangeability
 *
 * An LDA model may be used for a variety of statistical
 * calculations.  For instance, it may be used to determine the
 * distribution of topics to words, and using these distributions, may
 * determine word similarity.  Similarly, document similarity may be
 * determined by the topic distributions in a document.
 *
 * 
Point estimates are derived using a single LDA model.  For
 * Bayesian calculation, multiple samples are taken to produce
 * multiple LDA models.  The results of a calculation on these
 * models is then averaged to produce a Bayesian estimate of the
 * quantity of interest.  The sampling methodology is effectively
 * numerically computing the integral over the posterior.
 *
 * 
Bayesian calculations over multiple samples are complicated by
 * the exchangeability of topics in the LDA model.  In particular, there
 * is no guarantee that topics are the same between samples, thus it
 * is not acceptable to combine samples in topic-level reasoning.  For
 * instance, it does not make sense to estimate the probability of
 * a topic in a document using multiple samples.
 *
 *
 * 
Non-Document Data
 *
 * The "words" in an LDA model don't necessarily have to
 * represent words in documents.  LDA is basically a multinomial
 * mixture model, and any multinomial outcomes may be modeled with
 * LDA.  For instance, a document may correspond to a baseball game
 * and the words may correspond to the outcomes of at-bats (some
 * might occur more than once).  LDA has also been used for
 * gene expression data, where expression levels from mRNA microarray
 * experiments is quantized into a multinomial outcome.
 *
 * LDA has also been applied to collaborative filtering.  Movies
 * act as words, with users modeled as documents, the bag of words
 * they've seen.  Given an LDA model and a user's films, the user's
 * topic distribution may be inferred and used to estimate the likelihood
 * of seeing unseen films.
 *
 * 
References
 *
 * 
 * Wikipedia: Gibbs Sampling
 * 
Wikipedia: Markov chain Monte Carlo
 * 
Wikipedia: Dirichlet Distribution
 * 
Wikipedia: Latent Dirichelt Distribution
 * 
Steyvers, Mark and Tom Griffiths. 2007.
 *     Probabilistic topic models.
 *     In  Thomas K. Landauer, Danielle S. McNamara, Simon Dennis and Walter Kintsch (eds.),
 *     Handbook of Latent Semantic Analysis.
 *     Lawrence Erlbaum.
 *       
 *
 * Blei, David M., Andrew Y. Ng, and Michael I. Jordan.  2003.
 *       Latent Dirichlet allocation.
 *       Journal of Machine Learning Research 3:993-1022.
 *       
 * 
 *
 * @author Bob Carpenter
 * @version 4.0.0
 * @since   LingPipe3.3
 */
public class LatentDirichletAllocation implements Serializable {

    static final long serialVersionUID = 6313662446710242382L;

    private final double mDocTopicPrior;
    private final double[][] mTopicWordProbs;

    /**
     * Construct a latent Dirichelt allocation (LDA) model using the
     * specified document-topic prior and topic-word distributions.

     * The topic-word probability array topicWordProbs
     * represents a collection of discrete distributions
     * topicwordProbs[topic] for topics, and thus must
     * satisfy:
     *
     * 
     * topicWordProbs[topic][word] >= 0.0
     *
     * Σ_{_{word < numWords}} topicWordProbs[topic][word] = 1.0
     *
     * Warning: These requirements are not checked by the
     * constructor.
     *
     * 
See the class documentation above for an explanation of
     * the parameters and what can be done with a model.
     *
     * @param docTopicPrior The document-topic prior.
     * @param topicWordProbs Array of discrete distributions over words,
     * indexed by topic.
     * @throws IllegalArgumentException If the document-topic prior is
     * not finite and positive, or if the topic-word probabilities
     * arrays are not all the same length with entries between 0.0 and
     * 1.0 inclusive.
     */
    public LatentDirichletAllocation(double docTopicPrior,
                                     double[][] topicWordProbs) {

        if (docTopicPrior <= 0.0
            || Double.isNaN(docTopicPrior)
            || Double.isInfinite(docTopicPrior)) {
            String msg = "Document-topic prior must be finite and positive."
                + " Found docTopicPrior=" + docTopicPrior;
            throw new IllegalArgumentException(msg);
        }
        int numTopics = topicWordProbs.length;
        if (numTopics < 1) {
            String msg = "Require non-empty topic-word probabilities.";
            throw new IllegalArgumentException(msg);
        }

        int numWords = topicWordProbs[0].length;
        for (int topic = 1; topic < numTopics; ++topic) {
            if (topicWordProbs[topic].length != numWords) {
                String msg = "All topics must have the same number of words."
                    + " topicWordProbs[0].length="
                    + topicWordProbs[0].length
                    + " topicWordProbs[" + topic + "]="
                    + topicWordProbs[topic].length;
                throw new IllegalArgumentException(msg);
            }
        }

        for (int topic = 0; topic < numTopics; ++topic) {
            for (int word = 0; word < numWords; ++word) {
                if (topicWordProbs[topic][word] < 0.0
                    || topicWordProbs[topic][word] > 1.0) {
                    String msg = "All probabilities must be between 0.0 and 1.0"
                        + " Found topicWordProbs[" + topic + "][" + word + "]="
                        + topicWordProbs[topic][word];
                    throw new IllegalArgumentException(msg);
                }
            }
        }

        mDocTopicPrior = docTopicPrior;
        mTopicWordProbs = topicWordProbs;
    }

    /**
     * Returns the number of topics in this LDA model.
     *
     * @return The number of topics in this model.
     */
    public int numTopics() {
        return mTopicWordProbs.length;
    }

    /**
     * Returns the number of words on which this LDA model
     * is based.
     *
     * @return The numbe of words in this model.
     */
    public int numWords() {
        return mTopicWordProbs[0].length;
    }

    /**
     * Returns the concentration value of the uniform Dirichlet prior over
     * topic distributions for documents.  This value is effectively
     * a prior count for topics used for additive smoothing during
     * estimation.
     *
     * @return The prior count of topics in documents.
     */
    public double documentTopicPrior() {
        return mDocTopicPrior;
    }


    /**
     * Returns the probability of the specified word in the specified
     * topic.  The values returned should be non-negative and finite,
     * and should sum to 1.0 over all words for a specifed topic.
     *
     * @param topic Topic identifier.
     * @param word Word identifier.
     * @return Probability of the specified word in the specified
     * topic.
     */
    public double wordProbability(int topic, int word) {
        return mTopicWordProbs[topic][word];
    }

    /**
     * Returns an array representing of probabilities of words in the
     * specified topic.  The probabilities are indexed by word
     * identifier.
     *
     * 
The returned result is a copy of the underlying data in
     * the model so that changing it will not change the model.
     *
     * @param topic Topic identifier.
     * @return Array of probabilities of words in the specified topic.
     */
    public double[] wordProbabilities(int topic) {
        double[] xs = new double[mTopicWordProbs[topic].length];
        for (int i = 0; i < xs.length; ++i)
            xs[i] = mTopicWordProbs[topic][i];
        return xs;
    }

    /**
     * Returns the specified number of Gibbs samples of topics for the
     * specified tokens using the specified number of burnin epochs,
     * the specified lag between samples, and the specified
     * randomizer.  The array returned is an array of samples, each
     * sample consisting of a topic assignment to each token in the
     * specified list of tokens.  The tokens must all be in the appropriate
     * range for this class
     *
     * 
See the class documentation for more information on how the
     * samples are computed.
     *
     * @param tokens The tokens making up the document.
     * @param numSamples Number of Gibbs samples to return.
     * @param burnin The number of samples to take and throw away
     * during the burnin period.
     * @param sampleLag The interval between samples after burnin.
     * @param random The random number generator to use for this sampling
     * process.
     * @return The selection of topic samples generated by this
     * sampler.
     * @throws IndexOutOfBoundsException If there are tokens whose
     * value is less than zero, or whose value is greater than the
     * number of tokens in this model.
     * @throws IllegalArgumentException If the number of samples is
     * not positive, the sample lag is not positive, or if the burnin
     * period is negative.  if the number of samples, burnin, and lag
     * are not positive numbers.
     */
    public short[][] sampleTopics(int[] tokens,
                                  int numSamples,
                                  int burnin,
                                  int sampleLag,
                                  Random random) {

        if (burnin < 0) {
            String msg = "Burnin period must be non-negative."
                + " Found burnin=" + burnin;
            throw new IllegalArgumentException(msg);
        }

        if (numSamples < 1) {
            String msg = "Number of samples must be at least 1."
                + " Found numSamples=" + numSamples;
            throw new IllegalArgumentException(msg);
        }

        if (sampleLag < 1) {
            String msg = "Sample lag must be at least 1."
                + " Found sampleLag=" + sampleLag;
            throw new IllegalArgumentException(msg);
        }

        double docTopicPrior = documentTopicPrior();
        int numTokens = tokens.length;

        int numTopics = numTopics();

        int[] topicCount = new int[numTopics];

        short[][] samples = new short[numSamples][numTokens];
        int sample = 0;
        short[] currentSample = samples[0];
        for (int token = 0; token < numTokens; ++token) {
            int randomTopic = random.nextInt(numTopics);
            ++topicCount[randomTopic];
            currentSample[token] = (short) randomTopic;
        }

        double[] topicDistro = new double[numTopics];

        int numEpochs = burnin + sampleLag * (numSamples - 1);
        for (int epoch = 0; epoch < numEpochs; ++epoch) {
            for (int token = 0; token < numTokens; ++token) {
                int word = tokens[token];
                int currentTopic = currentSample[token];
                --topicCount[currentTopic];
                if (topicCount[currentTopic] < 0) {
                    throw new IllegalArgumentException("bomb");
                }
                for (int topic = 0; topic < numTopics; ++topic) {
                    topicDistro[topic]
                        = (topicCount[topic] + docTopicPrior)
                        * wordProbability(topic,word)
                        + (topic == 0 ? 0.0 : topicDistro[topic-1]);
                }
                int sampledTopic = Statistics.sample(topicDistro,random);
                ++topicCount[sampledTopic];
                currentSample[token] = (short) sampledTopic;
            }
            if ((epoch >= burnin) && (((epoch - burnin) % sampleLag) == 0)) {
                short[] pastSample = currentSample;
                ++sample;
                currentSample = samples[sample];
                for (int token = 0; token < numTokens; ++token)
                    currentSample[token] = pastSample[token];
            }
        }
        return samples;
    }

    /**
     * Replaced by method {@code bayesTopicEstimate()} because of
     * original misnaming.
     *
     * 
Warning: This is actually not a maximum a posterior
     * (MAP) estimate as suggested by the name.
     *
     * @param tokens The tokens making up the document.
     * @param numSamples Number of Gibbs samples to return.
     * @param burnin The number of samples to take and throw away
     * during the burnin period.
     * @param sampleLag The interval between samples after burnin.
     * @param random The random number generator to use for this sampling
     * process.
     * @return The selection of topic samples generated by this
     * sampler.
     * @throws IndexOutOfBoundsException If there are tokens whose
     * value is less than zero, or whose value is greater than the
     * number of tokens in this model.
     * @throws IllegalArgumentException If the number of samples is
     * not positive, the sample lag is not positive, or if the burnin
     * period is negative.
     */
    double[] mapTopicEstimate(int[] tokens,
                                     int numSamples,
                                     int burnin,
                                     int sampleLag,
                                     Random random) {
        return bayesTopicEstimate(tokens,numSamples,burnin,sampleLag,random);
    }
    
    /**
     * Return the Bayesian point estimate of the topic distribution
     * for a document consisting of the specified tokens, using Gibbs
     * sampling with the specified parameters.  The Gibbs topic
     * samples are simply averaged to produce the Bayesian estimate,
     * which minimizes expected square loss.
     *
     * 
See the method {@link #sampleTopics(int[],int,int,int,Random)}
     * and the class documentation for more information on the sampling
     * procedure.
     *
     * @param tokens The tokens making up the document.
     * @param numSamples Number of Gibbs samples to return.
     * @param burnin The number of samples to take and throw away
     * during the burnin period.
     * @param sampleLag The interval between samples after burnin.
     * @param random The random number generator to use for this sampling
     * process.
     * @return The selection of topic samples generated by this
     * sampler.
     * @throws IndexOutOfBoundsException If there are tokens whose
     * value is less than zero, or whose value is greater than the
     * number of tokens in this model.
     * @throws IllegalArgumentException If the number of samples is
     * not positive, the sample lag is not positive, or if the burnin
     * period is negative.
     */
    public double[] bayesTopicEstimate(int[] tokens,
                                       int numSamples,
                                       int burnin,
                                       int sampleLag,
                                       Random random) {
        short[][] sampleTopics = sampleTopics(tokens,numSamples,burnin,sampleLag,random);
        int numTopics = numTopics();
        int[] counts = new int[numTopics];
        for (short[] topics : sampleTopics) {
            for (int tok = 0; tok < topics.length; ++tok)
                ++counts[topics[tok]];
        }
        double totalCount = 0;
        for (int topic = 0; topic < numTopics; ++topic)
            totalCount += counts[topic];
        double[] result = new double[numTopics];
        for (int topic = 0; topic < numTopics; ++topic)
            result[topic] = counts[topic] / totalCount;
        return result;

    }


    Object writeReplace() {
        return new Serializer(this);
    }

    /**
     * Run Gibbs sampling for the specified multinomial data, number
     * of topics, priors, search parameters, randomization and
     * callback sample handler.  Gibbs sampling provides samples from the
     * posterior distribution of topic assignments given the corpus
     * and prior hyperparameters.  A sample is encapsulated as an
     * instance of class {@link GibbsSample}.  This method will return
     * the final sample and also send intermediate samples to an
     * optional handler.
     *
     * 
The class documentation above explains Gibbs sampling for
     * LDA as used in this method.
     *
     * 
The primary input is an array of documents, where each
     * document is represented as an array of integers representing
     * the tokens that appear in it.  These tokens should be numbered
     * contiguously from 0 for space efficiency.  The topic assignments
     * in the Gibbs sample are aligned as parallel arrays to the array
     * of documents.
     *
     * 
The next three parameters are the hyperparameters of the
     * model, specifically the number of topics, the prior count
     * assigned to topics in a document, and the prior count assigned
     * to words in topics.  A rule of thumb for the document-topic
     * prior is to set it to 5 divided by the number of topics (or
     * less if there are very few topics; 0.1 is typically the maximum
     * value used).  A good general value for the topic-word prior is
     * 0.01.  Both of these priors will be diffuse and tend to lead to
     * skewed posterior distributions.
     *
     * 
The following three parameters specify how the sampling is
     * to be done.  First, the sampler is "burned in" for a
     * number of epochs specified by the burnin parameter.  After burn
     * in, samples are taken after fixed numbers of documents to avoid
     * correlation in the samples; the sampling frequency is specified
     * by the sample lag.  Finally, the number of samples to be taken
     * is specified.  For instance, if the burnin is 1000, the sample
     * lag is 250, and the number of samples is 5, then samples are
     * taken after 1000, 1250, 1500, 1750 and 2000 epochs.  If a
     * non-null handler object is specified in the method call, its
     * handle(GibbsSample) method is called with each the
     * samples produced as above.
     *
     * 
The final sample in the chain of samples is returned as the
     * result.  Note that this sample will also have been passed to the
     * specified handler as the last sample for the handler.
     *
     * 
A random number generator must be supplied as an argument.
     * This may just be a new instance of {@link java.util.Random} or
     * a custom extension. It is used for all randomization in this
     * method.
     *
     * @param docWords Corpus of documents to be processed.
     * @param numTopics Number of latent topics to generate.
     * @param docTopicPrior Prior count of topics in a document.
     * @param topicWordPrior Prior count of words in a topic.
     * @param burninEpochs  Number of epochs to run before taking a sample.
     * @param sampleLag Frequency between samples.
     * @param numSamples Number of samples to take before exiting.
     * @param random Random number generator.
     * @param handler Handler to which the samples are sent.
     * @return The final Gibbs sample.
     */
    public static GibbsSample
        gibbsSampler(int[][] docWords,
                     short numTopics,
                     double docTopicPrior,
                     double topicWordPrior,

                     int burninEpochs,
                     int sampleLag,
                     int numSamples,

                     Random random,

                     ObjectHandler handler) {

        validateInputs(docWords,numTopics,docTopicPrior,topicWordPrior,burninEpochs,sampleLag,numSamples);

        int numDocs = docWords.length;
        int numWords = max(docWords) + 1;

        int numTokens = 0;
        for (int doc = 0; doc < numDocs; ++doc)
            numTokens += docWords[doc].length;

        // should inputs be permuted?
        // for (int doc = 0; doc < numDocs; ++doc)
        // Arrays.permute(docWords[doc]);

        short[][] currentSample = new short[numDocs][];
        for (int doc = 0; doc < numDocs; ++doc)
            currentSample[doc] = new short[docWords[doc].length];

        int[][] docTopicCount = new int[numDocs][numTopics];
        int[][] wordTopicCount = new int[numWords][numTopics];
        int[] topicTotalCount = new int[numTopics];

        for (int doc = 0; doc < numDocs; ++doc) {
            for (int tok = 0; tok < docWords[doc].length; ++tok) {
                int word = docWords[doc][tok];
                int topic = random.nextInt(numTopics);
                currentSample[doc][tok] = (short) topic;
                ++docTopicCount[doc][topic];
                ++wordTopicCount[word][topic];
                ++topicTotalCount[topic];
            }
        }

        double numWordsTimesTopicWordPrior = numWords * topicWordPrior;
        double[] topicDistro = new double[numTopics];
        int numEpochs = burninEpochs + sampleLag * (numSamples - 1);
        for (int epoch = 0; epoch <= numEpochs; ++epoch) {
            double corpusLog2Prob = 0.0;
            int numChangedTopics = 0;
            for (int doc = 0; doc < numDocs; ++doc) {
                int[] docWordsDoc = docWords[doc];
                short[] currentSampleDoc = currentSample[doc];
                int[] docTopicCountDoc = docTopicCount[doc];
                for (int tok = 0; tok < docWordsDoc.length; ++tok) {
                    int word = docWordsDoc[tok];
                    int[] wordTopicCountWord = wordTopicCount[word];
                    int currentTopic = currentSampleDoc[tok];
                    if (currentTopic == 0) {
                        topicDistro[0]
                            = (docTopicCountDoc[0] - 1.0 + docTopicPrior)
                            * (wordTopicCountWord[0] - 1.0 + topicWordPrior)
                            / (topicTotalCount[0] -1.0 + numWordsTimesTopicWordPrior);
                    } else {
                        topicDistro[0]
                            = (docTopicCountDoc[0] + docTopicPrior)
                            * (wordTopicCountWord[0] + topicWordPrior)
                            / (topicTotalCount[0] + numWordsTimesTopicWordPrior);
                        for (int topic = 1; topic < currentTopic; ++topic) {
                            topicDistro[topic]
                                = (docTopicCountDoc[topic] + docTopicPrior)
                                * (wordTopicCountWord[topic] + topicWordPrior)
                                / (topicTotalCount[topic] + numWordsTimesTopicWordPrior)
                                + topicDistro[topic-1];
                        }
                        topicDistro[currentTopic]
                            = (docTopicCountDoc[currentTopic] - 1.0 + docTopicPrior)
                            * (wordTopicCountWord[currentTopic] - 1.0 + topicWordPrior)
                            / (topicTotalCount[currentTopic] -1.0 + numWordsTimesTopicWordPrior)
                            + topicDistro[currentTopic-1];
                    }
                    for (int topic = currentTopic+1; topic < numTopics; ++topic) {
                        topicDistro[topic]
                            = (docTopicCountDoc[topic] + docTopicPrior)
                            * (wordTopicCountWord[topic] + topicWordPrior)
                            / (topicTotalCount[topic] + numWordsTimesTopicWordPrior)
                            + topicDistro[topic-1];
                    }
                    int sampledTopic = Statistics.sample(topicDistro,random);

                    // compute probs before updates
                    if (sampledTopic != currentTopic) {
                        currentSampleDoc[tok] = (short) sampledTopic;
                        --docTopicCountDoc[currentTopic];
                        --wordTopicCountWord[currentTopic];
                        --topicTotalCount[currentTopic];
                        ++docTopicCountDoc[sampledTopic];
                        ++wordTopicCountWord[sampledTopic];
                        ++topicTotalCount[sampledTopic];
                    }

                    if (sampledTopic != currentTopic)
                        ++numChangedTopics;
                    double topicProbGivenDoc
                        = docTopicCountDoc[sampledTopic] / (double) docWordsDoc.length;
                    double wordProbGivenTopic
                        = wordTopicCountWord[sampledTopic] / (double) topicTotalCount[sampledTopic];
                    double tokenLog2Prob = Math.log2(topicProbGivenDoc * wordProbGivenTopic);
                    corpusLog2Prob += tokenLog2Prob;
                }
            }
            // double crossEntropyRate = -corpusLog2Prob / numTokens;
            if ((epoch >= burninEpochs)
                && (((epoch - burninEpochs) % sampleLag) == 0)) {
                GibbsSample sample
                    = new GibbsSample(epoch,
                                      currentSample,
                                      docWords,
                                      docTopicPrior,
                                      topicWordPrior,
                                      docTopicCount,
                                      wordTopicCount,
                                      topicTotalCount,
                                      numChangedTopics,
                                      numWords,
                                      numTokens);
                if (handler != null)
                    handler.handle(sample);
                if (epoch == numEpochs)
                    return sample;
            }
        }
        throw new IllegalStateException("unreachable in practice because of return if epoch==numEpochs");
    }

    /**
     * Return an iterator over Gibbs samples for the specified
     * document-word corpus, number of topics, priors and randomizer.
     * These are the same Gibbs samples as wold be produced by the
     * method {@link
     * #gibbsSampler(int[][],short,double,double,int,int,int,Random,ObjectHandler)}.
     * See that method and the class documentation for more details.
     *
     * @param docWords Corpus of documents to be processed.
     * @param numTopics Number of latent topics to generate.
     * @param docTopicPrior Prior count of topics in a document.
     * @param topicWordPrior Prior count of words in a topic.
     * @param random Random number generator.
     */
    public static Iterator
        gibbsSample(int[][] docWords,
                    short numTopics,
                    double docTopicPrior,
                    double topicWordPrior,
                    Random random) {
        validateInputs(docWords,numTopics,docTopicPrior,topicWordPrior);

        return new SampleIterator(docWords,numTopics,docTopicPrior,topicWordPrior,random);
    }


    static class SampleIterator extends Iterators.Buffered {
        private final int[][] mDocWords;         // D x W x 4 (shared across threads)
        private final short mNumTopics;
        private final double mDocTopicPrior;
        private final double mTopicWordPrior;
        private final Random mRandom;

        private final int mNumDocs;
        private final int mNumWords;
        private final int mNumTokens;

        private final short[][] mCurrentSample;  // D x W x 2
        private final int[][] mDocTopicCount;    // D x K x 4
        private final int[][] mWordTopicCount;   // K x W x 4
        private final int[] mTopicTotalCount;    // K

        private int mNumChangedTopics;
        private int mEpoch = 0;

        SampleIterator(int[][] docWords,
                       short numTopics,
                       double docTopicPrior,
                       double topicWordPrior,
                       Random random) {
            mDocWords = docWords;
            mNumTopics = numTopics;
            mDocTopicPrior = docTopicPrior;
            mTopicWordPrior = topicWordPrior;
            mRandom = random;

            mNumDocs = mDocWords.length;
            mNumWords = max(mDocWords) + 1;

            int numTokens = 0;
            for (int doc = 0; doc < mNumDocs; ++doc)
                numTokens += mDocWords[doc].length;
            mNumTokens = numTokens;

            mNumChangedTopics = numTokens;

            mCurrentSample = new short[mNumDocs][];
            for (int doc = 0; doc < mNumDocs; ++doc)
                mCurrentSample[doc] = new short[mDocWords[doc].length];

            mDocTopicCount = new int[mNumDocs][numTopics];
            mWordTopicCount = new int[mNumWords][numTopics];
            mTopicTotalCount = new int[numTopics];

            // random initialization
            for (int doc = 0; doc < mNumDocs; ++doc) {
                for (int tok = 0; tok < docWords[doc].length; ++tok) {
                    int word = docWords[doc][tok];
                    int topic = mRandom.nextInt(numTopics);
                    mCurrentSample[doc][tok] = (short) topic;
                    ++mDocTopicCount[doc][topic];
                    ++mWordTopicCount[word][topic];
                    ++mTopicTotalCount[topic];
                }
            }
        }

        @Override
        protected GibbsSample bufferNext() {

            // create existing sample; then compute next one by setting all vars
            GibbsSample sample
                = new GibbsSample(mEpoch,
                                  mCurrentSample,
                                  mDocWords,
                                  mDocTopicPrior,
                                  mTopicWordPrior,
                                  mDocTopicCount,
                                  mWordTopicCount,
                                  mTopicTotalCount,
                                  mNumChangedTopics,
                                  mNumWords,
                                  mNumTokens);
            ++mEpoch;
            double numWordsTimesTopicWordPrior = mNumWords * mTopicWordPrior;
            double[] topicDistro = new double[mNumTopics];
            int numChangedTopics = 0;
            for (int doc = 0; doc < mNumDocs; ++doc) {
                int[] docWordsDoc = mDocWords[doc];
                short[] currentSampleDoc = mCurrentSample[doc];
                int[] docTopicCountDoc = mDocTopicCount[doc];
                for (int tok = 0; tok < docWordsDoc.length; ++tok) {
                    int word = docWordsDoc[tok];
                    int[] wordTopicCountWord = mWordTopicCount[word];
                    int currentTopic = currentSampleDoc[tok];
                    if (currentTopic == 0) {
                        topicDistro[0]
                            = (docTopicCountDoc[0] - 1.0 + mDocTopicPrior)
                            * (wordTopicCountWord[0] - 1.0 + mTopicWordPrior)
                            / (mTopicTotalCount[0] -1.0 + numWordsTimesTopicWordPrior);
                    } else {
                        topicDistro[0]
                            = (docTopicCountDoc[0] + mDocTopicPrior)
                            * (wordTopicCountWord[0] + mTopicWordPrior)
                            / (mTopicTotalCount[0] + numWordsTimesTopicWordPrior);
                        for (int topic = 1; topic < currentTopic; ++topic) {
                            topicDistro[topic]
                                = (docTopicCountDoc[topic] + mDocTopicPrior)
                                * (wordTopicCountWord[topic] + mTopicWordPrior)
                                / (mTopicTotalCount[topic] + numWordsTimesTopicWordPrior)
                                + topicDistro[topic-1];
                        }
                        topicDistro[currentTopic]
                            = (docTopicCountDoc[currentTopic] - 1.0 + mDocTopicPrior)
                            * (wordTopicCountWord[currentTopic] - 1.0 + mTopicWordPrior)
                            / (mTopicTotalCount[currentTopic] -1.0 + numWordsTimesTopicWordPrior)
                            + topicDistro[currentTopic-1];
                    }
                    for (int topic = currentTopic+1; topic < mNumTopics; ++topic) {
                        topicDistro[topic]
                            = (docTopicCountDoc[topic] + mDocTopicPrior)
                            * (wordTopicCountWord[topic] + mTopicWordPrior)
                            / (mTopicTotalCount[topic] + numWordsTimesTopicWordPrior)
                            + topicDistro[topic-1];
                    }
                    int sampledTopic = Statistics.sample(topicDistro,mRandom);
                    if (sampledTopic != currentTopic) {
                        currentSampleDoc[tok] = (short) sampledTopic;
                        --docTopicCountDoc[currentTopic];
                        --wordTopicCountWord[currentTopic];
                        --mTopicTotalCount[currentTopic];
                        ++docTopicCountDoc[sampledTopic];
                        ++wordTopicCountWord[sampledTopic];
                        ++mTopicTotalCount[sampledTopic];
                        ++numChangedTopics;
                    }
                }
            }
            mNumChangedTopics = numChangedTopics;
            return sample;
        }

    }


    /**
     * Tokenize an array of text documents represented as character
     * sequences into a form usable by LDA, using the specified
     * tokenizer factory and symbol table.  The symbol table should be
     * constructed fresh for this application, but may be used after
     * this method is called for further token to symbol conversions.
     * Only tokens whose count is equal to or larger the specified
     * minimum count are included.  Only tokens whose count exceeds
     * the minimum are added to the symbol table, thus producing a
     * compact set of symbol assignments to tokens for downstream
     * processing.
     *
     * 
Warning: With some tokenizer factories and or minimum
     * count thresholds, there may be documents with no tokens in
     * them.
     *
     * @param texts The text corpus.
     * @param tokenizerFactory A tokenizer factory for tokenizing the texts.
     * @param symbolTable Symbol table used to convert tokens to identifiers.
     * @param minCount Minimum count for a token to be included in a
     * document's representation.
     * @return The tokenized form of a document suitable for input to LDA.
     */
    public static int[][] tokenizeDocuments(CharSequence[] texts,
                                            TokenizerFactory tokenizerFactory,
                                            SymbolTable symbolTable,
                                            int minCount) {
        ObjectToCounterMap tokenCounter = new ObjectToCounterMap();
        for (CharSequence text : texts) {
            char[] cs = Strings.toCharArray(text);
            Tokenizer tokenizer = tokenizerFactory.tokenizer(cs,0,cs.length);
            for (String token : tokenizer)
                tokenCounter.increment(token);
        }
        tokenCounter.prune(minCount);
        Set tokenSet = tokenCounter.keySet();
        for (String token : tokenSet)
            symbolTable.getOrAddSymbol(token);

        int[][] docTokenId = new int[texts.length][];
        for (int i = 0; i < docTokenId.length; ++i) {
            docTokenId[i] = tokenizeDocument(texts[i],tokenizerFactory,symbolTable);
        }
        return docTokenId;
    }


    /**
     * Tokenizes the specified text document using the specified tokenizer
     * factory returning only tokens that exist in the symbol table.  This
     * method is useful within a given LDA model for tokenizing new documents
     * into lists of words.
     *
     * @param text Character sequence to tokenize.
     * @param tokenizerFactory Tokenizer factory for tokenization.
     * @param symbolTable Symbol table to use for converting tokens
     * to symbols.
     * @return The array of integer symbols for tokens that exist in
     * the symbol table.
     */
    public static int[] tokenizeDocument(CharSequence text,
                                         TokenizerFactory tokenizerFactory,
                                         SymbolTable symbolTable) {
        char[] cs = Strings.toCharArray(text);
        Tokenizer tokenizer = tokenizerFactory.tokenizer(cs,0,cs.length);
        List idList = new ArrayList();
        for (String token : tokenizer) {
            int id = symbolTable.symbolToID(token);
            if (id >= 0)
                idList.add(id);
        }
        int[] tokenIds = new int[idList.size()];
        for (int i = 0; i < tokenIds.length; ++i)
            tokenIds[i] = idList.get(i);

        return tokenIds;
    }



    static int max(int[][] xs) {
        int max = 0;
        for (int i = 0; i < xs.length; ++i) {
            int[] xsI = xs[i];
            for (int j = 0; j < xsI.length; ++j) {
                if (xsI[j] > max)
                    max = xsI[j];
            }
        }
        return max;
    }


    static double relativeDifference(double x, double y) {
        return java.lang.Math.abs(x - y)
            / (java.lang.Math.abs(x) + java.lang.Math.abs(y));
    }

    static void validateInputs(int[][] docWords,
                               short numTopics,
                               double docTopicPrior,
                               double topicWordPrior,
                               int burninEpochs,
                               int sampleLag,
                               int numSamples) {

        validateInputs(docWords,numTopics,docTopicPrior,topicWordPrior);

        if (burninEpochs < 0) {
            String msg = "Number of burnin epochs must be non-negative."
                + " Found burninEpochs=" + burninEpochs;
            throw new IllegalArgumentException(msg);
        }

        if (sampleLag < 1) {
            String msg = "Sample lag must be positive."
                + " Found sampleLag=" + sampleLag;
            throw new IllegalArgumentException(msg);
        }

        if (numSamples < 1) {
            String msg = "Number of samples must be positive."
                + " Found numSamples=" + numSamples;
            throw new IllegalArgumentException(msg);
        }
    }


    static void validateInputs(int[][] docWords,
                               short numTopics,
                               double docTopicPrior,
                               double topicWordPrior) {

        for (int doc = 0; doc < docWords.length; ++doc) {
            for (int tok = 0; tok < docWords[doc].length; ++tok) {
                if (docWords[doc][tok] >= 0) continue;
                String msg = "All tokens must have IDs greater than 0."
                    + " Found docWords[" + doc + "][" + tok + "]="
                    + docWords[doc][tok];
                throw new IllegalArgumentException(msg);
            }
        }

        if (numTopics < 1) {
            String msg = "Num topics must be positive."
                + " Found numTopics=" + numTopics;
            throw new IllegalArgumentException(msg);
        }

        if (Double.isInfinite(docTopicPrior) || Double.isNaN(docTopicPrior) || docTopicPrior < 0.0) {
            String msg = "Document-topic prior must be finite and positive."
                + " Found docTopicPrior=" + docTopicPrior;
            throw new IllegalArgumentException(msg);
        }

        if (Double.isInfinite(topicWordPrior) || Double.isNaN(topicWordPrior) || topicWordPrior < 0.0) {
            String msg = "Topic-word prior must be finite and positive."
                + " Found topicWordPrior=" + topicWordPrior;
            throw new IllegalArgumentException(msg);
        }
    }




    /**
     * The LatentDirichletAllocation.GibbsSample class
     * encapsulates all of the information related to a single Gibbs
     * sample for latent Dirichlet allocation (LDA).  A sample
     * consists of the assignment of a topic identifier to each
     * token in the corpus.  Other methods in this class are derived
     * from either the topic samples, the data being estimated, and
     * the LDA parameters such as priors.
     *
     * 
Instances of
     * this class are created by the sampling method in the containing
     * class, {@link LatentDirichletAllocation}.  For convenience, the
     * sample includes all of the data used to construct the sample,
     * as well as the hyperparameters used for sampling.
     *
     * 
As described in the class documentation for the containing
     * class {@link LatentDirichletAllocation}, the primary content in
     * a Gibbs sample for LDA is the assignment of a single topic to
     * each token in the corpus.  Cumulative counts for topics in
     * documents and words in topics as well as total counts are also
     * available; they do not entail any additional computation costs
     * as the sampler maintains them as part of the sample.
     *
     * 
The sample also contains meta information about the state
     * of the sampling procedure.  The epoch at which the sample
     * was produced is provided, as well as an indication of how many
     * topic assignments changed between this sample and the previous
     * sample (note that this is the previous sample in the chain, not
     * necessarily the previous sample handled by the LDA handler; the
     * handler only gets the samples separated by the specified lag.
     *
     * 
The sample may be used to generate an LDA model.  The
     * resulting model may then be used for estimation of unseen
     * documents.  Typically, models derived from several samples are
     * used for Bayesian computations, as described in the class
     * documentation above.
     *
     * @author Bob Carpenter
     * @version 3.3.0
     * @since   LingPipe3.3
     */
    public static class GibbsSample {
        private final int mEpoch;
        private final short[][] mTopicSample;
        private final int[][] mDocWords;
        private final double mDocTopicPrior;
        private final double mTopicWordPrior;
        private final int[][] mDocTopicCount;
        private final int[][] mWordTopicCount;
        private final int[] mTopicCount;
        private final int mNumChangedTopics;

        private final int mNumWords;
        private final int mNumTokens;

        GibbsSample(int epoch,
                    short[][] topicSample,
                    int[][] docWords,
                    double docTopicPrior,
                    double topicWordPrior,
                    int[][] docTopicCount,
                    int[][] wordTopicCount,
                    int[] topicCount,
                    int numChangedTopics,
                    int numWords,
                    int numTokens) {

            mEpoch = epoch;
            mTopicSample = topicSample;
            mDocWords = docWords;
            mDocTopicPrior = docTopicPrior;
            mTopicWordPrior = topicWordPrior;
            mDocTopicCount = docTopicCount;
            mWordTopicCount = wordTopicCount;
            mTopicCount = topicCount;
            mNumChangedTopics = numChangedTopics;
            mNumWords = numWords;
            mNumTokens = numTokens;
        }

        /**
         * Returns the epoch in which this sample was generated.
         *
         * @return The epoch for this sample.
         */
        public int epoch() {
            return mEpoch;
        }

        /**
         * Returns the number of documents on which the sample was
         * based.
         *
         * @return The number of documents for the sample.
         */
        public int numDocuments() {
            return mDocWords.length;
        }

        /**
         * Returns the number of distinct words in the documents on
         * which the sample was based.
         *
         * @return The number of words underlying the model.
         */
        public int numWords() {
            return mNumWords;
        }

        /**
         * Returns the number of tokens in documents on which the
         * sample was based.  Each token is an instance of a particular
         * word.
         */
        public int numTokens() {
            return mNumTokens;
        }

        /**
         * Returns the number of topics for this sample.
         *
         * @return The number of topics for this sample.
         */
        public int numTopics() {
            return mTopicCount.length;
        }

        /**
         * Returns the topic identifier sampled for the specified
         * token position in the specified document.
         *
         * @param doc Identifier for a document.
         * @param token Token position in the specified document.
         * @return The topic assigned to the specified token in this
         * sample.
         * @throws IndexOutOfBoundsException If the document
         * identifier is not between 0 (inclusive) and the number of
         * documents (exclusive), or if the token is not between 0
         * (inclusive) and the number of tokens (exclusive) in the
         * specified document.
         */
        public short topicSample(int doc, int token) {
            return mTopicSample[doc][token];
        }

        /**
         * Returns the word identifier for the specified token position
         * in the specified document.
         *
         * @param doc Identifier for a document.
         * @param token Token position in the specified document.
         * @return The word found at the specified position in the
         * specified document.
         * @throws IndexOutOfBoundsException If the document
         * identifier is not between 0 (inclusive) and the number of
         * documents (exclusive), or if the token is not between 0
         * (inclusive) and the number of tokens (exclusive) in the
         * specified document.
         */
        public int word(int doc, int token) {
            return mDocWords[doc][token];
        }

        /**
         * Returns the uniform Dirichlet concentration hyperparameter
         * α for document distributions over topics
         * from which this sample was produced.
         *
         * @return The document-topic prior.
         */
        public double documentTopicPrior() {
            return mDocTopicPrior;
        }

        /**
         * Returns the uniform Dirichlet concentration hyperparameter
         * β for topic distributions over words from
         * which this sample was produced.
         */
        public double topicWordPrior() {
            return mTopicWordPrior;
        }

        /**
         * Returns the number of times the specified topic was
         * assigned to the specified document in this sample.
         *
         * @param doc Identifier for a document.
         * @param topic Identifier for a topic.
         * @return The count of the topic in the document in this
         * sample.
         * @throws IndexOutOfBoundsException If the document identifier
         * is not between 0 (inclusive) and the number of documents
         * (exclusive) or if the topic identifier is not between 0 (inclusive)
         * and the number of topics (exclusive).
         */
        public int documentTopicCount(int doc, int topic) {
            return mDocTopicCount[doc][topic];
        }

        /**
         * Returns the length of the specified document in tokens.
         *
         * @param doc Identifier for a document.
         * @return The length of the specified document in tokens.
         * @throws IndexOutOfBoundsException If the document
         * identifier is not between 0 (inclusive) and the number of
         * documents (exclusive).
         */
        public int documentLength(int doc) {
            return mDocWords[doc].length;
        }

        /**
         * Returns the number of times tokens for the specified word
         * were assigned to the specified topic.
         *
         * @param topic Identifier for a topic.
         * @param word Identifier for a word.
         * @return The number of tokens of the specified word assigned
         * to the specified topic.
         * @throws IndexOutOfBoundsException If the specified topic is
         * not between 0 (inclusive) and the number of topics (exclusive),
         * or if the word is not between 0 (inclusive) and the number of
         * words (exclusive).
         */
        public int topicWordCount(int topic, int word) {
            return mWordTopicCount[word][topic];
        }

        /**
         * Returns the total number of tokens assigned to the specified
         * topic in this sample.
         *
         * @param topic Identifier for a topic.
         * @return The total number of tokens assigned to the
         * specified topic.
         * @throws IllegalArgumentException If the specified topic is
         * not between 0 (inclusive) and the number of topics (exclusive).
         */
        public int topicCount(int topic) {
            return mTopicCount[topic];
        }

        /**
         * Returns the total number of topic assignments to tokens
         * that changed between the last sample and this one.  Note
         * that this is the last sample in the chain, not the last
         * sample necessarily passed to a handler, because handlers
         * may not be configured to handle every * sample.
         *
         * @return The number of topics assignments that changed in this
         * sample relative to the previous sample.
         */
        public int numChangedTopics() {
            return mNumChangedTopics;
        }

        /**
         * Returns the probability estimate for the specified word in
         * the specified topic in this sample.  This value is
         * calculated as a maximum a posteriori estimate computed as
         * described in the class documentation for {@link
         * LatentDirichletAllocation} using the topic assignment
         * counts in this sample and the topic-word prior.
         *
         * @param topic Identifier for a topic.
         * @param word Identifier for a word.
         * @return The probability of generating the specified word in
         * the specified topic.
         * @throws IndexOutOfBoundsException If the specified topic is
         * not between 0 (inclusive) and the number of topics (exclusive),
         * or if the word is not between 0 (inclusive) and the number of
         * words (exclusive).
         */
        public double topicWordProb(int topic, int word) {
            return (topicWordCount(topic,word) + topicWordPrior())
                / (topicCount(topic) + numWords() * topicWordPrior());
        }

        /**
         * Returns the number of times tokens of the specified word
         * appeared in the corpus.
         *
         * @param word Identifier of a word.
         * @return The number of tokens of the word in the corpus.
         * @throws IndexOutOfBoundsException If the word identifier is
         * not between 0 (inclusive) and the number of words
         * (exclusive).
         */
        public int wordCount(int word) {
            int count = 0;
            for (int topic = 0; topic < numTopics(); ++topic)
                count += topicWordCount(topic,word);
            return count;
        }

        /**
         * Returns the estimate of the probability of the topic being
         * assigned to a word in the specified document given the
         * topic * assignments in this sample.  This is the maximum a
         * posteriori estimate computed from the topic assignments *
         * as described in the class documentation for {@link
         * LatentDirichletAllocation} using the topic assignment
         * counts in this sample and the document-topic prior.
         *
         * @param doc Identifier of a document.
         * @param topic Identifier for a topic.
         * @return An estimate of the probabilty of the topic in the
         * document.
         * @throws IndexOutOfBoundsException If the document identifier
         * is not between 0 (inclusive) and the number of documents
         * (exclusive) or if the topic identifier is not between 0 (inclusive)
         * and the number of topics (exclusive).
         */
        public double documentTopicProb(int doc, int topic) {
            return (documentTopicCount(doc,topic) + documentTopicPrior())
                / (documentLength(doc) + numTopics() * documentTopicPrior());
        }

        /**
         * Returns an estimate of the log (base 2) likelihood of the
         * corpus given the point estimates of topic and document
         * multinomials determined from this sample.
         *
         * 
This likelihood calculation uses the methods
         * {@link #documentTopicProb(int,int)} and {@link
         * #topicWordProb(int,int)} for estimating likelihoods
         * according the following formula:
         *
         * 
         * corpusLog2Probability()
         * = Σ_{_doc,i} log_₂ Σ_{_topic} p(topic|doc) * p(word[doc][i]|topic)
         *
         * Note that this is not the complete corpus likelihood,
         * which requires integrating over possible topic and document
         * multinomials given the priors.
         *
         * @return The log (base 2) likelihood of the training corpus
         * * given the document and topic multinomials determined by
         * this sample.
         */
        public double corpusLog2Probability() {
            double corpusLog2Prob = 0.0;
            int numDocs = numDocuments();
            int numTopics = numTopics();
            for (int doc = 0; doc < numDocs; ++doc) {
                int docLength = documentLength(doc);
                for (int token = 0; token < docLength; ++token) {
                    int word = word(doc,token);
                    double wordProb = 0.0;
                    for (int topic = 0; topic < numTopics; ++topic) {
                        double wordTopicProbGivenDoc = topicWordProb(topic,word) * documentTopicProb(doc,topic);
                        wordProb += wordTopicProbGivenDoc;
                    }
                    corpusLog2Prob += Math.log2(wordProb);
                }
            }
            return corpusLog2Prob;
        }

        /**
         * Returns a latent Dirichlet allocation model corresponding
         * to this sample.  The topic-word probabilities are
         * calculated according to {@link #topicWordProb(int,int)},
         * and the document-topic prior is as specified in the call
         * to LDA that produced this sample.
         *
         * @return The LDA model for this sample.
         */
        public LatentDirichletAllocation lda() {
            int numTopics = numTopics();
            int numWords = numWords();
            double topicWordPrior = topicWordPrior();
            double[][] topicWordProbs = new double[numTopics][numWords];
            for (int topic = 0; topic < numTopics; ++topic) {
                double topicCount = topicCount(topic);
                double denominator = topicCount + numWords * topicWordPrior;
                for (int word = 0; word < numWords; ++word)
                    topicWordProbs[topic][word]
                        = (topicWordCount(topic,word) + topicWordPrior)/ denominator;
            }
            return new LatentDirichletAllocation(mDocTopicPrior,
                                                 topicWordProbs);
        }
    }

    /**
     * Return a feature extractor for character sequences based
     * on tokenizing with the specified tokenizer and generating
     * expected values for topics given the words.  
     *
     * 
The symbol table is used to convert the tokens to word
     * identifiers used for this LDA model.  Symbols not in the symbol
     * table or with values out of range of the word probabilities are
     * ignored.
     * 
     * 
The feature names are determined by prefixing the specified
     * string to the topic number (starting from zero).
     * 
     * 
In order to maintain some degree of efficiency, the feature
     * extraction process estimates topic expectations for each
     * feature independently and then sums them, then normalizes the
     * result so the sum of the values of all features generated is 1.
     *
     * 
Given a uniform distribution over topics, the probability of
     * a topic given a word may be calculated up to a normalizing
     * constant using Bayes's law,
     *
     * 
     * p(topic|word) = p(word|topic) * p(topic) / p(word)
     *               ∝ p(word|topic) * p(topic)
     *               = p(word|topic)
     * 
     *
     * Given the finite number of topics, it is easy to normalize the
     * distribution and compute p(topic|word) from
     * p(word|topic). The values in
     * p(topic|word) are precompiled at construction
     * time.
     *
     * The prior document topic count from this LDA model is
     * added to the expected counts for each topic before normalization.
     *
     * 
The expectations for topics for each word are then summed,
     * the resulting feature vector is normalized to sum to 1, and then
     * it is returned.
     * 
     * 
Thus the value of each feature is proportional to the
     * expected number of words in that topic plus the document-topic
     * prior, divided by the total number of words. 
     *
     * 
Serialization
     *
     * The resulting feature extractor may be serialized if its component
     * tokenizer factory and symbol table are serializable. 
     *
     * @param tokenizerFactory Tokenizer factory to use for token extraction.
     * @param symbolTable Symbol table mapping tokens to word identifiers.
     * @param featurePrefix Prefix of the resulting feature names.
     * @return A feature extractor over character sequences based on
     * this LDA model.
     */
    FeatureExtractor
        expectedTopicFeatureExtractor(TokenizerFactory tokenizerFactory,
                                      SymbolTable symbolTable,
                                      String featurePrefix) {
        return new ExpectedTopicFeatureExtractor(this,
                                               tokenizerFactory,
                                               symbolTable,
                                               featurePrefix);
    }

    /**
     * Return a feature extractor for character sequences based on
     * this LDA model and the specified tokenizer factory and symbol
     * tale, which computes the unbiased Bayesian least squares
     * estimate for the character sequence's topic distribution.
     *
     * The method {@link
     * #bayesTopicEstimate(int[],int,int,int,Random)} is used to compute
     * the topic distribution for the features.  
     *
     * 
The feature names are determined by prefixing the specified
     * string to the topic number (starting from zero).
     *
     * 
Serialization
     *
     * The resulting feature extractor may be serialized if its component
     * tokenizer factory and symbol table are serializable. 
     * 
     * @param tokenizerFactory Tokenizer for character sequences.
     * @param symbolTable Symbol table for mapping tokens to dimensions.
     * @param burnIn Number of initial Gibbs samples to dispose.
     * @param sampleLag The inerval between saved samples after burnin.
     * @param numSamples Number of Gibbs samples to return.
     * @return Feature extractor with the specified parameters.
     * @throws IllegalArgumentException If the number of samples is
     * not positive, the sample lag is not positive, or if the burnin
     * period is negative.
     */
    FeatureExtractor
        bayesTopicFeatureExtractor(TokenizerFactory tokenizerFactory,
                                   SymbolTable symbolTable,
                                   String featurePrefix,
                                   int burnIn,
                                   int sampleLag,
                                   int numSamples) {
        return new BayesTopicFeatureExtractor(this,
                                              tokenizerFactory,
                                              symbolTable,
                                              featurePrefix,
                                              burnIn,
                                              sampleLag,
                                              numSamples);
    }

    static String[] genFeatures(String prefix, int numTopics) {
        String[] features = new String[numTopics];
        for (int k = 0; k < numTopics; ++k)
            features[k] = prefix + k;
        return features;
    }


    
    static class BayesTopicFeatureExtractor
        implements FeatureExtractor,
                   Serializable {

        static final long serialVersionUID = 8883227852502200365L;

        private final LatentDirichletAllocation mLda;
        private final TokenizerFactory mTokenizerFactory;
        private final SymbolTable mSymbolTable;
        private final String[] mFeatures;

        private final int mBurnin;
        private final int mSampleLag;
        private final int mNumSamples;
        
        public BayesTopicFeatureExtractor(LatentDirichletAllocation lda,
                                          TokenizerFactory tokenizerFactory,
                                          SymbolTable symbolTable,
                                          String featurePrefix,
                                          int burnin,
                                          int sampleLag,
                                          int numSamples) {
            this(lda,
                 tokenizerFactory,
                 symbolTable,
                 genFeatures(featurePrefix,lda.numTopics()),
                 burnin,
                 sampleLag,
                 numSamples);
        }

        BayesTopicFeatureExtractor(LatentDirichletAllocation lda,
                                   TokenizerFactory tokenizerFactory,
                                   SymbolTable symbolTable,
                                   String[] features,
                                   int burnin,
                                   int sampleLag,
                                   int numSamples) {
            mLda = lda;
            mTokenizerFactory = tokenizerFactory;
            mSymbolTable = symbolTable;
            mFeatures = features;
            mBurnin = burnin;
            mSampleLag = sampleLag;
            mNumSamples = numSamples;
        }

        public Map features(CharSequence cSeq) {
            int numTopics = mLda.numTopics();
            char[] cs = Strings.toCharArray(cSeq);
            Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);
            List tokenIdList = new ArrayList();
            for (String token : tokenizer) {
                int symbol = mSymbolTable.symbolToID(token);
                if (symbol < 0 || symbol >= mLda.numWords())
                    continue;
                tokenIdList.add(symbol);
            }
            int[] tokens = new int[tokenIdList.size()];
            for (int i = 0; i < tokenIdList.size(); ++i)
                tokens[i] = tokenIdList.get(i).intValue();
            
            double[] vals 
                = mLda.mapTopicEstimate(tokens,
                                        mNumSamples,
                                        mBurnin,
                                        mSampleLag,
                                        new Random());
            ObjectToDoubleMap features
                = new ObjectToDoubleMap((numTopics*3)/2);
            for (int k = 0; k < numTopics; ++k) {
                if (vals[k] > 0.0)
                    features.set(mFeatures[k],vals[k]);
            }
            return features;
        }
        
        Object writeReplace() {
            return new Serializer(this);
        }

        static class Serializer extends AbstractExternalizable {
            static final long serialVersionUID = 6719636683732661958L;
            final BayesTopicFeatureExtractor mFeatureExtractor;
            public Serializer() {
                this(null);
            }
            Serializer(BayesTopicFeatureExtractor featureExtractor) {
                mFeatureExtractor = featureExtractor;
            }
            public void writeExternal(ObjectOutput out) throws IOException {
                out.writeObject(mFeatureExtractor.mLda);
                out.writeObject(mFeatureExtractor.mTokenizerFactory);
                out.writeObject(mFeatureExtractor.mSymbolTable);
                writeUTFs(mFeatureExtractor.mFeatures,out);
                out.writeInt(mFeatureExtractor.mBurnin);
                out.writeInt(mFeatureExtractor.mSampleLag);
                out.writeInt(mFeatureExtractor.mNumSamples);
            }
            public Object read(ObjectInput in) throws IOException, ClassNotFoundException {
                @SuppressWarnings("unchecked")
                LatentDirichletAllocation lda
                    = (LatentDirichletAllocation)
                    in.readObject();
                @SuppressWarnings("unchecked")
                TokenizerFactory tokenizerFactory
                    = (TokenizerFactory)
                    in.readObject();
                @SuppressWarnings("unchecked")
                    SymbolTable symbolTable
                    = (SymbolTable)
                    in.readObject();
                String[] features = readUTFs(in);
                int burnIn = in.readInt();
                int sampleLag = in.readInt();
                int numSamples = in.readInt();
                return new BayesTopicFeatureExtractor(lda,tokenizerFactory,symbolTable,
                                                      features,
                                                      burnIn,sampleLag,numSamples);
            }
        }
    }

    static class ExpectedTopicFeatureExtractor 
        implements FeatureExtractor,
                   Serializable {

        static final long serialVersionUID = -7996546432550775177L;
        private final double[][] mWordTopicProbs;
        private final double mDocTopicPrior;
        private final TokenizerFactory mTokenizerFactory;
        private final SymbolTable mSymbolTable;
        private final String[] mFeatures;

        public ExpectedTopicFeatureExtractor(LatentDirichletAllocation lda,
                                           TokenizerFactory tokenizerFactory,
                                           SymbolTable symbolTable,
                                           String featurePrefix) {
            double[][] wordTopicProbs = new double[lda.numWords()][lda.numTopics()];
            for (int word = 0; word < lda.numWords(); ++word)
                for (int topic = 0; topic < lda.numTopics(); ++topic)
                    wordTopicProbs[word][topic] = lda.wordProbability(topic,word);
            for (double[] topicProbs : wordTopicProbs) {
                double sum = com.aliasi.util.Math.sum(topicProbs);
                for (int k = 0; k < topicProbs.length; ++k)
                    topicProbs[k] /= sum;
            }
            mWordTopicProbs = wordTopicProbs;
            mDocTopicPrior = lda.documentTopicPrior();
            mTokenizerFactory = tokenizerFactory;
            mSymbolTable = symbolTable;

            mFeatures = genFeatures(featurePrefix,lda.numTopics());
        }
        
        ExpectedTopicFeatureExtractor(double docTopicPrior,
                                    double[][] wordTopicProbs,
                                    TokenizerFactory tokenizerFactory,
                                    SymbolTable symbolTable,
                                    String[] features) {
            mWordTopicProbs = wordTopicProbs;
            mDocTopicPrior = docTopicPrior;
            mTokenizerFactory = tokenizerFactory;
            mSymbolTable = symbolTable;
            mFeatures = features;
        }

        public Map features(CharSequence cSeq) {
            int numTopics = mWordTopicProbs[0].length;
            char[] cs = Strings.toCharArray(cSeq);
            Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);
            double[] vals = new double[numTopics];
            Arrays.fill(vals,mDocTopicPrior);
            for (String token : tokenizer) {
                int symbol = mSymbolTable.symbolToID(token);
                if (symbol < 0 || symbol >= mWordTopicProbs.length)
                    continue;
                for (int k = 0; k < numTopics; ++k)
                    vals[k] += mWordTopicProbs[symbol][k];
            }

            ObjectToDoubleMap featMap
                = new ObjectToDoubleMap((numTopics*3)/2);
            double sum = com.aliasi.util.Math.sum(vals);
            for (int k = 0; k < numTopics; ++k)
                if (vals[k] > 0.0) 
                    featMap.set(mFeatures[k],
                                vals[k] / sum);
            return featMap;
        }

        Object writeReplace() {
            return new Serializer(this);
        }

        static class Serializer extends AbstractExternalizable {
            static final long serialVersionUID = -1472744781627035426L;
            final ExpectedTopicFeatureExtractor mFeatures;
            public Serializer() {
                this(null);
            }
            public Serializer(ExpectedTopicFeatureExtractor features) {
                mFeatures = features;
            }
            public void writeExternal(ObjectOutput out) throws IOException {
                out.writeDouble(mFeatures.mDocTopicPrior);
                out.writeInt(mFeatures.mWordTopicProbs.length);
                for (int w = 0; w < mFeatures.mWordTopicProbs.length; ++w)
                    writeDoubles(mFeatures.mWordTopicProbs[w],out);
                out.writeObject(mFeatures.mTokenizerFactory);
                out.writeObject(mFeatures.mSymbolTable);
                writeUTFs(mFeatures.mFeatures,out);
            }
            public Object read(ObjectInput in) throws IOException, ClassNotFoundException {
                double docTopicPrior = in.readDouble();
                int numWords = in.readInt();
                double[][] wordTopicProbs = new double[numWords][];
                for (int w = 0; w < numWords; ++w)
                    wordTopicProbs[w] = readDoubles(in);
                @SuppressWarnings("unchecked")
                TokenizerFactory tokenizerFactory
                    = (TokenizerFactory) in.readObject();
                @SuppressWarnings("unchecked")
                SymbolTable symbolTable
                    = (SymbolTable) in.readObject();
                String[] features = readUTFs(in);
                return new ExpectedTopicFeatureExtractor(docTopicPrior,wordTopicProbs,tokenizerFactory,symbolTable,features);
            }
        }
    }


    static class Serializer extends AbstractExternalizable {
        static final long serialVersionUID = 4725870665020270825L;
        final LatentDirichletAllocation mLda;
        public Serializer() {
            this(null);
        }
        public Serializer(LatentDirichletAllocation lda) {
            mLda = lda;
        }
        public Object read(ObjectInput in) throws IOException {
            double docTopicPrior = in.readDouble();
            int numTopics = in.readInt();
            double[][] topicWordProbs = new double[numTopics][];
            for (int i = 0; i < topicWordProbs.length; ++i)
                topicWordProbs[i] = readDoubles(in);
            return new LatentDirichletAllocation(docTopicPrior,topicWordProbs);
        }
        public void writeExternal(ObjectOutput out) throws IOException {
            out.writeDouble(mLda.mDocTopicPrior);
            out.writeInt(mLda.mTopicWordProbs.length);
            for (double[] topicWordProbs : mLda.mTopicWordProbs) 
                writeDoubles(topicWordProbs,out);
        }
    }

}