com.aliasi.lm.TokenizedLM Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */


package com.aliasi.lm;

import com.aliasi.corpus.ObjectHandler;

import com.aliasi.symbol.SymbolTable;
import com.aliasi.symbol.MapSymbolTable;

import com.aliasi.stats.BinomialDistribution;
import com.aliasi.stats.Statistics;

import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.util.AbstractExternalizable;
// import com.aliasi.util.Arrays;
import com.aliasi.util.BoundedPriorityQueue;
import com.aliasi.util.Exceptions;
import com.aliasi.util.ScoredObject;
import com.aliasi.util.Strings;

import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.IOException;
import java.io.Serializable;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Iterator;
import java.util.SortedSet;

/**
 * A TokenizedLM provides a dynamic sequence language
 * model which models token sequences with an n-gram model, and
 * whitespace and unknown tokens with their own sequence language
 * models.
 *
 * A tokenized language model factors the probability assigned to a
 * character sequence as follows:
 *
 * 

 * P(cs)
 * = P_{_tok}(toks(cs))
 * Π_{_{t in unknownToks(cs)}}
 *  P_{_unk}(t)
 * Π_{_{w in whitespaces(cs)}}
 *   P_{_whsp}(w)
 * 
 *
 * where
 *
 * 

 *  P_{_tok} is the token model
 * esimate and where toks(cs) replaces known tokens with
 * their integer identifiers, unknown tokens with -1 and
 * adds boundary symbols -2 front and back, the same
 * adjustment is used to remove the initial boundary estimate as in
 * {@link NGramBoundaryLM};
 *
 * 
 P_{_unk} is the unknown token
 * sequence language model and unknownToks(cs) is the
 * list of unknown tokens in the input (with duplicates); and
 *
 * 
 P_{_whsp} is the whitespace sequence
 * language model and whitespaces(cs) is the list of
 * whitespaces in the character sequence (with duplicates).
 *
 * 
 *
 * The token n-gram model itself uses the same method of counting
 * and smoothing as described in the class documentation for {@link
 * NGramProcessLM}.  Like {@link NGramBoundaryLM}, boundary tokens are
 * inserted before and after other tokens.  And like the n-gram
 * character boundary model, the initial boundary estimate is subtracted
 * from the overall estimate for normalization purposes.
 *
 * 
Tokens are all converted to integer identifiers using an
 * internal dynamic symbol table.  All symbols in symbol tables get
 * non-negative identifiers; the negative value -1 is
 * used for the unknown token in models, just as in symbol tables.
 * The value -2 is used for the boundary marker in the
 * counters.
 *
 * 
In order for all estimates to be non-zero, the integer
 * sequence counter used to back the token model is initialized
 * with a count of 1 for the end-of-stream identifier (-2).  The
 * unknown token count for any context is taken to be the number
 * of outcomes in that context.  Because unknowns are estimated
 * directly in this manner, there is no need to interpolate the
 * unigram model with a uniform model for unknown outcome.  Instead,
 * the occurrence of an unknown is modeled directly and its
 * identity is modeled by the unknown token language model.
 *
 * 
In order to produce a properly normalized sequence model, the
 * concatenation of tokens and whitespaces returned by the tokenizer
 * should concatenate together to produce the original input.  Note
 * that this condition is not checked at runtime.  But,
 * sequences may be normalized before being trained and evaluated for
 * a language model.  For instance, all alphabetic characters might be
 * reduced to lower case and all punctuation characters removed and
 * all non-empty sequences of whitespace reduced to a single space
 * character.  A langauge model may then be defined over this
 * normalized space of input, not the original space (and may thus use
 * a reduced number of characters for its uniform estimates).
 * Although this normalization may be carried out by a tokenizer in
 * practice, for instance for use in a tokenized classifier, an
 * normalization is consistent the interface specification for {@link
 * LanguageModel.Sequence} or {@link LanguageModel.Dynamic} only if
 * done on the outside.
 *
 * @author Bob Carpenter
 * @version 4.0.0
 * @since   LingPipe2.0
 */
public class TokenizedLM
    implements LanguageModel.Dynamic,
               LanguageModel.Sequence,
               LanguageModel.Tokenized,
               ObjectHandler {

    private final TokenizerFactory mTokenizerFactory;
    private final MapSymbolTable mSymbolTable;
    private final TrieIntSeqCounter mCounter;
    private final LanguageModel.Sequence mUnknownTokenModel;
    private final LanguageModel.Sequence mWhitespaceModel;
    private final double mLambdaFactor;

    private final LanguageModel.Dynamic mDynamicUnknownTokenModel;
    private final LanguageModel.Dynamic mDynamicWhitespaceModel;

    private final int mNGramOrder;

    /**
     * Constructs a tokenized language model with the specified
     * tokenization factory and n-gram order (see warnings below on
     * where this simple constructor may be used).  
     *
     * 
The unknown token and whitespace models are both uniform
     * sequence language models with default parameters as described
     * in the documentation for the constructor {@link
     * UniformBoundaryLM#UniformBoundaryLM()}.  The default
     * interpolation hyperparameter is equal to the n-gram Order.
     *
     * 
Warning: This construction method is probably only
     * going to be useful if you are only using the tokenized LM to
     * store character n-grams.  Because it uses fat constant uniform
     * language models for smoothing tokens and whitespaces, it will
     * provide very high entropy estimates for unseen text.  The other
     * constructors allow smoothing LMs to be supplied (which will take
     * up more space to estimate, but will provide more reasonable
     * estimates).
     *
     * @param factory Tokenizer factory for the model.
     * @param nGramOrder N-gram Order.
     * @throws IllegalArgumentException If the n-gram order is less
     * than 0.
     */
    public TokenizedLM(TokenizerFactory factory,
                       int nGramOrder) {
        this(factory,
             nGramOrder,
             new UniformBoundaryLM(),
             new UniformBoundaryLM(),
             nGramOrder);
    }

    /**
     * Construct a tokenized language model with the specified
     * tokenization factory and n-gram order, sequence models for
     * unknown tokens and whitespace, and an interpolation
     * hyperparameter.
     *
     * 
In order for this model to be serializable, the unknown
     * token and whitespace models should be serializable.  If they do
     * not, a runtime exception will be thrown when attempting to
     * serialize this model.  If these models implement {@link
     * LanguageModel.Dynamic}, they will be trained by calls to the
     * training method.
     *
     * @param tokenizerFactory Tokenizer factory for the model.
     * @param nGramOrder Length of maximum n-gram for model.
     * @param unknownTokenModel Sequence model for unknown tokens.
     * @param whitespaceModel Sequence model for all whitespace.
     * @param lambdaFactor Value of the interpolation hyperparameter.
     * @throws IllegalArgumentException If the n-gram order is less
     * than 1 or the interpolation is not a non-negative number.
     */
    public TokenizedLM(TokenizerFactory tokenizerFactory,
                       int nGramOrder,
                       LanguageModel.Sequence unknownTokenModel,
                       LanguageModel.Sequence whitespaceModel,
                       double lambdaFactor) {
        this(tokenizerFactory,nGramOrder,
             unknownTokenModel,whitespaceModel,lambdaFactor,
             true);
    }

    /**
     * Construct a tokenized language model with the specified
     * tokenization factory and n-gram order, sequence models for
     * unknown tokens and whitespace, and an interpolation
     * hyperparameter, as well as a flag indicating whether to
     * automatically increment a null input to avoid numerical
     * problems with zero counts.
     *
     * 
In order for this model to be serializable, the unknown
     * token and whitespace models should be serializable.  If they do
     * not, a runtime exception will be thrown when attempting to
     * serialize this model.  If these models implement {@link
     * LanguageModel.Dynamic}, they will be trained by calls to the
     * training method.
     *
     * @param tokenizerFactory Tokenizer factory for the model.
     * @param nGramOrder Length of maximum n-gram for model.
     * @param unknownTokenModel Sequence model for unknown tokens.
     * @param whitespaceModel Sequence model for all whitespace.
     * @param lambdaFactor Value of the interpolation hyperparameter.
     * @param initialIncrementBoundary Flag indicating whether or not
     * to increment the subsequence { BOUNDARY_TOKEN }
     * automatically after construction to avoid {@code NaN} error
     * states.
     * @throws IllegalArgumentException If the n-gram order is less
     * than 1 or the interpolation is not a non-negative number.
     */
    public TokenizedLM(TokenizerFactory tokenizerFactory,
                       int nGramOrder,
                       LanguageModel.Sequence unknownTokenModel,
                       LanguageModel.Sequence whitespaceModel,
                       double lambdaFactor,
                       boolean initialIncrementBoundary) {
        NGramProcessLM.checkMaxNGram(nGramOrder);
        NGramProcessLM.checkLambdaFactor(lambdaFactor);
        mSymbolTable = new MapSymbolTable();
        mNGramOrder = nGramOrder;
        mTokenizerFactory = tokenizerFactory;
        mUnknownTokenModel = unknownTokenModel;
        mWhitespaceModel = whitespaceModel;
        mDynamicUnknownTokenModel
            = (mUnknownTokenModel instanceof LanguageModel.Dynamic)
            ? (LanguageModel.Dynamic) mUnknownTokenModel
            : null;
        mDynamicWhitespaceModel
            = (mWhitespaceModel instanceof LanguageModel.Dynamic)
            ? (LanguageModel.Dynamic) mWhitespaceModel
            : null;
        mCounter = new TrieIntSeqCounter(nGramOrder);
        mLambdaFactor = lambdaFactor;
        // following is so it starts without NaN problems
        // decrement this if necessary when not needed
        if (initialIncrementBoundary)
            mCounter.incrementSubsequences(new int[] { BOUNDARY_TOKEN },0,1);
    }

    /**
     * Returns the interpolation ratio, or lambda factor,
     * for interpolating in this tokenized language model.
     * See the class documentation above for more details.
     *
     * @return The interpolation ratio for this LM.
     */
    public double lambdaFactor() {
        return mLambdaFactor;
    }


    /**
     * Returns the integer sequence counter underlying this model.
     * Symbols are mapped to integers using the symbol table returned
     * by {@link #symbolTable()}.  Changes to this counter affect this
     * tokenized language model.
     *
     * @return The sequence counter underlying this model.
     */
    public TrieIntSeqCounter sequenceCounter() {
        return mCounter;
    }

    /**
     * Returns the symbol table underlying this tokenized language
     * model's token n-gram model.  Changes to the symbol table affect
     * this tokenized language model.
     *
     * @return The symbol table underlying this language model.
     */
    public SymbolTable symbolTable() {
        return mSymbolTable;
    }

    /**
     * Returns the order of the token n-gram model underlying this
     * tokenized language model.
     *
     * @return The order of the token n-gram model underlying this
     * tokenized language model.
     */
    public int nGramOrder() {
        return mNGramOrder;
    }

    /**
     * Returns the tokenizer factory for this tokenized language
     * model.
     *
     * @return The tokenizer factory for this tokenized language
     * model.
     */
    public TokenizerFactory tokenizerFactory() {
        return mTokenizerFactory;
    }

    /**
     * Returns the unknown token seqeunce language model for this
     * tokenized language model.  Changes to the returned language
     * model affect this tokenized language model.
     *
     * @return The unknown token language model.
     */
    public LanguageModel.Sequence unknownTokenLM() {
        return mUnknownTokenModel;
    }

    /**
     * Returns the whitespace language model for this tokenized
     * language model.  Changes to the returned language model affect
     * this tokenized language model.
     *
     * @return The whitespace language model.
     */
    public LanguageModel.Sequence whitespaceLM() {
        return mWhitespaceModel;
    }

    /**
     * Writes a compiled version of this tokenized language model to
     * the specified object output.  When the model is read back in
     * it will be an instance of {@link CompiledTokenizedLM}.
     *
     * @param objOut Object output to which a compiled version of this
     * model is written.
     * @throws IOException If there is an I/O error writing the
     * output.
     */
    public void compileTo(ObjectOutput objOut) throws IOException {
        objOut.writeObject(new Externalizer(this));
    }

    /**
     * Visits the n-grams of the specified length with at least the specified
     * minimum count stored in the underlying counter of this
     * tokenized language model and passes them to the specified handler.
     *
     * @param nGramLength Length of n-grams visited.
     * @param minCount Minimum count of a visited n-gram.
     * @param handler Handler whose handle method is called for each
     * visited n-gram.
     */
    public void handleNGrams(int nGramLength, int minCount,
                             ObjectHandler handler) {
        StringArrayAdapter adapter = new StringArrayAdapter(handler);
        mCounter.handleNGrams(nGramLength,minCount,adapter);
    }





    double lambda(int[] tokIds) {
        double numExtensionsD = mCounter.numExtensions(tokIds,0,tokIds.length);
        double extCountD = mCounter.extensionCount(tokIds,0,tokIds.length);
        return extCountD / (extCountD + mLambdaFactor * numExtensionsD);
    }


    /**
     * Trains the token sequence model, whitespace model (if dynamic) and
     * unknown token model (if dynamic).
     *
     * @param cSeq Character sequence to train.
     */
    public void train(CharSequence cSeq) {
        char[] cs = Strings.toCharArray(cSeq);
        train(cs,0,cs.length);
    }

    /**
     * Trains the token sequence model, whitespace model (if dynamic) and
     * unknown token model (if dynamic) with the specified count number
     * of instances.  Calling train(cs,n) is equivalent to
     * calling train(cs) a total of n times.
     *
     * @param cSeq Character sequence to train.
     * @param count Number of instances to train.
     * @throws IllegalArgumentException If the count is not positive.
     */
    public void train(CharSequence cSeq, int count) {
        if (count < 0) {
            String msg = "Counts must be non-negative."
                + " Found count=" + count;
            throw new IllegalArgumentException(msg);
        }
        if (count == 0) return;
        char[] cs = Strings.toCharArray(cSeq);
        train(cs,0,cs.length,count);
    }

    /**
     * Trains the token sequence model, whitespace model (if dynamic) and
     * unknown token model (if dynamic).
     *
     * @param cs Underlying character array.
     * @param start Index of first character in slice.
     * @param end Index of one plus last character in slice.
     * @throws IndexOutOfBoundsException If the indices are out of
     * range for the character array.
     */
    public void train(char[] cs, int start, int end) {
        Strings.checkArgsStartEnd(cs,start,end);
        Tokenizer tokenizer =  mTokenizerFactory.tokenizer(cs,start,end-start);
        List tokenList = new ArrayList();
        while (true) {
            if (mDynamicWhitespaceModel != null) {
                String whitespace = tokenizer.nextWhitespace();
                mDynamicWhitespaceModel.train(whitespace);
            } // this'll pick up the last whitespace after last token
            String token = tokenizer.nextToken();
            if (token == null) break;
            tokenList.add(token);
        }
        int[] tokIds = new int[tokenList.size()+2];
        tokIds[0] = BOUNDARY_TOKEN;
        tokIds[tokIds.length-1] = BOUNDARY_TOKEN;
        Iterator it = tokenList.iterator();
        for (int i = 1; it.hasNext(); ++i) {
            String token = it.next();
            // train underlying token model just once per token
            if (mDynamicUnknownTokenModel != null
                && mSymbolTable.symbolToID(token) < 0) {
                mDynamicUnknownTokenModel.train(token);
            }
            tokIds[i] = mSymbolTable.getOrAddSymbol(token);
        }
        mCounter.incrementSubsequences(tokIds,0,tokIds.length);
        mCounter.decrementUnigram(BOUNDARY_TOKEN);
    }


    /**
     * Trains the language model on the specified character sequence.
     *
     * 
This method delegates to the {@link
     * #train(CharSequence,int)} method.
     * 
     * 
This method implements the ObjectHandler<CharSequence>
     * interface.
     */
    public void handle(CharSequence cs) {
        train(cs,1);
    }

    /**
     * Trains the token sequence model, whitespace model (if dynamic) and
     * unknown token model (if dynamic).
     *
     * @param cs Underlying character array.
     * @param start Index of first character in slice.
     * @param end Index of one plus last character in slice.
     * @param count Number of instances of sequence to train.
     * @throws IndexOutOfBoundsException If the indices are out of range for the
     * character array.
     * @throws IllegalArgumentException If the count is negative.
     */
    public void train(char[] cs, int start, int end, int count) {
        Strings.checkArgsStartEnd(cs,start,end);
        if (count < 0) {
            String msg = "Counts must be non-negative."
                + " Found count=" + count;
            throw new IllegalArgumentException(msg);
        }
        if (count == 0) return;
        Tokenizer tokenizer =  mTokenizerFactory.tokenizer(cs,start,end-start);
        List tokenList = new ArrayList();
        while (true) {
            if (mDynamicWhitespaceModel != null) {
                String whitespace = tokenizer.nextWhitespace();
                mDynamicWhitespaceModel.train(whitespace,count);
            } // this'll pick up the last whitespace after last token
            String token = tokenizer.nextToken();
            if (token == null) break;
            tokenList.add(token);
        }
        int[] tokIds = new int[tokenList.size()+2];
        tokIds[0] = BOUNDARY_TOKEN;
        tokIds[tokIds.length-1] = BOUNDARY_TOKEN;
        Iterator it = tokenList.iterator();
        for (int i = 1; it.hasNext(); ++i) {
            String token = it.next();
            // train underlying token model just once per token
            if (mDynamicUnknownTokenModel != null
                && mSymbolTable.symbolToID(token) < 0) {
                mDynamicUnknownTokenModel.train(token,count);
            }
            tokIds[i] = mSymbolTable.getOrAddSymbol(token);
        }
        mCounter.incrementSubsequences(tokIds,0,tokIds.length,count);
        mCounter.decrementUnigram(BOUNDARY_TOKEN,count);
    }


    /**
     * This method trains the last token in the sequence given the
     * previous tokens.  See {@link #trainSequence(CharSequence, int)}
     * for more information.
     *
     * @param cs Underlying character array.
     * @param start Index of first character in slice.
     * @param end Index of one plus last character in slice.
     * @throws IndexOutOfBoundsException If the indices are out of
     * range for the character array.
     * @throws IllegalArgumentException If the count is negative.
     */
    void trainSequence(char[] cs, int start, int end, int count) {
        Strings.checkArgsStartEnd(cs,start,end);
        if (count < 0) {
            String msg = "Count must be non-negative.  Found count=" + count;
            throw new IllegalArgumentException(msg);
        }
        Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start);
        String[] tokens = tokenizer.tokenize();
        int len = Math.min(tokens.length,nGramOrder());
        int offset = tokens.length - len;
        int[] tokIds = new int[len];
        for (int i = 0; i < len; ++i)
            tokIds[i] = mSymbolTable.getOrAddSymbol(tokens[i+offset]);
        mCounter.incrementSequence(tokIds,0,len,count);
    }

    /**
     * This method increments the count of the entire sequence
     * specified.  Note that this method does not increment any of the
     * token subsequences and does not increment the whitespace or
     * token smoothing models.
     *
     * 
This method may be used to train a tokenized language model
     * from individual character sequence counts.  Because the token
     * smoothing models are not implemented for this method, a pure
     * token model may be constructed by calling
     * train(CharSequence,int) for character sequences
     * corresponding to unigrams rather than this method in order to
     * train token smoothing with character subseuqneces.
     *
     * 
For instance, with
     * com.aliasi.tokenizer.IndoEuropeanTokenizerFactory,
     * the sequence calling trainSequence("the fast
     * computer",5) would extract three tokens,
     * the, fast and computer,
     * and would increment the count of the three-token sequence, but
     * not any of its subsequences.
     *
     * 
If the number of tokens is longer than the maximum n-gram
     * length, only the final tokens are trained. For instance, with
     * an n-gram length of 2, and the Indo-European tokenizer factory,
     * calling trainSequence("a slightly faster
     * computer",93) is equivalent to calling
     * trainSequence("faster computer",93).
     *
     * 
All tokens trained are added to the symbol table.  This
     * does not include any initial tokens that are not used because
     * the maximum n-gram length is too short.
     *
     * @param cSeq Character sequence to train.
     * @param count Number of instances to train.
     * @throws IllegalArgumentException If the count is negative.
     */
    public void trainSequence(CharSequence cSeq, int count) {
        char[] cs = Strings.toCharArray(cSeq);
        trainSequence(cs,0,cs.length,count);
    }

    public double log2Estimate(CharSequence cSeq) {
        char[] cs = Strings.toCharArray(cSeq);
        return log2Estimate(cs,0,cs.length);
    }

    public double log2Estimate(char[] cs, int start, int end) {
        Strings.checkArgsStartEnd(cs,start,end);
        double logEstimate = 0.0;

        // collect tokens, estimate whitespaces
        Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start);
        List tokenList = new ArrayList();
        while (true) {
            String whitespace = tokenizer.nextWhitespace();
            logEstimate += mWhitespaceModel.log2Estimate(whitespace);
            String token = tokenizer.nextToken();
            if (token == null) break;
            tokenList.add(token);
        }

        // collect token ids, estimate unknown tokens
        int[] tokIds = new int[tokenList.size()+2];
        tokIds[0] = BOUNDARY_TOKEN;
        tokIds[tokIds.length-1] = BOUNDARY_TOKEN;
        Iterator it = tokenList.iterator();
        for (int i = 1; it.hasNext(); ++i) {
            String token = it.next();
            tokIds[i] = mSymbolTable.symbolToID(token);
            if (tokIds[i] < 0) {
                logEstimate += mUnknownTokenModel.log2Estimate(token);
            }
        }

        // estimate token ids excluding start, inlcuding end
        for (int i = 2; i <= tokIds.length; ++i) {
            logEstimate += conditionalLog2TokenEstimate(tokIds,0,i);
        }
        return logEstimate;
    }

    class StringArrayAdapter implements ObjectHandler {
        ObjectHandler mHandler;
        public StringArrayAdapter(ObjectHandler handler) {
            mHandler = handler;
        }
        public void handle(int[] nGram) {
            mHandler.handle(simpleNGramToTokens(nGram));
        }
        String[] simpleNGramToTokens(int[] nGram) {
            String[] tokens = new String[nGram.length];
            for (int i = 0; i < tokens.length; ++i)
                tokens[i]
                    = nGram[i] >= 0
                    ? mSymbolTable.idToSymbol(nGram[i])
                    : null;
            return tokens;
        }
    }

    abstract class Collector implements ObjectHandler {
        final BoundedPriorityQueue> mBPQ;
        Collector(int maxReturned, boolean reverse) {
            Comparator> comparator = null;
            if (reverse)
                comparator = ScoredObject.reverseComparator();
            else
                comparator = ScoredObject.comparator();
            mBPQ = new BoundedPriorityQueue>(comparator,
                                                                    maxReturned);
        }
        SortedSet> nGramSet() {
            return mBPQ;
        }
        ScoredObject[] nGrams() {
            // necessary for array
            return mBPQ.>toArray(EMPTY_SCORED_OBJECT_STRING_ARRAY_ARRAY);
        }
        public void handle(int[] nGram) {
            for (int i = 0; i < nGram.length; ++i)
                if (nGram[i] < 0) return;  // don't include boundaries
            mBPQ.offer(new ScoredObject(nGramToTokens(nGram),
                                                  scoreNGram(nGram)));

        }
        abstract double scoreNGram(int[] nGram);
    }


    class FreqTermCollector extends Collector {
        FreqTermCollector(int maxReturned, boolean reverse) {
            super(maxReturned,reverse);
        }
        @Override
        double scoreNGram(int[] nGram) {
            return mCounter.count(nGram,0,nGram.length);
        }
    }

    class CollocationCollector extends Collector {
        CollocationCollector(int maxReturned) {
            super(maxReturned,false);
        }
        @Override
        double scoreNGram(int[] nGram) {
            return chiSquaredIndependence(nGram);
        }
    }

    class SigTermCollector extends Collector {
        final LanguageModel.Tokenized mBGModel;
        SigTermCollector(int maxReturned, LanguageModel.Tokenized bgModel,
                         boolean reverse) {
            super(maxReturned,reverse);
            mBGModel = bgModel;
        }
        @Override
        double scoreNGram(int[] nGram) {
            String[] tokens = nGramToTokens(nGram);
            int totalSampleCount = mCounter.count(nGram,0,0);
            int sampleCount = mCounter.count(nGram,0,nGram.length);
            double bgProb
                = mBGModel.tokenProbability(tokens,0,tokens.length);
            double score = BinomialDistribution.z(bgProb,
                                                  sampleCount,
                                                  totalSampleCount);
            return score;
        }
    }

    String[] nGramToTokens(int[] nGram) {
        String[] toks = new String[nGram.length];
        for (int i = 0; i < nGram.length; ++i) {
            toks[i] = nGram[i] >= 0
                ? mSymbolTable.idToSymbol(nGram[i])
                : (i == 0) ? "*BEGIN*" : "*END*";
        }
        return toks;
    }

    public double tokenProbability(String[] tokens, int start, int end) {
        return java.lang.Math.pow(2.0,tokenLog2Probability(tokens,start,end));
    }


    public double tokenLog2Probability(String[] tokens, int start, int end) {
        // check args!!!
        double log2Estimate = 0.0;
        int[] tokIds = new int[tokens.length];
        for (int i = start; i < end; ++i) {
            tokIds[i] = mSymbolTable.symbolToID(tokens[i]);
            double conditionalLog2TokenEstimate
                = conditionalLog2TokenEstimate(tokIds,0,i+1);
            if (Double.isInfinite(conditionalLog2TokenEstimate)) {
                double extCountD = mCounter.extensionCount(new int[0], 0, 0);
                double numTokensD = mSymbolTable.numSymbols();
                log2Estimate
                    +=  com.aliasi.util.Math.log2(extCountD
                                                  / (extCountD + numTokensD));
                log2Estimate += mUnknownTokenModel.log2Estimate(tokens[i]);
            } else {
                log2Estimate += conditionalLog2TokenEstimate;
            }
            if (Double.isInfinite(log2Estimate)) {
                System.out.println("tokens[" + i + "]=" + tokens[i]
                                   + "\n     id=" + tokIds[i]);
            }
        }
        return log2Estimate;
    }

    /**
     * Returns the probability of the specified tokens in the
     * underlying token n-gram distribution.  This includes the
     * estimation of the actual token for unknown tokens.
     *
     * @param tokens Tokens whose probability is returned.
     * @return The probability of the tokens.
     */
    public double processLog2Probability(String[] tokens) {
        return tokenLog2Probability(tokens,0,tokens.length);
    }


    /**
     * Returns an array of collocations in order of confidence that
     * their token sequences are not independent.  The object
     * contained in the returned scored objects will be an instance of
     * String[] containing tokens.  The length of n-gram,
     * minimum count for a result and the maximum number of results
     * returned are all specified.  The confidence ordering is based
     * on the result of Pearson's C_₂
     * independence statistic as computed by {@link
     * #chiSquaredIndependence(int[])}.
     *
     * @param nGram Length of n-grams to search for collocations.
     * @param minCount Minimum count for a returned n-gram.
     * @param maxReturned Maximum number of results returned.
     * @return Array of collocations in confidence order.
     */
    public SortedSet> collocationSet(int nGram,
                                                        int minCount,
                                                        int maxReturned) {
        CollocationCollector collector = new CollocationCollector(maxReturned);
        mCounter.handleNGrams(nGram,minCount,collector);
        return collector.nGramSet();
    }


    /**
     * Returns a list of scored n-grams ordered by the significance
     * of the degree to which their counts in this model exceed their
     * expected counts in a specified background model.  The returned
     * scored object array contains {@link ScoredObject} instances
     * whose objects are terms represented as string arrays and whose
     * scores are the collocation score for the term.  For instance,
     * the new terms may be printed in order of significance by:
     *
     * 
     * ScoredObject[] terms = new Terms(3,5,100,bgLM);
     * for (int i = 0; i < terms.length; ++i) {
     *     String[] term = (String[]) terms[i].getObject();
     *     double score = terms[i].score();
     *     ...
     * }
     * 
     *
     * The exact scoring used is the z-score as defined in {@link
     * BinomialDistribution#z(double,int,int)} with the success
     * probability defined by the n-grams probability estimate in the
     * background model, the number of successes being the count of
     * the n-gram in this model and the number of trials being the
     * total count in this model.
     *
     * 
See {@link #oldTermSet(int,int,int,LanguageModel.Tokenized)}
     * for a method that returns the least significant terms in
     * this model relative to a background model.
     * @param nGram Length of n-grams to search for significant new terms.
     * @param minCount Minimum count for a returned n-gram.
     * @param maxReturned Maximum number of results returned.
     * @param backgroundLM Background language model against which
     * significance is measured.
     * @return New terms ordered by significance.
     */
    public SortedSet> newTermSet(int nGram, int minCount,
                                                    int maxReturned,
                                                    LanguageModel.Tokenized backgroundLM) {
        return sigTermSet(nGram,minCount,maxReturned,backgroundLM,false);
    }


    /**
     * Returns a list of scored n-grams ordered in reverse order
     * of significance with respect to the background model.  In
     * other words, these are ones that occur less often in this
     * model than they would have been expected to given the
     * background model.
     *
     * 
Note that only terms that exist in the foreground model are
     * considered.  By contrast, reversing the roles of the models in
     * the sister method {@link
     * #newTermSet(int,int,int,LanguageModel.Tokenized)} considers
     * every n-gram in the background model and may return slightly
     * different results.
     *
     * @param nGram Length of n-grams to search for significant old terms.
     * @param minCount Minimum count in background model for a returned n-gram.
     * @param maxReturned Maximum number of results returned.
     * @param backgroundLM Background language model from which counts are
     * derived.
     * @return Old terms ordered by significance.
     */
    public SortedSet> oldTermSet(int nGram, int minCount,
                                                    int maxReturned,
                                                    LanguageModel.Tokenized backgroundLM) {
        return sigTermSet(nGram,minCount,maxReturned,backgroundLM,true);
    }

    private ScoredObject[] sigTerms(int nGram, int minCount,
                                              int maxReturned,
                                              LanguageModel.Tokenized backgroundLM,
                                              boolean reverse) {
        SigTermCollector collector
            = new SigTermCollector(maxReturned,backgroundLM,reverse);
        mCounter.handleNGrams(nGram,minCount,collector);
        return collector.nGrams();
    }

    private SortedSet> sigTermSet(int nGram, int minCount,
                                                     int maxReturned,
                                                     LanguageModel.Tokenized backgroundLM,
                                                     boolean reverse) {
        SigTermCollector collector
            = new SigTermCollector(maxReturned,backgroundLM,reverse);
        mCounter.handleNGrams(nGram,minCount,collector);
        return collector.nGramSet();
    }



    /**
     * Returns the most frequent n-gram terms in the training data up
     * to the specified maximum number.  The terms are ordered by raw
     * counts and returned in order.  The scored objects in the return
     * array have objects that are the terms themselves and
     * scores based on count.
     *
     * 
See {@link #infrequentTermSet(int,int)} to retrieve the most
     * infrequent terms.
     *
     * @param nGram Length of n-grams to search.
     * @param maxReturned Maximum number of results returned.
     */
    public SortedSet> frequentTermSet(int nGram, int maxReturned) {
        return freqTermSet(nGram,maxReturned,false);
    }

    private ScoredObject[] freqTerms(int nGram, int maxReturned,
                                               boolean reverse) {
        FreqTermCollector collector
            = new FreqTermCollector(maxReturned,reverse);
        mCounter.handleNGrams(nGram,1,collector);
        return collector.nGrams();
    }

    private SortedSet> freqTermSet(int nGram, int maxReturned,
                                               boolean reverse) {
        FreqTermCollector collector
            = new FreqTermCollector(maxReturned,reverse);
        mCounter.handleNGrams(nGram,1,collector);
        return collector.nGramSet();
    }


    /**
     * Returns the least frequent n-gram terms in the training data up
     * to the specified maximum number.  The terms are ordered by raw
     * counts and returned in reverse order.  The scored objects in
     * the return array have objects that are the terms themselves and
     * scores based on count.
     *
     * 
See {@link #frequentTermSet(int,int)} to retrieve the most
     * frequent terms.
     *
     * @param nGram Length of n-grams to search.
     * @param maxReturned Maximum number of results returned.
     */
    public SortedSet> infrequentTermSet(int nGram, int maxReturned) {
        return freqTermSet(nGram,maxReturned,true);
    }

    /**
     * Returns the maximum value of Pearson's C_₂
     * independence test statistic resulting from splitting the
     * specified n-gram in half to derive a contingency matrix.
     * Higher return values indicate more dependence among the terms
     * in the n-gram.
     *
     * 
The input n-gram is split into two halves,
     * Term_₁ and
     * Term_₂, each of which is a
     * non-empty sequence of integers.
     * Term_₁ consists of the tokens
     * indexed 0 to mid-1 and
     * Term_₂ from mid
     * to end-1.
     *
     * 
The contingency matrix for computing the independence
     * statistic is:
     *
     * 

     * 
     * 
     * 
     * 
     *   +Term_₂ -Term_₂
+Term_₁ Term(+,+) Term(+,-)
-Term_₁ Term(-,+) Term(-,-)
     * 
     *
     * where values for a specified integer sequence
     * nGram and midpoint 0 < mid < end is:
     *
     * 
     *  Term(+,+) = count(nGram,0,end)
     *  

     *  Term(+,-) = count(nGram,0,mid) - count(nGram,0,end)
     *  

     *  Term(-,+) = count(nGram,mid,end) - count(nGram,0,end)
     *  

     *  Term(-,-) = totalCount - Term(+,+) - Term(+,-) - Term(-,+)
     * 
     *
     * Note that using the overall total count provides a slight
     * overapproximation of the count of appropriate-length n-grams.
     *
     * For further information on the independence test, see the
     * documentation for {@link
     * Statistics#chiSquaredIndependence(double,double,double,double)}.
     *
     * @param nGram Array of integers whose independence
     * statistic is returned.
     * @return Minimum independence test statistic score for splits of
     * the n-gram.
     * @throws IllegalArgumentException If the specified n-gram is not at
     * least two elements long.
     */
    public double chiSquaredIndependence(int[] nGram) {
        if (nGram.length < 2) {
            String msg = "Require n-gram >= 2 for chi square independence."
                + " Found nGram length=" + nGram.length;
            throw new IllegalArgumentException(msg);
        }
        if (nGram.length == 2) {
            return chiSquaredSplit(nGram,1);
        }
        double bestScore = Double.NEGATIVE_INFINITY;
        for (int mid = 1; mid+1 < nGram.length; ++mid)
            bestScore = Math.max(bestScore,
                                 chiSquaredSplit(nGram,mid));
        return bestScore;
    }

    /**
     * Returns the z-score of the specified n-gram with the specified
     * count out of a total sample count, as measured against the
     * expectation of this tokenized language model.  Negative
     * z-scores mean the sample n-gram count is lower than expected
     * and positive z-scores mean the sample n-gram count is higher
     * than expected.  Z-scores close to zero indicate the sample
     * count is in line with expectations according to this language
     * model.
     *
     * Formulas for z-scores and an explanation of their scaling by
     * deviation is described in the documentation for the static
     * method {@link BinomialDistribution#z(double,int,int)}.
     *
     * @param nGram The n-gram to test.
     * @param nGramSampleCount The number of observations of the
     * n-gram in the sample.
     * @param totalSampleCount The total number of samples.
     * @return The z-score for the specified sample counts against the
     * expections of this language model.
     */
    public double z(int[] nGram, int nGramSampleCount, int totalSampleCount) {
        double totalCount = mCounter.count(nGram,0,0);
        double nGramCount = mCounter.count(nGram,0,nGram.length);
        double successProbability = nGramCount / totalCount;
        return BinomialDistribution.z(successProbability,
                                      nGramSampleCount,
                                      totalSampleCount);
    }

    /**
     * Returns a string-based representation of the token
     * counts for this language model.
     *
     * @return A string-based representation of this model.
     */
    @Override
    public String toString() {
        return mCounter.mRootNode.toString(mSymbolTable);
    }

    private double conditionalLog2TokenEstimate(int[] tokIds,
                                                int start, int end) {
        if (end < 1) return 0.0; // this can't get hit from current calls; end >= 1
        int maxLength = mCounter.maxLength();
        int contextEnd = end-1;

        double estimate = tokIds[end-1] == UNKNOWN_TOKEN ? 1.0 : 0.0;
        for (int contextStart = end-1;
             (contextStart >= start
              && (end-contextStart) <= maxLength);
             --contextStart) {
            int numExtensions
                = mCounter.numExtensions(tokIds,contextStart,contextEnd);
            if (numExtensions == 0) break;
            double extCountD
                = mCounter.extensionCount(tokIds,contextStart,contextEnd);
            double lambda
                = extCountD
                / (extCountD + mLambdaFactor * (double) numExtensions);
            estimate = estimate * (1.0 - lambda);
            if (tokIds[end-1] == UNKNOWN_TOKEN) continue;
            int count = mCounter.count(tokIds,contextStart,end);
            if (count > 0)
                estimate += (lambda * ((double) count))/extCountD;
        }
        return com.aliasi.util.Math.log2(estimate);
    }

    private double chiSquaredSplit(int[] pair, int mid) {
        // contingency table & probabilities
        //         _2    _y
        //    1_   12    1y
        //    x_   x2    xy
        long count12 = mCounter.count(pair,0,pair.length);
        long count1_ = mCounter.count(pair,0,mid);
        long count_2 = mCounter.count(pair,mid,pair.length);
        long n = mCounter.extensionCount(pair,0,0);
        long countxy = n - count1_ - count_2 + count12;
        long countx2 = count_2 - count12;
        long count1y = count1_ - count12;
        return Statistics.chiSquaredIndependence(count12,count1y,countx2,countxy);
    }

    private int lastInternalNodeIndex() {
        int last = 1;
        LinkedList queue = new LinkedList();
        queue.add(mCounter.mRootNode);
        for (int i = 1; !queue.isEmpty(); ++i) {
            IntNode node = queue.removeFirst();
            if (node.numExtensions() > 0)
                last = i;
            node.addDaughters(queue);
        }
        return last-1;
    }

    /**
     * The symbol used for unknown symbol IDs.
     */
    public static final int UNKNOWN_TOKEN =
        SymbolTable.UNKNOWN_SYMBOL_ID;

    /**
     * The symbol used for boundaries in the counter, -2.
     */
    public static final int BOUNDARY_TOKEN = -2;

    private static int[] concatenate(int[] is, int i) {
        int[] result = new int[is.length+1];
        System.arraycopy(is,0,result,0,is.length);
        result[is.length] = i;
        return result;
    }

    static class Externalizer extends AbstractExternalizable {
        private static final long serialVersionUID = 6135272620545804504L;
        final TokenizedLM mLM;
        public Externalizer() {
            this(null);
        }
        public Externalizer(TokenizedLM lm) {
            mLM = lm;
        }
        @Override
        public Object read(ObjectInput in) throws IOException {
            try {
                return new CompiledTokenizedLM(in);
            } catch (ClassNotFoundException e) {
                throw Exceptions.toIO("TokenizedLM.Externalizer.read()",e);
            }
        }
        @Override
        public void writeExternal(ObjectOutput objOut) throws IOException {
            if (mLM.mTokenizerFactory instanceof Serializable) {
                objOut.writeUTF("");
                objOut.writeObject(mLM.mTokenizerFactory);
            } else {
                objOut.writeUTF(mLM.mTokenizerFactory.getClass().getName());
            }
            objOut.writeObject(mLM.mSymbolTable);
            ((LanguageModel.Dynamic) mLM.mUnknownTokenModel).compileTo(objOut);
            ((LanguageModel.Dynamic) mLM.mWhitespaceModel).compileTo(objOut);
            objOut.writeInt(mLM.mNGramOrder);

            int numNodes = mLM.mCounter.mRootNode.trieSize();
            objOut.writeInt(numNodes);

            int lastInternalNodeIndex = mLM.lastInternalNodeIndex();
            objOut.writeInt(lastInternalNodeIndex);

            // write root node (-int,-logP,-log(1-L),firstDtr)
            objOut.writeInt(Integer.MIN_VALUE);  // root symbol unknown
            objOut.writeFloat(Float.NaN); // no estimate
            objOut.writeFloat((float)
                              com.aliasi.util.Math
                              .log2(1.0-mLM.lambda(com.aliasi.util.Arrays
                                                   .EMPTY_INT_ARRAY)));
            objOut.writeInt(1);           // first dtr = 1

            LinkedList queue = new LinkedList();
            int[] outcomes
                = mLM.mCounter.mRootNode
                .integersFollowing(com.aliasi.util.Arrays.EMPTY_INT_ARRAY,0,0);
            for (int i = 0; i < outcomes.length; ++i)
                queue.add(new int[] { outcomes[i] });
            for (int i = 1; !queue.isEmpty(); ++i) {
                int[] is = queue.removeFirst();
                objOut.writeInt(is[is.length-1]);
                objOut.writeFloat((float)
                                  mLM.conditionalLog2TokenEstimate(is,0,is.length));
                if (i <= lastInternalNodeIndex) {
                    objOut.writeFloat((float)
                                      com.aliasi.util.Math.log2(1.0-mLM.lambda(is)));
                    objOut.writeInt(i+queue.size()+1);
                }
                int[] followers
                    = mLM.mCounter.mRootNode.integersFollowing(is,0,is.length);
                for (int j = 0; j < followers.length; ++j)
                    queue.add(concatenate(is,followers[j]));
            }
        }
    }


    @SuppressWarnings("rawtypes")
    static final ScoredObject[] EMPTY_SCORED_OBJECT_ARRAY
        = new ScoredObject[0];

    static final ScoredObject[] EMPTY_SCORED_OBJECT_STRING_ARRAY_ARRAY
        = emptyScoredObjectArray();

    static ScoredObject[] emptyScoredObjectArray() {
        @SuppressWarnings("unchecked")
        ScoredObject[] result
            = (ScoredObject[]) EMPTY_SCORED_OBJECT_ARRAY;
        return result;
    }


}
	+Term_₂	-Term_₂
+Term_₁	Term(+,+)	Term(+,-)
-Term_₁	Term(-,+)	Term(-,-)