com.aliasi.chunk.CompiledEstimator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.chunk;

import com.aliasi.symbol.SymbolTable;

import com.aliasi.tokenizer.TokenCategorizer;

import java.io.IOException;
import java.io.ObjectInput;

import java.util.ArrayList;
import java.util.List;

/**
 * A compiled estimator is constructed by reading a binary model
 * compiled by a trainable estimator from a data input stream.  The
 * estimator may then be used to estimate instances of log
 * P(Tag,Token|Tag-1,Token-1,Token-2) using the {@link
 * #estimate(int,int,int,int,int)} method, where the integer values
 * are identifers of the associated symbols in the appropriate (tag or
 * token) symbol table.  The symbol tables are stored in the compiled
 * estimator.  Various operations on tags as identifiers are
 * precomputed and supplied by methods in this class.
 *
 *  The components of a compiled estimator is stored in the
 * following order.
 *
 * 


 * 
 * 
 *     
 *     
 * 
 *     
 *     
 * 
 *     
 *     
 * 
 *     
 *     
 * 
 *     
 *     
 * 
 *     
 *     
 * Variable Type Description
tagSymbolTable SymbolTable Symbol table for tags.
tokenSymbolTable SymbolTable Symbol table for tokens.
tagTrie EstimatorTrie Estimator trie for tags.
tokenTrie EsitmatorTrie Estimator trie for tokens.
logVocabEstimate double Estimate of log likelihood of a token.
 * 

 *
 * @author  Bob Carpenter
 * @version 3.8
 * @since   LingPipe1.0
 */
final class CompiledEstimator {

    /**
     * A trie of estimates and backoffs for log
     * P(Tag|Tag-1,Token-1,Token-2).
     */
    private final EstimatorTrie mTagTrie;

    /**
     * A trie of estimates and backoffs for log
     * P(Token|Tag,Tag-1,Token-1).
     */
    private final EstimatorTrie mTokenTrie;

    /**
     * A symbol table for tags.
     */
    private final SymbolTable mTagSymbolTable;

    /**
     * The symbol table for tokens.
     */
    private final SymbolTable mTokenSymbolTable;

    /**
     * mCannotFollow[tagID][tagMinus1ID] is true
     * if and only if a tag with identifier tagID can follow
     * a tag of identifier tagMinus1ID.
     */
    private final boolean[][] mCannotFollow;

    /**
     * mConvertToInterior[tagID] is the interior
     * tag ID with the same base tag as tagID.
     */
    private final int[] mConvertToInterior;

    /**
     * Array of start tag identifiers; tags are all prefixed by "ST_".
     * Does not include "OUT" tag.
     */
    private final int[] mStart;

    /**
     * Array of interior tag identifiers; tags not prefixed by "ST_".
     */
    private final int[] mInterior;

    /**
     * Natural log of the uniform vocabulary estimate for this
     * estimator.
     */
    private final double mLogUniformVocabEstimate;

    /**
     * Categorizer to provide token categories for smoothed
     * estimates and unknown estimates.
     */
    private final TokenCategorizer mTokenCategorizer;

    /**
     * Construct a compiled estimator from a data input stream and
     * sets the log estimate of the uniform vocabulary likelihood for
     * smoothing the token model.
     *
     * @param in Data input stream from which to read the estimator.
     * @param categorizer Token categorizer to classify tokens.
     *
     * @throws IOException If there is an I/O exception reading from
     * the data input stream.
     */
    public CompiledEstimator(ObjectInput in)
        throws ClassNotFoundException, IOException {

        mTokenCategorizer = (TokenCategorizer) in.readObject();

        mTagSymbolTable = (SymbolTable) in.readObject();
        mTokenSymbolTable = (SymbolTable) in.readObject();

        // read from model & put in training
        mTagTrie = new EstimatorTrie(in);
        mTokenTrie = new EstimatorTrie(in);
        mLogUniformVocabEstimate = in.readDouble();

        int numSymbols = mTagSymbolTable.numSymbols();
        mConvertToInterior = new int[numSymbols];
        mCannotFollow = new boolean[numSymbols][numSymbols];
        int numTags = mTagSymbolTable.numSymbols();
        List starts = new ArrayList();
        List interiors = new ArrayList();
        for (int tagID = 0; tagID < numTags; ++tagID) {
            String tag = idToTag(tagID);
            mConvertToInterior[tagID] = tagToInteriorID(tag);
            if (tagID != mConvertToInterior[tagID]) {
                interiors.add(Integer.valueOf(mConvertToInterior[tagID]));
                starts.add(Integer.valueOf(tagID));
            }
            for (int tagMinus1ID = 0; tagMinus1ID < numTags; ++tagMinus1ID)
                mCannotFollow[tagID][tagMinus1ID]
                    = Tags.illegalSequence(idToTag(tagMinus1ID),tag);
        }
        mStart = convertToIntArray(starts);
        mInterior = convertToIntArray(interiors);
    }

    /**
     * Returns the array of start tag IDs.  The array returned is
     * aligned with the interior tag IDs returned by {@link
     * #interiorTagIDs()}.
     *
     * @return Array of identifiers for start tags.
     */
    public int[] startTagIDs() {
        return mStart;
    }

    /**
     * Returns the array of interior tag IDs.  The array returned is
     * aligned with the start tag IDs returned by {@link
     * #startTagIDs()}.
     *
     * @return Array of identifiers for interior tags.
     */
    public int[] interiorTagIDs() {
        return mInterior;
    }

    /**
     * Returns number of possible tags produced by this estimator,
     * including both base and start forms of tags and the
     * distinguished out tag.
     *
     * @return Number of possible tags produced by this estimator.
     */
    public int numTags() {
        return mTagSymbolTable.numSymbols();
    }

    /** Maps a tag to its integer identifier or -1 if it
     * is not in the table.
     * @param tag Name of tag.
     * @return Integer identifier for the specified tag or -1 if it is not in the table.
     */
    public int tagToID(String tag) {
        return mTagSymbolTable.symbolToID(tag);
    }

    /**
     * Maps a tag identifier to the name of that tag.  Throws an array out of
     * bounds exception if the identifier does not exist in the table.
     *
     * @param id Identifier of the tag to return.
     * @return Name of the tag identified by the specified identifier.
     */
    public String idToTag(int id) {
        return mTagSymbolTable.idToSymbol(id);
    }

    /**
     * Maps a token to its integer identifier or -1 if it
     * is not in the table.
     *
     * @param token Name of token.
     * @return Integer identifier for the specified token or -1 if it is not in the table.
     */
    public int tokenToID(String token) {
        return mTokenSymbolTable.symbolToID(token);
    }

    /**
     * Maps a token to its integer identifier if it is in the symbol
     * table, or to the identifier of its token category.
     *
     * @param token Token to compute ID for.
     * @return Identifier of token if it exists, or identifier of its
     * category if nothing is known about the token.
     */
    public int tokenOrCategoryToID(String token) {
        int id = tokenToID(token);
        if (id < 0) {
            id = tokenToID(mTokenCategorizer.categorize(token));
            if (id < 0) {
                System.err.println("No id for token category: " + token);
            }
        }
        return id;
    }

    /**
     * Maps an integer identifier to the token it represents
     * in the token symbol table.
     *
     * @param id Identifier of the token.
     * @return Token with specified identifier in the token symbol
     * table.
     */
    public String idToToken(int id) {
        return mTokenSymbolTable.idToSymbol(id);

    }

    /**
     * Returns true if the tag identified by the first
     * identifier cannot follow the tag identified by the second
     * identifier.
     *
     * @param tagID Identifier of tag.
     * @param tagMinus1ID Identifier of preceding tag.
     * @return true if the tag for tagID
     * cannot follow the tag for tagMinus1ID.
     */
    public boolean cannotFollow(int tagID, int tagMinus1ID) {
        return mCannotFollow[tagID][tagMinus1ID];
    }

    /**
     * Returns the identifier for the base tag of
     * the tag picked out by the specified identifier.
     *
     * @param tagID Identifier of tag to convert to base form.
     * @return Identifier of the base form of the tag picked out by
     * the specified identifier.
     */
    private int idToInteriorID(int tagID) {
        return mConvertToInterior[tagID];
    }


    /**
     * Returns log P(tag,token|tag-1,token-1,token-2),
     * where information about the tags and tokens are supplied
     * through symbol table identifiers.
     *
     * @param tagID Identifier of outcome tag to estimate along with
     * the token.
     * @param tokenID Identifier of outcome token to estimate along
     * with the tag.
     * @param tagMinus1ID Identifier of the previous tag.
     * @param tokenMinus1ID Identifier of the previous token.
     * @param tokenMinus2ID Token two back from token.
     * @return log P(tag,token|tag-1,token-1,token-2).
     */
    public double estimate(int tagID, int tokenID,
                           int tagMinus1ID,
                           int tokenMinus1ID,
                           int tokenMinus2ID) {
        if (cannotFollow(tagID,tagMinus1ID)) return Double.NaN;
        int tagMinus1IDInterior = idToInteriorID(tagMinus1ID);
        return estimateTag(tagID,tagMinus1IDInterior,
                           tokenMinus1ID,tokenMinus2ID)
            + estimateToken(tokenID,tagID,tagMinus1IDInterior,tokenMinus1ID);
    }

    /**
     * Return log P(tag|tag-1,token-1,token-2).  Returns
     * Double.NaN when nothing is known about
     * tag-1.
     *
     * @param tagID Identifier of outcome tag to estimate along with
     * the token.
     * @param tagMinus1ID Identifier of the previous tag.
     * @param tokenMinus1ID Identifier of the previous token.
     * @param tokenMinus2ID Token two back from token.
     * @return log P(tag|tag-1,token-1,token-2).
     */
    private double estimateTag(int tagID,
                               int tagMinus1ID,
                               int tokenMinus1ID,
                               int tokenMinus2ID) {
        // find most specific node matching context,
        // then lookup estimate from there
        // estimating from node follows backoffs,
        // adding 1-lambda from current context as necessary
        int nodeTag1Index = mTagTrie.lookupChild(tagMinus1ID,0);
        if (nodeTag1Index == -1) {
            // no outcomes for simple tag -- really an error
            return Double.NaN;
        }
        int nodeTag1W1Index
            = mTagTrie.lookupChild(tokenMinus1ID,nodeTag1Index);
        if (nodeTag1W1Index == -1) {
            return mTagTrie.estimateFromNode(tagID,nodeTag1Index);
        }
        int nodeTag1W1W2Index
            = mTagTrie.lookupChild(tokenMinus2ID,nodeTag1W1Index);
        if (nodeTag1W1W2Index == -1) {
            return mTagTrie.estimateFromNode(tagID,nodeTag1W1Index);
        }
        return mTagTrie.estimateFromNode(tagID,nodeTag1W1W2Index);
    }

    /**
     * Return log P(token|tag,tag-1,token-1), where
     * information about the tags and tokens are supplied through
     * symbol table identifiers.  Return Double.NaN if
     * nothign is known about tag.
     *
     * @param tokenID Identifier of outcome token to estimate along
     * with the tag.
     * @param tagID Identifier of outcome tag to estimate along with
     * the token.
     * @param tagMinus1ID Identifier of the previous tag.
     * @param tokenMinus1ID Identifier of the previous token.
     * @return  log P(token|tag,tag-1,token-1).
     */
    private double estimateToken(int tokenID,
                                 int tagID, int tagMinus1ID,
                                 int tokenMinus1ID) {
        int nodeTagIndex = mTokenTrie.lookupChild(tagID,0);
        if (nodeTagIndex == -1)
            return Double.NaN;
        int nodeTagTag1Index = mTokenTrie.lookupChild(tagMinus1ID,nodeTagIndex);
        if (nodeTagTag1Index == -1) {
            return
                mTokenTrie.estimateFromNodeUniform(tokenID,
                                                   nodeTagIndex,
                                                   mLogUniformVocabEstimate);
        }
        int nodeTagTag1W1Index
            = mTokenTrie.lookupChild(tokenMinus1ID,nodeTagTag1Index);
        if (nodeTagTag1W1Index != -1) {
            return
                mTokenTrie.estimateFromNodeUniform(tokenID,
                                                   nodeTagTag1W1Index,
                                                   mLogUniformVocabEstimate);
        }
        return mTokenTrie.estimateFromNodeUniform(tokenID,
                                                  nodeTagTag1Index,
                                                  mLogUniformVocabEstimate);
    }

    /**
     * Return the identifier for the base tag corresponding
     * to the specified tag.
     *
     * @param tag Tag whose base tag ID is returned.
     * @return Identifier for base tag of specified tag.
     */
    private int tagToInteriorID(String tag) {
        return tagToID(Tags.toInnerTag(tag));
    }

    /**
     * Convert the array list of Integer objects to an
     * array of their integer values.
     *
     * @param xs Arraylist of Integer objects.
     * @return Array of integer values for the specified array of
     * objects.
     */
    private static int[] convertToIntArray(List xs) {
        int[] result = new int[xs.size()];
        for (int i = 0; i < result.length; ++i)
            result[i] = xs.get(i).intValue();
        return result;
    }

}
Variable	Type	Description
`tagSymbolTable`	`SymbolTable`	Symbol table for tags.
`tokenSymbolTable`	`SymbolTable`	Symbol table for tokens.
`tagTrie`	`EstimatorTrie`	Estimator trie for tags.
`tokenTrie`	`EsitmatorTrie`	Estimator trie for tokens.
`logVocabEstimate`	`double`	Estimate of log likelihood of a token.