com.aliasi.chunk.CompiledEstimator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe:
http://alias-i.com/lingpipe/web/download.html
There were not made any changes to the source code.
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.chunk;
import com.aliasi.symbol.SymbolTable;
import com.aliasi.tokenizer.TokenCategorizer;
import java.io.IOException;
import java.io.ObjectInput;
import java.util.ArrayList;
import java.util.List;
/**
* A compiled estimator is constructed by reading a binary model
* compiled by a trainable estimator from a data input stream. The
* estimator may then be used to estimate instances of log
* P(Tag,Token|Tag-1,Token-1,Token-2)
using the {@link
* #estimate(int,int,int,int,int)} method, where the integer values
* are identifers of the associated symbols in the appropriate (tag or
* token) symbol table. The symbol tables are stored in the compiled
* estimator. Various operations on tags as identifiers are
* precomputed and supplied by methods in this class.
*
* The components of a compiled estimator is stored in the
* following order.
*
*
*
* Variable
* Type
* Description
* tagSymbolTable
* SymbolTable
* Symbol table for tags.
* tokenSymbolTable
* SymbolTable
* Symbol table for tokens.
* tagTrie
* EstimatorTrie
* Estimator trie for tags.
* tokenTrie
* EsitmatorTrie
* Estimator trie for tokens.
* logVocabEstimate
* double
* Estimate of log likelihood of a token.
*
*
*
* @author Bob Carpenter
* @version 3.8
* @since LingPipe1.0
*/
final class CompiledEstimator {
/**
* A trie of estimates and backoffs for log
* P(Tag|Tag-1,Token-1,Token-2)
.
*/
private final EstimatorTrie mTagTrie;
/**
* A trie of estimates and backoffs for log
* P(Token|Tag,Tag-1,Token-1)
.
*/
private final EstimatorTrie mTokenTrie;
/**
* A symbol table for tags.
*/
private final SymbolTable mTagSymbolTable;
/**
* The symbol table for tokens.
*/
private final SymbolTable mTokenSymbolTable;
/**
* mCannotFollow[tagID][tagMinus1ID]
is true
* if and only if a tag with identifier tagID
can follow
* a tag of identifier tagMinus1ID
.
*/
private final boolean[][] mCannotFollow;
/**
* mConvertToInterior[tagID]
is the interior
* tag ID with the same base tag as tagID
.
*/
private final int[] mConvertToInterior;
/**
* Array of start tag identifiers; tags are all prefixed by "ST_".
* Does not include "OUT" tag.
*/
private final int[] mStart;
/**
* Array of interior tag identifiers; tags not prefixed by "ST_".
*/
private final int[] mInterior;
/**
* Natural log of the uniform vocabulary estimate for this
* estimator.
*/
private final double mLogUniformVocabEstimate;
/**
* Categorizer to provide token categories for smoothed
* estimates and unknown estimates.
*/
private final TokenCategorizer mTokenCategorizer;
/**
* Construct a compiled estimator from a data input stream and
* sets the log estimate of the uniform vocabulary likelihood for
* smoothing the token model.
*
* @param in Data input stream from which to read the estimator.
* @param categorizer Token categorizer to classify tokens.
*
* @throws IOException If there is an I/O exception reading from
* the data input stream.
*/
public CompiledEstimator(ObjectInput in)
throws ClassNotFoundException, IOException {
mTokenCategorizer = (TokenCategorizer) in.readObject();
mTagSymbolTable = (SymbolTable) in.readObject();
mTokenSymbolTable = (SymbolTable) in.readObject();
// read from model & put in training
mTagTrie = new EstimatorTrie(in);
mTokenTrie = new EstimatorTrie(in);
mLogUniformVocabEstimate = in.readDouble();
int numSymbols = mTagSymbolTable.numSymbols();
mConvertToInterior = new int[numSymbols];
mCannotFollow = new boolean[numSymbols][numSymbols];
int numTags = mTagSymbolTable.numSymbols();
List starts = new ArrayList();
List interiors = new ArrayList();
for (int tagID = 0; tagID < numTags; ++tagID) {
String tag = idToTag(tagID);
mConvertToInterior[tagID] = tagToInteriorID(tag);
if (tagID != mConvertToInterior[tagID]) {
interiors.add(Integer.valueOf(mConvertToInterior[tagID]));
starts.add(Integer.valueOf(tagID));
}
for (int tagMinus1ID = 0; tagMinus1ID < numTags; ++tagMinus1ID)
mCannotFollow[tagID][tagMinus1ID]
= Tags.illegalSequence(idToTag(tagMinus1ID),tag);
}
mStart = convertToIntArray(starts);
mInterior = convertToIntArray(interiors);
}
/**
* Returns the array of start tag IDs. The array returned is
* aligned with the interior tag IDs returned by {@link
* #interiorTagIDs()}.
*
* @return Array of identifiers for start tags.
*/
public int[] startTagIDs() {
return mStart;
}
/**
* Returns the array of interior tag IDs. The array returned is
* aligned with the start tag IDs returned by {@link
* #startTagIDs()}.
*
* @return Array of identifiers for interior tags.
*/
public int[] interiorTagIDs() {
return mInterior;
}
/**
* Returns number of possible tags produced by this estimator,
* including both base and start forms of tags and the
* distinguished out tag.
*
* @return Number of possible tags produced by this estimator.
*/
public int numTags() {
return mTagSymbolTable.numSymbols();
}
/** Maps a tag to its integer identifier or -1
if it
* is not in the table.
* @param tag Name of tag.
* @return Integer identifier for the specified tag or -1
if it is not in the table.
*/
public int tagToID(String tag) {
return mTagSymbolTable.symbolToID(tag);
}
/**
* Maps a tag identifier to the name of that tag. Throws an array out of
* bounds exception if the identifier does not exist in the table.
*
* @param id Identifier of the tag to return.
* @return Name of the tag identified by the specified identifier.
*/
public String idToTag(int id) {
return mTagSymbolTable.idToSymbol(id);
}
/**
* Maps a token to its integer identifier or -1
if it
* is not in the table.
*
* @param token Name of token.
* @return Integer identifier for the specified token or -1
if it is not in the table.
*/
public int tokenToID(String token) {
return mTokenSymbolTable.symbolToID(token);
}
/**
* Maps a token to its integer identifier if it is in the symbol
* table, or to the identifier of its token category.
*
* @param token Token to compute ID for.
* @return Identifier of token if it exists, or identifier of its
* category if nothing is known about the token.
*/
public int tokenOrCategoryToID(String token) {
int id = tokenToID(token);
if (id < 0) {
id = tokenToID(mTokenCategorizer.categorize(token));
if (id < 0) {
System.err.println("No id for token category: " + token);
}
}
return id;
}
/**
* Maps an integer identifier to the token it represents
* in the token symbol table.
*
* @param id Identifier of the token.
* @return Token with specified identifier in the token symbol
* table.
*/
public String idToToken(int id) {
return mTokenSymbolTable.idToSymbol(id);
}
/**
* Returns true
if the tag identified by the first
* identifier cannot follow the tag identified by the second
* identifier.
*
* @param tagID Identifier of tag.
* @param tagMinus1ID Identifier of preceding tag.
* @return true
if the tag for tagID
* cannot follow the tag for tagMinus1ID
.
*/
public boolean cannotFollow(int tagID, int tagMinus1ID) {
return mCannotFollow[tagID][tagMinus1ID];
}
/**
* Returns the identifier for the base tag of
* the tag picked out by the specified identifier.
*
* @param tagID Identifier of tag to convert to base form.
* @return Identifier of the base form of the tag picked out by
* the specified identifier.
*/
private int idToInteriorID(int tagID) {
return mConvertToInterior[tagID];
}
/**
* Returns log P(tag,token|tag-1,token-1,token-2)
,
* where information about the tags and tokens are supplied
* through symbol table identifiers.
*
* @param tagID Identifier of outcome tag to estimate along with
* the token.
* @param tokenID Identifier of outcome token to estimate along
* with the tag.
* @param tagMinus1ID Identifier of the previous tag.
* @param tokenMinus1ID Identifier of the previous token.
* @param tokenMinus2ID Token two back from token.
* @return log P(tag,token|tag-1,token-1,token-2)
.
*/
public double estimate(int tagID, int tokenID,
int tagMinus1ID,
int tokenMinus1ID,
int tokenMinus2ID) {
if (cannotFollow(tagID,tagMinus1ID)) return Double.NaN;
int tagMinus1IDInterior = idToInteriorID(tagMinus1ID);
return estimateTag(tagID,tagMinus1IDInterior,
tokenMinus1ID,tokenMinus2ID)
+ estimateToken(tokenID,tagID,tagMinus1IDInterior,tokenMinus1ID);
}
/**
* Return log P(tag|tag-1,token-1,token-2)
. Returns
* Double.NaN
when nothing is known about
* tag-1
.
*
* @param tagID Identifier of outcome tag to estimate along with
* the token.
* @param tagMinus1ID Identifier of the previous tag.
* @param tokenMinus1ID Identifier of the previous token.
* @param tokenMinus2ID Token two back from token.
* @return log P(tag|tag-1,token-1,token-2)
.
*/
private double estimateTag(int tagID,
int tagMinus1ID,
int tokenMinus1ID,
int tokenMinus2ID) {
// find most specific node matching context,
// then lookup estimate from there
// estimating from node follows backoffs,
// adding 1-lambda from current context as necessary
int nodeTag1Index = mTagTrie.lookupChild(tagMinus1ID,0);
if (nodeTag1Index == -1) {
// no outcomes for simple tag -- really an error
return Double.NaN;
}
int nodeTag1W1Index
= mTagTrie.lookupChild(tokenMinus1ID,nodeTag1Index);
if (nodeTag1W1Index == -1) {
return mTagTrie.estimateFromNode(tagID,nodeTag1Index);
}
int nodeTag1W1W2Index
= mTagTrie.lookupChild(tokenMinus2ID,nodeTag1W1Index);
if (nodeTag1W1W2Index == -1) {
return mTagTrie.estimateFromNode(tagID,nodeTag1W1Index);
}
return mTagTrie.estimateFromNode(tagID,nodeTag1W1W2Index);
}
/**
* Return log P(token|tag,tag-1,token-1)
, where
* information about the tags and tokens are supplied through
* symbol table identifiers. Return Double.NaN
if
* nothign is known about tag
.
*
* @param tokenID Identifier of outcome token to estimate along
* with the tag.
* @param tagID Identifier of outcome tag to estimate along with
* the token.
* @param tagMinus1ID Identifier of the previous tag.
* @param tokenMinus1ID Identifier of the previous token.
* @return log P(token|tag,tag-1,token-1)
.
*/
private double estimateToken(int tokenID,
int tagID, int tagMinus1ID,
int tokenMinus1ID) {
int nodeTagIndex = mTokenTrie.lookupChild(tagID,0);
if (nodeTagIndex == -1)
return Double.NaN;
int nodeTagTag1Index = mTokenTrie.lookupChild(tagMinus1ID,nodeTagIndex);
if (nodeTagTag1Index == -1) {
return
mTokenTrie.estimateFromNodeUniform(tokenID,
nodeTagIndex,
mLogUniformVocabEstimate);
}
int nodeTagTag1W1Index
= mTokenTrie.lookupChild(tokenMinus1ID,nodeTagTag1Index);
if (nodeTagTag1W1Index != -1) {
return
mTokenTrie.estimateFromNodeUniform(tokenID,
nodeTagTag1W1Index,
mLogUniformVocabEstimate);
}
return mTokenTrie.estimateFromNodeUniform(tokenID,
nodeTagTag1Index,
mLogUniformVocabEstimate);
}
/**
* Return the identifier for the base tag corresponding
* to the specified tag.
*
* @param tag Tag whose base tag ID is returned.
* @return Identifier for base tag of specified tag.
*/
private int tagToInteriorID(String tag) {
return tagToID(Tags.toInnerTag(tag));
}
/**
* Convert the array list of Integer
objects to an
* array of their integer values.
*
* @param xs Arraylist of Integer objects.
* @return Array of integer values for the specified array of
* objects.
*/
private static int[] convertToIntArray(List xs) {
int[] result = new int[xs.size()];
for (int i = 0; i < result.length; ++i)
result[i] = xs.get(i).intValue();
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy