All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.lm.NGramBoundaryLM Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.lm;

import com.aliasi.corpus.ObjectHandler;

import com.aliasi.io.BitInput;
import com.aliasi.io.BitOutput;

import com.aliasi.stats.Model;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Compilable;
import com.aliasi.util.Strings;

import java.io.InputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.OutputStream;
import java.io.Serializable;

/**
 * An NGramBoundaryLM provides a dynamic sequence
 * language model for which training, estimation and pruning may be
 * interleaved.  A sequence language model normalizes probabilities
 * over all sequences.
 *
 * 

This class wraps an n-gram process language model by supplying a * special boundary character boundaryChar at * construction time which will be added to the total number of * characters in defining the estimator. For each training event, the * boundary character is inserted both before and after the character * sequence provided. The actual unigram count of this boundary must * then be decremented so that the initial character isn't counted in * estimates. During estimation, the initial boundary character is * used as context and the final one is used to estimate the * end-of-stream likelihood. Thus if Ppr * is the underlying process model then the boundary model defines * estimates by: * *

* Pb(c1,...,cN) *
  * = Ppr(boundaryChar|boundaryChar,c1,...,cN) *
    * * Σ1<=i<=N * Ppr(ci|boundaryChar,c1,...,ci-1) * *
  * = Ppr(boundaryChar,c1,...,cN,boundaryChar) * - Ppr(boundaryChar) *
* * The result of serializing and deserializing an n-gram boundary * language model is a compiled implementation of a conditional * sequence language model. The serialization format is the boundary character * followed by the serialization of the contained writable process * language model. * *

Models may be pruned by pruning the substring counter returned * by {@link #substringCounter()}. See the documentation for the * class of the return object, {@link TrieCharSeqCounter}, for more * information. * *

Serialization and Compilation

* * N-gram boundary language models are both serializable and compilable, * implementing Java's {@link Serializable} interface and * LingPipe's {@link Compilable} interface. * *

Serialization and deserialization returns a copy of the * serialized object, which again implements this class, {@code * NGramBoundaryLM}. Compilation and deserialization returns an * instance of {@link CompiledNGramBoundaryLM}. The compiled version * is much faster and may also be more compact in memory. * * @author Bob Carpenter * @version 4.1.0 * @since LingPipe2.0 */ public class NGramBoundaryLM implements LanguageModel.Sequence, LanguageModel.Conditional, LanguageModel.Dynamic, Model, Compilable, Serializable { static final long serialVersionUID = 2917786830470130748L; private final NGramProcessLM mProcessLM; private final char mBoundaryChar; private final char[] mBoundaryArray; /** * Constructs a dynamic n-gram sequence language model with the * specified maximum n-gram and default values for other * parameters. * *

The default number of characters is {@link * Character#MAX_VALUE}-1, the default interpolation * parameter ratio is equal to the n-gram length, and the boundary * character is the byte-order marker U+FFFF * * @param maxNGram Maximum n-gram length in model. */ public NGramBoundaryLM(int maxNGram) { this(maxNGram,Character.MAX_VALUE-1); } /** * Constructs a dynamic n-gram sequence language model with the * specified maximum n-gram, specified maximum number of observed * characters, and default values for other parameters. * *

The default interpolation * parameter ratio is equal to the n-gram length, and the boundary * character is the byte-order marker U+FFFF * * @param maxNGram Maximum n-gram length in model. * @param numChars Maximum number of character seen in training * and test sets. */ public NGramBoundaryLM(int maxNGram, int numChars) { this(maxNGram,numChars,maxNGram,'\uFFFF'); } /** * Construct a dynamic n-gram sequence language model with the * specified maximum n-gram length, number of characters, * interpolation ratio hyperparameter and boundary character. * Note that the boundary character must not occur as a regular * character in the input. Unicode provides several options for * marker characters; for instance the byte order markers * U+FFFF or U+FEFF may be used * internally by applications but may not be part of valid unicode * character streams and thus make ideal choices for boundary * characters. See: * * Unicode Standard, Chapter 15.8: NonCharacters * * @param maxNGram Maximum n-gram length in model. * @param numChars Maximum number of character seen in training * and test sets. * @param lambdaFactor Interpolation ratio hyperparameter. * @param boundaryChar Boundary character. */ public NGramBoundaryLM(int maxNGram, int numChars, double lambdaFactor, char boundaryChar) { this(new NGramProcessLM(maxNGram,numChars+1,lambdaFactor), boundaryChar); } /** * Construct an n-gram boundary language model with the specified * boundary character and underlying process language model. * *

This constructor may be used to reconstitute a serialized * model. By writing the trie character sequence counter for the * underlying process language model, it may be read back in. * This may be used to construct a process language model, which * may be used to reconstruct a boundary language model using * this constructor. * * @param processLm Underlying process language model. * @param boundaryChar Character used to encode boundaries. */ public NGramBoundaryLM(NGramProcessLM processLm, char boundaryChar) { mBoundaryChar = boundaryChar; mBoundaryArray = new char[] { boundaryChar }; mProcessLM = processLm; } /** * Writes this language model to the specified output stream. * *

A bit output is wrapped around the output stream for * writing. The format begins with a delta-encoding of * the boundary character plus 1, and is followed by the * bit output of the underlying process language model. * * @param out Output stream from which to read the language model. * @throws IOException If there is an underlying I/O error. */ public void writeTo(OutputStream out) throws IOException { BitOutput bitOut = new BitOutput(out); bitOut.writeDelta((long)(mBoundaryChar+1)); mProcessLM.writeTo(bitOut); bitOut.flush(); } /** * Read a process language model from the specified input * stream. * *

See {@link #writeTo(OutputStream)} for a description * of the binary format. * * @param in Input stream from which to read the model. * @return Process language model read from stream. * @throws IOException If there is an underlying I/O error. */ public static NGramBoundaryLM readFrom(InputStream in) throws IOException { BitInput bitIn = new BitInput(in); char boundaryChar = (char) (bitIn.readDelta()-1L); NGramProcessLM processLM = NGramProcessLM.readFrom(bitIn); return new NGramBoundaryLM(processLM,boundaryChar); } /** * Returns the underlying n-gram process language model * for this boundary language model. Changes to the returned * model affect this language model. * * @return The underlying process language model. */ public NGramProcessLM getProcessLM() { return mProcessLM; } /** * Returns the characters that have been observed for this * language model, including the special boundary character. * * @return The observed characters for this langauge model. */ public char[] observedCharacters() { return mProcessLM.observedCharacters(); } /** * Returns the underlying substring counter for this language * model. This model may be pruned by pruning the counter * returned by this method. * * @return The underlying substring counter for this language model. */ public TrieCharSeqCounter substringCounter() { return mProcessLM.substringCounter(); } /** * Writes a compiled version of this boundary language model to * the specified object output. The result may be read back in * by casting the result of {@link ObjectInput#readObject()} to * {@link CompiledNGramBoundaryLM}. * * @param objOut Object output to which this model is compiled. * @throws IOException If there is an I/O exception during the * write. */ public void compileTo(ObjectOutput objOut) throws IOException { objOut.writeObject(new Externalizer(this)); } Object writeReplace() { return new Serializer(this); } /** * Train the language model on the specified character sequence. * This method just delegates to {@link #train(CharSequence)}. * * @param cSeq Character sequence on which to train. */ public void handle(CharSequence cSeq) { train(cSeq); } public void train(CharSequence cs, int count) { char[] csBounded = addBoundaries(cs,mBoundaryChar); mProcessLM.train(csBounded,0,csBounded.length,count); // don't count initial boundary mProcessLM.decrementUnigram(mBoundaryChar,count); } public void train(CharSequence cs) { train(cs,1); } public void train(char[] cs, int start, int end) { train(cs,start,end,1); } public void train(char[] cs, int start, int end, int count) { char[] csBounded = addBoundaries(cs,start,end,mBoundaryChar); mProcessLM.train(csBounded,0,csBounded.length,count); mProcessLM.decrementUnigram(mBoundaryChar,count); } public double log2ConditionalEstimate(CharSequence cs) { if (cs.length() < 1) { String msg = "Conditional estimate must be at least one character."; throw new IllegalArgumentException(msg); } char[] csBounded = addBoundaries(cs,mBoundaryChar); return mProcessLM.log2ConditionalEstimate(csBounded,0,csBounded.length-1); } public double log2ConditionalEstimate(char[] cs, int start, int end) { if (end <= start) { String msg = "Conditional estimate must be at least one character."; throw new IllegalArgumentException(msg); } char[] csBounded = addBoundaries(cs,start,end,mBoundaryChar); return mProcessLM.log2ConditionalEstimate(csBounded,0,csBounded.length-1); } public double log2Estimate(CharSequence cs) { char[] csBounded = addBoundaries(cs,mBoundaryChar); return mProcessLM.log2Estimate(csBounded,0,csBounded.length) - mProcessLM.log2Estimate(mBoundaryArray,0,1); } public double log2Estimate(char[] cs, int start, int end) { char[] csBounded = addBoundaries(cs,start,end,mBoundaryChar); return mProcessLM.log2Estimate(csBounded,0,csBounded.length) - mProcessLM.log2Estimate(mBoundaryArray,0,1); } /** * This method is a convenience impelementation of the {@link * Model} interface which delegates the call to {@link * #log2Estimate(CharSequence)}. * * @param cSeq Character sequence whose probability is returned. * @return The log (base 2) probability of the specified character sequence. */ public double log2Prob(CharSequence cSeq) { return log2Estimate(cSeq); } /** * This method is a convenience implementation of the {@link Model} * interface which returns the result of raising 2.0 to the * power of the result of a call to {@link #log2Estimate(CharSequence)}. * * @param cSeq Character sequence whose probability is returned. * @return The log probability of the specified character sequence. */ public double prob(CharSequence cSeq) { return java.lang.Math.pow(2.0,log2Estimate(cSeq)); } /** * Returns a string-based representation of this language model. * It displays the boundary character and the contained * process language model. * * @return A string-based representation of this language model. */ @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("Boundary char=" + ((int)mBoundaryChar)); sb.append('\n'); mProcessLM.toStringBuilder(sb); return sb.toString(); } static char[] addBoundaries(CharSequence cs, char boundaryChar) { char[] cs2 = new char[cs.length() + 2]; for (int i = 0; i < cs.length(); ++i) { char c = cs.charAt(i); if (c == boundaryChar) { String msg = "Estimated string cannot contain boundary char." + " Found boundary char=" + c + " at index=" + i; throw new IllegalArgumentException(msg); } cs2[i+1] = cs.charAt(i); } addBoundaryChars(cs2,boundaryChar); return cs2; } static char[] addBoundaries(char[] cs, int start, int end, char boundaryChar) { char[] cs2 = new char[cs.length+1]; int len = end-start; for (int i = 0; i < len; ++i) { char c = cs[i+start]; if (c == boundaryChar) { // ugly cut and paste from above String msg = "Estimated string cannot contain boundary char." + " Found boundary char=" + c + " at index=" + (i+start); throw new IllegalArgumentException(msg); } cs2[i+1] = c; } addBoundaryChars(cs2,boundaryChar); return cs2; } static void addBoundaryChars(char[] cs, char boundaryChar) { cs[0] = boundaryChar; cs[cs.length-1] = boundaryChar; } static class Externalizer extends AbstractExternalizable { private static final long serialVersionUID = -7945082563035787530L; final NGramBoundaryLM mLM; public Externalizer() { this(null); } public Externalizer(NGramBoundaryLM lm) { mLM = lm; } @Override public Object read(ObjectInput objIn) throws IOException { return new CompiledNGramBoundaryLM(objIn); } @Override public void writeExternal(ObjectOutput objOut) throws IOException { objOut.writeChar(mLM.mBoundaryChar); mLM.mProcessLM.compileTo(objOut); } } static class Serializer extends AbstractExternalizable { static final long serialVersionUID = -251292379784295407L; final NGramBoundaryLM mLM; public Serializer() { this(null); } public Serializer(NGramBoundaryLM lm) { mLM = lm; } @Override public void writeExternal(ObjectOutput objOut) throws IOException { objOut.writeChar(mLM.mBoundaryChar); objOut.writeObject(mLM.mProcessLM); } public Object read(ObjectInput objIn) throws IOException, ClassNotFoundException { char boundaryChar = objIn.readChar(); @SuppressWarnings("unchecked") NGramProcessLM lm = (NGramProcessLM) objIn.readObject(); return new NGramBoundaryLM(lm,boundaryChar); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy