All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.chunk.AbstractCharLmRescoringChunker Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.chunk;

import com.aliasi.lm.LanguageModel;

import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

/**
 * An AbstractCharLmRescoringChunker provides the basic
 * character language-model rescoring model used by the trainable
 * CharLmRescoringChunker and its compiled version.
 *
 * 

Rescoring Model

* The per-type language models simply model expressions of that type, * both within and across tokens. The non-chunk model is responsible * not only for modeling the text not in chunks, but also in * predicting what the next chunk is given the text not in a chunk. * *

The exact model used is most easily described through an * example. Consider the sentence John J. Smith lives in * Washington. with John J. Smith as a person-type chunk * and Washington as a location-type chunk. The probablity of * this analysis derives from alternating chunk/non-chunk spans, * starting and ending with non-chunk spans. * *

 * POUT(CPER|CBOS)
 * * PPER(John J. Smith)
 * * POUT( lives in CLOC|CPER)
 * * PLOC(Washington)
 * * POUT(.CEOS|CLOC)
 * 
* * Note that the chunk models PPER and * PLOC are bounded models, and thus predict * the first letter given the fact that it's the first letter, and * also encodes an end-of-string probability to model the end. See * {@link com.aliasi.lm.NGramBoundaryLM} for more information on bounded models. * *

The non-chunk POUT model is a process * language model, but uses distinguished characters in much the same * way as the bounded models do internally. In particular, we have * distinguished characters for each type * (e.g. CPER), and for * begin-of-sentence and end-of-sentence markers * (e.g. CBOS). These must be chosen * so as not to conflict with any input characters in training or * decoding. With this encoding, the non-chunk model bears the brunt * of the burden in predicting types. To start, it conditions the * text it generates on the previous type, encoded as a character. To * end, it generates the next chunk type, also encoded as a character. * This allows the models to be sensitive to the fact that phrases * like lives in (including the spaces on either side) are * conditioned on following a person. The following chunk type, * location, is generated conditional on following * CPER lives in. The only constraints * on the length of these dependencies is the length of the n-gram * models (and the size of the chunk/non-chunk spans). * *

The resulting model generates a properly normalized probability * distribution over chunkings. * *

Reserved Tag

* *

The tag BOS is reserved for use by the system * for encoding document start/end positions. See {@link HmmChunker} * for more information. * * @author Bob Carpenter * @version 3.0 * @since LingPipe2.3 * @param the type of the underlying n-best chunker being rescored * @param the type of the process language model for non-entities * @param the type of the sequence language model for entities */ public class AbstractCharLmRescoringChunker extends RescoringChunker { final Map mTypeToChar; final Map mTypeToLM; final O mOutLM; final static char UNKNOWN_TYPE_CHAR = 0xFFFF; final static char BOS_CHAR = (char)(0xFFFE); final static char EOS_CHAR = (char)(BOS_CHAR-1); /** * Construct a rescoring chunker based on the specified underlying * chunker, with the specified number of underlying chunkings * rescored, based on the models and type encodings provided in * the last three arguments. See the class documentation for more * information on the role of these parameters. * * @param baseNBestChunker Underlying chunker to rescore. * @param numChunkingsRescored Number of underlying chunkings * rescored by this chunker. * @param outLM The process language model for non-chunks. * @param typeToChar A mapping from chunk types to the characters that * encode them. * @param typeToLM A mapping from chunk types to the language * models used to model them. */ public AbstractCharLmRescoringChunker(B baseNBestChunker, int numChunkingsRescored, O outLM, Map typeToChar, Map typeToLM) { super(baseNBestChunker,numChunkingsRescored); mOutLM = outLM; mTypeToChar = typeToChar; mTypeToLM = typeToLM; } /** * Returns the character used to encode the specified type * in the model. See the class documentation for more details * on the use of this character in the model. * * @param chunkType Type of chunk. * @return The character to code the type in the model. * @throws IllegalArgumentException If the specified chunk * type does not exist. */ public char typeToChar(String chunkType) { Character result = mTypeToChar.get(chunkType); if (result == null) return UNKNOWN_TYPE_CHAR; return result.charValue(); } /** * Returns the process language model for non-chunks. This * is the actual language model used, so changes to it affect * this chunker. * * @return The process language model for non-chunks. */ public O outLM() { return mOutLM; } /** * Returns the sequence language model for chunks of the * specified type. * * @param chunkType Type of chunk. * @return Language model for the specified chunk type. */ public C chunkLM(String chunkType) { return mTypeToLM.get(chunkType); } /** * Performs rescoring of the base chunking output using * character language models. See the class documentation * above for more information. * * @param chunking Chunking being rescored. * @return New score for chunker. */ @Override public double rescore(Chunking chunking) { String text = chunking.charSequence().toString(); double logProb = 0.0; int pos = 0; char prevTagChar = BOS_CHAR; for (Chunk chunk : orderedSet(chunking)) { int start = chunk.start(); int end = chunk.end(); String chunkType = chunk.type(); char tagChar = typeToChar(chunkType); logProb += outLMEstimate(text.substring(pos,start), prevTagChar,tagChar); if (mTypeToLM.get(chunkType) == null) { System.out.println("\nFound null lm for type=" + chunkType + " Full type set =" + mTypeToLM.keySet()); System.out.println("Chunking=" + chunking); } logProb += typeLMEstimate(chunkType,text.substring(start,end)); pos = end; prevTagChar = tagChar; } logProb += outLMEstimate(text.substring(pos), prevTagChar,EOS_CHAR); return logProb; } double typeLMEstimate(String type, String text) { LanguageModel.Sequence lm = mTypeToLM.get(type); if (lm == null) { String msg = "Found null lm for type=" + type + " Full type set =" + mTypeToLM.keySet(); System.out.println("TypeLM Estimate:\n" + msg); return -16.0 * text.length(); } double estimate = lm.log2Estimate(text); return estimate; } double outLMEstimate(String text, char prevTagChar, char nextTagChar) { String seq = prevTagChar + text + nextTagChar; String start = seq.substring(0,1); double estimate = mOutLM.log2Estimate(seq) - mOutLM.log2Estimate(start); return estimate; } static char[] wrapText(String text, char prevTagChar, char nextTagChar) { char[] cs = new char[text.length()+2]; cs[0] = prevTagChar; cs[cs.length-1] = nextTagChar; for (int i = 0; i < text.length(); ++i) cs[i+1] = text.charAt(i); return cs; } static Set orderedSet(Chunking chunking) { Set orderedChunkSet = new TreeSet(Chunk.TEXT_ORDER_COMPARATOR); orderedChunkSet.addAll(chunking.chunkSet()); return orderedChunkSet; } }