All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.chunk.CharLmRescoringChunker Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.chunk;

import com.aliasi.corpus.ObjectHandler;

import com.aliasi.hmm.HmmCharLmEstimator;

import com.aliasi.lm.LanguageModel;
import com.aliasi.lm.NGramBoundaryLM;
import com.aliasi.lm.NGramProcessLM;

import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Compilable;
import com.aliasi.util.Strings;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;

import java.util.HashMap;
import java.util.Map;

/**
 * A CharLmRescoringChunker provides a long-distance
 * character language model-based chunker that operates by rescoring
 * the output of a contained character language model HMM chunker.

 * 

The Underlying Chunker

* *

This model performs rescoring over an underlying chunker. * The underlying chunkeris an instance of {@link CharLmHmmChunker}, * configured with the specified tokenizer factory, n-gram length, * number of characters and interpolation ratio provided in the * constructor. The underlying chunker may be configured after * retrieving it through the superclass's {@link #baseChunker()} * method. The typical use of this is to configure caching. * *

The Rescoring Model

* *

The rescoring model used by this chunker is based on a bounded * character language model per chunk type with an additional * process character language model for text not in chunks. The * remaining details are described in the class documentation for * the superclass {@link AbstractCharLmRescoringChunker}. * *

Training and Compilation

* *

This chunker is trained in the usual way through calls to the * appropriate handle() method. The method {@link * #handle(Chunking)} implements the {@code ObjectHandler} interface * and allows for training through chunking examples. * A model is compiled by calling the {@link * Compilable} interface method {@link #compileTo(ObjectOutput)}. * The compiled model is an instance of a AbstractCharLmRescoringChunker, * and its underlying chunker may be recovered that way. * *

Runtime Configuration

* *

The underlying chunker is recoverable as a character language * model HMM chunker through {@link #baseChunker()}. The * non-chunk process n-gram character language model is returned by * {@link #outLM()}, whereas the chunk models are returned * by {@link #chunkLM(String)}. * *

The components of a character LM rescoring chunker are accessible * in their training format for methods on this class, as described above. * *

The compiled models are instances of {@link RescoringChunker}, * which allow their underlying chunker to be retrieved through {@link * #baseChunker()} and then configured. The other run-time models, for * may be retrieved through the superclass's * *

Reserved Tag

* *

The tag BOS is reserved for use by the system * for encoding document start/end positions. See {@link HmmChunker} * for more information. * * @author Bob Carpenter * @version 4.0.0 * @since LingPipe2.3 */ public class CharLmRescoringChunker extends AbstractCharLmRescoringChunker implements ObjectHandler, Compilable { final int mNGram; final int mNumChars; final double mInterpolationRatio; char mNextCodeChar = (char) (BOS_CHAR - 2); /** * Construct a character language model rescoring chunker based on * the specified components. Tags in the underlying model are not * smoothed by default (see the full constructor's documentation: * {@link * #CharLmRescoringChunker(TokenizerFactory,int,int,int,double,boolean)}. * * @param tokenizerFactory Tokenizer factory for boundaries. * @param numChunkingsRescored Number of underlying chunkings rescored. * @param nGram N-gram length for all models. * @param numChars Number of characters in the training and * run-time character sets. * @param interpolationRatio Underlying language-model * interpolation ratios. */ public CharLmRescoringChunker(TokenizerFactory tokenizerFactory, int numChunkingsRescored, int nGram, int numChars, double interpolationRatio) { super(new CharLmHmmChunker(tokenizerFactory, new HmmCharLmEstimator(nGram, numChars, interpolationRatio)), numChunkingsRescored, new NGramProcessLM(nGram,numChars,interpolationRatio), new HashMap(), new HashMap()); mNGram = nGram; mNumChars = numChars; mInterpolationRatio = interpolationRatio; } /** * Construct a character language model rescoring chunker based on * the specified components. * *

Whether tags are smoothed in the underlying model is determined * by the flag in the constructor. See {@link CharLmHmmChunker}'s * class documentation for more information on the effects of * smoothing. * * * @param tokenizerFactory Tokenizer factory for boundaries. * @param numChunkingsRescored Number of underlying chunkings rescored. * @param nGram N-gram length for all models. * @param numChars Number of characters in the training and * run-time character sets. * @param interpolationRatio Underlying language-model * interpolation ratios. * @param smoothTags Set to true to smooth tags in underlying * chunker. */ public CharLmRescoringChunker(TokenizerFactory tokenizerFactory, int numChunkingsRescored, int nGram, int numChars, double interpolationRatio, boolean smoothTags) { super(new CharLmHmmChunker(tokenizerFactory, new HmmCharLmEstimator(nGram, numChars, interpolationRatio), smoothTags), numChunkingsRescored, new NGramProcessLM(nGram,numChars,interpolationRatio), new HashMap(), new HashMap()); mNGram = nGram; mNumChars = numChars; mInterpolationRatio = interpolationRatio; } /** * Trains this chunker with the specified chunking. * * @param chunking Training data. */ public void handle(Chunking chunking) { // train underlying ObjectHandler handler2 = baseChunker(); handler2.handle(chunking); // train rescorer String text = chunking.charSequence().toString(); char prevTagChar = BOS_CHAR; int pos = 0; for (Chunk chunk : orderedSet(chunking)) { int start = chunk.start(); int end = chunk.end(); if (pos > start) { String msg = "Chunk overlap for chunk=" + chunk + " in chunking=" + chunking; throw new IllegalArgumentException(msg); } String chunkType = chunk.type(); createTypeIfNecessary(chunkType); char tagChar = typeToChar(chunkType); trainOutLM(text.substring(pos,start), prevTagChar,tagChar); trainTypeLM(chunkType,text.substring(start,end)); pos = end; prevTagChar = tagChar; } trainOutLM(text.substring(pos), prevTagChar,EOS_CHAR); } /** * Compiles this model to the specified object output stream. The * model may then be read back in using {@link * java.io.ObjectInput#readObject()}; the resulting object will be * an instance of {@link AbstractCharLmRescoringChunker}. * * @throws IOException If there is an I/O error during the write. * @throws IllegalArgumentException If the tokenizer factory supplied to * the constructor of this class is not compilable. */ public void compileTo(ObjectOutput objOut) throws IOException { objOut.writeObject(new Externalizer(this)); } /** * Provides the specified character sequence data as training data * for the language model of the specfied type. This method calls * the method of the same signature on the trainable base chunker. * The language model for the specified type will be created if it * has not been seen previously. * *

Warning: It is not sufficient to only train a model * using this method. Annotated data with a representative * balance of entities and non-entity text is required to train * the overall likelihood of entities and the contexts in which * they occur. Use of this method will not bias the * likelihoods of entities occurring. But, it might cause the * common entities in the training data to be overwhelmed if a * large dictionary is used. One possibility is to train the * basic data multiple times relative to the dictionary (or * vice-versa). * * @param cSeq Character sequence for training. * @param type Type of character sequence. */ public void trainDictionary(CharSequence cSeq, String type) { baseChunker().trainDictionary(cSeq,type); trainTypeLM(type,cSeq); } /** * Trains the language model for non-entities using the specified * character sequence. * *

Warning: Training using this method biases the * likelihood of entities downward, because it does not train the * likelihood of a non-entity character sequence ending and being * followed by an entity of a specified type. Thus this method is * best used to seed a dictionary of common words that are * relatively few in number relative to the entity-annotated * training data. * * @param cSeq Data to train the non-entity (out) model. */ public void trainOut(CharSequence cSeq) { outLM().train(cSeq); } void createTypeIfNecessary(String chunkType) { if (mTypeToChar.containsKey(chunkType)) return; Character c = Character.valueOf(mNextCodeChar--); mTypeToChar.put(chunkType,c); NGramBoundaryLM lm = new NGramBoundaryLM(mNGram,mNumChars,mInterpolationRatio, (char) 0xFFFF); mTypeToLM.put(chunkType,lm); } void trainTypeLM(String type, CharSequence text) { createTypeIfNecessary(type); NGramBoundaryLM lm = mTypeToLM.get(type); lm.train(text); } void trainOutLM(String text, char prevTagChar, char nextTagChar) { String trainSeq = prevTagChar + text + nextTagChar; outLM().train(trainSeq); outLM().substringCounter().decrementUnigram(prevTagChar); } static class Externalizer extends AbstractExternalizable { private static final long serialVersionUID = 3555143657918695241L; final CharLmRescoringChunker mChunker; public Externalizer() { this(null); } public Externalizer(CharLmRescoringChunker chunker) { mChunker = chunker; } // base chunker:NBestChunker // numChunkingsRescored:int // numTypes:int // (type:UTF,codeChar:char,ngramBoundaryLM:LanguageModel.Process)**numTypes // outLM:NGram.Process @Override public void writeExternal(ObjectOutput objOut) throws IOException { mChunker.baseChunker().compileTo(objOut); objOut.writeInt(mChunker.numChunkingsRescored()); String[] types = mChunker.mTypeToLM.keySet().toArray(Strings.EMPTY_STRING_ARRAY); objOut.writeInt(types.length); for (int i = 0; i < types.length; ++i) { objOut.writeUTF(types[i]); objOut.writeChar(mChunker.typeToChar(types[i])); NGramBoundaryLM lm = mChunker.mTypeToLM.get(types[i]); lm.compileTo(objOut); } mChunker.outLM().compileTo(objOut); } @Override public Object read(ObjectInput in) throws ClassNotFoundException, IOException { NBestChunker baseChunker = (NBestChunker) in.readObject(); int numChunkingsRescored = in.readInt(); int numTypes = in.readInt(); Map typeToChar = new HashMap(); Map typeToLM = new HashMap(); for (int i = 0; i < numTypes; ++i) { String type = in.readUTF(); char c = in.readChar(); LanguageModel.Sequence lm = (LanguageModel.Sequence) in.readObject(); typeToChar.put(type,Character.valueOf(c)); typeToLM.put(type,lm); } LanguageModel.Process outLM = (LanguageModel.Process) in.readObject(); return new AbstractCharLmRescoringChunker(baseChunker, numChunkingsRescored, outLM, typeToChar, typeToLM); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy