com.aliasi.chunk.AbstractCharLmRescoringChunker Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.chunk;

import com.aliasi.lm.LanguageModel;

import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

/**
 * An AbstractCharLmRescoringChunker provides the basic
 * character language-model rescoring model used by the trainable
 * CharLmRescoringChunker and its compiled version.
 *
 * Rescoring Model

 * The per-type language models simply model expressions of that type,
 * both within and across tokens.  The non-chunk model is responsible
 * not only for modeling the text not in chunks, but also in
 * predicting what the next chunk is given the text not in a chunk.
 *
 * The exact model used is most easily described through an
 * example.  Consider the sentence John J. Smith lives in
 * Washington. with John J. Smith as a person-type chunk
 * and Washington as a location-type chunk.  The probablity of
 * this analysis derives from alternating chunk/non-chunk spans,
 * starting and ending with non-chunk spans.
 *
 * 
 * P_{_OUT}(C_{_PER}|C_{_BOS})
 * * P_{_PER}(John J. Smith)
 * * P_{_OUT}( lives in C_{_LOC}|C_{_PER})
 * * P_{_LOC}(Washington)
 * * P_{_OUT}(.C_{_EOS}|C_{_LOC})
 * 
 *
 * Note that the chunk models P_{_PER} and
 * P_{_LOC} are bounded models, and thus predict
 * the first letter given the fact that it's the first letter, and
 * also encodes an end-of-string probability to model the end.  See
 * {@link com.aliasi.lm.NGramBoundaryLM} for more information on bounded models.
 *
 * The non-chunk P_{_OUT} model is a process
 * language model, but uses distinguished characters in much the same
 * way as the bounded models do internally.  In particular, we have
 * distinguished characters for each type
 * (e.g. C_{_PER}), and for
 * begin-of-sentence and end-of-sentence markers
 * (e.g. C_{_BOS}).  These must be chosen
 * so as not to conflict with any input characters in training or
 * decoding.  With this encoding, the non-chunk model bears the brunt
 * of the burden in predicting types.  To start, it conditions the
 * text it generates on the previous type, encoded as a character.  To
 * end, it generates the next chunk type, also encoded as a character.
 * This allows the models to be sensitive to the fact that phrases
 * like  lives in  (including the spaces on either side) are
 * conditioned on following a person.  The following chunk type,
 * location, is generated conditional on following
 * C_{_PER} lives in.  The only constraints
 * on the length of these dependencies is the length of the n-gram
 * models (and the size of the chunk/non-chunk spans).
 *
 * 
The resulting model generates a properly normalized probability
 * distribution over chunkings.
 *
 * 
Reserved Tag
 *
 * The tag BOS is reserved for use by the system
 * for encoding document start/end positions.  See {@link HmmChunker}
 * for more information.
 *
 * @author  Bob Carpenter
 * @version 3.0
 * @since   LingPipe2.3
 * @param  the type of the underlying n-best chunker being rescored
 * @param  the type of the process language model for non-entities
 * @param  the type of the sequence language model for entities
 */
public class AbstractCharLmRescoringChunker
    extends RescoringChunker {

    final Map mTypeToChar;
    final Map mTypeToLM;

    final O mOutLM;

    final static char UNKNOWN_TYPE_CHAR = 0xFFFF;
    final static char BOS_CHAR = (char)(0xFFFE);
    final static char EOS_CHAR = (char)(BOS_CHAR-1);

    /**
     * Construct a rescoring chunker based on the specified underlying
     * chunker, with the specified number of underlying chunkings
     * rescored, based on the models and type encodings provided in
     * the last three arguments.  See the class documentation for more
     * information on the role of these parameters.
     *
     * @param baseNBestChunker Underlying chunker to rescore.
     * @param numChunkingsRescored Number of underlying chunkings
     * rescored by this chunker.
     * @param outLM The process language model for non-chunks.
     * @param typeToChar A mapping from chunk types to the characters that
     * encode them.
     * @param typeToLM A mapping from chunk types to the language
     * models used to model them.
     */
    public AbstractCharLmRescoringChunker(B baseNBestChunker,
                                          int numChunkingsRescored,
                                          O outLM,
                                          Map typeToChar,
                                          Map typeToLM) {
        super(baseNBestChunker,numChunkingsRescored);
        mOutLM = outLM;
        mTypeToChar = typeToChar;
        mTypeToLM = typeToLM;
    }

    /**
     * Returns the character used to encode the specified type
     * in the model.  See the class documentation for more details
     * on the use of this character in the model.
     *
     * @param chunkType Type of chunk.
     * @return The character to code the type in the model.
     * @throws IllegalArgumentException If the specified chunk
     * type does not exist.
     */
    public char typeToChar(String chunkType) {
        Character result = mTypeToChar.get(chunkType);
        if (result == null)
            return UNKNOWN_TYPE_CHAR;
        return result.charValue();
    }

    /**
     * Returns the process language model for non-chunks.  This
     * is the actual language model used, so changes to it affect
     * this chunker.
     *
     * @return The process language model for non-chunks.
     */
    public O outLM() {
        return mOutLM;
    }

    /**
     * Returns the sequence language model for chunks of the
     * specified type.
     *
     * @param chunkType Type of chunk.
     * @return Language model for the specified chunk type.
     */
    public C chunkLM(String chunkType) {
        return mTypeToLM.get(chunkType);
    }

    /**
     * Performs rescoring of the base chunking output using
     * character language models.  See the class documentation
     * above for more information.
     *
     * @param chunking Chunking being rescored.
     * @return New score for chunker.
     */
    @Override
    public double rescore(Chunking chunking) {
        String text = chunking.charSequence().toString();
        double logProb = 0.0;
        int pos = 0;
        char prevTagChar = BOS_CHAR;
        for (Chunk chunk : orderedSet(chunking)) {
            int start = chunk.start();
            int end = chunk.end();
            String chunkType = chunk.type();
            char tagChar = typeToChar(chunkType);
            logProb += outLMEstimate(text.substring(pos,start),
                                     prevTagChar,tagChar);

            if (mTypeToLM.get(chunkType) == null) {
                System.out.println("\nFound null lm for type=" + chunkType
                                   + " Full type set =" + mTypeToLM.keySet());
                System.out.println("Chunking=" + chunking);
            }

            logProb += typeLMEstimate(chunkType,text.substring(start,end));
            pos = end;
            prevTagChar = tagChar;
        }
        logProb += outLMEstimate(text.substring(pos),
                                 prevTagChar,EOS_CHAR);
        return logProb;
    }


    double typeLMEstimate(String type, String text) {
        LanguageModel.Sequence lm = mTypeToLM.get(type);
        if (lm == null) {
            String msg = "Found null lm for type=" + type
                + " Full type set =" + mTypeToLM.keySet();
            System.out.println("TypeLM Estimate:\n" + msg);
            return -16.0 * text.length();
        }
        double estimate = lm.log2Estimate(text);
        return estimate;
    }


    double outLMEstimate(String text,
                         char prevTagChar, char nextTagChar) {
        String seq = prevTagChar + text + nextTagChar;
        String start = seq.substring(0,1);
        double estimate = mOutLM.log2Estimate(seq)
            - mOutLM.log2Estimate(start);
        return estimate;
    }

    static char[] wrapText(String text, char prevTagChar, char nextTagChar) {
        char[] cs = new char[text.length()+2];
        cs[0] = prevTagChar;
        cs[cs.length-1] = nextTagChar;
        for (int i = 0; i < text.length(); ++i)
            cs[i+1] = text.charAt(i);
        return cs;
    }

    static Set orderedSet(Chunking chunking) {
        Set orderedChunkSet = new TreeSet(Chunk.TEXT_ORDER_COMPARATOR);
        orderedChunkSet.addAll(chunking.chunkSet());
        return orderedChunkSet;
    }






}