com.aliasi.dict.ApproxDictionaryChunker Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.dict;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.ChunkFactory;
import com.aliasi.chunk.Chunking;
import com.aliasi.chunk.ChunkingImpl;
import com.aliasi.chunk.Chunker;

import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.spell.WeightedEditDistance;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Scored;
import com.aliasi.util.Strings;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

/**
 * An ApproxDictionaryChunker implements a chunker that
 * produces chunks based on weighted edit distance of strings from
 * dictionary entries.  This is an approximate or "fuzzy"
 * dictionary matching strategy.
 *
 * The underlying dictionary is required to be an instance of
 * {@link TrieDictionary} in order to support efficient search for
 * matches.  Other dictionaries can be easily converted to
 * trie dictionaries by adding their entries to a fresh trie
 * dictionary.
 *
 * 
Entries are matched by weighted edit distance, as supplied by an
 * implementation of {@link WeightedEditDistance}.  All substrings
 * within the maximum distance specified at construction time are
 * returned as part of the chunking.  Keep in mind that weights for
 * weighted edit distance are specified as proximities, that is, as
 * negative distances.
 *
 * 
No Transposition
 *
 * Transposition is not implemented in the approximate dictionary
 * chunker, so no matches are possible through
 * transposition. Specifically, the transpose weight method is never
 * called on the underlying weighted edit distance.
 *
 * 
Token Sensitivity
 *
 * The tokenizer factory supplied at construction time is
 * only used to constrain search by enforcing boundary conditions.
 * Chunks are only returned if they start on the first character
 * of a token and end on the last character of a token.
 *
 * 
Using an instance of {@link
 * com.aliasi.tokenizer.CharacterTokenizerFactory} effectively removes
 * token sensitivity by treating every non-whitespace character as a
 * token and thus rendering every non-whitespace position a possible
 * chunk boundary.
 *
 * 
Serialization
 *
 * An approximate dictionary is serializable if its tokenizer factory
 * and edit distance are serializable.  The reconstituted object will
 * be an instance of this class, {@code ApproxDictionaryChunker}.
 * 
 * References
 *
 * The approach implemented here is very similar to that described
 * in the following paper:
 *
 * 

 *
 *  Yoshimasa Tsuruoka and Jun'ichi Tsujii.  2003. Boosting precision and recall of dictionary-based protein name
 * recognition In Proceedings of the 2003 ACL workshop on NLP
 * in Biomedicine.
 * 
 * 
 *
 * The best general reference for approximate string matching
 * is:
 *
 * 
 * 
 * Gusfield, Dan.  1997.  Algorithms on Strings, Trees and Sequences.
 * Cambridge University Press.
 * 
 * 
 *
 * @author Bob Carpenter
 * @version 3.9.1
 * @since   LingPipe2.1
 */
public class ApproxDictionaryChunker 
    implements Chunker,
               Serializable {

    static final long serialVersionUID = 5364907367744655793L;

    private final TrieDictionary mDictionary;
    private final TokenizerFactory mTokenizerFactory;
    private final WeightedEditDistance mEditDistance;
    private double mDistanceThreshold;

    /**
     * Construct an approximate dictionary chunker from the specified
     * dictionary, tokenizer factory, weighted edit distance and
     * distance bound.  The dictionary is used for the candidate
     * matches.  The tokenizer factory is used for determining
     * possible boundaries of matches, which must start on the first
     * character of a token and end on the last character of a token.
     * The edit distance is used for measuring substrings against
     * dictionary entries.  The distance threshold specifies the
     * maximum distance at which matches are returned.
     *
     * @param dictionary Dictionary to use for matching.
     * @param tokenizerFactory Tokenizer factory for boundary
     * determination.
     * @param editDistance Matching distance measure.
     * @param distanceThreshold Distance threshold for matching.
     */
    public ApproxDictionaryChunker(TrieDictionary dictionary,
                                   TokenizerFactory tokenizerFactory,
                                   WeightedEditDistance editDistance,
                                   double distanceThreshold) {
        mDictionary = dictionary;
        mTokenizerFactory = tokenizerFactory;
        mEditDistance = editDistance;
        mDistanceThreshold = distanceThreshold;
    }


    /**
     * Returns the trie dictionary underlying this chunker.
     * This is the actual dictionary used by the chunker, so changes
     * to it will affect this chunker.
     *
     * @return The trie dictionary underlying this chunker.
     */
    public TrieDictionary dictionary() {
        return mDictionary;
    }

    /**
     * Returns the weighted edit distance for matching with
     * this chunker.  This is the actual edit distance used by
     * the chunker, so changes to it will affect this chunker.
     *
     * @return The weighted edit distance for this chunker.
     */
    public WeightedEditDistance editDistance() {
        return mEditDistance;
    }

    /**
     * Returns the tokenizer factory for matching with this
     * chunker.  This is the actual tokenizer factory used
     * by this chunker, so changes to it will affect the
     * behavior of this class.
     *
     * @return The tokenizer factory for this chunker.
     */
    public TokenizerFactory tokenizerFactory() {
        return mTokenizerFactory;
    }

    /**
     * Returns the maximum edit distance a string can be from a
     * dictionary entry in order to be returned by this chunker.  This
     * value is set using {@link #setMaxDistance(double)}.
     *
     * @return The maximum edit distance for this chunker.
     */
    public double distanceThreshold() {
        return mDistanceThreshold;
    }


    /**
     * Set the max distance a string can be from a dictionary entry
     * in order to be returned as a chunk by this chunker.
     */
    public void setMaxDistance(double distanceThreshold) {
        mDistanceThreshold = distanceThreshold;
    }

    /**
     * Return the approximate dictionary-based chunking for
     * the specified character sequence.
     *
     * @param cSeq Character sequence to chunk.
     * @return Chunking of the specified character sequence.
     */
    public Chunking chunk(CharSequence cSeq) {
        char[] cs = Strings.toCharArray(cSeq);
        return chunk(cs,0,cs.length);
    }

    /**
     * Return the approximate dictionary-based chunking for the
     * specified character sequence.
     *
     * @param cs Underlying characters.
     * @param start Index of first character in the array.
     * @param end Index of one past the last character in the array.
     * @return Chunking of the specified character sequence.
     * @throws IllegalArgumentException If the indices are out of
     * bounds in the character sequence.
     */
    public Chunking chunk(char[] cs, int start, int end) {
        int length = end-start;

        // token start/ends setup; throws exception if args wrong
        Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,length);
        boolean[] startTokens = new boolean[length];
        boolean[] endTokens = new boolean[length+1];
        Arrays.fill(startTokens,false);
        Arrays.fill(endTokens,false);
        String token;
        while ((token = tokenizer.nextToken()) != null) {
            int lastStart = tokenizer.lastTokenStartPosition();
            startTokens[lastStart] = true;
            endTokens[lastStart + token.length()] = true;
        }

        Map dpToChunk = new HashMap();
        Map queue = new HashMap();
        for (int i = 0; i < length; ++i) {
            int startPlusI = start + i;
            char c = cs[startPlusI];
            if (startTokens[i]) {
                add(queue,mDictionary.mRootNode,startPlusI,
                    0.0,
                    false,dpToChunk,cs,startPlusI);
            }
            Map nextQueue = new HashMap();
            double deleteCost = -mEditDistance.deleteWeight(c);
            for (SearchState state : queue.values()) {
                
                // delete
                add(nextQueue,state.mNode,state.mStartIndex,
                    state.mScore + deleteCost,
                    endTokens[i+1],dpToChunk,cs,startPlusI);

                // match or subst
                char[] dtrChars = state.mNode.mDtrChars;
                Node[] dtrNodes = state.mNode.mDtrNodes;
                for (int j = 0; j < dtrChars.length; ++j) {
                    add(nextQueue,dtrNodes[j],state.mStartIndex,
                        state.mScore
                        - (dtrChars[j] == c
                           ?  mEditDistance.matchWeight(dtrChars[j])
                           : mEditDistance.substituteWeight(dtrChars[j],c)),
                        endTokens[i+1],dpToChunk,cs,startPlusI);
                }

            }
            queue = nextQueue;
        }
        ChunkingImpl result = new ChunkingImpl(cs,start,end);
        for (Chunk chunk : dpToChunk.values())
            result.add(chunk);
        return result;
    }

    Object writeReplace() {
        return new Serializer(this);
    }

    void add(Map nextQueue, Node node, int startIndex,
             double chunkScore,
             boolean isTokenEnd,
             Map chunking, char[] cs, int end) {

        if (chunkScore > mDistanceThreshold)
            return;

        SearchState state2
            = new SearchState(node,startIndex,chunkScore);

        SearchState exState = nextQueue.get(state2);

        if (exState != null && exState.mScore < chunkScore)
            return;

        nextQueue.put(state2,state2);

        // finish match at token end by adding each cat (may be 0)
        if (isTokenEnd) {
            for (int i = 0; i < node.mEntries.length; ++i) {
                Chunk newChunk
                    = ChunkFactory
                    .createChunk(startIndex,end+1,
                                 node.mEntries[i].category().toString(),
                                 chunkScore);
                Dp dpNewChunk = new Dp(newChunk);
                Chunk oldChunk = chunking.get(dpNewChunk);
                if (oldChunk != null && oldChunk.score() <= chunkScore)
                    continue;
                chunking.remove(dpNewChunk);
                chunking.put(dpNewChunk,newChunk);
            }
        }

        // insert
        for (int i = 0; i < node.mDtrChars.length; ++i)
            add(nextQueue,node.mDtrNodes[i],startIndex,
                chunkScore - mEditDistance.insertWeight(node.mDtrChars[i]),
                isTokenEnd,chunking,cs,end);
    }

    // chunk's data less score for efficient dynamic programming key
    static final class Dp {
        final int mStart;
        final int mEnd;
        final String mType;
        int mHashCode;
        Dp(Chunk chunk) {
            mStart = chunk.start();
            mEnd = chunk.end();
            mType = chunk.type();
            mHashCode = mStart + 31 * (mEnd + 31 * mType.hashCode());
        }
        @Override
        public int hashCode() {
            return mHashCode;
        }
        @Override
        public boolean equals(Object that) {
            Dp thatDp = (Dp) that;
            return mStart == thatDp.mStart
                && mEnd == thatDp.mEnd
                && mType.equals(thatDp.mType);
        }
    }

    static final class SearchState implements Scored {
        private final double mScore;
        private final Node mNode;
        private final int mStartIndex; // absolute in cs
        SearchState(Node node, int startIndex) {
            this(node,startIndex,0.0);
        }
        SearchState(Node node, int startIndex, double score) {
            mNode = node;
            mStartIndex = startIndex;
            mScore = score;
        }
        public double score() {
            return mScore;
        }
        @Override
        public boolean equals(Object that) {
            SearchState thatState = (SearchState) that;
            return mStartIndex == thatState.mStartIndex
                && mNode == thatState.mNode;
        }
        @Override
        public int hashCode() {
            return mStartIndex; //  + 31 * mNode.hashCode();
        }
        @Override
        public String toString() {
            return "SearchState(" + mNode
                + ", " + mStartIndex
                + ", " + mScore + ")";
        }
    }

    /**
     * This is a weighted edit distance defined by Tsuruoka and Tsujii
     * for matching protein names in biomedical texts.  Reproducing
     * table 1 from their paper provides the weighting function
     * (converting slightly to our terminology and scale):
     *
     * 
     * 
     * 
     *     
     *     
     *
     * 
     *     
     *     
     * 
     *     
     *
     * 
     *     
     *     
     * 
     *     
     *
     * 
     *     
     *     
     * 
     *     
     * 
     *     
     * 
     *     
     *
     * 
     *     
     *     
     *
     * 
     *     
     *     
     *
     * 
     * Operation Character Cost
Insertion space or hyphen -10
other characters -100
Deletion space or hyphen -10
other characters -100
Substitution space for hyphen -10
digit for other digit -10
capital for lowercase -10
other characters -50
Match any character 0
Transposition any characters Double.NEGATIVE_INFINITY
Tsuruoka and Tsujii's Weighted Edit Distance
     * 
     * 
     *
     * Tsuruoka and Tsujii's paper is available online:
     *
     *  Yoshimasa Tsuruoka and Jun'ichi Tsujii.  2003. Boosting precision and recall of dictionary-based protein name
     * recognition In Proceedings of the 2003 ACL workshop on
     * NLP in Biomedicine.
     */
    public static final WeightedEditDistance TT_DISTANCE = new TTDistance();

    static final class TTDistance extends WeightedEditDistance {
        @Override
        public double deleteWeight(char cDeleted) {
            return (cDeleted == ' ' || cDeleted == '-')
                ? -10.0
                : -100.0;
        }
        @Override
        public double insertWeight(char cInserted) {
            return deleteWeight(cInserted);
        }
        @Override
        public double matchWeight(char cMatched) {
            return 0.0;
        }
        @Override
        public double substituteWeight(char cDeleted, char cInserted) {
            if (cDeleted == ' ' && cInserted == '-')
                return -10.0;
            if (cDeleted == '-' && cInserted == ' ')
                return -10.0;
            if (Character.isDigit(cDeleted) && Character.isDigit(cInserted))
                return -10.0;
            if (Character.toLowerCase(cDeleted)
                == Character.toLowerCase(cInserted))
                return -10.0;
            return -50.0;
        }
        @Override
        public double transposeWeight(char c1, char c2) {
            return Double.NEGATIVE_INFINITY;
        }
    }

    static class Serializer extends AbstractExternalizable {
        static final long serialVersionUID = 3935654738558540166L;
        private final ApproxDictionaryChunker mChunker;
        public Serializer() {
            this(null);
        }
        public Serializer(ApproxDictionaryChunker chunker) {
            mChunker = chunker;
        }
        @Override
        public Object read(ObjectInput in) throws IOException, ClassNotFoundException {
            @SuppressWarnings("unchecked")
            TrieDictionary dictionary
                = (TrieDictionary) in.readObject();
            @SuppressWarnings("unchecked")
            TokenizerFactory tokenizerFactory
                = (TokenizerFactory) in.readObject();
            @SuppressWarnings("unchecked")
            WeightedEditDistance editDistance
                = (WeightedEditDistance) in.readObject();
            double distanceThreshold = in.readDouble();
            return new ApproxDictionaryChunker(dictionary,
                                               tokenizerFactory,
                                               editDistance,
                                               distanceThreshold);
            
        }
        @Override
        public void writeExternal(ObjectOutput out) throws IOException {
            out.writeObject(mChunker.mDictionary);
            out.writeObject(mChunker.mTokenizerFactory);
            out.writeObject(mChunker.mEditDistance);
            out.writeDouble(mChunker.mDistanceThreshold);
        }
    }

}
*Operation*	*Character*	*Cost*
Insertion	space or hyphen	-10
Insertion	other characters	-100
Deletion	space or hyphen	-10
Deletion	other characters	-100
Substitution	space for hyphen	-10
	digit for other digit	-10
	capital for lowercase	-10
	other characters	-50
Match	any character	0
Transposition	any characters	Double.NEGATIVE_INFINITY
Tsuruoka and Tsujii's Weighted Edit Distance