All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.dict.ApproxDictionaryChunker Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.dict;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.ChunkFactory;
import com.aliasi.chunk.Chunking;
import com.aliasi.chunk.ChunkingImpl;
import com.aliasi.chunk.Chunker;

import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.spell.WeightedEditDistance;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Scored;
import com.aliasi.util.Strings;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

/**
 * An ApproxDictionaryChunker implements a chunker that
 * produces chunks based on weighted edit distance of strings from
 * dictionary entries.  This is an approximate or "fuzzy"
 * dictionary matching strategy.
 *
 * 

The underlying dictionary is required to be an instance of * {@link TrieDictionary} in order to support efficient search for * matches. Other dictionaries can be easily converted to * trie dictionaries by adding their entries to a fresh trie * dictionary. * *

Entries are matched by weighted edit distance, as supplied by an * implementation of {@link WeightedEditDistance}. All substrings * within the maximum distance specified at construction time are * returned as part of the chunking. Keep in mind that weights for * weighted edit distance are specified as proximities, that is, as * negative distances. * *

No Transposition

* *

Transposition is not implemented in the approximate dictionary * chunker, so no matches are possible through * transposition. Specifically, the transpose weight method is never * called on the underlying weighted edit distance. * *

Token Sensitivity

* *

The tokenizer factory supplied at construction time is * only used to constrain search by enforcing boundary conditions. * Chunks are only returned if they start on the first character * of a token and end on the last character of a token. * *

Using an instance of {@link * com.aliasi.tokenizer.CharacterTokenizerFactory} effectively removes * token sensitivity by treating every non-whitespace character as a * token and thus rendering every non-whitespace position a possible * chunk boundary. * *

Serialization

* * An approximate dictionary is serializable if its tokenizer factory * and edit distance are serializable. The reconstituted object will * be an instance of this class, {@code ApproxDictionaryChunker}. * *

References

* *

The approach implemented here is very similar to that described * in the following paper: * *

* * The best general reference for approximate string matching * is: * *
    *
  • * Gusfield, Dan. 1997. Algorithms on Strings, Trees and Sequences. * Cambridge University Press. *
  • *
* * @author Bob Carpenter * @version 3.9.1 * @since LingPipe2.1 */ public class ApproxDictionaryChunker implements Chunker, Serializable { static final long serialVersionUID = 5364907367744655793L; private final TrieDictionary mDictionary; private final TokenizerFactory mTokenizerFactory; private final WeightedEditDistance mEditDistance; private double mDistanceThreshold; /** * Construct an approximate dictionary chunker from the specified * dictionary, tokenizer factory, weighted edit distance and * distance bound. The dictionary is used for the candidate * matches. The tokenizer factory is used for determining * possible boundaries of matches, which must start on the first * character of a token and end on the last character of a token. * The edit distance is used for measuring substrings against * dictionary entries. The distance threshold specifies the * maximum distance at which matches are returned. * * @param dictionary Dictionary to use for matching. * @param tokenizerFactory Tokenizer factory for boundary * determination. * @param editDistance Matching distance measure. * @param distanceThreshold Distance threshold for matching. */ public ApproxDictionaryChunker(TrieDictionary dictionary, TokenizerFactory tokenizerFactory, WeightedEditDistance editDistance, double distanceThreshold) { mDictionary = dictionary; mTokenizerFactory = tokenizerFactory; mEditDistance = editDistance; mDistanceThreshold = distanceThreshold; } /** * Returns the trie dictionary underlying this chunker. * This is the actual dictionary used by the chunker, so changes * to it will affect this chunker. * * @return The trie dictionary underlying this chunker. */ public TrieDictionary dictionary() { return mDictionary; } /** * Returns the weighted edit distance for matching with * this chunker. This is the actual edit distance used by * the chunker, so changes to it will affect this chunker. * * @return The weighted edit distance for this chunker. */ public WeightedEditDistance editDistance() { return mEditDistance; } /** * Returns the tokenizer factory for matching with this * chunker. This is the actual tokenizer factory used * by this chunker, so changes to it will affect the * behavior of this class. * * @return The tokenizer factory for this chunker. */ public TokenizerFactory tokenizerFactory() { return mTokenizerFactory; } /** * Returns the maximum edit distance a string can be from a * dictionary entry in order to be returned by this chunker. This * value is set using {@link #setMaxDistance(double)}. * * @return The maximum edit distance for this chunker. */ public double distanceThreshold() { return mDistanceThreshold; } /** * Set the max distance a string can be from a dictionary entry * in order to be returned as a chunk by this chunker. */ public void setMaxDistance(double distanceThreshold) { mDistanceThreshold = distanceThreshold; } /** * Return the approximate dictionary-based chunking for * the specified character sequence. * * @param cSeq Character sequence to chunk. * @return Chunking of the specified character sequence. */ public Chunking chunk(CharSequence cSeq) { char[] cs = Strings.toCharArray(cSeq); return chunk(cs,0,cs.length); } /** * Return the approximate dictionary-based chunking for the * specified character sequence. * * @param cs Underlying characters. * @param start Index of first character in the array. * @param end Index of one past the last character in the array. * @return Chunking of the specified character sequence. * @throws IllegalArgumentException If the indices are out of * bounds in the character sequence. */ public Chunking chunk(char[] cs, int start, int end) { int length = end-start; // token start/ends setup; throws exception if args wrong Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,length); boolean[] startTokens = new boolean[length]; boolean[] endTokens = new boolean[length+1]; Arrays.fill(startTokens,false); Arrays.fill(endTokens,false); String token; while ((token = tokenizer.nextToken()) != null) { int lastStart = tokenizer.lastTokenStartPosition(); startTokens[lastStart] = true; endTokens[lastStart + token.length()] = true; } Map dpToChunk = new HashMap(); Map queue = new HashMap(); for (int i = 0; i < length; ++i) { int startPlusI = start + i; char c = cs[startPlusI]; if (startTokens[i]) { add(queue,mDictionary.mRootNode,startPlusI, 0.0, false,dpToChunk,cs,startPlusI); } Map nextQueue = new HashMap(); double deleteCost = -mEditDistance.deleteWeight(c); for (SearchState state : queue.values()) { // delete add(nextQueue,state.mNode,state.mStartIndex, state.mScore + deleteCost, endTokens[i+1],dpToChunk,cs,startPlusI); // match or subst char[] dtrChars = state.mNode.mDtrChars; Node[] dtrNodes = state.mNode.mDtrNodes; for (int j = 0; j < dtrChars.length; ++j) { add(nextQueue,dtrNodes[j],state.mStartIndex, state.mScore - (dtrChars[j] == c ? mEditDistance.matchWeight(dtrChars[j]) : mEditDistance.substituteWeight(dtrChars[j],c)), endTokens[i+1],dpToChunk,cs,startPlusI); } } queue = nextQueue; } ChunkingImpl result = new ChunkingImpl(cs,start,end); for (Chunk chunk : dpToChunk.values()) result.add(chunk); return result; } Object writeReplace() { return new Serializer(this); } void add(Map nextQueue, Node node, int startIndex, double chunkScore, boolean isTokenEnd, Map chunking, char[] cs, int end) { if (chunkScore > mDistanceThreshold) return; SearchState state2 = new SearchState(node,startIndex,chunkScore); SearchState exState = nextQueue.get(state2); if (exState != null && exState.mScore < chunkScore) return; nextQueue.put(state2,state2); // finish match at token end by adding each cat (may be 0) if (isTokenEnd) { for (int i = 0; i < node.mEntries.length; ++i) { Chunk newChunk = ChunkFactory .createChunk(startIndex,end+1, node.mEntries[i].category().toString(), chunkScore); Dp dpNewChunk = new Dp(newChunk); Chunk oldChunk = chunking.get(dpNewChunk); if (oldChunk != null && oldChunk.score() <= chunkScore) continue; chunking.remove(dpNewChunk); chunking.put(dpNewChunk,newChunk); } } // insert for (int i = 0; i < node.mDtrChars.length; ++i) add(nextQueue,node.mDtrNodes[i],startIndex, chunkScore - mEditDistance.insertWeight(node.mDtrChars[i]), isTokenEnd,chunking,cs,end); } // chunk's data less score for efficient dynamic programming key static final class Dp { final int mStart; final int mEnd; final String mType; int mHashCode; Dp(Chunk chunk) { mStart = chunk.start(); mEnd = chunk.end(); mType = chunk.type(); mHashCode = mStart + 31 * (mEnd + 31 * mType.hashCode()); } @Override public int hashCode() { return mHashCode; } @Override public boolean equals(Object that) { Dp thatDp = (Dp) that; return mStart == thatDp.mStart && mEnd == thatDp.mEnd && mType.equals(thatDp.mType); } } static final class SearchState implements Scored { private final double mScore; private final Node mNode; private final int mStartIndex; // absolute in cs SearchState(Node node, int startIndex) { this(node,startIndex,0.0); } SearchState(Node node, int startIndex, double score) { mNode = node; mStartIndex = startIndex; mScore = score; } public double score() { return mScore; } @Override public boolean equals(Object that) { SearchState thatState = (SearchState) that; return mStartIndex == thatState.mStartIndex && mNode == thatState.mNode; } @Override public int hashCode() { return mStartIndex; // + 31 * mNode.hashCode(); } @Override public String toString() { return "SearchState(" + mNode + ", " + mStartIndex + ", " + mScore + ")"; } } /** * This is a weighted edit distance defined by Tsuruoka and Tsujii * for matching protein names in biomedical texts. Reproducing * table 1 from their paper provides the weighting function * (converting slightly to our terminology and scale): * *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
OperationCharacterCost
Insertionspace or hyphen-10
other characters-100
Deletionspace or hyphen-10
other characters-100
Substitutionspace for hyphen-10
digit for other digit-10
capital for lowercase-10
other characters-50
Matchany character0
Transpositionany charactersDouble.NEGATIVE_INFINITY
Tsuruoka and Tsujii's Weighted Edit Distance
* *
* * Tsuruoka and Tsujii's paper is available online: * *
Yoshimasa Tsuruoka and Jun'ichi Tsujii. 2003. Boosting precision and recall of dictionary-based protein name * recognition In Proceedings of the 2003 ACL workshop on * NLP in Biomedicine. */ public static final WeightedEditDistance TT_DISTANCE = new TTDistance(); static final class TTDistance extends WeightedEditDistance { @Override public double deleteWeight(char cDeleted) { return (cDeleted == ' ' || cDeleted == '-') ? -10.0 : -100.0; } @Override public double insertWeight(char cInserted) { return deleteWeight(cInserted); } @Override public double matchWeight(char cMatched) { return 0.0; } @Override public double substituteWeight(char cDeleted, char cInserted) { if (cDeleted == ' ' && cInserted == '-') return -10.0; if (cDeleted == '-' && cInserted == ' ') return -10.0; if (Character.isDigit(cDeleted) && Character.isDigit(cInserted)) return -10.0; if (Character.toLowerCase(cDeleted) == Character.toLowerCase(cInserted)) return -10.0; return -50.0; } @Override public double transposeWeight(char c1, char c2) { return Double.NEGATIVE_INFINITY; } } static class Serializer extends AbstractExternalizable { static final long serialVersionUID = 3935654738558540166L; private final ApproxDictionaryChunker mChunker; public Serializer() { this(null); } public Serializer(ApproxDictionaryChunker chunker) { mChunker = chunker; } @Override public Object read(ObjectInput in) throws IOException, ClassNotFoundException { @SuppressWarnings("unchecked") TrieDictionary dictionary = (TrieDictionary) in.readObject(); @SuppressWarnings("unchecked") TokenizerFactory tokenizerFactory = (TokenizerFactory) in.readObject(); @SuppressWarnings("unchecked") WeightedEditDistance editDistance = (WeightedEditDistance) in.readObject(); double distanceThreshold = in.readDouble(); return new ApproxDictionaryChunker(dictionary, tokenizerFactory, editDistance, distanceThreshold); } @Override public void writeExternal(ObjectOutput out) throws IOException { out.writeObject(mChunker.mDictionary); out.writeObject(mChunker.mTokenizerFactory); out.writeObject(mChunker.mEditDistance); out.writeDouble(mChunker.mDistanceThreshold); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy