All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aliasi.crf.ChainCrfChunker Maven / Gradle / Ivy

Go to download

This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.

There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.crf;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunker;
import com.aliasi.chunk.ChunkFactory;
import com.aliasi.chunk.Chunking;
import com.aliasi.chunk.ChunkingImpl;
import com.aliasi.chunk.ConfidenceChunker;
import com.aliasi.chunk.NBestChunker;
import com.aliasi.chunk.TagChunkCodec;

import com.aliasi.corpus.Corpus;
import com.aliasi.corpus.ObjectHandler;

import com.aliasi.io.Reporter;
import com.aliasi.io.Reporters;

import com.aliasi.stats.AnnealingSchedule;
import com.aliasi.stats.RegressionPrior;

import com.aliasi.tag.ScoredTagging;
import com.aliasi.tag.StringTagging;
import com.aliasi.tag.Tagging;
import com.aliasi.tag.TagLattice;

import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.FeatureExtractor;
import com.aliasi.util.ScoredObject;
import com.aliasi.util.Strings;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * A {@code ChainCrfChunker} implements chunking based on a chain CRF
 * over string sequences, a tokenizer factory, and a tag to chunk
 * coder/decoder.
 *
 * 

The tokenizer factory is used to turn an input sequence into * a list of tokens. The codec is used to convert taggings into * chunkings and vice-versa. * *

Codec-Based Features

* *

For chunking, feature extraction is over the same two implicit * data structures as for chain CRFs, nodes and edges. For chunkers, * the labels are coded and decoded by an instance of {@link * TagChunkCodec}, such as the BIO-based codec. In order to generate * token-based representations on which to hang tags, an instance of * {@link TokenizerFactory} is supplied in the chunker constructor. * *

Training

* * The static {@code estimate()} method is used to train a chain * CRF-based chunker. The training data is provided as a corpus of * chunkings. The tag-chunk codec and tokenizer factory are then used * to convert the chunkings to taggings, and the resulting tag corpus * passed off to the chain CRF estimator method. Feature extractors * are the same as for a chain CRF, with one for nodes and one for * edges. The tags passed in to these feature extractors will be * determiend by the tag-chunk codec. The remaining inputs are * identical to those for chain CRFs; see the method documentation for * more information. * *

Decoding

* * A chain CRF chunker implements all three chunker interfaces in * order to return first-best chunkings, n-best chunkings (with or * without normalization of scores to conditional probabilities), and * to iterate over the n-best chunks in decreasing order of * probability. * *

Serialization

* * Chain CRF chunkers are serializable if their contained tokenizer * factories and codecs are serializable. The chunker read back in * will be of this class, {@code ChainCrfChunker}, with components * derived from serialization and deserialization. * *

Thread Safety

* * The chain CRF chunker class is thread safe if the tokenizer * factory and tag/chunk coder/decoder are thread safe. * * @author Bob Carpenter * @version 3.9 * @since LingPipe3.9 */ public class ChainCrfChunker implements Chunker, ConfidenceChunker, NBestChunker, Serializable { static final long serialVersionUID = -2244399751558084581L; private final ChainCrf mCrf; private final TokenizerFactory mTokenizerFactory; private final TagChunkCodec mCodec; /** * Construct a chunker based on the specified chain conditional * random field, tokenizer factory and tag-chunk coder/decoder. If * the codec requires a tokenizer factory, it should be the same * one as supplied to this chunker constructor. * * @param crf Underlying conditional random field. * @param tokenizerFactory Tokenizer factory for converting chunkings * to token sequences. * @param codec Coder/decoder for converting taggings to chunkings * and vice-versa. */ public ChainCrfChunker(ChainCrf crf, TokenizerFactory tokenizerFactory, TagChunkCodec codec) { mCrf = crf; mTokenizerFactory = tokenizerFactory; mCodec = codec; } /** * Returns the underlying CRF for this chunker. * * @return CRF for this chunker. */ public ChainCrf crf() { return mCrf; } /** * Returns the tag/chunk coder/decoder for this chunker. * * @return The tag chunk codec for this chunker. */ public TagChunkCodec codec() { return mCodec; } /** * Return the tokenizer factory for this chunker. * * @return The tokenizer factory for this chunker. */ public TokenizerFactory tokenizerFactory() { return mTokenizerFactory; } /** * Return a string-based representation of this CRF chunker. * * @return String representation of this chunker. */ @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("TagChunkCodec=" + codec()); sb.append("\n"); sb.append("Tokenizer Factory=" + tokenizerFactory()); sb.append("\n"); sb.append("CRF=\n"); sb.append(crf().toString()); return sb.toString(); } public Chunking chunk(CharSequence cSeq) { char[] cs = Strings.toCharArray(cSeq); return chunk(cs,0,cs.length); } public Chunking chunk(char[] cs, int start, int end) { PreTagging preTagging = preTag(cs,start,end); List tokens = preTagging.mTokens; Tagging tagging = mCrf.tag(tokens); return toChunking(tagging,preTagging,cs,start,end,mCodec); } public Iterator> nBest(char[] cs, int start, int end, int maxResults) { PreTagging preTagging = preTag(cs,start,end); List tokens = preTagging.mTokens; Iterator> it = mCrf.tagNBest(tokens,maxResults); return new IteratorWrapper(it,preTagging,cs,start,end,mCodec); } /** * Returns an iterator over n-best chunkings with scores * normalized to conditional probabilities of the output given the * input string slice. The same chunkings will be returned in the * same order as for the unnormalized method, {@link * #nBest(char[],int,int,int)}. Like that method, the maximum number * of results parameter should be set as low as practical, as it * cuts down on memory requirement for outputs that will never be * returned. * *

Conditional probability normalization requires an additional * forward-backward pass to derive the normalizing factor, but the * benefit is that results become comparable across input strings. * * @param cs Underlying characters. * @param start First character in slice. * @param end One past the last character in the slice. * @param maxResults Maximum number of results to return. */ public Iterator> nBestConditional(char[] cs, int start, int end, int maxResults) { PreTagging preTagging = preTag(cs,start,end); List tokens = preTagging.mTokens; Iterator> it = mCrf.tagNBestConditional(tokens,maxResults); return new IteratorWrapper(it,preTagging,cs,start,end,mCodec); } public Iterator nBestChunks(char[] cs, int start, int end, int maxNBest) { PreTagging preTagging = preTag(cs,start,end); List tokens = preTagging.mTokens; TagLattice lattice = mCrf.tagMarginal(tokens); return mCodec.nBestChunks(lattice, preTagging.mTokenStarts, preTagging.mTokenEnds, maxNBest); } PreTagging preTag(char[] cs, int start, int end) { List tokenStarts = new ArrayList(); List tokenEnds = new ArrayList(); List tokens = new ArrayList(); Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,end-start); String token; while ((token = tokenizer.nextToken()) != null) { tokens.add(token); tokenStarts.add(tokenizer.lastTokenStartPosition()); tokenEnds.add(tokenizer.lastTokenEndPosition()); } return new PreTagging(tokens, toArray(tokenStarts), toArray(tokenEnds)); } Object writeReplace() { return new Serializer(this); } static final boolean ALLOW_UNSEEN_TAG_TRANSITIONS = false; /** * Return the chain CRF-based chunker estimated from the specified * corpus, which is converted to a tagging corpus using the * specified coder/decoder and tokenizer factory, then passed to * the chain CRF estimate method along with the rest of the * arguments. * *

Estimation is based on regularized stochastic gradient * descent. See {@link * ChainCrf#estimate(Corpus,ChainCrfFeatureExtractor,boolean,int,boolean,boolean,RegressionPrior,int,AnnealingSchedule,double,int,int,Reporter)} * for more information. * * @param chunkingCorpus Training corpus of chunkings. * @param codec Coder/decoder for translating chunkings to * taggings and vice-versa. * @param tokenizerFactory Tokenizer factory for converting inputs to * token sequences for the underlying chain CRF. * @param featureExtractor Feature extractor for the underlying chain CRF. * @param addInterceptFeature Set to {@code true} to automatically add an * intercept feature with constant value 1.0 in position 0. * @param minFeatureCount Minimum number of times a feature must * show up in the tagging corpus given the feature extractors to * be retained for training. * @param cacheFeatureVectors Flag indicating whether or not to cache * extracted features. * @param prior Prior to use to regularize the underlying chain * CRF estimates. * @param priorBlockSize Number of instances to update by gradeint * for every prior update. * @param annealingSchedule Annealing schedule to determine * learning rates for stochastic gradient descent training. * @param minImprovement Minimum improvement in epoch to terminate training (computed * with a rolling average). * @param minEpochs Minimum number of epochs for which to train. * @param maxEpochs Maximum nubmer of epochs for which to train. * @param reporter Reporter to which reports of training are sent, or * {@code null} for silent operation. * @throws IOException If there is an underlying I/O exception reading * the corpus. */ public static ChainCrfChunker estimate(Corpus> chunkingCorpus, TagChunkCodec codec, TokenizerFactory tokenizerFactory, ChainCrfFeatureExtractor featureExtractor, boolean addInterceptFeature, int minFeatureCount, boolean cacheFeatureVectors, RegressionPrior prior, int priorBlockSize, AnnealingSchedule annealingSchedule, double minImprovement, int minEpochs, int maxEpochs, Reporter reporter) throws IOException { if (reporter == null) reporter = Reporters.silent(); reporter.info("Training chain CRF chunker"); reporter.info("Converting chunk corpus to tag corpus using codec."); Corpus>> taggingCorpus = new TagCorpus(chunkingCorpus,codec); ChainCrf crf = ChainCrf.estimate(taggingCorpus, featureExtractor, addInterceptFeature, minFeatureCount, cacheFeatureVectors, ALLOW_UNSEEN_TAG_TRANSITIONS, prior, priorBlockSize, annealingSchedule, minImprovement, minEpochs, maxEpochs, reporter); return new ChainCrfChunker(crf,tokenizerFactory,codec); } static Chunking toChunking(Tagging tagging, PreTagging preTagging, char[] cs, int start, int end, TagChunkCodec codec) { String s = new String(cs,start,end-start); List tokens = preTagging.mTokens; int[] tokenStarts = preTagging.mTokenStarts; int[] tokenEnds = preTagging.mTokenEnds; List tags = tagging.tags(); StringTagging stringTagging = new StringTagging(tokens,tags,s,tokenStarts,tokenEnds); return codec.toChunking(stringTagging); } static int[] toArray(List xs) { int len = xs.size(); int[] ys = new int[len]; for (int i = 0; i < len; ++i) ys[i] = xs.get(i); return ys; } static class PreTagging { final List mTokens; final int[] mTokenStarts; final int[] mTokenEnds; public PreTagging(List tokens, int[] tokenStarts, int[] tokenEnds) { mTokens = tokens; mTokenStarts = tokenStarts; mTokenEnds = tokenEnds; } } static class ChunkingAdapter implements ObjectHandler { private final ObjectHandler> mTagHandler; private final TagChunkCodec mCodec; public ChunkingAdapter(ObjectHandler> tagHandler, TagChunkCodec codec) { mTagHandler = tagHandler; mCodec = codec; } public void handle(Chunking chunking) { Tagging tagging = mCodec.toTagging(chunking); mTagHandler.handle(tagging); } } static class TagCorpus extends Corpus>> { private final Corpus> mChunkingCorpus; private final TagChunkCodec mCodec; public TagCorpus(Corpus> chunkingCorpus, TagChunkCodec codec) { mChunkingCorpus = chunkingCorpus; mCodec = codec; } public void visitTrain(ObjectHandler> handler) throws IOException { mChunkingCorpus.visitTrain(new ChunkingAdapter(handler,mCodec)); } public void visitTest(ObjectHandler> handler) throws IOException { mChunkingCorpus.visitTest(new ChunkingAdapter(handler,mCodec)); } } static class IteratorWrapper implements Iterator> { private final Iterator> mIt; private final PreTagging mPreTagging; private final char[] mCs; private final int mStart; private final int mEnd; private final TagChunkCodec mCodec; IteratorWrapper(Iterator> it, PreTagging preTagging, char[] cs, int start, int end, TagChunkCodec codec) { mIt = it; mPreTagging = preTagging; mCs = cs; mStart = start; mEnd = end; mCodec = codec; } public boolean hasNext() { return mIt.hasNext(); } public void remove() { mIt.remove(); } public ScoredObject next() { ScoredTagging tagging = mIt.next(); double score = tagging.score(); Chunking chunking = toChunking(tagging,mPreTagging,mCs,mStart,mEnd,mCodec); return new ScoredObject(chunking,score); } } static class Serializer extends AbstractExternalizable { static final long serialVersionUID = 2460314741682974199L; private final ChainCrfChunker mChunker; public Serializer() { this(null); } public Serializer(ChainCrfChunker chunker) { mChunker = chunker; } public Object read(ObjectInput in) throws IOException, ClassNotFoundException { @SuppressWarnings("unchecked") ChainCrf crf = (ChainCrf) in.readObject(); @SuppressWarnings("unchecked") TokenizerFactory factory = (TokenizerFactory) in.readObject(); @SuppressWarnings("unchecked") TagChunkCodec codec = (TagChunkCodec) in.readObject(); return new ChainCrfChunker(crf,factory,codec); } public void writeExternal(ObjectOutput out) throws IOException { out.writeObject(mChunker.mCrf); out.writeObject(mChunker.mTokenizerFactory); out.writeObject(mChunker.mCodec); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy