All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.tokenize.TokenizerME Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreemnets.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.tokenize;

import java.io.IOException;
import java.io.ObjectStreamException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import opennlp.maxent.GIS;
import opennlp.maxent.GISModel;
import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
import opennlp.model.TwoPassDataIndexer;
import opennlp.tools.util.HashSumEventStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;

/**
 * A Tokenizer for converting raw text into separated tokens.  It uses
 * Maximum Entropy to make its decisions.  The features are loosely
 * based off of Jeff Reynar's UPenn thesis "Topic Segmentation:
 * Algorithms and Applications.", which is available from his
 * homepage: .
 * 

* This tokenizer needs a statistical model to tokenize a text which reproduces * the tokenization observed in the training data used to create the model. * The {@link TokenizerModel} class encapsulates the model and provides * methods to create it from the binary representation. *

* A tokenizer instance is not thread safe. For each thread one tokenizer * must be instantiated which can share one TokenizerModel instance * to safe memory. *

* To train a new model {{@link #train(String, Iterator, boolean)} method * can be used. *

* Sample usage: *

* * InputStream modelIn;
*
* ...
*
* TokenizerModel model = TokenizerModel(modelIn);
*
* Tokenizer tokenizer = new TokenizerME(model);
*
* String tokens[] = tokenizer.tokenize("A sentence to be tokenized."); *
* * @see Tokenizer * @see TokenizerModel * @see TokenSample */ public class TokenizerME extends AbstractTokenizer { /** * Constant indicates a token split. */ public static final String SPLIT ="T"; /** * Constant indicates no token split. */ public static final String NO_SPLIT ="F"; /** * Alpha-Numeric Pattern */ public static final Pattern alphaNumeric = Pattern.compile("^[A-Za-z0-9]+$"); /** * The maximum entropy model to use to evaluate contexts. */ private MaxentModel model; /** * The context generator. */ private final TokenContextGenerator cg = new DefaultTokenContextGenerator(); /** * Optimization flag to skip alpha numeric tokens for further * tokenization */ private boolean useAlphaNumericOptimization; /** * List of probabilities for each token returned from a call to * tokenize or tokenizePos. */ private List tokProbs; private List newTokens; public TokenizerME(TokenizerModel model) { this.model = model.getMaxentModel(); useAlphaNumericOptimization = model.useAlphaNumericOptimization(); newTokens = new ArrayList(); tokProbs = new ArrayList(50); } /** * Returns the probabilities associated with the most recent * calls to {@link TokenizerME#tokenize(String)} or {@link TokenizerME#tokenizePos(String)}. * * @return probability for each token returned for the most recent * call to tokenize. If not applicable an empty array is * returned. */ public double[] getTokenProbabilities() { double[] tokProbArray = new double[tokProbs.size()]; for (int i = 0; i < tokProbArray.length; i++) { tokProbArray[i] = tokProbs.get(i); } return tokProbArray; } /** * Tokenizes the string. * * @param d The string to be tokenized. * * @return A span array containing individual tokens as elements. */ public Span[] tokenizePos(String d) { Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(d); newTokens.clear(); tokProbs.clear(); for (int i = 0, il = tokens.length; i < il; i++) { Span s = tokens[i]; String tok = d.substring(s.getStart(), s.getEnd()); // Can't tokenize single characters if (tok.length() < 2) { newTokens.add(s); tokProbs.add(1d); } else if (useAlphaNumericOptimization() && alphaNumeric.matcher(tok).matches()) { newTokens.add(s); tokProbs.add(1d); } else { int start = s.getStart(); int end = s.getEnd(); final int origStart = s.getStart(); double tokenProb = 1.0; for (int j = origStart + 1; j < end; j++) { double[] probs = model.eval(cg.getContext(tok, j - origStart)); String best = model.getBestOutcome(probs); tokenProb *= probs[model.getIndex(best)]; if (best.equals(TokenizerME.SPLIT)) { newTokens.add(new Span(start, j)); tokProbs.add(tokenProb); start = j; tokenProb = 1.0; } } newTokens.add(new Span(start, end)); tokProbs.add(tokenProb); } } Span[] spans = new Span[newTokens.size()]; newTokens.toArray(spans); return spans; } /** * Trains a model for the {@link TokenizerME}. * * @param languageCode the language of the natural text * @param samples the samples used for the training. * @param useAlphaNumericOptimization - if true alpha numerics are skipped * @param cutoff number of times a feature must be seen to be considered * @param iterations number of iterations to train the maxent model * * @return the trained {@link TokenizerModel} * * @throws IOException it throws an {@link IOException} if an {@link IOException} * is thrown during IO operations on a temp file which is created during training. * Or if reading from the {@link ObjectStream} fails. * */ public static TokenizerModel train(String languageCode, ObjectStream samples, boolean useAlphaNumericOptimization, int cutoff, int iterations) throws IOException { Map manifestInfoEntries = new HashMap(); ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations); EventStream eventStream = new TokSpanEventStream(samples, useAlphaNumericOptimization); HashSumEventStream hses = new HashSumEventStream(eventStream); GISModel maxentModel = GIS.trainModel(iterations, new TwoPassDataIndexer(hses, cutoff)); manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, hses.calculateHashSum().toString(16)); return new TokenizerModel(languageCode, maxentModel, useAlphaNumericOptimization, manifestInfoEntries); } /** * Trains a model for the {@link TokenizerME} with a default cutoff of 5 and 100 iterations. * * @param languageCode the language of the natural text * @param samples the samples used for the training. * @param useAlphaNumericOptimization - if true alpha numerics are skipped * * @return the trained {@link TokenizerModel} * * @throws IOException it throws an {@link IOException} if an {@link IOException} * is thrown during IO operations on a temp file which is * * @throws ObjectStreamException if reading from the {@link ObjectStream} fails * created during training. */ public static TokenizerModel train(String languageCode, ObjectStream samples, boolean useAlphaNumericOptimization) throws IOException, ObjectStreamException { return train(languageCode, samples, useAlphaNumericOptimization, 5, 100); } /** * Returns the value of the alpha-numeric optimization flag. * * @return true if the tokenizer should use alpha-numeric optimization, false otherwise. */ public boolean useAlphaNumericOptimization() { return useAlphaNumericOptimization; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy