All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.tokenize.TokenizerME Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.tokenize;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.ml.EventTrainer;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringList;
import opennlp.tools.util.TrainingParameters;

/**
 * A {@link Tokenizer} for converting raw text into separated tokens. It uses
 * Maximum Entropy to make its decisions. The features are loosely
 * based off of Jeff Reynar's UPenn thesis "Topic Segmentation:
 * Algorithms and Applications.", which is available from his
 * homepage: http://www.cis.upenn.edu/~jcreynar.
 * 

* This implementation needs a statistical model to tokenize a text which reproduces * the tokenization observed in the training data used to create the model. * The {@link TokenizerModel} class encapsulates that model and provides * methods to create it from the binary representation. *

* A tokenizer instance is not thread-safe. For each thread, one tokenizer * must be instantiated which can share one {@link TokenizerModel} instance * to safe memory. *

* To train a new model, the {@link #train(ObjectStream, TokenizerFactory, TrainingParameters)} method * can be used. *

* Sample usage: *

* * InputStream modelIn;
*
* ...
*
* TokenizerModel model = TokenizerModel(modelIn);
*
* Tokenizer tokenizer = new TokenizerME(model);
*
* String tokens[] = tokenizer.tokenize("A sentence to be tokenized."); *
* @see Tokenizer * @see TokenizerModel * @see TokenSample */ public class TokenizerME extends AbstractTokenizer { /** * Constant indicates a token split. */ public static final String SPLIT = "T"; /** * Constant indicates no token split. */ public static final String NO_SPLIT = "F"; private final Pattern alphanumeric; /* * The maximum entropy model to use to evaluate contexts. */ private final MaxentModel model; /* * The context generator. */ private final TokenContextGenerator cg; /* * Optimization flag to skip alphanumeric tokens for further tokenization */ private final boolean useAlphaNumericOptimization; /* * List of probabilities for each token returned from a call to * tokenize or tokenizePos. */ private final List tokProbs; private final List newTokens; /* * The {@link Dictionary abbreviation dictionary} if available (may be {@code null}). */ private final Dictionary abbDict; /** * Initializes a {@link TokenizerME} by downloading a default model. * @param language The language of the tokenizer. * @throws IOException Thrown if the model cannot be downloaded or saved. */ public TokenizerME(String language) throws IOException { this(DownloadUtil.downloadModel(language, DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class)); } /** * Instantiates a {@link TokenizerME} with an existing {@link TokenizerModel}. * * @param model The {@link TokenizerModel} to be used. */ public TokenizerME(TokenizerModel model) { TokenizerFactory factory = model.getFactory(); this.alphanumeric = factory.getAlphaNumericPattern(); this.cg = factory.getContextGenerator(); this.model = model.getMaxentModel(); this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptimization(); abbDict = model.getAbbreviations(); newTokens = new ArrayList<>(); tokProbs = new ArrayList<>(50); } /** * @return the probabilities associated with the most recent calls to * {@link TokenizerME#tokenize(String)} or {@link TokenizerME#tokenizePos(String)}. * If not applicable an empty array is returned. */ public double[] getTokenProbabilities() { double[] tokProbArray = new double[tokProbs.size()]; for (int i = 0; i < tokProbArray.length; i++) { tokProbArray[i] = tokProbs.get(i); } return tokProbArray; } /** * Tokenizes the string. * * @param d The string to be tokenized. * * @return A {@link Span} array containing individual tokens as elements. */ @Override public Span[] tokenizePos(String d) { WhitespaceTokenizer whitespaceTokenizer = WhitespaceTokenizer.INSTANCE; whitespaceTokenizer.setKeepNewLines(keepNewLines); Span[] tokens = whitespaceTokenizer.tokenizePos(d); newTokens.clear(); tokProbs.clear(); for (Span s : tokens) { String tok = d.substring(s.getStart(), s.getEnd()); // Can't tokenize single characters if (tok.length() < 2) { newTokens.add(s); tokProbs.add(1d); } else if (useAlphaNumericOptimization() && alphanumeric.matcher(tok).matches()) { newTokens.add(s); tokProbs.add(1d); } else { int start = s.getStart(); int end = s.getEnd(); final int origStart = s.getStart(); double tokenProb = 1.0; for (int j = origStart + 1; j < end; j++) { double[] probs = model.eval(cg.getContext(tok, j - origStart)); String best = model.getBestOutcome(probs); tokenProb *= probs[model.getIndex(best)]; if (best.equals(TokenizerME.SPLIT)) { if (isAcceptableAbbreviation(tok)) { newTokens.add(new Span(start, end)); tokProbs.add(tokenProb); long numberOfDots = tok.codePoints().filter(ch -> ch == '.').count(); j = j + (int) numberOfDots; // To compensate for abbreviation dot(s) start = j + 1; } else { newTokens.add(new Span(start, j)); tokProbs.add(tokenProb); start = j; } tokenProb = 1.0; } } if (start < end) { newTokens.add(new Span(start, end)); tokProbs.add(tokenProb); } } } Span[] spans = new Span[newTokens.size()]; newTokens.toArray(spans); return spans; } /** * Trains a model for the {@link TokenizerME}. * * @param samples The samples used for the training. * @param factory A {@link TokenizerFactory} to get resources from. * @param mlParams The machine learning {@link TrainingParameters train parameters}. * @return A trained {@link TokenizerModel}. * @throws IOException Thrown during IO operations on a temp file which is created * during training. Or if reading from the {@link ObjectStream} fails. */ public static TokenizerModel train(ObjectStream samples, TokenizerFactory factory, TrainingParameters mlParams) throws IOException { Map manifestInfoEntries = new HashMap<>(); ObjectStream eventStream = new TokSpanEventStream(samples, factory.isUseAlphaNumericOptimization(), factory.getAlphaNumericPattern(), factory.getContextGenerator()); EventTrainer trainer = TrainerFactory.getEventTrainer( mlParams, manifestInfoEntries); MaxentModel maxentModel = trainer.train(eventStream); return new TokenizerModel(maxentModel, manifestInfoEntries, factory); } /** * @return {@code true} if the tokenizer uses alphanumeric optimization, {@code false} otherwise. */ public boolean useAlphaNumericOptimization() { return useAlphaNumericOptimization; } /** * Allows checking a token abbreviation candidate for acceptability. * *

Note: The implementation always returns {@code false} if no * abbreviation dictionary is available for the underlying model.

* * @param s the {@link CharSequence token} to check for. * @return {@code true} if the candidate is acceptable, {@code false} otherwise. */ protected boolean isAcceptableAbbreviation(CharSequence s) { if (abbDict == null) return false; return abbDict.contains(new StringList(s.toString())); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy