![JAR search and dependency download from the Maven repository](/logo.png)
opennlp.tools.langdetect.LanguageDetectorME Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.langdetect;
import java.io.IOException;
import java.io.Serial;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import opennlp.tools.ml.AbstractEventTrainer;
import opennlp.tools.ml.EventTrainer;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.util.MutableInt;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
/**
* Implements a learnable {@link LanguageDetector}.
*
*
* This will process the entire string when called with
* {@link #predictLanguage(CharSequence)} or
* {@link #predictLanguages(CharSequence)}.
*
*
* If you want this to stop early, use {@link #probingPredictLanguages(CharSequence)}
* or {@link #probingPredictLanguages(CharSequence, LanguageDetectorConfig)}.
* When run in probing mode, this starts at the beginning of the char sequence
* and runs language detection on chunks of text. If the end of the
* string is reached or there are {@link LanguageDetectorConfig#getMinConsecImprovements()}
* consecutive predictions for the best language and the confidence
* increases over those last predictions and if the difference
* in confidence between the highest confidence language
* and the second highest confidence language is greater than
* {@link LanguageDetectorConfig#getMinDiff()}, the language detector will
* stop and report the results.
*
*
* The authors wish to thank Ken Krugler and
* Yalder}
* for the inspiration for many of the design
* components of this detector.
*
*/
public class LanguageDetectorME implements LanguageDetector {
@Serial
private static final long serialVersionUID = 2426614409522429702L;
protected final LanguageDetectorModel model;
private final LanguageDetectorContextGenerator mContextGenerator;
/**
* Initializes an instance with a specific {@link LanguageDetectorModel}.
* Default feature generation is used.
*
* @param model the {@link LanguageDetectorModel} to be used.
*/
public LanguageDetectorME(LanguageDetectorModel model) {
this.model = model;
this.mContextGenerator = model.getFactory().getContextGenerator();
}
@Override
public Language[] predictLanguages(CharSequence content) {
return predict(arrayToCounts(mContextGenerator.getContext(content)));
}
@Override
public Language predictLanguage(CharSequence content) {
return predictLanguages(content)[0];
}
@Override
public String[] getSupportedLanguages() {
int numberLanguages = model.getMaxentModel().getNumOutcomes();
String[] languages = new String[numberLanguages];
for (int i = 0; i < numberLanguages; i++) {
languages[i] = model.getMaxentModel().getOutcome(i);
}
return languages;
}
/**
* This will stop processing early if the stopping criteria
* specified in {@link LanguageDetectorConfig#DEFAULT_LANGUAGE_DETECTOR_CONFIG}
* are met.
*
* @param content content to be processed
* @return A computed {@link ProbingLanguageDetectionResult}.
*/
public ProbingLanguageDetectionResult probingPredictLanguages(CharSequence content) {
return probingPredictLanguages(content,
LanguageDetectorConfig.DEFAULT_LANGUAGE_DETECTOR_CONFIG);
}
/**
* This will stop processing early if the stopping criteria
* specified in {@link LanguageDetectorConfig#DEFAULT_LANGUAGE_DETECTOR_CONFIG}
* are met.
*
* @param content The textual content to process.
* @param config The {@link LanguageDetectorConfig} to customize detection.
*
* @return A computed {@link ProbingLanguageDetectionResult}.
*/
public ProbingLanguageDetectionResult probingPredictLanguages(CharSequence content,
LanguageDetectorConfig config) {
//list of the languages that received the highest
//confidence over the last n chunk detections
List predictions = new LinkedList<>();
int start = 0;//where to start the next chunk in codepoints
Language[] currPredictions = null;
//cache ngram counts across chunks
Map ngramCounts = new HashMap<>();
while (true) {
int actualChunkSize =
(start + config.getChunkSize() > config.getMaxLength()) ?
config.getMaxLength() - start : config.getChunkSize();
StringCPLengthPair chunk = chunk(content, start, actualChunkSize);
if (chunk.length() == 0) {
if (currPredictions == null) {
return new ProbingLanguageDetectionResult(predict(ngramCounts), start);
} else {
return new ProbingLanguageDetectionResult(currPredictions, start);
}
}
start += chunk.length();
updateCounts(mContextGenerator.getContext(chunk.s), ngramCounts);
currPredictions = predict(ngramCounts);
if (seenEnough(predictions, currPredictions, ngramCounts, config)) {
return new ProbingLanguageDetectionResult(currPredictions, start);
}
}
}
private void updateCounts(CharSequence[] context, Map ngrams) {
for (CharSequence ngram : context) {
MutableInt i = ngrams.get(ngram);
if (i == null) {
i = new MutableInt(1);
ngrams.put(ngram, i);
} else {
i.increment();
}
}
}
private Map arrayToCounts(CharSequence[] context) {
Map ngrams = new HashMap<>();
updateCounts(context, ngrams);
return ngrams;
}
private Language[] predict(Map ngramCounts) {
String[] allGrams = new String[ngramCounts.size()];
float[] counts = new float[ngramCounts.size()];
int i = 0;
for (Map.Entry e : ngramCounts.entrySet()) {
allGrams[i] = e.getKey().toString();
// TODO -- once OPENNLP-1261 is fixed,
// change this to e.getValue().getValue().
counts[i] = 1;
i++;
}
double[] eval = model.getMaxentModel().eval(allGrams, counts);
Language[] arr = new Language[eval.length];
for (int j = 0; j < eval.length; j++) {
arr[j] = new Language(model.getMaxentModel().getOutcome(j), eval[j]);
}
Arrays.sort(arr, (o1, o2) -> Double.compare(o2.getConfidence(), o1.getConfidence()));
return arr;
}
/**
* Overriding this for different behavior to determine if there is enough
* confidence in the predictions to stop.
*
* @param predictionsQueue queue of earlier predictions
* @param newPredictions most recent predictions
* @param ngramCounts -- not currently used, but might be useful
* @return {@code true} if enough text has been processed to make a determination,
* else {@code false}.
*/
boolean seenEnough(List predictionsQueue, Language[] newPredictions,
Map ngramCounts, LanguageDetectorConfig config) {
if (predictionsQueue.size() < config.getMinConsecImprovements()) {
predictionsQueue.add(newPredictions);
return false;
} else if (predictionsQueue.size() > config.getMinConsecImprovements()
&& predictionsQueue.size() > 0) {
predictionsQueue.remove(0);
}
predictionsQueue.add(newPredictions);
if (config.getMinDiff() > 0.0 &&
newPredictions[0].getConfidence() -
newPredictions[1].getConfidence() < config.getMinDiff()) {
return false;
}
String lastLang = null;
double lastConf = -1.0;
//iterate through the last predictions
//and check that the lang with the highest confidence
//hasn't changed, and that the confidence in it
//hasn't decreased
for (Language[] predictions : predictionsQueue) {
if (lastLang == null) {
lastLang = predictions[0].getLang();
lastConf = predictions[0].getConfidence();
continue;
} else {
if (!lastLang.equals(predictions[0].getLang())) {
return false;
}
if (lastConf > predictions[0].getConfidence()) {
return false;
}
}
lastLang = predictions[0].getLang();
lastConf = predictions[0].getConfidence();
}
return true;
}
private StringCPLengthPair chunk(CharSequence content, int start, int chunkSize) {
if (start == 0 && chunkSize > content.length()) {
String s = content.toString();
int codePointLength = s.codePointCount(0, s.length());
return
new StringCPLengthPair(s, codePointLength);
}
int[] codepoints = content.codePoints().skip(start).limit(chunkSize).toArray();
return
new StringCPLengthPair(
new String(codepoints, 0, codepoints.length),
codepoints.length);
}
/**
* Starts a training of a {@link LanguageDetectorModel} with the given parameters.
*
* @param samples The {@link ObjectStream} of {@link LanguageSample} used as input for training.
* @param mlParams The {@link TrainingParameters} for the context of the training.
* @param factory The {@link LanguageDetectorFactory} for creating related objects defined
* via {@code mlParams}.
*
* @return A valid, trained {@link LanguageDetectorModel} instance.
* @throws IOException Thrown if IO errors occurred.
*/
public static LanguageDetectorModel train(ObjectStream samples,
TrainingParameters mlParams,
LanguageDetectorFactory factory)
throws IOException {
Map manifestInfoEntries = new HashMap<>();
mlParams.putIfAbsent(AbstractEventTrainer.DATA_INDEXER_PARAM,
AbstractEventTrainer.DATA_INDEXER_ONE_PASS_VALUE);
EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries);
MaxentModel model = trainer.train(
new LanguageDetectorEventStream(samples, factory.getContextGenerator()));
return new LanguageDetectorModel(model, manifestInfoEntries, factory);
}
private record StringCPLengthPair(String s, int length) {
String getString() {
return s;
}
}
}