com.optimaize.langdetect.LanguageDetectorImpl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of language-detector Show documentation
Show all versions of language-detector Show documentation
Language Detection Library for Java.
package com.optimaize.langdetect;
import com.cybozu.labs.langdetect.util.Util;
import com.google.common.base.Optional;
import com.optimaize.langdetect.ngram.NgramExtractor;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
/**
*
*
* This class is immutable and thus thread-safe.
*
* @author Nakatani Shuyo
* @author Fabian Kessler
* @author Elmer Garduno
*/
public final class LanguageDetectorImpl implements LanguageDetector {
private static final Logger logger = LoggerFactory.getLogger(LanguageDetectorImpl.class);
/**
* TODO document what this is for, and why that value is chosen.
*/
private static final double ALPHA_WIDTH = 0.05;
/**
* TODO document what this is for, and why that value is chosen.
*/
private static final int ITERATION_LIMIT = 1000;
/**
* TODO document what this is for, and why that value is chosen.
*/
private static final double CONV_THRESHOLD = 0.99999;
/**
* TODO document what this is for, and why that value is chosen.
*/
private static final int BASE_FREQ = 10000;
/**
* TODO document what this is for, and why that value is chosen.
*/
private static final int N_TRIAL = 7;
@NotNull
private final NgramFrequencyData ngramFrequencyData;
/**
* User-defined language priorities, in the same order as {@code langlist}.
*/
@Nullable
private final double[] priorMap;
private final double alpha;
private final int shortTextAlgorithm;
private final double prefixFactor;
private final double suffixFactor;
private final double probabilityThreshold;
private final double minimalConfidence;
private final NgramExtractor ngramExtractor;
/**
* Use the {@link LanguageDetectorBuilder}.
*/
LanguageDetectorImpl(@NotNull NgramFrequencyData ngramFrequencyData,
double alpha, int shortTextAlgorithm,
double prefixFactor, double suffixFactor,
double probabilityThreshold,
double minimalConfidence,
@Nullable Map langWeightingMap,
@NotNull NgramExtractor ngramExtractor) {
if (alpha<0d || alpha >1d) throw new IllegalArgumentException(""+alpha);
if (prefixFactor <0d || prefixFactor >10d) throw new IllegalArgumentException(""+ prefixFactor);
if (suffixFactor <0d || suffixFactor >10d) throw new IllegalArgumentException(""+ suffixFactor);
if (probabilityThreshold<0d || probabilityThreshold>1d) throw new IllegalArgumentException(""+probabilityThreshold);
if (minimalConfidence<0d || minimalConfidence>1d) throw new IllegalArgumentException(""+minimalConfidence);
if (langWeightingMap!=null && langWeightingMap.isEmpty()) langWeightingMap = null;
this.ngramFrequencyData = ngramFrequencyData;
this.alpha = alpha;
this.shortTextAlgorithm = shortTextAlgorithm;
this.prefixFactor = prefixFactor;
this.suffixFactor = suffixFactor;
this.probabilityThreshold = probabilityThreshold;
this.minimalConfidence = minimalConfidence;
this.priorMap = (langWeightingMap==null) ? null : Util.makeInternalPrioMap(langWeightingMap, ngramFrequencyData.getLanguageList());
this.ngramExtractor = ngramExtractor;
}
@Override
public Optional detect(CharSequence text) {
List probabilities = getProbabilities(text);
if (probabilities.isEmpty()) {
return Optional.absent();
} else {
DetectedLanguage best = probabilities.get(0);
if (best.getProbability() >= minimalConfidence) {
return Optional.of(best.getLanguage());
} else {
return Optional.absent();
}
}
}
@Override
public List getProbabilities(CharSequence text) {
double[] langprob = detectBlock(text);
if (langprob==null) {
return Collections.emptyList();
} else {
return sortProbability(langprob);
}
}
/**
* @return null if there are no "features" in the text (just noise).
*/
@Nullable
private double[] detectBlock(CharSequence text) {
if (text.length() <= shortTextAlgorithm) {
Map ngrams = ngramExtractor.extractCountedGrams(text);
if (ngrams.isEmpty()) return null;
return detectBlockShortText(ngrams);
} else {
List strings = ngramExtractor.extractGrams(text);
if (strings.isEmpty()) return null;
return detectBlockLongText(strings);
}
}
/**
*/
private double[] detectBlockShortText(Map ngrams) {
double[] prob = initProbability();
double alpha = this.alpha; //TODO I don't understand what this does.
for (Map.Entry gramWithCount : ngrams.entrySet()) {
updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha);
}
Util.normalizeProb(prob);
if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
return prob;
}
/**
* This is the original algorithm used for all text length.
* It is inappropriate for short text.
*/
private double[] detectBlockLongText(List ngrams) {
assert !ngrams.isEmpty();
double[] langprob = new double[ngramFrequencyData.getLanguageList().size()];
Random rand = new Random();
for (int t = 0; t < N_TRIAL; ++t) {
double[] prob = initProbability();
double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH);
for (int i=0; i CONV_THRESHOLD) break; //this looks like an optimization to return quickly when sure. TODO document what's the plan.
if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob));
}
}
for(int j=0;j " + sortProbability(prob));
}
return langprob;
}
/**
* Initialize the map of language probabilities.
* If there is the specified prior map, use it as initial map.
* @return initialized map of language probabilities
*/
private double[] initProbability() {
double[] prob = new double[ngramFrequencyData.getLanguageList().size()];
if (priorMap != null) {
//TODO analyze and optimize this code, looks like double copy.
System.arraycopy(priorMap, 0, prob, 0, prob.length);
for(int i=0;i1) {
if (prefixFactor !=1.0 && ngram.charAt(0)==' ') {
weight *= prefixFactor;
} else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') {
weight *= suffixFactor;
}
}
for (int i=0; i sortProbability(double[] prob) {
List list = new ArrayList<>();
for (int j=0;j= probabilityThreshold) {
for (int i=0; i<=list.size(); ++i) {
if (i == list.size() || list.get(i).getProbability() < p) {
list.add(i, new DetectedLanguage(ngramFrequencyData.getLanguage(j), p));
break;
}
}
}
}
return list;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy