All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.optimaize.langdetect.LanguageDetectorImpl Maven / Gradle / Ivy

The newest version!
package com.optimaize.langdetect;

import com.optimaize.langdetect.cybozu.util.Util;
import com.google.common.base.Optional;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractor;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

/**
 *
 *
 * 

This class is immutable and thus thread-safe.

* * @author Nakatani Shuyo * @author Fabian Kessler * @author Elmer Garduno */ public final class LanguageDetectorImpl implements LanguageDetector { private static final Logger logger = LoggerFactory.getLogger(LanguageDetectorImpl.class); /** * TODO document what this is for, and why that value is chosen. */ private static final double ALPHA_WIDTH = 0.05; /** * TODO document what this is for, and why that value is chosen. */ private static final int ITERATION_LIMIT = 1000; /** * TODO document what this is for, and why that value is chosen. */ private static final double CONV_THRESHOLD = 0.99999; /** * TODO document what this is for, and why that value is chosen. */ private static final int BASE_FREQ = 10000; /** * TODO document what this is for, and why that value is chosen. */ private static final int N_TRIAL = 7; /** * This is used when no custom seed was passed in. * By using the same seed for different calls, the results are consistent also. * * Changing this number means that users of the library might suddenly see other results after updating. * So don't change it hastily. I chose a prime number *clueless*. * See https://github.com/optimaize/language-detector/issues/14 */ private static final long DEFAULT_SEED = 41L; private static final Comparator PROBABILITY_SORTING_COMPARATOR = new Comparator() { public int compare(DetectedLanguage a, DetectedLanguage b) { return Double.compare(b.getProbability(), a.getProbability()); } }; @NotNull private final NgramFrequencyData ngramFrequencyData; /** * User-defined language priorities, in the same order as {@code langlist}. */ @Nullable private final double[] priorMap; private final double alpha; private final Optional seed; private final int shortTextAlgorithm; private final double prefixFactor; private final double suffixFactor; private final double probabilityThreshold; private final double minimalConfidence; private final NgramExtractor ngramExtractor; /** * Use the {@link LanguageDetectorBuilder}. */ LanguageDetectorImpl(@NotNull NgramFrequencyData ngramFrequencyData, double alpha, Optional seed, int shortTextAlgorithm, double prefixFactor, double suffixFactor, double probabilityThreshold, double minimalConfidence, @Nullable Map langWeightingMap, @NotNull NgramExtractor ngramExtractor) { if (alpha<0d || alpha >1d) throw new IllegalArgumentException("alpha must be between 0 and 1, but was: "+alpha); if (prefixFactor <0d || prefixFactor >10d) throw new IllegalArgumentException("prefixFactor must be between 0 and 10, but was: "+prefixFactor); if (suffixFactor <0d || suffixFactor >10d) throw new IllegalArgumentException("suffixFactor must be between 0 and 10, but was: "+suffixFactor); if (probabilityThreshold<0d || probabilityThreshold>1d) throw new IllegalArgumentException("probabilityThreshold must be between 0 and 1, but was: "+probabilityThreshold); if (minimalConfidence<0d || minimalConfidence>1d) throw new IllegalArgumentException("minimalConfidence must be between 0 and 1, but was: "+minimalConfidence); if (langWeightingMap!=null && langWeightingMap.isEmpty()) langWeightingMap = null; this.ngramFrequencyData = ngramFrequencyData; this.alpha = alpha; this.seed = seed; this.shortTextAlgorithm = shortTextAlgorithm; this.prefixFactor = prefixFactor; this.suffixFactor = suffixFactor; this.probabilityThreshold = probabilityThreshold; this.minimalConfidence = minimalConfidence; this.priorMap = (langWeightingMap==null) ? null : Util.makeInternalPrioMap(langWeightingMap, ngramFrequencyData.getLanguageList()); this.ngramExtractor = ngramExtractor; } @Override public Optional detect(CharSequence text) { List probabilities = getProbabilities(text); if (probabilities.isEmpty()) { return Optional.absent(); } else { DetectedLanguage best = probabilities.get(0); if (best.getProbability() >= minimalConfidence) { return Optional.of(best.getLocale()); } else { return Optional.absent(); } } } @Override public List getProbabilities(CharSequence text) { double[] langprob = detectBlock(text); if (langprob==null) { return Collections.emptyList(); } else { return sortProbability(langprob); } } /** * @return null if there are no "features" in the text (just noise). */ @Nullable private double[] detectBlock(CharSequence text) { if (text.length() <= shortTextAlgorithm) { Map ngrams = ngramExtractor.extractCountedGrams(text); if (ngrams.isEmpty()) return null; return detectBlockShortText(ngrams); } else { List strings = ngramExtractor.extractGrams(text); if (strings.isEmpty()) return null; return detectBlockLongText(strings); } } /** */ private double[] detectBlockShortText(Map ngrams) { double[] prob = initProbability(); double alpha = this.alpha; //TODO I don't understand what this does. for (Map.Entry gramWithCount : ngrams.entrySet()) { updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha); if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 } Util.normalizeProb(prob); if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); return prob; } /** * This is the original algorithm used for all text length. * It is inappropriate for short text. */ private double[] detectBlockLongText(List ngrams) { assert !ngrams.isEmpty(); double[] langprob = new double[ngramFrequencyData.getLanguageList().size()]; Random rand = new Random(seed.or(DEFAULT_SEED)); for (int t = 0; t < N_TRIAL; ++t) { double[] prob = initProbability(); double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH); for (int i=0; i CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob)); } } for(int j=0;j " + sortProbability(prob)); } return langprob; } /** * Initialize the map of language probabilities. * If there is the specified prior map, use it as initial map. * @return initialized map of language probabilities */ private double[] initProbability() { double[] prob = new double[ngramFrequencyData.getLanguageList().size()]; if (priorMap != null) { //TODO analyze and optimize this code, looks like double copy. System.arraycopy(priorMap, 0, prob, 0, prob.length); for(int i=0;i1) { if (prefixFactor !=1.0 && ngram.charAt(0)==' ') { weight *= prefixFactor; } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') { weight *= suffixFactor; } } for (int i=0; i sortProbability(double[] prob) { List list = new ArrayList<>(); //step 1: add all that have reached a minimal probability: for (int j=0;j= probabilityThreshold) { list.add(new DetectedLanguage(ngramFrequencyData.getLanguage(j), p)); } } //step 2: sort in descending order if (list.size() >= 2) { Collections.sort(list, PROBABILITY_SORTING_COMPARATOR); } return list; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy