com.optimaize.langdetect.LanguageDetectorImpl Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of language-detector Show documentation
Language Detection Library for Java.
The newest version!
package com.optimaize.langdetect;

import com.optimaize.langdetect.cybozu.util.Util;
import com.google.common.base.Optional;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractor;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

/**
 *
 *
 * This class is immutable and thus thread-safe.
 *
 * @author Nakatani Shuyo
 * @author Fabian Kessler
 * @author Elmer Garduno
 */
public final class LanguageDetectorImpl implements LanguageDetector {
    private static final Logger logger = LoggerFactory.getLogger(LanguageDetectorImpl.class);

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final double ALPHA_WIDTH = 0.05;

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final int ITERATION_LIMIT = 1000;

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final double CONV_THRESHOLD = 0.99999;

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final int BASE_FREQ = 10000;

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final int N_TRIAL = 7;

    /**
     * This is used when no custom seed was passed in.
     * By using the same seed for different calls, the results are consistent also.
     *
     * Changing this number means that users of the library might suddenly see other results after updating.
     * So don't change it hastily. I chose a prime number *clueless*.
     * See https://github.com/optimaize/language-detector/issues/14
     */
    private static final long DEFAULT_SEED = 41L;

    private static final Comparator PROBABILITY_SORTING_COMPARATOR = new Comparator() {
        public int compare(DetectedLanguage a, DetectedLanguage b) {
            return Double.compare(b.getProbability(), a.getProbability());
        }
    };

    @NotNull
    private final NgramFrequencyData ngramFrequencyData;

    /**
     * User-defined language priorities, in the same order as {@code langlist}.
     */
    @Nullable
    private final double[] priorMap;

    private final double alpha;
    private final Optional seed;
    private final int shortTextAlgorithm;
    private final double prefixFactor;
    private final double suffixFactor;

    private final double probabilityThreshold;
    private final double minimalConfidence;

    private final NgramExtractor ngramExtractor;


    /**
     * Use the {@link LanguageDetectorBuilder}.
     */
    LanguageDetectorImpl(@NotNull NgramFrequencyData ngramFrequencyData,
                         double alpha, Optional seed, int shortTextAlgorithm,
                         double prefixFactor, double suffixFactor,
                         double probabilityThreshold,
                         double minimalConfidence,
                         @Nullable Map langWeightingMap,
                         @NotNull NgramExtractor ngramExtractor) {
        if (alpha<0d || alpha >1d) throw new IllegalArgumentException("alpha must be between 0 and 1, but was: "+alpha);
        if (prefixFactor <0d || prefixFactor >10d) throw new IllegalArgumentException("prefixFactor must be between 0 and 10, but was: "+prefixFactor);
        if (suffixFactor <0d || suffixFactor >10d) throw new IllegalArgumentException("suffixFactor must be between 0 and 10, but was: "+suffixFactor);
        if (probabilityThreshold<0d || probabilityThreshold>1d) throw new IllegalArgumentException("probabilityThreshold must be between 0 and 1, but was: "+probabilityThreshold);
        if (minimalConfidence<0d || minimalConfidence>1d) throw new IllegalArgumentException("minimalConfidence must be between 0 and 1, but was: "+minimalConfidence);
        if (langWeightingMap!=null && langWeightingMap.isEmpty()) langWeightingMap = null;

        this.ngramFrequencyData = ngramFrequencyData;
        this.alpha = alpha;
        this.seed = seed;
        this.shortTextAlgorithm = shortTextAlgorithm;
        this.prefixFactor = prefixFactor;
        this.suffixFactor = suffixFactor;
        this.probabilityThreshold = probabilityThreshold;
        this.minimalConfidence = minimalConfidence;
        this.priorMap = (langWeightingMap==null) ? null : Util.makeInternalPrioMap(langWeightingMap, ngramFrequencyData.getLanguageList());
        this.ngramExtractor = ngramExtractor;
    }


    @Override
    public Optional detect(CharSequence text) {
        List probabilities = getProbabilities(text);
        if (probabilities.isEmpty()) {
            return Optional.absent();
        } else {
            DetectedLanguage best = probabilities.get(0);
            if (best.getProbability() >= minimalConfidence) {
                return Optional.of(best.getLocale());
            } else {
                return Optional.absent();
            }
        }
    }

    @Override
    public List getProbabilities(CharSequence text) {
        double[] langprob = detectBlock(text);
        if (langprob==null) {
            return Collections.emptyList();
        } else {
            return sortProbability(langprob);
        }
    }


    /**
     * @return null if there are no "features" in the text (just noise).
     */
    @Nullable
    private double[] detectBlock(CharSequence text) {
        if (text.length() <= shortTextAlgorithm) {
            Map ngrams = ngramExtractor.extractCountedGrams(text);
            if (ngrams.isEmpty()) return null;
            return detectBlockShortText(ngrams);
        } else {
            List strings = ngramExtractor.extractGrams(text);
            if (strings.isEmpty()) return null;
            return detectBlockLongText(strings);
        }
    }

    /**
     */
    private double[] detectBlockShortText(Map ngrams) {
        double[] prob = initProbability();
        double alpha = this.alpha; //TODO I don't understand what this does.
        for (Map.Entry gramWithCount : ngrams.entrySet()) {
            updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha);
            if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0
        }
        Util.normalizeProb(prob);
        if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
        return prob;
    }

    /**
     * This is the original algorithm used for all text length.
     * It is inappropriate for short text.
     */
    private double[] detectBlockLongText(List ngrams) {
        assert !ngrams.isEmpty();
        double[] langprob = new double[ngramFrequencyData.getLanguageList().size()];
        Random rand = new Random(seed.or(DEFAULT_SEED));
        for (int t = 0; t < N_TRIAL; ++t) {
            double[] prob = initProbability();
            double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH);

            for (int i=0; i CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0
                    if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob));
                }
            }
            for(int j=0;j " + sortProbability(prob));
        }
        return langprob;
    }

    /**
     * Initialize the map of language probabilities.
     * If there is the specified prior map, use it as initial map.
     * @return initialized map of language probabilities
     */
    private double[] initProbability() {
        double[] prob = new double[ngramFrequencyData.getLanguageList().size()];
        if (priorMap != null) {
            //TODO analyze and optimize this code, looks like double copy.
            System.arraycopy(priorMap, 0, prob, 0, prob.length);
            for(int i=0;i1) {
            if (prefixFactor !=1.0 && ngram.charAt(0)==' ') {
                weight *= prefixFactor;
            } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') {
                weight *= suffixFactor;
            }
        }
        for (int i=0; i sortProbability(double[] prob) {
        List list = new ArrayList<>();
        //step 1: add all that have reached a minimal probability:
        for (int j=0;j= probabilityThreshold) {
                list.add(new DetectedLanguage(ngramFrequencyData.getLanguage(j), p));
            }
        }
        //step 2: sort in descending order
        if (list.size() >= 2) {
            Collections.sort(list, PROBABILITY_SORTING_COMPARATOR);
        }
        return list;
    }

}