com.optimaize.langdetect.LanguageDetectorImpl Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of language-detector Show documentation
Language Detection Library for Java.
There is a newer version: 0.6
package com.optimaize.langdetect;

import com.cybozu.labs.langdetect.util.Util;
import com.google.common.base.Optional;
import com.optimaize.langdetect.ngram.NgramExtractor;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

/**
 *
 *
 * This class is immutable and thus thread-safe.
 *
 * @author Nakatani Shuyo
 * @author Fabian Kessler
 * @author Elmer Garduno
 */
public final class LanguageDetectorImpl implements LanguageDetector {

    private static final Logger logger = LoggerFactory.getLogger(LanguageDetectorImpl.class);

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final double ALPHA_WIDTH = 0.05;

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final int ITERATION_LIMIT = 1000;

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final double CONV_THRESHOLD = 0.99999;

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final int BASE_FREQ = 10000;

    /**
     * TODO document what this is for, and why that value is chosen.
     */
    private static final int N_TRIAL = 7;

    @NotNull
    private final NgramFrequencyData ngramFrequencyData;

    /**
     * User-defined language priorities, in the same order as {@code langlist}.
     */
    @Nullable
    private final double[] priorMap;

    private final double alpha;
    private final int shortTextAlgorithm;
    private final double prefixFactor;
    private final double suffixFactor;

    private final double probabilityThreshold;
    private final double minimalConfidence;

    private final NgramExtractor ngramExtractor;


    /**
     * Use the {@link LanguageDetectorBuilder}.
     */
    LanguageDetectorImpl(@NotNull NgramFrequencyData ngramFrequencyData,
                         double alpha, int shortTextAlgorithm,
                         double prefixFactor, double suffixFactor,
                         double probabilityThreshold,
                         double minimalConfidence,
                         @Nullable Map langWeightingMap,
                         @NotNull NgramExtractor ngramExtractor) {
        if (alpha<0d || alpha >1d) throw new IllegalArgumentException(""+alpha);
        if (prefixFactor <0d || prefixFactor >10d) throw new IllegalArgumentException(""+ prefixFactor);
        if (suffixFactor <0d || suffixFactor >10d) throw new IllegalArgumentException(""+ suffixFactor);
        if (probabilityThreshold<0d || probabilityThreshold>1d) throw new IllegalArgumentException(""+probabilityThreshold);
        if (minimalConfidence<0d || minimalConfidence>1d) throw new IllegalArgumentException(""+minimalConfidence);
        if (langWeightingMap!=null && langWeightingMap.isEmpty()) langWeightingMap = null;

        this.ngramFrequencyData = ngramFrequencyData;
        this.alpha = alpha;
        this.shortTextAlgorithm = shortTextAlgorithm;
        this.prefixFactor = prefixFactor;
        this.suffixFactor = suffixFactor;
        this.probabilityThreshold = probabilityThreshold;
        this.minimalConfidence = minimalConfidence;
        this.priorMap = (langWeightingMap==null) ? null : Util.makeInternalPrioMap(langWeightingMap, ngramFrequencyData.getLanguageList());
        this.ngramExtractor = ngramExtractor;
    }


    @Override
    public Optional detect(CharSequence text) {
        List probabilities = getProbabilities(text);
        if (probabilities.isEmpty()) {
            return Optional.absent();
        } else {
            DetectedLanguage best = probabilities.get(0);
            if (best.getProbability() >= minimalConfidence) {
                return Optional.of(best.getLanguage());
            } else {
                return Optional.absent();
            }
        }
    }

    @Override
    public List getProbabilities(CharSequence text) {
        double[] langprob = detectBlock(text);
        if (langprob==null) {
            return Collections.emptyList();
        } else {
            return sortProbability(langprob);
        }
    }


    /**
     * @return null if there are no "features" in the text (just noise).
     */
    @Nullable
    private double[] detectBlock(CharSequence text) {
        if (text.length() <= shortTextAlgorithm) {
            Map ngrams = ngramExtractor.extractCountedGrams(text);
            if (ngrams.isEmpty()) return null;
            return detectBlockShortText(ngrams);
        } else {
            List strings = ngramExtractor.extractGrams(text);
            if (strings.isEmpty()) return null;
            return detectBlockLongText(strings);
        }
    }

    /**
     */
    private double[] detectBlockShortText(Map ngrams) {
        double[] prob = initProbability();
        double alpha = this.alpha; //TODO I don't understand what this does.
        for (Map.Entry gramWithCount : ngrams.entrySet()) {
            updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha);
        }
        Util.normalizeProb(prob);
        if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
        return prob;
    }

    /**
     * This is the original algorithm used for all text length.
     * It is inappropriate for short text.
     */
    private double[] detectBlockLongText(List ngrams) {
        assert !ngrams.isEmpty();
        double[] langprob = new double[ngramFrequencyData.getLanguageList().size()];
        Random rand = new Random();
        for (int t = 0; t < N_TRIAL; ++t) {
            double[] prob = initProbability();
            double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH);

            for (int i=0; i CONV_THRESHOLD) break; //this looks like an optimization to return quickly when sure. TODO document what's the plan.
                    if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob));
                }
            }
            for(int j=0;j " + sortProbability(prob));
        }
        return langprob;
    }

    /**
     * Initialize the map of language probabilities.
     * If there is the specified prior map, use it as initial map.
     * @return initialized map of language probabilities
     */
    private double[] initProbability() {
        double[] prob = new double[ngramFrequencyData.getLanguageList().size()];
        if (priorMap != null) {
            //TODO analyze and optimize this code, looks like double copy.
            System.arraycopy(priorMap, 0, prob, 0, prob.length);
            for(int i=0;i1) {
            if (prefixFactor !=1.0 && ngram.charAt(0)==' ') {
                weight *= prefixFactor;
            } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') {
                weight *= suffixFactor;
            }
        }
        for (int i=0; i sortProbability(double[] prob) {
        List list = new ArrayList<>();
        for (int j=0;j= probabilityThreshold) {
                for (int i=0; i<=list.size(); ++i) {
                    if (i == list.size() || list.get(i).getProbability() < p) {
                        list.add(i, new DetectedLanguage(ngramFrequencyData.getLanguage(j), p));
                        break;
                    }
                }
            }
        }
        return list;
    }

}