All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.optimaize.langdetect.LanguageDetectorImpl Maven / Gradle / Ivy

There is a newer version: 0.6
Show newest version
package com.optimaize.langdetect;

import com.cybozu.labs.langdetect.util.Util;
import com.google.common.base.Optional;
import com.optimaize.langdetect.ngram.NgramExtractor;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

/**
 *
 *
 * 

This class is immutable and thus thread-safe.

* * @author Nakatani Shuyo * @author Fabian Kessler * @author Elmer Garduno */ public final class LanguageDetectorImpl implements LanguageDetector { private static final Logger logger = LoggerFactory.getLogger(LanguageDetectorImpl.class); /** * TODO document what this is for, and why that value is chosen. */ private static final double ALPHA_WIDTH = 0.05; /** * TODO document what this is for, and why that value is chosen. */ private static final int ITERATION_LIMIT = 1000; /** * TODO document what this is for, and why that value is chosen. */ private static final double CONV_THRESHOLD = 0.99999; /** * TODO document what this is for, and why that value is chosen. */ private static final int BASE_FREQ = 10000; /** * TODO document what this is for, and why that value is chosen. */ private static final int N_TRIAL = 7; @NotNull private final NgramFrequencyData ngramFrequencyData; /** * User-defined language priorities, in the same order as {@code langlist}. */ @Nullable private final double[] priorMap; private final double alpha; private final int shortTextAlgorithm; private final double prefixFactor; private final double suffixFactor; private final double probabilityThreshold; private final double minimalConfidence; private final NgramExtractor ngramExtractor; /** * Use the {@link LanguageDetectorBuilder}. */ LanguageDetectorImpl(@NotNull NgramFrequencyData ngramFrequencyData, double alpha, int shortTextAlgorithm, double prefixFactor, double suffixFactor, double probabilityThreshold, double minimalConfidence, @Nullable Map langWeightingMap, @NotNull NgramExtractor ngramExtractor) { if (alpha<0d || alpha >1d) throw new IllegalArgumentException(""+alpha); if (prefixFactor <0d || prefixFactor >10d) throw new IllegalArgumentException(""+ prefixFactor); if (suffixFactor <0d || suffixFactor >10d) throw new IllegalArgumentException(""+ suffixFactor); if (probabilityThreshold<0d || probabilityThreshold>1d) throw new IllegalArgumentException(""+probabilityThreshold); if (minimalConfidence<0d || minimalConfidence>1d) throw new IllegalArgumentException(""+minimalConfidence); if (langWeightingMap!=null && langWeightingMap.isEmpty()) langWeightingMap = null; this.ngramFrequencyData = ngramFrequencyData; this.alpha = alpha; this.shortTextAlgorithm = shortTextAlgorithm; this.prefixFactor = prefixFactor; this.suffixFactor = suffixFactor; this.probabilityThreshold = probabilityThreshold; this.minimalConfidence = minimalConfidence; this.priorMap = (langWeightingMap==null) ? null : Util.makeInternalPrioMap(langWeightingMap, ngramFrequencyData.getLanguageList()); this.ngramExtractor = ngramExtractor; } @Override public Optional detect(CharSequence text) { List probabilities = getProbabilities(text); if (probabilities.isEmpty()) { return Optional.absent(); } else { DetectedLanguage best = probabilities.get(0); if (best.getProbability() >= minimalConfidence) { return Optional.of(best.getLanguage()); } else { return Optional.absent(); } } } @Override public List getProbabilities(CharSequence text) { double[] langprob = detectBlock(text); if (langprob==null) { return Collections.emptyList(); } else { return sortProbability(langprob); } } /** * @return null if there are no "features" in the text (just noise). */ @Nullable private double[] detectBlock(CharSequence text) { if (text.length() <= shortTextAlgorithm) { Map ngrams = ngramExtractor.extractCountedGrams(text); if (ngrams.isEmpty()) return null; return detectBlockShortText(ngrams); } else { List strings = ngramExtractor.extractGrams(text); if (strings.isEmpty()) return null; return detectBlockLongText(strings); } } /** */ private double[] detectBlockShortText(Map ngrams) { double[] prob = initProbability(); double alpha = this.alpha; //TODO I don't understand what this does. for (Map.Entry gramWithCount : ngrams.entrySet()) { updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha); } Util.normalizeProb(prob); if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); return prob; } /** * This is the original algorithm used for all text length. * It is inappropriate for short text. */ private double[] detectBlockLongText(List ngrams) { assert !ngrams.isEmpty(); double[] langprob = new double[ngramFrequencyData.getLanguageList().size()]; Random rand = new Random(); for (int t = 0; t < N_TRIAL; ++t) { double[] prob = initProbability(); double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH); for (int i=0; i CONV_THRESHOLD) break; //this looks like an optimization to return quickly when sure. TODO document what's the plan. if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob)); } } for(int j=0;j " + sortProbability(prob)); } return langprob; } /** * Initialize the map of language probabilities. * If there is the specified prior map, use it as initial map. * @return initialized map of language probabilities */ private double[] initProbability() { double[] prob = new double[ngramFrequencyData.getLanguageList().size()]; if (priorMap != null) { //TODO analyze and optimize this code, looks like double copy. System.arraycopy(priorMap, 0, prob, 0, prob.length); for(int i=0;i1) { if (prefixFactor !=1.0 && ngram.charAt(0)==' ') { weight *= prefixFactor; } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') { weight *= suffixFactor; } } for (int i=0; i sortProbability(double[] prob) { List list = new ArrayList<>(); for (int j=0;j= probabilityThreshold) { for (int i=0; i<=list.size(); ++i) { if (i == list.size() || list.get(i).getProbability() < p) { list.add(i, new DetectedLanguage(ngramFrequencyData.getLanguage(j), p)); break; } } } } return list; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy