All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.codelibs.elasticsearch.langfield.detect.LangDetector Maven / Gradle / Ivy

The newest version!
package org.codelibs.elasticsearch.langfield.detect;

import java.io.IOException;
import java.io.Reader;
import java.lang.Character.UnicodeBlock;
import java.util.ArrayList;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Pattern;

import org.codelibs.elasticsearch.langfield.detect.util.NGram;
import org.elasticsearch.ElasticsearchException;

/**
 * {@link LangDetector} class is to detect language from specified text.
 * Its instance is able to be constructed via the factory class {@link LangDetectorFactory}.
 * 

* After appending a target text to the {@link LangDetector} instance with {@link #append(Reader)} or {@link #append(String)}, * the detector provides the language detection results for target text via {@link #detect()} or {@link #getProbabilities()}. * {@link #detect()} method returns a single language name which has the highest probability. * {@link #getProbabilities()} methods returns a list of multiple languages and their probabilities. *

* The detector has some parameters for language detection. * See {@link #setAlpha(double)}, {@link #setMaxTextLength(int)} and {@link #setPriorMap(Map)}. * *

 * import java.util.ArrayList;
 * import org.codelibs.elasticsearch.langfield.detect.LangDetector;
 * import org.codelibs.elasticsearch.langfield.detect.LangDetectorFactory;
 * import org.codelibs.elasticsearch.langfield.detect.Language;
 *
 * class LangDetectSample {
 *     public void init(String profileDirectory)  {
 *         LangDetectorFactory.loadProfile(profileDirectory);
 *     }
 *     public String detect(String text)  {
 *         LangDetector detector = LangDetectorFactory.create();
 *         detector.append(text);
 *         return detector.detect();
 *     }
 *     public List<Language> detectLangs(String text)  {
 *         LangDetector detector = LangDetectorFactory.create();
 *         detector.append(text);
 *         return detector.getProbabilities();
 *     }
 * }
 * 
* *
    *
  • 4x faster improvement based on Elmer Garduno's code. Thanks!
  • *
* * @author Nakatani Shuyo * @author shinsuke * @see LangDetectorFactory */ public class LangDetector { private static final double ALPHA_DEFAULT = 0.5; private static final double ALPHA_WIDTH = 0.05; private static final int ITERATION_LIMIT = 1000; private static final double PROB_THRESHOLD = 0.1; private static final double CONV_THRESHOLD = 0.99999; private static final int BASE_FREQ = 10000; public static final String UNKNOWN_LANG = "unknown"; private static final Pattern URL_REGEX = Pattern .compile("https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}"); private static final Pattern MAIL_REGEX = Pattern.compile( "[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}"); private final Map wordLangProbMap; private final List langlist; private StringBuilder text; private double[] langprob = null; private double alpha = ALPHA_DEFAULT; private final int nTrial = 7; private int maxTextLength = 10000; private double[] priorMap = null; private boolean verbose = false; private Long seed = null; /** * Constructor. * LangDetector instance can be constructed via {@link LangDetectorFactory#getLangDetector()}. * @param factory {@link LangDetectorFactory} instance (only LangDetectorFactory inside) */ public LangDetector(final LangDetectorFactory factory) { this.wordLangProbMap = factory.wordLangProbMap; this.langlist = factory.langlist; this.text = new StringBuilder(); this.seed = factory.seed; } /** * Set Verbose Mode(use for debug). */ public void setVerbose() { this.verbose = true; } /** * Set smoothing parameter. * The default value is 0.5(i.e. Expected Likelihood Estimate). * @param alpha the smoothing parameter */ public void setAlpha(final double alpha) { this.alpha = alpha; } /** * Set prior information about language probabilities. * @param priorMap the priorMap to set */ public void setPriorMap(final Map priorMap) { this.priorMap = new double[langlist.size()]; double sump = 0; for (int i = 0; i < this.priorMap.length; ++i) { final String lang = langlist.get(i); if (priorMap.containsKey(lang)) { final double p = priorMap.get(lang); if (p < 0) { throw new ElasticsearchException("Prior probability must be non-negative."); } this.priorMap[i] = p; sump += p; } } if (sump <= 0) { throw new ElasticsearchException("More one of prior probability must be non-zero."); } for (int i = 0; i < this.priorMap.length; ++i) { this.priorMap[i] /= sump; } } /** * Specify max size of target text to use for language detection. * The default value is 10000(10KB). * @param maxTextLength the maxTextLength to set */ public void setMaxTextLength(final int maxTextLength) { this.maxTextLength = maxTextLength; } /** * Append the target text for language detection. * This method read the text from specified input reader. * If the total size of target text exceeds the limit size specified by {@link LangDetector#setMaxTextLength(int)}, * the rest is cut down. * * @param reader the input reader (BufferedReader as usual) * @throws IOException Can't read the reader. */ public void append(final Reader reader) throws IOException { final char[] buf = new char[maxTextLength / 2]; while (text.length() < maxTextLength && reader.ready()) { final int length = reader.read(buf); append(new String(buf, 0, length)); } } /** * Append the target text for language detection. * If the total size of target text exceeds the limit size specified by {@link LangDetector#setMaxTextLength(int)}, * the rest is cut down. * * @param text the target text to append */ public void append(String text) { text = URL_REGEX.matcher(text).replaceAll(" "); text = MAIL_REGEX.matcher(text).replaceAll(" "); text = NGram.normalize_vi(text); char pre = 0; for (int i = 0; i < text.length() && i < maxTextLength; ++i) { final char c = text.charAt(i); if (c != ' ' || pre != ' ') { this.text.append(c); } pre = c; } } /** * Cleaning text to detect * (eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet) */ private void cleaningText() { int latinCount = 0, nonLatinCount = 0; for (int i = 0; i < text.length(); ++i) { final char c = text.charAt(i); if (c <= 'z' && c >= 'A') { ++latinCount; } else if (c >= '\u0300' && UnicodeBlock .of(c) != UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) { ++nonLatinCount; } } if (latinCount * 2 < nonLatinCount) { final StringBuilder textWithoutLatin = new StringBuilder(); for (int i = 0; i < text.length(); ++i) { final char c = text.charAt(i); if (c > 'z' || c < 'A') { textWithoutLatin.append(c); } } text = textWithoutLatin; } } /** * Detect language of the target text and return the language name which has the highest probability. * @return detected language name which has most probability. * code = ErrorCode.CantDetectError : Can't detect because of no valid features in text */ public String detect() { final List probabilities = getProbabilities(); if (probabilities.size() > 0) { return probabilities.get(0).lang; } return UNKNOWN_LANG; } /** * Get language candidates which have high probabilities * @return possible languages list (whose probabilities are over PROB_THRESHOLD, ordered by probabilities descendently * code = ErrorCode.CantDetectError : Can't detect because of no valid features in text */ public List getProbabilities() { if (langprob == null) { detectBlock(); } final List list = sortProbability(langprob); return list; } private void detectBlock() { cleaningText(); final List ngrams = extractNGrams(); if (ngrams.size() == 0) { throw new ElasticsearchException("no features in text"); } langprob = new double[langlist.size()]; final Random rand = new Random(); if (seed != null) { rand.setSeed(seed); } for (int t = 0; t < nTrial; ++t) { final double[] prob = initProbability(); final double alpha = this.alpha + rand.nextGaussian() * ALPHA_WIDTH; for (int i = 0;; ++i) { final int r = rand.nextInt(ngrams.size()); updateLangProb(prob, ngrams.get(r), alpha); if (i % 5 == 0) { if (normalizeProb(prob) > CONV_THRESHOLD || i >= ITERATION_LIMIT) { break; } if (verbose) { System.out.println("> " + sortProbability(prob)); } } } for (int j = 0; j < langprob.length; ++j) { langprob[j] += prob[j] / nTrial; } if (verbose) { System.out.println("==> " + sortProbability(prob)); } } } /** * Initialize the map of language probabilities. * If there is the specified prior map, use it as initial map. * @return initialized map of language probabilities */ private double[] initProbability() { final double[] prob = new double[langlist.size()]; if (priorMap != null) { for (int i = 0; i < prob.length; ++i) { prob[i] = priorMap[i]; } } else { for (int i = 0; i < prob.length; ++i) { prob[i] = 1.0 / langlist.size(); } } return prob; } /** * Extract n-grams from target text * @return n-grams list */ private List extractNGrams() { final List list = new ArrayList<>(); final NGram ngram = new NGram(); for (int i = 0; i < text.length(); ++i) { ngram.addChar(text.charAt(i)); for (int n = 1; n <= NGram.N_GRAM; ++n) { final String w = ngram.get(n); if (w != null && wordLangProbMap.containsKey(w)) { list.add(w); } } } return list; } /** * update language probabilities with N-gram string(N=1,2,3) * @param word N-gram string */ private boolean updateLangProb(final double[] prob, final String word, final double alpha) { if (word == null || !wordLangProbMap.containsKey(word)) { return false; } final double[] langProbMap = wordLangProbMap.get(word); if (verbose) { System.out.println(word + "(" + unicodeEncode(word) + "):" + wordProbToString(langProbMap)); } final double weight = alpha / BASE_FREQ; for (int i = 0; i < prob.length; ++i) { prob[i] *= weight + langProbMap[i]; } return true; } private String wordProbToString(final double[] prob) { final Formatter formatter = new Formatter(); for (int j = 0; j < prob.length; ++j) { final double p = prob[j]; if (p >= 0.00001) { formatter.format(" %s:%.5f", langlist.get(j), p); } } final String string = formatter.toString(); formatter.close(); return string; } /** * normalize probabilities and check convergence by the maximun probability * @return maximum of probabilities */ static private double normalizeProb(final double[] prob) { double maxp = 0, sump = 0; for (final double element : prob) { sump += element; } if(sump == 0) { return 0; } for (int i = 0; i < prob.length; ++i) { final double p = prob[i] / sump; if (maxp < p) { maxp = p; } prob[i] = p; } return maxp; } /** * @param probabilities HashMap * @return lanugage candidates order by probabilities descendently */ private List sortProbability(final double[] prob) { final List list = new ArrayList<>(); for (int j = 0; j < prob.length; ++j) { final double p = prob[j]; if (p > PROB_THRESHOLD) { for (int i = 0; i <= list.size(); ++i) { if (i == list.size() || list.get(i).prob < p) { list.add(i, new Language(langlist.get(j), p)); break; } } } } return list; } /** * unicode encoding (for verbose mode) * @param word * @return encoded word */ static private String unicodeEncode(final String word) { final StringBuffer buf = new StringBuffer(); for (int i = 0; i < word.length(); ++i) { final char ch = word.charAt(i); if (ch >= '\u0080') { String st = Integer.toHexString(0x10000 + ch); while (st.length() < 4) { st = "0" + st; } buf.append("\\u").append(st.subSequence(1, 5)); } else { buf.append(ch); } } return buf.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy