org.codelibs.elasticsearch.langfield.detect.LangDetector Maven / Gradle / Ivy

Go to download
package org.codelibs.elasticsearch.langfield.detect;

import java.io.IOException;
import java.io.Reader;
import java.lang.Character.UnicodeBlock;
import java.util.ArrayList;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Pattern;

import org.codelibs.elasticsearch.langfield.detect.util.NGram;
import org.elasticsearch.ElasticsearchException;

/**
 * {@link LangDetector} class is to detect language from specified text.
 * Its instance is able to be constructed via the factory class {@link LangDetectorFactory}.
 * 
 * After appending a target text to the {@link LangDetector} instance with {@link #append(Reader)} or {@link #append(String)},
 * the detector provides the language detection results for target text via {@link #detect()} or {@link #getProbabilities()}.
 * {@link #detect()} method returns a single language name which has the highest probability.
 * {@link #getProbabilities()} methods returns a list of multiple languages and their probabilities.
 * 

 * The detector has some parameters for language detection.
 * See {@link #setAlpha(double)}, {@link #setMaxTextLength(int)} and {@link #setPriorMap(Map)}.
 *
 * 
 * import java.util.ArrayList;
 * import org.codelibs.elasticsearch.langfield.detect.LangDetector;
 * import org.codelibs.elasticsearch.langfield.detect.LangDetectorFactory;
 * import org.codelibs.elasticsearch.langfield.detect.Language;
 *
 * class LangDetectSample {
 *     public void init(String profileDirectory)  {
 *         LangDetectorFactory.loadProfile(profileDirectory);
 *     }
 *     public String detect(String text)  {
 *         LangDetector detector = LangDetectorFactory.create();
 *         detector.append(text);
 *         return detector.detect();
 *     }
 *     public List<Language> detectLangs(String text)  {
 *         LangDetector detector = LangDetectorFactory.create();
 *         detector.append(text);
 *         return detector.getProbabilities();
 *     }
 * }
 * 
 *
 * 
 * 4x faster improvement based on Elmer Garduno's code. Thanks!
 * 
 *
 * @author Nakatani Shuyo
 * @author shinsuke
 * @see LangDetectorFactory
 */
public class LangDetector {
    private static final double ALPHA_DEFAULT = 0.5;

    private static final double ALPHA_WIDTH = 0.05;

    private static final int ITERATION_LIMIT = 1000;

    private static final double PROB_THRESHOLD = 0.1;

    private static final double CONV_THRESHOLD = 0.99999;

    private static final int BASE_FREQ = 10000;

    public static final String UNKNOWN_LANG = "unknown";

    private static final Pattern URL_REGEX = Pattern
            .compile("https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}");

    private static final Pattern MAIL_REGEX = Pattern.compile(
            "[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}");

    private final Map wordLangProbMap;

    private final List langlist;

    private StringBuilder text;

    private double[] langprob = null;

    private double alpha = ALPHA_DEFAULT;

    private final int nTrial = 7;

    private int maxTextLength = 10000;

    private double[] priorMap = null;

    private boolean verbose = false;

    private Long seed = null;

    /**
     * Constructor.
     * LangDetector instance can be constructed via {@link LangDetectorFactory#getLangDetector()}.
     * @param factory {@link LangDetectorFactory} instance (only LangDetectorFactory inside)
     */
    public LangDetector(final LangDetectorFactory factory) {
        this.wordLangProbMap = factory.wordLangProbMap;
        this.langlist = factory.langlist;
        this.text = new StringBuilder();
        this.seed = factory.seed;
    }

    /**
     * Set Verbose Mode(use for debug).
     */
    public void setVerbose() {
        this.verbose = true;
    }

    /**
     * Set smoothing parameter.
     * The default value is 0.5(i.e. Expected Likelihood Estimate).
     * @param alpha the smoothing parameter
     */
    public void setAlpha(final double alpha) {
        this.alpha = alpha;
    }

    /**
     * Set prior information about language probabilities.
     * @param priorMap the priorMap to set
     */
    public void setPriorMap(final Map priorMap) {
        this.priorMap = new double[langlist.size()];
        double sump = 0;
        for (int i = 0; i < this.priorMap.length; ++i) {
            final String lang = langlist.get(i);
            if (priorMap.containsKey(lang)) {
                final double p = priorMap.get(lang);
                if (p < 0) {
                    throw new ElasticsearchException("Prior probability must be non-negative.");
                }
                this.priorMap[i] = p;
                sump += p;
            }
        }
        if (sump <= 0) {
            throw new ElasticsearchException("More one of prior probability must be non-zero.");
        }
        for (int i = 0; i < this.priorMap.length; ++i) {
            this.priorMap[i] /= sump;
        }
    }

    /**
     * Specify max size of target text to use for language detection.
     * The default value is 10000(10KB).
     * @param maxTextLength the maxTextLength to set
     */
    public void setMaxTextLength(final int maxTextLength) {
        this.maxTextLength = maxTextLength;
    }

    /**
     * Append the target text for language detection.
     * This method read the text from specified input reader.
     * If the total size of target text exceeds the limit size specified by {@link LangDetector#setMaxTextLength(int)},
     * the rest is cut down.
     *
     * @param reader the input reader (BufferedReader as usual)
     * @throws IOException Can't read the reader.
     */
    public void append(final Reader reader) throws IOException {
        final char[] buf = new char[maxTextLength / 2];
        while (text.length() < maxTextLength && reader.ready()) {
            final int length = reader.read(buf);
            append(new String(buf, 0, length));
        }
    }

    /**
     * Append the target text for language detection.
     * If the total size of target text exceeds the limit size specified by {@link LangDetector#setMaxTextLength(int)},
     * the rest is cut down.
     *
     * @param text the target text to append
     */
    public void append(String text) {
        text = URL_REGEX.matcher(text).replaceAll(" ");
        text = MAIL_REGEX.matcher(text).replaceAll(" ");
        text = NGram.normalize_vi(text);
        char pre = 0;
        for (int i = 0; i < text.length() && i < maxTextLength; ++i) {
            final char c = text.charAt(i);
            if (c != ' ' || pre != ' ') {
                this.text.append(c);
            }
            pre = c;
        }
    }

    /**
     * Cleaning text to detect
     * (eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet)
     */
    private void cleaningText() {
        int latinCount = 0, nonLatinCount = 0;
        for (int i = 0; i < text.length(); ++i) {
            final char c = text.charAt(i);
            if (c <= 'z' && c >= 'A') {
                ++latinCount;
            } else if (c >= '\u0300' && UnicodeBlock
                    .of(c) != UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) {
                ++nonLatinCount;
            }
        }
        if (latinCount * 2 < nonLatinCount) {
            final StringBuilder textWithoutLatin = new StringBuilder();
            for (int i = 0; i < text.length(); ++i) {
                final char c = text.charAt(i);
                if (c > 'z' || c < 'A') {
                    textWithoutLatin.append(c);
                }
            }
            text = textWithoutLatin;
        }

    }

    /**
     * Detect language of the target text and return the language name which has the highest probability.
     * @return detected language name which has most probability.
     *  code = ErrorCode.CantDetectError : Can't detect because of no valid features in text
     */
    public String detect() {
        final List probabilities = getProbabilities();
        if (probabilities.size() > 0) {
            return probabilities.get(0).lang;
        }
        return UNKNOWN_LANG;
    }

    /**
     * Get language candidates which have high probabilities
     * @return possible languages list (whose probabilities are over PROB_THRESHOLD, ordered by probabilities descendently
     *  code = ErrorCode.CantDetectError : Can't detect because of no valid features in text
     */
    public List getProbabilities() {
        if (langprob == null) {
            detectBlock();
        }

        final List list = sortProbability(langprob);
        return list;
    }

    private void detectBlock() {
        cleaningText();
        final List ngrams = extractNGrams();
        if (ngrams.size() == 0) {
            throw new ElasticsearchException("no features in text");
        }

        langprob = new double[langlist.size()];

        final Random rand = new Random();
        if (seed != null) {
            rand.setSeed(seed);
        }
        for (int t = 0; t < nTrial; ++t) {
            final double[] prob = initProbability();
            final double alpha = this.alpha + rand.nextGaussian() * ALPHA_WIDTH;

            for (int i = 0;; ++i) {
                final int r = rand.nextInt(ngrams.size());
                updateLangProb(prob, ngrams.get(r), alpha);
                if (i % 5 == 0) {
                    if (normalizeProb(prob) > CONV_THRESHOLD
                            || i >= ITERATION_LIMIT) {
                        break;
                    }
                    if (verbose) {
                        System.out.println("> " + sortProbability(prob));
                    }
                }
            }
            for (int j = 0; j < langprob.length; ++j) {
                langprob[j] += prob[j] / nTrial;
            }
            if (verbose) {
                System.out.println("==> " + sortProbability(prob));
            }
        }
    }

    /**
     * Initialize the map of language probabilities.
     * If there is the specified prior map, use it as initial map.
     * @return initialized map of language probabilities
     */
    private double[] initProbability() {
        final double[] prob = new double[langlist.size()];
        if (priorMap != null) {
            for (int i = 0; i < prob.length; ++i) {
                prob[i] = priorMap[i];
            }
        } else {
            for (int i = 0; i < prob.length; ++i) {
                prob[i] = 1.0 / langlist.size();
            }
        }
        return prob;
    }

    /**
     * Extract n-grams from target text
     * @return n-grams list
     */
    private List extractNGrams() {
        final List list = new ArrayList<>();
        final NGram ngram = new NGram();
        for (int i = 0; i < text.length(); ++i) {
            ngram.addChar(text.charAt(i));
            for (int n = 1; n <= NGram.N_GRAM; ++n) {
                final String w = ngram.get(n);
                if (w != null && wordLangProbMap.containsKey(w)) {
                    list.add(w);
                }
            }
        }
        return list;
    }

    /**
     * update language probabilities with N-gram string(N=1,2,3)
     * @param word N-gram string
     */
    private boolean updateLangProb(final double[] prob, final String word,
            final double alpha) {
        if (word == null || !wordLangProbMap.containsKey(word)) {
            return false;
        }

        final double[] langProbMap = wordLangProbMap.get(word);
        if (verbose) {
            System.out.println(word + "(" + unicodeEncode(word) + "):"
                    + wordProbToString(langProbMap));
        }

        final double weight = alpha / BASE_FREQ;
        for (int i = 0; i < prob.length; ++i) {
            prob[i] *= weight + langProbMap[i];
        }
        return true;
    }

    private String wordProbToString(final double[] prob) {
        final Formatter formatter = new Formatter();
        for (int j = 0; j < prob.length; ++j) {
            final double p = prob[j];
            if (p >= 0.00001) {
                formatter.format(" %s:%.5f", langlist.get(j), p);
            }
        }
        final String string = formatter.toString();
        formatter.close();
        return string;
    }

    /**
     * normalize probabilities and check convergence by the maximun probability
     * @return maximum of probabilities
     */
    static private double normalizeProb(final double[] prob) {
        double maxp = 0, sump = 0;
        for (final double element : prob) {
            sump += element;
        }
        for (int i = 0; i < prob.length; ++i) {
            final double p = prob[i] / sump;
            if (maxp < p) {
                maxp = p;
            }
            prob[i] = p;
        }
        return maxp;
    }

    /**
     * @param probabilities HashMap
     * @return lanugage candidates order by probabilities descendently
     */
    private List sortProbability(final double[] prob) {
        final List list = new ArrayList<>();
        for (int j = 0; j < prob.length; ++j) {
            final double p = prob[j];
            if (p > PROB_THRESHOLD) {
                for (int i = 0; i <= list.size(); ++i) {
                    if (i == list.size() || list.get(i).prob < p) {
                        list.add(i, new Language(langlist.get(j), p));
                        break;
                    }
                }
            }
        }
        return list;
    }

    /**
     * unicode encoding (for verbose mode)
     * @param word
     * @return encoded word
     */
    static private String unicodeEncode(final String word) {
        final StringBuffer buf = new StringBuffer();
        for (int i = 0; i < word.length(); ++i) {
            final char ch = word.charAt(i);
            if (ch >= '\u0080') {
                String st = Integer.toHexString(0x10000 + ch);
                while (st.length() < 4) {
                    st = "0" + st;
                }
                buf.append("\\u").append(st.subSequence(1, 5));
            } else {
                buf.append(ch);
            }
        }
        return buf.toString();
    }

}