com.optimaize.langdetect.profiles.LanguageProfileBuilder Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of language-detector Show documentation
Language Detection Library for Java.
The newest version!
package com.optimaize.langdetect.profiles;

import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractor;
import org.jetbrains.annotations.NotNull;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/**
 * Builder for {@link LanguageProfile}.
 *
 * This class does no internal synchronization.
 *
 * @author Fabian Kessler
 */
public class LanguageProfileBuilder {

    @NotNull
    private final LdLocale locale;
    private int minimalFrequency = 1;
    private NgramExtractor ngramExtractor;
    private final Map> ngrams = new HashMap<>();


    public LanguageProfileBuilder(@NotNull LdLocale locale) {
        this.locale = locale;
    }
    @Deprecated
    public LanguageProfileBuilder(@NotNull String locale) {
        this.locale = LdLocale.fromString(locale);
    }

    public LanguageProfileBuilder ngramExtractor(@NotNull NgramExtractor ngramExtractor) {
        this.ngramExtractor = ngramExtractor;
        return this;
    }

    /**
     * @param minimalFrequency 1-n, the default is 1. n-grams that occurred less often in the text are removed.
     *                         This really should be set to something higher.
     *                         Try to play with the number until you get a profile file of satisfying size,
     *                         that produces good language detection results.
     */
    public LanguageProfileBuilder minimalFrequency(int minimalFrequency) {
        if (minimalFrequency < 1) throw new IllegalArgumentException("minimalFrequency must be >= 1, but was: "+minimalFrequency);
        this.minimalFrequency = minimalFrequency;
        return this;
    }

    /**
     * In order to use this you must set the {@link #ngramExtractor} first.
     */
    public LanguageProfileBuilder addText(CharSequence text) {
        if (ngramExtractor==null) {
            throw new IllegalStateException("NgramExtractor has not been set yet!");
        }
        for (Map.Entry entry : ngramExtractor.extractCountedGrams(text).entrySet()) {
            addGram(entry.getKey(), entry.getValue());
        }
        return this;
    }

    /**
     * Shortcut for addGram(ngram, 1).
     */
    public LanguageProfileBuilder addGram(String ngram) {
        return addGram(ngram, 1);
    }
    /**
     * If the builder already has this ngram, the given frequency is added to the current count.
     */
    public LanguageProfileBuilder addGram(String ngram, int frequency) {
        Map map = ngrams.get(ngram.length());
        if (map==null) {
            map = new HashMap<>();
            ngrams.put(ngram.length(), map);
        }
        Integer total = map.get(ngram);
        if (total==null) total = 0;
        total += frequency;
        map.put(ngram, total);
        return this;
    }


    public LanguageProfile build() {
        if (minimalFrequency >1) {
            removeNgramsWithLessFrequency();
        }
        return new LanguageProfileImpl(locale, ngrams);
    }


    private void removeNgramsWithLessFrequency() {
        for (Map map : ngrams.values()) {
            Iterator> iterator = map.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry next = iterator.next();
                if (next.getValue() < minimalFrequency) {
                    iterator.remove();
                }
            }
        }
    }

}