All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.optimaize.langdetect.profiles.LanguageProfileBuilder Maven / Gradle / Ivy

The newest version!
package com.optimaize.langdetect.profiles;

import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractor;
import org.jetbrains.annotations.NotNull;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/**
 * Builder for {@link LanguageProfile}.
 *
 * 

This class does no internal synchronization.

* * @author Fabian Kessler */ public class LanguageProfileBuilder { @NotNull private final LdLocale locale; private int minimalFrequency = 1; private NgramExtractor ngramExtractor; private final Map> ngrams = new HashMap<>(); public LanguageProfileBuilder(@NotNull LdLocale locale) { this.locale = locale; } @Deprecated public LanguageProfileBuilder(@NotNull String locale) { this.locale = LdLocale.fromString(locale); } public LanguageProfileBuilder ngramExtractor(@NotNull NgramExtractor ngramExtractor) { this.ngramExtractor = ngramExtractor; return this; } /** * @param minimalFrequency 1-n, the default is 1. n-grams that occurred less often in the text are removed. * This really should be set to something higher. * Try to play with the number until you get a profile file of satisfying size, * that produces good language detection results. */ public LanguageProfileBuilder minimalFrequency(int minimalFrequency) { if (minimalFrequency < 1) throw new IllegalArgumentException("minimalFrequency must be >= 1, but was: "+minimalFrequency); this.minimalFrequency = minimalFrequency; return this; } /** * In order to use this you must set the {@link #ngramExtractor} first. */ public LanguageProfileBuilder addText(CharSequence text) { if (ngramExtractor==null) { throw new IllegalStateException("NgramExtractor has not been set yet!"); } for (Map.Entry entry : ngramExtractor.extractCountedGrams(text).entrySet()) { addGram(entry.getKey(), entry.getValue()); } return this; } /** * Shortcut for addGram(ngram, 1). */ public LanguageProfileBuilder addGram(String ngram) { return addGram(ngram, 1); } /** * If the builder already has this ngram, the given frequency is added to the current count. */ public LanguageProfileBuilder addGram(String ngram, int frequency) { Map map = ngrams.get(ngram.length()); if (map==null) { map = new HashMap<>(); ngrams.put(ngram.length(), map); } Integer total = map.get(ngram); if (total==null) total = 0; total += frequency; map.put(ngram, total); return this; } public LanguageProfile build() { if (minimalFrequency >1) { removeNgramsWithLessFrequency(); } return new LanguageProfileImpl(locale, ngrams); } private void removeNgramsWithLessFrequency() { for (Map map : ngrams.values()) { Iterator> iterator = map.entrySet().iterator(); while (iterator.hasNext()) { Map.Entry next = iterator.next(); if (next.getValue() < minimalFrequency) { iterator.remove(); } } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy