com.optimaize.langdetect.profiles.LanguageProfileBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of language-detector Show documentation
Show all versions of language-detector Show documentation
Language Detection Library for Java.
The newest version!
package com.optimaize.langdetect.profiles;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractor;
import org.jetbrains.annotations.NotNull;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* Builder for {@link LanguageProfile}.
*
* This class does no internal synchronization.
*
* @author Fabian Kessler
*/
public class LanguageProfileBuilder {
@NotNull
private final LdLocale locale;
private int minimalFrequency = 1;
private NgramExtractor ngramExtractor;
private final Map> ngrams = new HashMap<>();
public LanguageProfileBuilder(@NotNull LdLocale locale) {
this.locale = locale;
}
@Deprecated
public LanguageProfileBuilder(@NotNull String locale) {
this.locale = LdLocale.fromString(locale);
}
public LanguageProfileBuilder ngramExtractor(@NotNull NgramExtractor ngramExtractor) {
this.ngramExtractor = ngramExtractor;
return this;
}
/**
* @param minimalFrequency 1-n, the default is 1. n-grams that occurred less often in the text are removed.
* This really should be set to something higher.
* Try to play with the number until you get a profile file of satisfying size,
* that produces good language detection results.
*/
public LanguageProfileBuilder minimalFrequency(int minimalFrequency) {
if (minimalFrequency < 1) throw new IllegalArgumentException("minimalFrequency must be >= 1, but was: "+minimalFrequency);
this.minimalFrequency = minimalFrequency;
return this;
}
/**
* In order to use this you must set the {@link #ngramExtractor} first.
*/
public LanguageProfileBuilder addText(CharSequence text) {
if (ngramExtractor==null) {
throw new IllegalStateException("NgramExtractor has not been set yet!");
}
for (Map.Entry entry : ngramExtractor.extractCountedGrams(text).entrySet()) {
addGram(entry.getKey(), entry.getValue());
}
return this;
}
/**
* Shortcut for addGram(ngram, 1).
*/
public LanguageProfileBuilder addGram(String ngram) {
return addGram(ngram, 1);
}
/**
* If the builder already has this ngram, the given frequency is added to the current count.
*/
public LanguageProfileBuilder addGram(String ngram, int frequency) {
Map map = ngrams.get(ngram.length());
if (map==null) {
map = new HashMap<>();
ngrams.put(ngram.length(), map);
}
Integer total = map.get(ngram);
if (total==null) total = 0;
total += frequency;
map.put(ngram, total);
return this;
}
public LanguageProfile build() {
if (minimalFrequency >1) {
removeNgramsWithLessFrequency();
}
return new LanguageProfileImpl(locale, ngrams);
}
private void removeNgramsWithLessFrequency() {
for (Map map : ngrams.values()) {
Iterator> iterator = map.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry next = iterator.next();
if (next.getValue() < minimalFrequency) {
iterator.remove();
}
}
}
}
}