
com.optimaize.langdetect.NgramFrequencyData Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of language-detector Show documentation
Show all versions of language-detector Show documentation
Language Detection Library for Java.
package com.optimaize.langdetect;
import com.optimaize.langdetect.profiles.LanguageProfile;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.util.*;
/**
* Contains frequency information for ngram strings and the languages.
*
* Immutable by definition (can't make Arrays unmodifiable).
*
* @author Fabian Kessler
*/
public final class NgramFrequencyData {
/**
* Key=ngram
* Value = array with probabilities per loaded language, in the same order as {@code langlist}.
*/
@NotNull
private final Map wordLangProbMap;
/**
* All the loaded languages, order is important.
*/
@NotNull
private final List langlist;
/**
* @param gramLengths for example [1,2,3]
* @throws java.lang.IllegalArgumentException if languageProfiles or gramLengths is empty, or if one of the
* languageProfiles does not have the grams of the required sizes.
*/
@NotNull
public static NgramFrequencyData create(@NotNull Collection languageProfiles, @NotNull Collection gramLengths) throws IllegalArgumentException {
if (languageProfiles.isEmpty()) throw new IllegalArgumentException("No languageProfiles provided!");
if (gramLengths.isEmpty()) throw new IllegalArgumentException("No gramLengths provided!");
Map wordLangProbMap = new HashMap<>();
List langlist = new ArrayList<>();
int langsize = languageProfiles.size();
int index = -1;
for (LanguageProfile profile : languageProfiles) {
index++;
langlist.add( profile.getLanguage() );
for (Integer gramLength : gramLengths) {
if (!profile.getGramLengths().contains(gramLength)) {
throw new IllegalArgumentException("The language profile for "+profile.getLanguage()+" does not contain "+gramLength+"-grams!");
}
for (Map.Entry ngramEntry : profile.iterateGrams(gramLength)) {
String ngram = ngramEntry.getKey();
Integer frequency = ngramEntry.getValue();
if (!wordLangProbMap.containsKey(ngram)) {
wordLangProbMap.put(ngram, new double[langsize]);
}
double prob = frequency.doubleValue() / profile.getNumGramOccurrences(ngram.length());
wordLangProbMap.get(ngram)[index] = prob;
}
}
}
return new NgramFrequencyData(wordLangProbMap, langlist);
}
private NgramFrequencyData(@NotNull Map wordLangProbMap,
@NotNull List langlist) {
//not making immutable copies because I create them here (optimization).
this.wordLangProbMap = Collections.unmodifiableMap(wordLangProbMap);
this.langlist = Collections.unmodifiableList(langlist);
}
@NotNull
public List getLanguageList() {
return langlist;
}
@NotNull
public String getLanguage(int pos) {
return langlist.get(pos);
}
/**
* Don't modify this data structure! (Can't make array immutable...)
* @return null if no language profile knows that ngram.
* entries are 0 for languages that don't know that ngram at all.
* The array is in the order of the {@link #getLanguageList()} language list, and has exactly that size.
* impl note: this way the caller can handle it more efficient than returning an empty array.
*/
@Nullable
public double[] getProbabilities(String ngram) {
return wordLangProbMap.get(ngram);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy