org.languagetool.language.LanguageIdentifier Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of languagetool-core Show documentation
Show all versions of languagetool-core Show documentation
LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.
/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.language;
import com.google.common.base.Optional;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObject;
import com.optimaize.langdetect.text.TextObjectFactory;
import org.jetbrains.annotations.Nullable;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Identify the language of a text. Note that some languages might never be
* detected because they are close to another language. Language variants like
* en-US or en-GB are not detected, the result will be {@code en} for those.
* @since 2.9
*/
public class LanguageIdentifier {
private static final double MINIMAL_CONFIDENCE = 0.9;
// ast and gl often prevent the correct detection of Spanish (as the are quite similar
// to Spanish, I assume) so we disable them for now. See LanguageDetectionEval.java:
private static final List ignoreLangCodes = Arrays.asList("ast", "gl");
// languages that we offer profiles for as they are not yet supported by language-detector:
private static final List externalLangCodes = Arrays.asList("eo");
private final LanguageDetector languageDetector;
private final TextObjectFactory textObjectFactory;
public LanguageIdentifier() {
try {
List profiles = loadProfiles(getLanguageCodes());
languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.minimalConfidence(MINIMAL_CONFIDENCE)
.withProfiles(profiles)
.build();
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
} catch (IOException e) {
throw new RuntimeException("Could not set up language identifier", e);
}
}
private static List getLanguageCodes() {
List langCodes = new ArrayList<>();
for (Language lang : Languages.get()) {
String langCode = lang.getShortCode();
boolean ignore = lang.isVariant() || ignoreLangCodes.contains(langCode) || externalLangCodes.contains(langCode);
if (ignore) {
continue;
}
if ("zh".equals(langCode)) {
langCodes.add("zh-CN");
langCodes.add("zh-TW");
} else {
langCodes.add(langCode);
}
}
return langCodes;
}
private List loadProfiles(List langCodes) throws IOException {
LanguageProfileReader profileReader = new LanguageProfileReader();
List profiles = profileReader.read(langCodes);
for (String externalLangCode : externalLangCodes) {
String profilePath = "/" + externalLangCode + "/" + externalLangCode + ".profile";
if (JLanguageTool.getDataBroker().resourceExists(profilePath)) { // not all languages are always available
try (InputStream profile = JLanguageTool.getDataBroker().getFromResourceDirAsStream(profilePath)) {
profiles.add(new LanguageProfileReader().read(profile));
}
}
}
return profiles;
}
/**
* @return language or {@code null} if language could not be identified
*/
@Nullable
public Language detectLanguage(String text) {
String languageCode = detectLanguageCode(text);
if (languageCode != null) {
return Languages.getLanguageForShortCode(languageCode);
} else {
return null;
}
}
/**
* @return language or {@code null} if language could not be identified
*/
@Nullable
private String detectLanguageCode(String text) {
TextObject textObject = textObjectFactory.forText(text);
Optional lang = languageDetector.detect(textObject);
// comment in for debugging:
//System.out.println(languageDetector.getProbabilities(textObject));
if (lang.isPresent()) {
return lang.get().getLanguage();
} else {
return null;
}
}
}