org.apache.commons.codec.language.bm.Lang Maven / Gradle / Ivy
Show all versions of gwt-commons-codec Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.codec.language.bm;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import com.google.gwt.core.shared.GwtIncompatible;
import com.google.gwt.regexp.shared.RegExp;
import org.apache.commons.codec.Resources;
/**
* Language guessing utility.
*
* This class encapsulates rules used to guess the possible languages that a word originates from. This is
* done by reference to a whole series of rules distributed in resource files.
*
* Instances of this class are typically managed through the static factory method instance().
* Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
*
* This class is intended to be immutable and thread-safe.
*
* Lang resources
*
* Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
* They are systematically named following the pattern:
*
org/apache/commons/codec/language/bm/lang.txt
* The format of these resources is the following:
*
* - Rules: whitespace separated strings.
* There should be 3 columns to each row, and these will be interpreted as:
*
* - pattern: a regular expression.
* - languages: a '+'-separated list of languages.
* - acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.
*
*
* - End-of-line comments: Any occurrence of '//' will cause all text following on that line to be
* discarded as a comment.
* - Multi-line comments: Any line starting with '/*' will start multi-line commenting mode.
* This will skip all content until a line ending in '*' and '/' is found.
* - Blank lines: All blank lines will be skipped.
*
*
* Port of lang.php
*
* @since 1.6
*/
public class Lang {
// Implementation note: This class is divided into two sections. The first part is a static factory interface that
// exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
// encapsulate a particular language-guessing rule table and the language guessing itself.
//
// It may make sense in the future to expose the private constructor to allow power users to build custom language-
// guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
// should be strongly encouraged to use the static factory {@code instance} method to get their Lang instances.
private static final class LangRule {
private final boolean acceptOnMatch;
private final Set languages;
private final RegExp pattern;
private LangRule(final RegExp pattern, final Set languages, final boolean acceptOnMatch) {
this.pattern = pattern;
this.languages = languages;
this.acceptOnMatch = acceptOnMatch;
}
public boolean matches(final String txt) {
return this.pattern.test(txt);
}
}
private static final Map Langs = new EnumMap<>(NameType.class);
private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/%s_lang.txt";
static {
for (final NameType s : NameType.values()) {
Langs.put(s, loadFromResource(s, Languages.getInstance(s)));
}
}
/**
* Gets a Lang instance for one of the supported NameTypes.
*
* @param nameType
* the NameType to look up
* @return a Lang encapsulating the language guessing rules for that name type
*/
public static Lang instance(final NameType nameType) {
return Langs.get(nameType);
}
/**
* Loads language rules from a resource.
*
* In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
* You will only need to call this yourself if you are developing custom language mapping rules.
*
* @param languageRulesResourceName
* the fully-qualified resource name to load
* @param languages
* the languages that these rules will support
* @return a Lang encapsulating the loaded language-guessing rules.
*/
public static Lang loadFromResource(final NameType languageRulesResourceName, final Languages languages) {
final List rules = new ArrayList();
switch (languageRulesResourceName) {
case ASHKENAZI:
rules.add(new LangRule(RegExp.compile("zh"), new HashSet(Arrays.asList("polish", "russian", "german", "english")), true));
rules.add(new LangRule(RegExp.compile("eau"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("[aoeiuäöü]h"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("^vogel"), new HashSet(Arrays.asList("german,")), true));
rules.add(new LangRule(RegExp.compile("vogel$"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("witz"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("tz$"), new HashSet(Arrays.asList("german", "russian", "english")), true));
rules.add(new LangRule(RegExp.compile("^tz"), new HashSet(Arrays.asList("russian", "english")), true));
rules.add(new LangRule(RegExp.compile("güe"), new HashSet(Arrays.asList("spanish")), true));
rules.add(new LangRule(RegExp.compile("güi"), new HashSet(Arrays.asList("spanish")), true));
rules.add(new LangRule(RegExp.compile("ghe"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("ghi"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("vici$"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("schi$"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("chsch"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("tsch"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("ssch"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("sch$"), new HashSet(Arrays.asList("german", "russian")), true));
rules.add(new LangRule(RegExp.compile("^sch"), new HashSet(Arrays.asList("german", "russian")), true));
rules.add(new LangRule(RegExp.compile("^rz"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("rz$"), new HashSet(Arrays.asList("polish", "german")), true));
rules.add(new LangRule(RegExp.compile("[^aoeiuäöü]rz"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("rz[^aoeiuäöü]"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("cki$"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ska$"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("cka$"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ue"), new HashSet(Arrays.asList("german", "russian")), true));
rules.add(new LangRule(RegExp.compile("ae"), new HashSet(Arrays.asList("german", "russian", "english")), true));
rules.add(new LangRule(RegExp.compile("oe"), new HashSet(Arrays.asList("german", "french", "russian", "english")), true));
rules.add(new LangRule(RegExp.compile("th$"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("^th"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("th[^aoeiu]"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("mann"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("cz"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("cy"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("niew"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("stein"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("heim$"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("heimer$"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("ii$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("iy$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("yy$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("yi$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("yj$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("ij$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("gaus$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("gauz$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("gauz$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("goltz$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("gol'tz$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("golts$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("gol'ts$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("^goltz"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("^gol'tz"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("^golts"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("^gol'ts"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("gendler$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("gejmer$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("gejm$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("geimer$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("geim$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("geymer"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("geym$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("gof$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("thal"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("zweig"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("ck$"), new HashSet(Arrays.asList("german", "english")), true));
rules.add(new LangRule(RegExp.compile("c$"), new HashSet(Arrays.asList("polish", "romanian", "hungarian")), true));
rules.add(new LangRule(RegExp.compile("sz"), new HashSet(Arrays.asList("polish", "hungarian")), true));
rules.add(new LangRule(RegExp.compile("gue"), new HashSet(Arrays.asList("spanish", "french")), true));
rules.add(new LangRule(RegExp.compile("gui"), new HashSet(Arrays.asList("spanish", "french")), true));
rules.add(new LangRule(RegExp.compile("guy"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("cs$"), new HashSet(Arrays.asList("hungarian")), true));
rules.add(new LangRule(RegExp.compile("^cs"), new HashSet(Arrays.asList("hungarian")), true));
rules.add(new LangRule(RegExp.compile("dzs"), new HashSet(Arrays.asList("hungarian")), true));
rules.add(new LangRule(RegExp.compile("zs$"), new HashSet(Arrays.asList("hungarian")), true));
rules.add(new LangRule(RegExp.compile("^zs"), new HashSet(Arrays.asList("hungarian")), true));
rules.add(new LangRule(RegExp.compile("^wl"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("^wr"), new HashSet(Arrays.asList("polish", "english", "german")), true));
rules.add(new LangRule(RegExp.compile("gy$"), new HashSet(Arrays.asList("hungarian")), true));
rules.add(new LangRule(RegExp.compile("gy[aeou]"), new HashSet(Arrays.asList("hungarian")), true));
rules.add(new LangRule(RegExp.compile("gy"), new HashSet(Arrays.asList("hungarian", "russian")), true));
rules.add(new LangRule(RegExp.compile("ly"), new HashSet(Arrays.asList("hungarian", "russian", "polish")), true));
rules.add(new LangRule(RegExp.compile("ny"), new HashSet(Arrays.asList("hungarian", "russian", "polish")), true));
rules.add(new LangRule(RegExp.compile("ty"), new HashSet(Arrays.asList("hungarian", "russian", "polish")), true));
rules.add(new LangRule(RegExp.compile("â"), new HashSet(Arrays.asList("romanian", "french")), true));
rules.add(new LangRule(RegExp.compile("ă"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("à"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("ä"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("á"), new HashSet(Arrays.asList("hungarian", "spanish")), true));
rules.add(new LangRule(RegExp.compile("ą"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ć"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ç"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("ę"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("é"), new HashSet(Arrays.asList("french", "hungarian", "spanish")), true));
rules.add(new LangRule(RegExp.compile("è"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("ê"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("í"), new HashSet(Arrays.asList("hungarian", "spanish")), true));
rules.add(new LangRule(RegExp.compile("î"), new HashSet(Arrays.asList("romanian", "french")), true));
rules.add(new LangRule(RegExp.compile("ł"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ń"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ñ"), new HashSet(Arrays.asList("spanish")), true));
rules.add(new LangRule(RegExp.compile("ó"), new HashSet(Arrays.asList("polish", "hungarian", "spanish")), true));
rules.add(new LangRule(RegExp.compile("ö"), new HashSet(Arrays.asList("german", "hungarian")), true));
rules.add(new LangRule(RegExp.compile("õ"), new HashSet(Arrays.asList("hungarian")), true));
rules.add(new LangRule(RegExp.compile("ş"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("ś"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ţ"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("ü"), new HashSet(Arrays.asList("german", "hungarian")), true));
rules.add(new LangRule(RegExp.compile("ù"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("ű"), new HashSet(Arrays.asList("hungarian")), true));
rules.add(new LangRule(RegExp.compile("ú"), new HashSet(Arrays.asList("hungarian", "spanish")), true));
rules.add(new LangRule(RegExp.compile("ź"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ż"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ß"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("а"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("ё"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("о"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("е"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("и"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("у"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("ы"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("э"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("ю"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("я"), new HashSet(Arrays.asList("cyrillic")), true));
rules.add(new LangRule(RegExp.compile("א"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ב"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ג"), new HashSet(Arrays.asList("ebrew")), true));
rules.add(new LangRule(RegExp.compile("ד"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ה"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ו"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ז"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ח"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ט"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("י"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("כ"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ל"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("מ"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("נ"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ס"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ע"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("פ"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("צ"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ק"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ר"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ש"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("ת"), new HashSet(Arrays.asList("hebrew")), true));
rules.add(new LangRule(RegExp.compile("a"), new HashSet(Arrays.asList("cyrillic", "hebrew")), false));
rules.add(new LangRule(RegExp.compile("o"), new HashSet(Arrays.asList("cyrillic", "hebrew")), false));
rules.add(new LangRule(RegExp.compile("e"), new HashSet(Arrays.asList("cyrillic", "hebrew")), false));
rules.add(new LangRule(RegExp.compile("i"), new HashSet(Arrays.asList("cyrillic", "hebrew")), false));
rules.add(new LangRule(RegExp.compile("y"), new HashSet(Arrays.asList("cyrillic", "hebrew", "romanian")), false));
rules.add(new LangRule(RegExp.compile("u"), new HashSet(Arrays.asList("cyrillic", "hebrew")), false));
rules.add(new LangRule(RegExp.compile("v[^aoeiuäüö]"), new HashSet(Arrays.asList("german")), false));
rules.add(new LangRule(RegExp.compile("y[^aoeiu]"), new HashSet(Arrays.asList("german")), false));
rules.add(new LangRule(RegExp.compile("c[^aohk]"), new HashSet(Arrays.asList("german")), false));
rules.add(new LangRule(RegExp.compile("dzi"), new HashSet(Arrays.asList("german", "english", "french")), false));
rules.add(new LangRule(RegExp.compile("ou"), new HashSet(Arrays.asList("german")), false));
rules.add(new LangRule(RegExp.compile("aj"), new HashSet(Arrays.asList("german", "english", "french")), false));
rules.add(new LangRule(RegExp.compile("ej"), new HashSet(Arrays.asList("german", "english", "french")), false));
rules.add(new LangRule(RegExp.compile("oj"), new HashSet(Arrays.asList("german", "english", "french")), false));
rules.add(new LangRule(RegExp.compile("uj"), new HashSet(Arrays.asList("german", "english", "french")), false));
rules.add(new LangRule(RegExp.compile("k"), new HashSet(Arrays.asList("romanian")), false));
rules.add(new LangRule(RegExp.compile("v"), new HashSet(Arrays.asList("polish")), false));
rules.add(new LangRule(RegExp.compile("ky"), new HashSet(Arrays.asList("polish")), false));
rules.add(new LangRule(RegExp.compile("eu"), new HashSet(Arrays.asList("russian", "polish")), false));
rules.add(new LangRule(RegExp.compile("w"), new HashSet(Arrays.asList("french", "romanian", "spanish", "hungarian", "russian")), false));
rules.add(new LangRule(RegExp.compile("kie"), new HashSet(Arrays.asList("french", "spanish")), false));
rules.add(new LangRule(RegExp.compile("gie"), new HashSet(Arrays.asList("french", "romanian", "spanish")), false));
rules.add(new LangRule(RegExp.compile("q"), new HashSet(Arrays.asList("hungarian", "polish", "russian", "romanian")), false));
rules.add(new LangRule(RegExp.compile("sch"), new HashSet(Arrays.asList("hungarian", "polish", "french", "spanish")), false));
rules.add(new LangRule(RegExp.compile("^h"), new HashSet(Arrays.asList("russian")), false));
break;
case GENERIC:
rules.add(new LangRule(RegExp.compile("^o’"), new HashSet(Arrays.asList("english")), true));
rules.add(new LangRule(RegExp.compile("^o'"), new HashSet(Arrays.asList("english")), true));
rules.add(new LangRule(RegExp.compile("^mc"), new HashSet(Arrays.asList("english")), true));
rules.add(new LangRule(RegExp.compile("^fitz"), new HashSet(Arrays.asList("english")), true));
rules.add(new LangRule(RegExp.compile("ceau"), new HashSet(Arrays.asList("french", "romanian")), true));
rules.add(new LangRule(RegExp.compile("eau"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("eau$"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("eaux$"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("ault$"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("oult$"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("eux$"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("eix$"), new HashSet(Arrays.asList("french")), true));
rules.add(new LangRule(RegExp.compile("glou$"), new HashSet(Arrays.asList("greeklatin")), true));
rules.add(new LangRule(RegExp.compile("uu"), new HashSet(Arrays.asList("dutch")), true));
rules.add(new LangRule(RegExp.compile("tx"), new HashSet(Arrays.asList("spanish")), true));
rules.add(new LangRule(RegExp.compile("witz"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("tz$"), new HashSet(Arrays.asList("german", "russian", "english")), true));
rules.add(new LangRule(RegExp.compile("^tz"), new HashSet(Arrays.asList("russian", "english")), true));
rules.add(new LangRule(RegExp.compile("poulos$"), new HashSet(Arrays.asList("greeklatin")), true));
rules.add(new LangRule(RegExp.compile("pulos$"), new HashSet(Arrays.asList("greeklatin")), true));
rules.add(new LangRule(RegExp.compile("iou"), new HashSet(Arrays.asList("greeklatin")), true));
rules.add(new LangRule(RegExp.compile("sj$"), new HashSet(Arrays.asList("dutch")), true));
rules.add(new LangRule(RegExp.compile("^sj"), new HashSet(Arrays.asList("dutch")), true));
rules.add(new LangRule(RegExp.compile("güe"), new HashSet(Arrays.asList("spanish")), true));
rules.add(new LangRule(RegExp.compile("güi"), new HashSet(Arrays.asList("spanish")), true));
rules.add(new LangRule(RegExp.compile("ghe"), new HashSet(Arrays.asList("romanian", "greeklatin")), true));
rules.add(new LangRule(RegExp.compile("ghi"), new HashSet(Arrays.asList("romanian", "greeklatin")), true));
rules.add(new LangRule(RegExp.compile("escu$"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("esco$"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("vici$"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("schi$"), new HashSet(Arrays.asList("romanian")), true));
rules.add(new LangRule(RegExp.compile("ii$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("iy$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("yy$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("yi$"), new HashSet(Arrays.asList("russian")), true));
rules.add(new LangRule(RegExp.compile("^rz"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("rz$"), new HashSet(Arrays.asList("polish", "german")), true));
rules.add(new LangRule(RegExp.compile("[bcdfgklmnpstwz]rz"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("rz[bcdfghklmnpstw]"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("cki$"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ska$"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("cka$"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("ae"), new HashSet(Arrays.asList("german", "russian", "english")), true));
rules.add(new LangRule(RegExp.compile("oe"), new HashSet(Arrays.asList("german", "french", "russian", "english", "dutch")), true));
rules.add(new LangRule(RegExp.compile("th$"), new HashSet(Arrays.asList("german", "english")), true));
rules.add(new LangRule(RegExp.compile("^th"), new HashSet(Arrays.asList("german", "english", "greeklatin")), true));
rules.add(new LangRule(RegExp.compile("mann"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("cz"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("cy"), new HashSet(Arrays.asList("polish", "greeklatin")), true));
rules.add(new LangRule(RegExp.compile("niew"), new HashSet(Arrays.asList("polish")), true));
rules.add(new LangRule(RegExp.compile("etti$"), new HashSet(Arrays.asList("italian")), true));
rules.add(new LangRule(RegExp.compile("eti$"), new HashSet(Arrays.asList("italian")), true));
rules.add(new LangRule(RegExp.compile("ati$"), new HashSet(Arrays.asList("italian")), true));
rules.add(new LangRule(RegExp.compile("ato$"), new HashSet(Arrays.asList("italian")), true));
rules.add(new LangRule(RegExp.compile("[aoei]no$"), new HashSet(Arrays.asList("italian")), true));
rules.add(new LangRule(RegExp.compile("[aoei]ni$"), new HashSet(Arrays.asList("italian")), true));
rules.add(new LangRule(RegExp.compile("esi$"), new HashSet(Arrays.asList("italian")), true));
rules.add(new LangRule(RegExp.compile("oli$"), new HashSet(Arrays.asList("italian")), true));
rules.add(new LangRule(RegExp.compile("field$"), new HashSet(Arrays.asList("english")), true));
rules.add(new LangRule(RegExp.compile("stein"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("heim$"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("heimer$"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("thal"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("zweig"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("[aeou]h"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("äh"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("öh"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("üh"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("[ln]h[ao]$"), new HashSet(Arrays.asList("portuguese")), true));
rules.add(new LangRule(RegExp.compile("[ln]h[aou]"), new HashSet(Arrays.asList("portuguese", "french", "german", "dutch", "czech", "spanish", "turkish")), true));
rules.add(new LangRule(RegExp.compile("chsch"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("tsch"), new HashSet(Arrays.asList("german")), true));
rules.add(new LangRule(RegExp.compile("sch$"), new HashSet