com.ibm.icu.util.LocaleMatcher Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support
There is a newer version: 76.1
Show newest version
/*
 ****************************************************************************************
 * Copyright (C) 2009-2014, Google, Inc.; International Business Machines Corporation   *
 * and others. All Rights Reserved.                                                     *
 ****************************************************************************************
 */
package com.ibm.icu.util;

import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.Row;
import com.ibm.icu.impl.Row.R2;
import com.ibm.icu.impl.Row.R3;

/**
 * Provides a way to match the languages (locales) supported by a product to the
 * languages (locales) acceptable to a user, and get the best match. For
 * example:
 * 
 *  * LocaleMatcher matcher = new LocaleMatcher("fr, en-GB, en");
 * 
 * // afterwards:
 * matcher.getBestMatch("en-US").toLanguageTag() => "en"
 * 
 * 
 * It takes into account when languages are close to one another, such as fil
 * and tl, and when language regional variants are close, like en-GB and en-AU.
 * It also handles scripts, like zh-Hant vs zh-TW. For examples, see the test
 * file.
 * All classes implementing this interface should be immutable. Often a
 * product will just need one static instance, built with the languages
 * that it supports. However, it may want multiple instances with different
 * default languages based on additional information, such as the domain.
 * 
 * @author [email protected]
 * @stable ICU 4.4
 */
public class LocaleMatcher {
    
    private static boolean DEBUG = false;

    private static final ULocale UNKNOWN_LOCALE = new ULocale("und");

    /**
     * Threshold for falling back to the default (first) language. May make this
     * a parameter in the future.
     */
    private static final double DEFAULT_THRESHOLD = 0.5;

    /**
     * The default language, in case the threshold is not met.
     */
    private final ULocale defaultLanguage;

    /**
     * The default language, in case the threshold is not met.
     */
    private final double threshold;

    /**
     * Create a new language matcher. The highest-weighted language is the
     * default. That means that if no other language is matches closer than a given
     * threshold, that default language is chosen. Typically the default is English,
     * but it could be different based on additional information, such as the domain
     * of the page.
     * 
     * @param languagePriorityList weighted list
     * @stable ICU 4.4
     */
    public LocaleMatcher(LocalePriorityList languagePriorityList) {
        this(languagePriorityList, defaultWritten);
    }

    /**
     * Create a new language matcher from a String form. The highest-weighted
     * language is the default.
     * 
     * @param languagePriorityListString String form of LanguagePriorityList
     * @stable ICU 4.4
     */
    public LocaleMatcher(String languagePriorityListString) {
        this(LocalePriorityList.add(languagePriorityListString).build());
    }

    /**
     * Internal testing function; may expose API later.
     * @param languagePriorityList LocalePriorityList to match
     * @param matcherData Internal matching data
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData) {
        this(languagePriorityList, matcherData, DEFAULT_THRESHOLD);
    }

    /**
     * Internal testing function; may expose API later.
     * @param languagePriorityList LocalePriorityList to match
     * @param matcherData Internal matching data
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData, double threshold) {
        this.matcherData = matcherData == null ? defaultWritten : matcherData;
        for (final ULocale language : languagePriorityList) {
            add(language, languagePriorityList.getWeight(language));
        }
        Iterator it = languagePriorityList.iterator();
        defaultLanguage = it.hasNext() ? it.next() : null;
        this.threshold = threshold;
    }


    /**
     * Returns a fraction between 0 and 1, where 1 means that the languages are a
     * perfect match, and 0 means that they are completely different. Note that
     * the precise values may change over time; no code should be made dependent
     * on the values remaining constant.
     * @param desired Desired locale
     * @param desiredMax Maximized locale (using likely subtags)
     * @param supported Supported locale
     * @param supportedMax Maximized locale (using likely subtags)
     * @return value between 0 and 1, inclusive.
     * @stable ICU 4.4
     */
    public double match(ULocale desired, ULocale desiredMax, ULocale supported, ULocale supportedMax) {
        return matcherData.match(desired, desiredMax, supported, supportedMax);
    }


    /**
     * Canonicalize a locale (language). Note that for now, it is canonicalizing
     * according to CLDR conventions (he vs iw, etc), since that is what is needed
     * for likelySubtags.
     * @param ulocale language/locale code
     * @return ULocale with remapped subtags.
     * @stable ICU 4.4
     */
    public ULocale canonicalize(ULocale ulocale) {
        // TODO Get the data from CLDR, use Java conventions.
        String lang = ulocale.getLanguage();
        String lang2 = canonicalMap.get(lang);
        String script = ulocale.getScript();
        String script2 = canonicalMap.get(script);
        String region = ulocale.getCountry();
        String region2 = canonicalMap.get(region);
        if (lang2 != null || script2 != null || region2 != null) {
            return new ULocale(
                    lang2 == null ? lang : lang2,
                            script2 == null ? script : script2,
                                    region2 == null ? region : region2
                    );
        }
        return ulocale;
    }

    /**
     * Get the best match for a LanguagePriorityList
     * 
     * @param languageList list to match
     * @return best matching language code
     * @stable ICU 4.4
     */
    public ULocale getBestMatch(LocalePriorityList languageList) {
        double bestWeight = 0;
        ULocale bestTableMatch = null;
        for (final ULocale language : languageList) {
            final Row.R2 matchRow = getBestMatchInternal(language);
            final double weight = matchRow.get1() * languageList.getWeight(language);
            if (weight > bestWeight) {
                bestWeight = weight;
                bestTableMatch = matchRow.get0();
            }
        }
        if (bestWeight < threshold) {
            bestTableMatch = defaultLanguage;
        }
        return bestTableMatch;
    }

    /**
     * Convenience method: Get the best match for a LanguagePriorityList
     * 
     * @param languageList String form of language priority list
     * @return best matching language code
     * @stable ICU 4.4
     */
    public ULocale getBestMatch(String languageList) {
        return getBestMatch(LocalePriorityList.add(languageList).build());
    }

    /**
     * Get the best match for an individual language code.
     * 
     * @param ulocale locale/language code to match
     * @return best matching language code
     * @stable ICU 4.4
     */
    public ULocale getBestMatch(ULocale ulocale) {
        return getBestMatchInternal(ulocale).get0();
    }

    /**
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public ULocale getBestMatch(ULocale... ulocales) {
        return getBestMatch(LocalePriorityList.add(ulocales).build());
    }

    /**
     * {@inheritDoc}
     * @stable ICU 4.4
     */
    @Override
    public String toString() {
        return "{" + defaultLanguage + ", " 
                + maximizedLanguageToWeight + "}";
    }
    // ================= Privates =====================

    /**
     * Get the best match for an individual language code.
     * 
     * @param languageCode
     * @return best matching language code and weight (as per
     *         {@link #match(ULocale, ULocale)})
     */
    private Row.R2 getBestMatchInternal(ULocale languageCode) {
        languageCode = canonicalize(languageCode);
        final ULocale maximized = addLikelySubtags(languageCode);
        if (DEBUG) {
            System.out.println("\n" + languageCode + ";\t" + maximized);
        }
        double bestWeight = 0;
        ULocale bestTableMatch = null;
        for (final ULocale tableKey : maximizedLanguageToWeight.keySet()) {
            R2 row = maximizedLanguageToWeight.get(tableKey);
            final double match = match(languageCode, maximized, tableKey, row.get0());
            if (DEBUG) {
                System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match + "\n");
            }
            final double weight = match * row.get1();
            if (weight > bestWeight) {
                bestWeight = weight;
                bestTableMatch = tableKey;
            }
        }
        if (bestWeight < threshold) {
            bestTableMatch = defaultLanguage;
        }
        return Row.R2.of(bestTableMatch, bestWeight);
    }

    private void add(ULocale language, Double weight) {
        language = canonicalize(language);
        R2 row = Row.of(addLikelySubtags(language), weight);
        maximizedLanguageToWeight.put(language, row);
    }

    Map> maximizedLanguageToWeight = new LinkedHashMap>();


    // =============== Special Mapping Information ==============

    /**
     * We need to add another method to addLikelySubtags that doesn't return
     * null, but instead substitutes Zzzz and ZZ if unknown. There are also
     * a few cases where addLikelySubtags needs to have expanded data, to handle
     * all deprecated codes, and to update to CLDR 1.6.
     * @param languageCode
     * @return "fixed" addLikelySubtags
     */
    // TODO(markdavis): update the above when CLDR 1.6 is final.
    private ULocale addLikelySubtags(ULocale languageCode) {
        // max("und") = "en_Latn_US", and since matching is based on maximized tags, the undefined
        // language would normally match English.  But that would produce the counterintuitive results
        // that getBestMatch("und", LocaleMatcher("it,en")) would be "en", and
        // getBestMatch("en", LocaleMatcher("it,und")) would be "und".
        //
        // To avoid that, we change the matcher's definitions of max (AddLikelySubtagsWithDefaults)
        // so that max("und")="und". That produces the following, more desirable results:
        if (languageCode.equals(UNKNOWN_LOCALE)) {
            return UNKNOWN_LOCALE;
        }
        final ULocale result = ULocale.addLikelySubtags(languageCode);
        // should have method on getLikelySubtags for this
        if (result == null || result.equals(languageCode)) {
            final String language = languageCode.getLanguage();
            final String script = languageCode.getScript();
            final String region = languageCode.getCountry();
            return new ULocale((language.length()==0 ? "und"
                    : language)
                    + "_"
                    + (script.length()==0 ? "Zzzz" : script)
                    + "_"
                    + (region.length()==0 ? "ZZ" : region));
        }
        return result;
    }

    private static class LocalePatternMatcher {
        // a value of null means a wildcard; matches any.
        private String lang;
        private String script;
        private String region;
        private Level level;
        static Pattern pattern = Pattern.compile(
                "([a-z]{1,8}|\\*)"
                        + "(?:[_-]([A-Z][a-z]{3}|\\*))?"
                        + "(?:[_-]([A-Z]{2}|[0-9]{3}|\\*))?");

        public LocalePatternMatcher(String toMatch) {
            Matcher matcher = pattern.matcher(toMatch);
            if (!matcher.matches()) {
                throw new IllegalArgumentException("Bad pattern: " + toMatch);
            }
            lang = matcher.group(1);
            script = matcher.group(2);
            region = matcher.group(3);
            level = region != null ? Level.region : script != null ? Level.script : Level.language;

            if (lang.equals("*")) {
                lang = null;
            }
            if (script != null && script.equals("*")) {
                script = null;
            }
            if (region != null && region.equals("*")) {
                region = null;
            }
        }

        boolean matches(ULocale ulocale) {
            if (lang != null && !lang.equals(ulocale.getLanguage())) {
                return false;
            }
            if (script != null && !script.equals(ulocale.getScript())) {
                return false;
            }
            if (region != null && !region.equals(ulocale.getCountry())) {
                return false;
            }
            return true;
        }

        public Level getLevel() {
            return level;
        }

        public String getLanguage() {
            return (lang == null ? "*" : lang);
        }

        public String getScript() {
            return (script == null ? "*" : script);
        }

        public String getRegion() {
            return (region == null ? "*" : region);
        }

        public String toString() {
            String result = getLanguage();
            if (level != Level.language) {
                result += "-" + getScript();
                if (level != Level.script) {
                    result += "-" + getRegion();
                }
            }
            return result;
        }
    }

    enum Level {
        language(0.99),
        script(0.2), 
        region(0.04);

        final double worst;

        Level(double d) {
            worst = d;
        }
    }

    private static class ScoreData implements Freezable {
        @SuppressWarnings("unused")
        private static final double maxUnequal_changeD_sameS = 0.5;

        @SuppressWarnings("unused")
        private static final double maxUnequal_changeEqual = 0.75;

        LinkedHashSet> scores = new LinkedHashSet>();
        final Level level;

        public ScoreData(Level level) {
            this.level = level;
        }

        void addDataToScores(String desired, String supported, R3 data) {
            //            Map>> lang_result = scores.get(desired);
            //            if (lang_result == null) {
            //                scores.put(desired, lang_result = new HashMap());
            //            }
            //            Set> result = lang_result.get(supported);
            //            if (result == null) {
            //                lang_result.put(supported, result = new LinkedHashSet());
            //            }
            //            result.add(data);
            scores.add(data);
        }

        double getScore(ULocale desiredLocale, ULocale dMax, String desiredRaw, String desiredMax, 
                ULocale supportedLocale, ULocale sMax, String supportedRaw, String supportedMax) {

            /*
             * d, dm, s, sm
             * dc = d != dm
             * sc = s != sm
             * if dm != sm
             *   rd = rd(dm,sm) // line 4
             *   if dc != sc
             *     rd *= 0.75 // lines 3,8
             *   ef dc
             *     rd *= 0.5 // lines 7
             *   end
             *  ef dc == sc
             *   rd = 0 // line 6
             *  else
             *   rd = 0.25*StdRDiff // lines 2,5
             */
            
            // example: input en-GB, supported en en-GB
            // we want to have a closer match with 

//            boolean desiredChange = desiredRaw.equals(desiredMax);
//            boolean supportedChange = supportedRaw.equals(supportedMax);
            double distance = 0;
            if (!desiredMax.equals(supportedMax)) {
//                Map>> lang_result = scores.get(desiredMax);
//                if (lang_result == null) {
//                    distance = worst;
//                } else {
//                    Set> result = lang_result.get(supportedMax);
//                    skip:
//                    if (result == null) {
//                        distance = worst;
//                    } else {
                distance = getRawScore(dMax, sMax);
//                }
//                if (desiredChange == supportedChange) {
//                    distance *= maxUnequal_changeEqual;
//                    if (DEBUG) {
//                        System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, changeD=changeS)\t" + distance);
//                    }
//                } else if (desiredChange) {
//                    distance *= maxUnequal_changeD_sameS;
//                    if (DEBUG) {
//                        System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, changeD, !changeS)\t" + distance);
//                    }
//                } else {
//                    if (DEBUG) {
//                        System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, !changeD, changeS)\t" + distance);
//                    }
//                }
            } else if (!desiredRaw.equals(supportedRaw)) { // maxes are equal, changes are equal
                distance += 0.001;
//                if (DEBUG) {
//                    System.out.println("\t\t\t" + level + " Distance (maxD=maxS, changeD=changeS)\t" + distance);
//                }
            } else { // maxes are equal, changes are different
//                distance = 0.25*level.worst;
//                if (DEBUG) {
//                    System.out.println("\t\t\t" + level + " Distance (maxD=maxS, changeD≠changeS)\t" + distance);
//                }
            }
            return distance;
        }

        private double getRawScore(ULocale desiredLocale, ULocale supportedLocale) {
            if (DEBUG) {
                System.out.println("\t\t\t" + level + " Raw Score:\t" + desiredLocale + ";\t" + supportedLocale);
            }
            for (R3 datum : scores) { // : result
                if (datum.get0().matches(desiredLocale) 
                        && datum.get1().matches(supportedLocale)) {
                    if (DEBUG) {
                        System.out.println("\t\t\t\tFOUND\t" + datum);
                    }
                    return datum.get2();
                }
            }
            if (DEBUG) {
                System.out.println("\t\t\t\tNOTFOUND\t" + level.worst);
            }
            return level.worst;
        }

        public String toString() {
            StringBuilder result = new StringBuilder().append(level);
            for (R3 score : scores) {
                result.append("\n\t\t").append(score);
            }
            return result.toString();
        }


        @SuppressWarnings("unchecked")
        public ScoreData cloneAsThawed() {
            try {
                ScoreData result = (ScoreData) clone();
                result.scores = (LinkedHashSet>) result.scores.clone();
                result.frozen = false;
                return result;
            } catch (CloneNotSupportedException e) {
                throw new ICUCloneNotSupportedException(e); // will never happen
            }

        }

        private boolean frozen = false;

        public ScoreData freeze() {
            return this;
        }

        public boolean isFrozen() {
            return frozen;
        }
    }

    /**
     * Only for testing and use by tools. Interface may change!!
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public static class LanguageMatcherData implements Freezable {
        ScoreData languageScores = new ScoreData(Level.language);
        ScoreData scriptScores = new ScoreData(Level.script);
        ScoreData regionScores = new ScoreData(Level.region);

        /**
         * @internal
         * @deprecated This API is ICU internal only.
         */
        @Deprecated
        public LanguageMatcherData() {
        }

        /**
         * @internal
         * @deprecated This API is ICU internal only.
         */
        @Deprecated
        public String toString() {
            return languageScores + "\n\t" + scriptScores + "\n\t" + regionScores;
        }

        /**
         * @internal
         * @deprecated This API is ICU internal only.
         */
        @Deprecated
        public double match(ULocale a, ULocale aMax, ULocale b, ULocale bMax) {
            double diff = 0;
            diff += languageScores.getScore(a, aMax, a.getLanguage(), aMax.getLanguage(), b, bMax, b.getLanguage(), bMax.getLanguage());
            diff += scriptScores.getScore(a, aMax, a.getScript(), aMax.getScript(), b, bMax, b.getScript(), bMax.getScript());
            diff += regionScores.getScore(a, aMax, a.getCountry(), aMax.getCountry(), b, bMax, b.getCountry(), bMax.getCountry());

            if (!a.getVariant().equals(b.getVariant())) {
                diff += 0.01;
            }
            if (diff < 0.0d) {
                diff = 0.0d;
            } else if (diff > 1.0d) {
                diff = 1.0d;
            }
            if (DEBUG) {
                System.out.println("\t\t\tTotal Distance\t" + diff);
            }
            return 1.0 - diff;
        }


        /**
         * Add an exceptional distance between languages, typically because regional
         * dialects were given their own language codes. At this point the code is
         * symmetric. We don't bother producing an equivalence class because there are
         * so few cases; this function depends on the other permutations being
         * added specifically.
         * @internal
         * @deprecated This API is ICU internal only.
         */
        @Deprecated
        private LanguageMatcherData addDistance(String desired, String supported, int percent) {
            return addDistance(desired, supported, percent, false, null);
        }
        /**
         * @internal
         * @deprecated This API is ICU internal only.
         */
        @Deprecated
        public LanguageMatcherData addDistance(String desired, String supported, int percent, String comment) {
            return addDistance(desired, supported, percent, false, comment);
        }
        /**
         * @internal
         * @deprecated This API is ICU internal only.
         */
        @Deprecated
        public LanguageMatcherData addDistance(String desired, String supported, int percent, boolean oneway) {
            return addDistance(desired, supported, percent, oneway, null);
        }

        private LanguageMatcherData addDistance(String desired, String supported, int percent, boolean oneway, String comment) {
            if (DEBUG) {
                System.out.println("\t"
                        + (comment == null ? "" : "\t"));
                //                    //     .addDistance("nn", "nb", 4, true)
                //                        System.out.println(".addDistance(\"" + desired + "\"" +
                //                                ", \"" + supported + "\"" +
                //                                ", " + percent + ""
                //                                + (oneway ? "" : ", true")
                //                                + (comment == null ? "" : ", \"" + comment + "\"")
                //                                + ")"
                //                        );

            }
            double score = 1-percent/100.0; // convert from percentage
            LocalePatternMatcher desiredMatcher = new LocalePatternMatcher(desired);
            Level desiredLen = desiredMatcher.getLevel();
            LocalePatternMatcher supportedMatcher = new LocalePatternMatcher(supported);
            Level supportedLen = supportedMatcher.getLevel();
            if (desiredLen != supportedLen) {
                throw new IllegalArgumentException("Lengths unequal: " + desired + ", " + supported);
            }
            R3 data = Row.of(desiredMatcher, supportedMatcher, score);
            R3 data2 = oneway ? null : Row.of(supportedMatcher, desiredMatcher, score);
            switch (desiredLen) {
            case language:
                String dlanguage = desiredMatcher.getLanguage();
                String slanguage = supportedMatcher.getLanguage();
                languageScores.addDataToScores(dlanguage, slanguage, data);
                if (!oneway) {
                    languageScores.addDataToScores(slanguage, dlanguage, data2);
                }
                break;
            case script:
                String dscript = desiredMatcher.getScript();
                String sscript = supportedMatcher.getScript();
                scriptScores.addDataToScores(dscript, sscript, data);
                if (!oneway) {
                    scriptScores.addDataToScores(sscript, dscript, data2);
                }
                break;
            case region:
                String dregion = desiredMatcher.getRegion();
                String sregion = supportedMatcher.getRegion();
                regionScores.addDataToScores(dregion, sregion, data);
                if (!oneway) {
                    regionScores.addDataToScores(sregion, dregion, data2);
                }
                break;
            }
            return this;
        }

        /** 
         * {@inheritDoc}
         * @internal
         * @deprecated This API is ICU internal only.
         */
        @Deprecated
        public LanguageMatcherData cloneAsThawed() {
            LanguageMatcherData result;
            try {
                result = (LanguageMatcherData) clone();
                result.languageScores = languageScores.cloneAsThawed();
                result.scriptScores = scriptScores.cloneAsThawed();
                result.regionScores = regionScores.cloneAsThawed();
                result.frozen = false;
                return result;
            } catch (CloneNotSupportedException e) {
                throw new ICUCloneNotSupportedException(e); // will never happen
            }
        }

        private boolean frozen = false;

        /** 
         * {@inheritDoc}
         * @internal
         * @deprecated This API is ICU internal only.
         */
        @Deprecated
        public LanguageMatcherData freeze() {
            return this;
        }

        /** 
         * {@inheritDoc}
         * @internal
         * @deprecated This API is ICU internal only.
         */
        @Deprecated
        public boolean isFrozen() {
            return frozen;
        }
    }

    LanguageMatcherData matcherData;

    private static final LanguageMatcherData defaultWritten;
//    = new LanguageMatcherData()
//    // TODO get data from CLDR
//    .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.")
//    .addDistance("nn", "nb", 96)
//    .addDistance("nn", "no", 96)
//    .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.")
//    .addDistance("da", "nb", 90)
//    .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.")
//    .addDistance("sh", "br", 96)
//    .addDistance("sr", "br", 96)
//    .addDistance("sh", "hr", 96)
//    .addDistance("sr", "hr", 96)
//    .addDistance("sh", "sr", 96)
//    .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.")
//    .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.")
//    .addDistance("*-Hant", "*-Hans", 75, true)
//    .addDistance("en-*-US", "en-*-*", 97, "Non-US English variants are closer to each other (written). Make en-US be further from everything else.")
//    .addDistance("en-*-*", "en-*-*", 99)
//    .addDistance("es-*-ES", "es-*-*", 97, "Latin American Spanishes are closer to each other. Make es-ES be further from everything else.")
//    .addDistance("es-*-419", "es-*-*", 99, "Have es-MX, es-AR, etc be closer to es-419 than to each other")
//    .addDistance("es-*-*", "es-*-*", 97)
//    .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.")
//    .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.")
//    .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.")
//    .freeze();

    private static HashMap canonicalMap = new HashMap();

    static class DataHack implements Comparable{
        final String source;
        final String target;
        int percent;
        public DataHack(String source, String target, int percent) {
            this.source = source;
            this.target = target.equals("de_CH") ? "de" : target; // hack to fix bad data
            this.percent = percent;
        }
        static final Pattern STAR_KEEP = Pattern.compile("([^_]+)(?:_[^_]+(?:_[^_]+)?)?");
        public int compareTo(DataHack other) {
            // this is just a one-time hack so we don't need to optimize
            int diff = getUnderbars(source) - getUnderbars(other.source);
            if (0 != diff) {
                return diff;
            }
            String thisSource = source.replace('*', 'þ'); // just something after Z
            String otherSource = other.source.replace('*', 'þ'); // just something after Z
            diff = thisSource.compareTo(otherSource);
            if (0 != diff) {
                return diff;
            }
            String thisTarget = target.replace('*', 'þ'); // just something after Z
            String otherTarget = other.target.replace('*', 'þ'); // just something after Z
            diff = thisTarget.compareTo(otherTarget);

//            Matcher matcher = STAR_KEEP.matcher(source);
//            matcher.matches();
//            String first = matcher.group(0);
//            String second = matcher.group(1);
//            String third = matcher.group(2);
//            Matcher matcherB = STAR_KEEP.matcher(source);
//            String firstB = matcher.group(0);
//            String secondB = matcher.group(1);
//            String thirdB = matcher.group(2);
//
//            int diff = onlyStars.length() - onlyStarsOther.length();
            
            if (0 != diff) {
                return diff;
            }
            diff = source.compareTo(other.source);
            if (0 != diff) {
                return diff;
            }
            return target.compareTo(other.target);
        }
        /**
         * @param source2
         */
        private int getUnderbars(String source2) {
            int pos = source2.indexOf('_');
            if (pos < 0) {
                return 0;
            }
            pos = source2.indexOf('_',pos+1);
            return pos < 0 ? 1 : 2;
        }
        public String toString() {
            return source + ", " + target + " => " + percent;
        }
    }
    
    static {
        // TODO get data from CLDR
        canonicalMap.put("iw", "he");
        canonicalMap.put("mo", "ro");
        canonicalMap.put("tl", "fil");
        
        ICUResourceBundle suppData = getICUSupplementalData();
        ICUResourceBundle languageMatching = suppData.findTopLevel("languageMatching");
        ICUResourceBundle written = (ICUResourceBundle) languageMatching.get("written");
        defaultWritten = new LanguageMatcherData();
        // HACK
        // The data coming from ICU may be old, and badly ordered.
        TreeSet hack = new TreeSet();
        defaultWritten.addDistance("en_*_US", "en_*_*", 97);
        defaultWritten.addDistance("en_*_GB", "en_*_*", 98);
        defaultWritten.addDistance("es_*_ES", "es_*_*", 97);
        defaultWritten.addDistance("es_*_419", "es_*_*", 99);
        defaultWritten.addDistance("es_*_*", "es_*_*", 98);

        for(UResourceBundleIterator iter = written.getIterator(); iter.hasNext();) {
            ICUResourceBundle item = (ICUResourceBundle) iter.next();
            /*
            "*_*_*",
            "*_*_*",
            "96",
             */
            hack.add(new DataHack(item.getString(0), item.getString(1), Integer.parseInt(item.getString(2))));
        }
        for (DataHack dataHack : hack) {
            defaultWritten.addDistance(dataHack.source, dataHack.target, dataHack.percent);
        }
        defaultWritten.freeze();
    }
    
    /**
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public static ICUResourceBundle getICUSupplementalData() {
        ICUResourceBundle suppData = (ICUResourceBundle) UResourceBundle.getBundleInstance(
                ICUResourceBundle.ICU_BASE_NAME,
                "supplementalData",
                ICUResourceBundle.ICU_DATA_CLASS_LOADER);
        return suppData;
    }

    /**
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public static double match(ULocale a, ULocale b) {
        final LocaleMatcher matcher = new LocaleMatcher("");
        return matcher.match(a, matcher.addLikelySubtags(a), b, matcher.addLikelySubtags(b));
    }
}