All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.util.LocaleMatcher Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/*
 ****************************************************************************************
 * Copyright (C) 2009-2010, Google, Inc.; International Business Machines Corporation   *
 * and others. All Rights Reserved.                                                     *
 ****************************************************************************************
 */
package com.ibm.icu.util;

import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.ibm.icu.impl.Row;
import com.ibm.icu.impl.Row.R2;
import com.ibm.icu.impl.Row.R3;

/**
 * Provides a way to match the languages (locales) supported by a product to the
 * languages (locales) acceptable to a user, and get the best match. For
 * example:
 * 
 * 
 * LanguageMatcher matcher = new StandardLanguageMatcher("fr, en-GB, en");
 * 
 * // afterwards:
 * matcher.getBestMatch(LanguageCode.US).first == LanguageCode.ENGLISH
 * 
* * It takes into account when languages are close to one another, such as fil * and tl, and when language regional variants are close, like en-GB and en-AU. * It also handles scripts, like zh-Hant vs zh-TW. For examples, see the test * file. *

All classes implementing this interface should be immutable. Often a * product will just need one static instance, built with the languages * that it supports. However, it may want multiple instances with different * default languages based on additional information, such as the domain. * * @author [email protected] * @stable ICU 4.4 */ public class LocaleMatcher { private static final boolean DEBUG = false; /** * Threshold for falling back to the default (first) language. May make this * a parameter in the future. */ private static final double DEFAULT_THRESHOLD = 0.5; /** * The default language, in case the threshold is not met. */ private final ULocale defaultLanguage; /** * Create a new language matcher. The highest-weighted language is the * default. That means that if no other language is matches closer than a given * threshold, that default language is chosen. Typically the default is English, * but it could be different based on additional information, such as the domain * of the page. * * @param languagePriorityList weighted list * @stable ICU 4.4 */ public LocaleMatcher(LocalePriorityList languagePriorityList) { this(languagePriorityList, defaultWritten); } /** * Create a new language matcher from a String form. The highest-weighted * language is the default. * * @param languagePriorityListString String form of LanguagePriorityList * @stable ICU 4.4 */ public LocaleMatcher(String languagePriorityListString) { this(LocalePriorityList.add(languagePriorityListString).build()); } /** * Internal testing function; may expose API later. * @param languagePriorityList LocalePriorityList to match * @param matcherData Internal matching data * @internal * @deprecated This API is ICU internal only. */ public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData) { this.matcherData = matcherData; for (final ULocale language : languagePriorityList) { add(language, languagePriorityList.getWeight(language)); } Iterator it = languagePriorityList.iterator(); defaultLanguage = it.hasNext() ? it.next() : null; } /** * Returns a fraction between 0 and 1, where 1 means that the languages are a * perfect match, and 0 means that they are completely different. Note that * the precise values may change over time; no code should be made dependent * on the values remaining constant. * @param desired Desired locale * @param desiredMax Maximized locale (using likely subtags) * @param supported Supported locale * @param supportedMax Maximized locale (using likely subtags) * @return value between 0 and 1, inclusive. * @stable ICU 4.4 */ public double match(ULocale desired, ULocale desiredMax, ULocale supported, ULocale supportedMax) { return matcherData.match(desired, desiredMax, supported, supportedMax); } /** * Canonicalize a locale (language). Note that for now, it is canonicalizing * according to CLDR conventions (he vs iw, etc), since that is what is needed * for likelySubtags. * @param ulocale language/locale code * @return ULocale with remapped subtags. * @stable ICU 4.4 */ public ULocale canonicalize(ULocale ulocale) { // TODO Get the data from CLDR, use Java conventions. String lang = ulocale.getLanguage(); String lang2 = canonicalMap.get(lang); String script = ulocale.getScript(); String script2 = canonicalMap.get(script); String region = ulocale.getCountry(); String region2 = canonicalMap.get(region); if (lang2 != null || script2 != null || region2 != null) { return new ULocale( lang2 == null ? lang : lang2, script2 == null ? script : script2, region2 == null ? region : region2 ); } return ulocale; } /** * Get the best match for a LanguagePriorityList * * @param languageList list to match * @return best matching language code * @stable ICU 4.4 */ public ULocale getBestMatch(LocalePriorityList languageList) { double bestWeight = 0; ULocale bestTableMatch = null; for (final ULocale language : languageList) { final Row.R2 matchRow = getBestMatchInternal(language); final double weight = matchRow.get1() * languageList.getWeight(language); if (weight > bestWeight) { bestWeight = weight; bestTableMatch = matchRow.get0(); } } if (bestWeight < DEFAULT_THRESHOLD) { bestTableMatch = defaultLanguage; } return bestTableMatch; } /** * Convenience method: Get the best match for a LanguagePriorityList * * @param languageList String form of language priority list * @return best matching language code * @stable ICU 4.4 */ public ULocale getBestMatch(String languageList) { return getBestMatch(LocalePriorityList.add(languageList).build()); } /** * Get the best match for an individual language code. * * @param ulocale locale/language code to match * @return best matching language code * @stable ICU 4.4 */ public ULocale getBestMatch(ULocale ulocale) { return getBestMatchInternal(ulocale).get0(); } /** * {@inheritDoc} * @stable ICU 4.4 */ @Override public String toString() { return "{" + defaultLanguage + ", " + maximizedLanguageToWeight + "}"; } // ================= Privates ===================== /** * Get the best match for an individual language code. * * @param languageCode * @return best matching language code and weight (as per * {@link #match(ULocale, ULocale)}) */ private Row.R2 getBestMatchInternal(ULocale languageCode) { languageCode = canonicalize(languageCode); final ULocale maximized = addLikelySubtags(languageCode); if (DEBUG) { System.out.println("\n" + languageCode + ";\t" + maximized); } double bestWeight = 0; ULocale bestTableMatch = null; for (final ULocale tableKey : maximizedLanguageToWeight.keySet()) { R2 row = maximizedLanguageToWeight.get(tableKey); final double match = match(languageCode, maximized, tableKey, row.get0()); if (DEBUG) { System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match); } final double weight = match * row.get1(); if (weight > bestWeight) { bestWeight = weight; bestTableMatch = tableKey; } } if (bestWeight < DEFAULT_THRESHOLD) { bestTableMatch = defaultLanguage; } return Row.R2.of(bestTableMatch, bestWeight); } private void add(ULocale language, Double weight) { language = canonicalize(language); R2 row = Row.of(addLikelySubtags(language), weight); maximizedLanguageToWeight.put(language, row); } Map> maximizedLanguageToWeight = new LinkedHashMap>(); // =============== Special Mapping Information ============== /** * We need to add another method to addLikelySubtags that doesn't return * null, but instead substitutes Zzzz and ZZ if unknown. There are also * a few cases where addLikelySubtags needs to have expanded data, to handle * all deprecated codes, and to update to CLDR 1.6. * @param languageCode * @return "fixed" addLikelySubtags */ // TODO(markdavis): update the above when CLDR 1.6 is final. private ULocale addLikelySubtags(ULocale languageCode) { final ULocale result = ULocale.addLikelySubtags(languageCode); // should have method on getLikelySubtags for this if (result == null || result.equals(languageCode)) { final String language = languageCode.getLanguage(); final String script = languageCode.getScript(); final String region = languageCode.getCountry(); return new ULocale((language.length()==0 ? "und" : language) + "_" + (script.length()==0 ? "Zzzz" : script) + "_" + (region.length()==0 ? "ZZ" : region)); } return result; } private static class LocalePatternMatcher { // a value of null means a wildcard; matches any. private String lang; private String script; private String region; private Level level; static Pattern pattern = Pattern.compile( "([a-zA-Z]{1,8}|\\*)" + "(?:-([a-zA-Z]{4}|\\*))?" + "(?:-([a-zA-Z]{2}|[0-9]{3}|\\*))?"); public LocalePatternMatcher(String toMatch) { Matcher matcher = pattern.matcher(toMatch); if (!matcher.matches()) { throw new IllegalArgumentException("Bad pattern: " + toMatch); } lang = matcher.group(1); script = matcher.group(2); region = matcher.group(3); level = region != null ? Level.region : script != null ? Level.script : Level.language; if (lang.equals("*")) { lang = null; } if (script != null && script.equals("*")) { script = null; } if (region != null && region.equals("*")) { region = null; } } boolean matches(ULocale ulocale) { if (lang != null && !lang.equals(ulocale.getLanguage())) { return false; } if (script != null && !script.equals(ulocale.getScript())) { return false; } if (region != null && !region.equals(ulocale.getCountry())) { return false; } return true; } public Level getLevel() { return level; } public String getLanguage() { return (lang == null ? "*" : lang); } public String getScript() { return (script == null ? "*" : script); } public String getRegion() { return (region == null ? "*" : region); } public String toString() { String result = getLanguage(); if (level != Level.language) { result += "-" + getScript(); if (level != Level.script) { result += "-" + getRegion(); } } return result; } } enum Level {language, script, region} private static class ScoreData implements Freezable { LinkedHashSet> scores = new LinkedHashSet>(); final double worst; final Level level; public ScoreData(Level level) { this.level = level; this.worst = (1-(level == Level.language ? 90 : level == Level.script ? 20 : 4))/100.0; } void addDataToScores(String desired, String supported, R3 data) { // Map>> lang_result = scores.get(desired); // if (lang_result == null) { // scores.put(desired, lang_result = new HashMap()); // } // Set> result = lang_result.get(supported); // if (result == null) { // lang_result.put(supported, result = new LinkedHashSet()); // } // result.add(data); scores.add(data); } double getScore(ULocale desiredLocale, ULocale dMax, String desiredRaw, String desiredMax, ULocale supportedLocale, ULocale sMax, String supportedRaw, String supportedMax) { /* * d, dm, s, sm * dc = d != dm * sc = s != sm * if dm != sm * rd = rd(dm,sm) // line 4 * if dc != sc * rd *= 0.75 // lines 3,8 * ef dc * rd *= 0.5 // lines 7 * end * ef dc == sc * rd = 0 // line 6 * else * rd = 0.25*StdRDiff // lines 2,5 */ boolean desiredChange = desiredRaw.equals(desiredMax); boolean supportedChange = supportedRaw.equals(supportedMax); double distance; if (!desiredMax.equals(supportedMax)) { // Map>> lang_result = scores.get(desiredMax); // if (lang_result == null) { // distance = worst; // } else { // Set> result = lang_result.get(supportedMax); // skip: // if (result == null) { // distance = worst; // } else { distance = getRawScore(dMax, sMax); // } if (desiredChange == supportedChange) { distance *= 0.75; } else if (desiredChange) { distance *= 0.5; } } else if (desiredChange == supportedChange) { // maxes are equal, changes are equal distance = 0; } else { // maxes are equal, changes are different distance = 0.25*worst; } return distance; } private double getRawScore(ULocale desiredLocale, ULocale supportedLocale) { if (DEBUG) { System.out.println("\t\t\tRaw Score:\t" + desiredLocale + ";\t" + supportedLocale); } for (R3 datum : scores) { // : result if (datum.get0().matches(desiredLocale) && datum.get1().matches(supportedLocale)) { if (DEBUG) { System.out.println("\t\t\tFOUND\t" + datum); } return datum.get2(); } } if (DEBUG) { System.out.println("\t\t\tNOTFOUND\t" + worst); } return worst; } public String toString() { return level + ", " + scores; } @SuppressWarnings("unchecked") public ScoreData cloneAsThawed() { try { ScoreData result = (ScoreData) clone(); result.scores = (LinkedHashSet>) result.scores.clone(); result.frozen = false; return result; } catch (CloneNotSupportedException e) { throw new IllegalArgumentException(e); // will never happen } } private boolean frozen = false; public ScoreData freeze() { return this; } public boolean isFrozen() { return frozen; } } /** * Only for testing and use by tools. Interface may change!! * @internal * @deprecated This API is ICU internal only. */ public static class LanguageMatcherData implements Freezable { ScoreData languageScores = new ScoreData(Level.language); ScoreData scriptScores = new ScoreData(Level.script); ScoreData regionScores = new ScoreData(Level.region); /** * @internal * @deprecated This API is ICU internal only. */ public LanguageMatcherData() { } /** * @internal * @deprecated This API is ICU internal only. */ public double match(ULocale a, ULocale aMax, ULocale b, ULocale bMax) { double diff = 0; diff += languageScores.getScore(a, aMax, a.getLanguage(), aMax.getLanguage(), b, bMax, b.getLanguage(), bMax.getLanguage()); diff += scriptScores.getScore(a, aMax, a.getScript(), aMax.getScript(), b, bMax, b.getScript(), bMax.getScript()); diff += regionScores.getScore(a, aMax, a.getCountry(), aMax.getCountry(), b, bMax, b.getCountry(), bMax.getCountry()); if (!a.getVariant().equals(b.getVariant())) { diff += 1; } if (diff < 0.0d) { diff = 0.0d; } else if (diff > 1.0d) { diff = 1.0d; } return 1.0 - diff; } /** * Add an exceptional distance between languages, typically because regional * dialects were given their own language codes. At this point the code is * symmetric. We don't bother producing an equivalence class because there are * so few cases; this function depends on the other permutations being * added specifically. * @internal * @deprecated This API is ICU internal only. */ private LanguageMatcherData addDistance(String desired, String supported, int percent) { return addDistance(desired, supported, percent, false, null); } /** * @internal * @deprecated This API is ICU internal only. */ public LanguageMatcherData addDistance(String desired, String supported, int percent, String comment) { return addDistance(desired, supported, percent, false, comment); } /** * @internal * @deprecated This API is ICU internal only. */ public LanguageMatcherData addDistance(String desired, String supported, int percent, boolean oneway) { return addDistance(desired, supported, percent, oneway, null); } private LanguageMatcherData addDistance(String desired, String supported, int percent, boolean oneway, String comment) { if (DEBUG) { System.out.println("\t" + (comment == null ? "" : "\t")); // // .addDistance("nn", "nb", 4, true) // System.out.println(".addDistance(\"" + desired + "\"" + // ", \"" + supported + "\"" + // ", " + percent + "" // + (oneway ? "" : ", true") // + (comment == null ? "" : ", \"" + comment + "\"") // + ")" // ); } double score = 1-percent/100.0; // convert from percentage LocalePatternMatcher desiredMatcher = new LocalePatternMatcher(desired); Level desiredLen = desiredMatcher.getLevel(); LocalePatternMatcher supportedMatcher = new LocalePatternMatcher(supported); Level supportedLen = supportedMatcher.getLevel(); if (desiredLen != supportedLen) { throw new IllegalArgumentException(); } R3 data = Row.of(desiredMatcher, supportedMatcher, score); R3 data2 = oneway ? null : Row.of(supportedMatcher, desiredMatcher, score); switch (desiredLen) { case language: String dlanguage = desiredMatcher.getLanguage(); String slanguage = supportedMatcher.getLanguage(); languageScores.addDataToScores(dlanguage, slanguage, data); if (!oneway) { languageScores.addDataToScores(slanguage, dlanguage, data2); } break; case script: String dscript = desiredMatcher.getScript(); String sscript = supportedMatcher.getScript(); scriptScores.addDataToScores(dscript, sscript, data); if (!oneway) { scriptScores.addDataToScores(sscript, dscript, data2); } break; case region: String dregion = desiredMatcher.getRegion(); String sregion = supportedMatcher.getRegion(); regionScores.addDataToScores(dregion, sregion, data); if (!oneway) { regionScores.addDataToScores(sregion, dregion, data2); } break; } return this; } /** * {@inheritDoc} * @internal * @deprecated This API is ICU internal only. */ public LanguageMatcherData cloneAsThawed() { LanguageMatcherData result; try { result = (LanguageMatcherData) clone(); result.languageScores = languageScores.cloneAsThawed(); result.scriptScores = scriptScores.cloneAsThawed(); result.regionScores = regionScores.cloneAsThawed(); result.frozen = false; return result; } catch (CloneNotSupportedException e) { throw new IllegalArgumentException(e); // will never happen } } private boolean frozen = false; /** * {@inheritDoc} * @internal * @deprecated This API is ICU internal only. */ public LanguageMatcherData freeze() { return this; } /** * {@inheritDoc} * @internal * @deprecated This API is ICU internal only. */ public boolean isFrozen() { return frozen; } } LanguageMatcherData matcherData; private static LanguageMatcherData defaultWritten = new LanguageMatcherData() // TODO get data from CLDR .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.") .addDistance("nn", "nb", 96) .addDistance("nn", "no", 96) .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.") .addDistance("da", "nb", 90) .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.") .addDistance("sh", "br", 96) .addDistance("sr", "br", 96) .addDistance("sh", "hr", 96) .addDistance("sr", "hr", 96) .addDistance("sh", "sr", 96) .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.") .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.") .addDistance("*-Hant", "*-Hans", 75, true) .addDistance("en-*-US", "en-*-CA", 98, "US is different than others, and Canadian is inbetween.") .addDistance("en-*-US", "en-*-*", 97) .addDistance("en-*-CA", "en-*-*", 98) .addDistance("en-*-*", "en-*-*", 99) .addDistance("es-*-ES", "es-*-ES", 100, "Latin American Spanishes are closer to each other. Approximate by having es-ES be further from everything else.") .addDistance("es-*-ES", "es-*-*", 93) .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.") .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.") .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.") .freeze(); private static HashMap canonicalMap = new HashMap(); static { // TODO get data from CLDR canonicalMap.put("iw", "he"); canonicalMap.put("mo", "ro"); canonicalMap.put("tl", "fil"); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy