com.ibm.icu.impl.locale.LocaleValidityChecker Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2015-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl.locale;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import com.ibm.icu.impl.ValidIdentifiers;
import com.ibm.icu.impl.ValidIdentifiers.Datasubtype;
import com.ibm.icu.impl.ValidIdentifiers.Datatype;
import com.ibm.icu.impl.locale.KeyTypeData.ValueType;
import com.ibm.icu.util.IllformedLocaleException;
import com.ibm.icu.util.Output;
import com.ibm.icu.util.ULocale;
/**
* @author markdavis
*
*/
public class LocaleValidityChecker {
private final Set datasubtypes;
private final boolean allowsDeprecated;
public static class Where {
public Datatype fieldFailure;
public String codeFailure;
public boolean set(Datatype datatype, String code) {
fieldFailure = datatype;
codeFailure = code;
return false;
}
@Override
public String toString() {
return fieldFailure == null ? "OK" : "{" + fieldFailure + ", " + codeFailure + "}";
}
}
public LocaleValidityChecker(Set datasubtypes) {
this.datasubtypes = EnumSet.copyOf(datasubtypes);
allowsDeprecated = datasubtypes.contains(Datasubtype.deprecated);
}
public LocaleValidityChecker(Datasubtype... datasubtypes) {
this.datasubtypes = EnumSet.copyOf(Arrays.asList(datasubtypes));
allowsDeprecated = this.datasubtypes.contains(Datasubtype.deprecated);
}
/**
* @return the datasubtypes
*/
public Set getDatasubtypes() {
return EnumSet.copyOf(datasubtypes);
}
static Pattern SEPARATOR = Pattern.compile("[-_]");
@SuppressWarnings("unused")
private static final Pattern VALID_X = Pattern.compile("[a-zA-Z0-9]{2,8}(-[a-zA-Z0-9]{2,8})*");
public boolean isValid(ULocale locale, Where where) {
where.set(null, null);
final String language = locale.getLanguage();
final String script = locale.getScript();
final String region = locale.getCountry();
final String variantString = locale.getVariant();
final Set extensionKeys = locale.getExtensionKeys();
// if (language.isEmpty()) {
// // the only case where this is valid is if there is only an 'x' extension string
// if (!script.isEmpty() || !region.isEmpty() || variantString.isEmpty()
// || extensionKeys.size() != 1 || !extensionKeys.contains('x')) {
// return where.set(Datatype.x, "Null language only with x-...");
// }
// return true; // for x string, wellformedness = valid
// }
if (!isValid(Datatype.language, language, where)) {
// special case x
if (language.equals("x")) {
where.set(null, null); // for x, well-formed == valid
return true;
}
return false;
}
if (!isValid(Datatype.script, script, where)) return false;
if (!isValid(Datatype.region, region, where)) return false;
if (!variantString.isEmpty()) {
for (String variant : SEPARATOR.split(variantString)) {
if (!isValid(Datatype.variant, variant, where)) return false;
}
}
for (Character c : extensionKeys) {
try {
Datatype datatype = Datatype.valueOf(c+"");
switch (datatype) {
case x:
return true; // if it is syntactic (checked by ULocale) it is valid
case t:
case u:
if (!isValidU(locale, datatype, locale.getExtension(c), where)) return false;
break;
default:
break;
}
} catch (Exception e) {
return where.set(Datatype.illegal, c+"");
}
}
return true;
}
// TODO combine this with the KeyTypeData.SpecialType, and get it from the type, not the key
enum SpecialCase {
normal, anything, reorder, codepoints, subdivision, rgKey;
static SpecialCase get(String key) {
if (key.equals("kr")) {
return reorder;
} else if (key.equals("vt")) {
return codepoints;
} else if (key.equals("sd")) {
return subdivision;
} else if (key.equals("rg")) {
return rgKey;
} else if (key.equals("x0")) {
return anything;
} else {
return normal;
}
}
}
/**
* @param locale
* @param datatype
* @param extension
* @param where
* @return
*/
private boolean isValidU(ULocale locale, Datatype datatype, String extensionString, Where where) {
String key = "";
int typeCount = 0;
ValueType valueType = null;
SpecialCase specialCase = null;
StringBuilder prefix = new StringBuilder();
Set seen = new HashSet();
StringBuilder tBuffer = datatype == Datatype.t ? new StringBuilder() : null;
// TODO: is empty -u- valid?
for (String subtag : SEPARATOR.split(extensionString)) {
if (subtag.length() == 2
&& (tBuffer == null || subtag.charAt(1) <= '9')) {
// if we have accumulated a t buffer, check that first
if (tBuffer != null) {
// Check t buffer. Empty after 't' is ok.
if (tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) {
return false;
}
tBuffer = null;
}
key = KeyTypeData.toBcpKey(subtag);
if (key == null) {
return where.set(datatype, subtag);
}
if (!allowsDeprecated && KeyTypeData.isDeprecated(key)) {
return where.set(datatype, key);
}
valueType = KeyTypeData.getValueType(key);
specialCase = SpecialCase.get(key);
typeCount = 0;
} else if (tBuffer != null) {
if (tBuffer.length() != 0) {
tBuffer.append('-');
}
tBuffer.append(subtag);
} else {
++typeCount;
switch (valueType) {
case single:
if (typeCount > 1) {
return where.set(datatype, key+"-"+subtag);
}
break;
case incremental:
if (typeCount == 1) {
prefix.setLength(0);
prefix.append(subtag);
} else {
prefix.append('-').append(subtag);
subtag = prefix.toString();
}
break;
case multiple:
if (typeCount == 1) {
seen.clear();
}
break;
default:
break;
}
switch (specialCase) {
case anything:
continue;
case codepoints:
try {
if (Integer.parseInt(subtag,16) > 0x10FFFF) {
return where.set(datatype, key+"-"+subtag);
}
} catch (NumberFormatException e) {
return where.set(datatype, key+"-"+subtag);
}
continue;
case reorder:
boolean newlyAdded = seen.add(subtag.equals("zzzz") ? "others" : subtag);
if (!newlyAdded || !isScriptReorder(subtag)) {
return where.set(datatype, key+"-"+subtag);
}
continue;
case subdivision:
if (!isSubdivision(locale, subtag)) {
return where.set(datatype, key+"-"+subtag);
}
continue;
case rgKey:
if (subtag.length() < 6 || !subtag.endsWith("zzzz")) {
return where.set(datatype, subtag);
}
if (!isValid(Datatype.region, subtag.substring(0,subtag.length()-4), where)) {
return false;
}
continue;
default:
break;
}
// en-u-sd-usca
// en-US-u-sd-usca
Output isKnownKey = new Output();
Output isSpecialType = new Output();
String type = KeyTypeData.toBcpType(key, subtag, isKnownKey, isSpecialType);
if (type == null) {
return where.set(datatype, key+"-"+subtag);
}
if (!allowsDeprecated && KeyTypeData.isDeprecated(key, subtag)) {
return where.set(datatype, key+"-"+subtag);
}
}
}
// Check t buffer. Empty after 't' is ok.
if (tBuffer != null && tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(),where)) {
return false;
}
return true;
}
/**
* @param locale
* @param subtag
* @return
*/
private boolean isSubdivision(ULocale locale, String subtag) {
// First check if the subtag is valid
if (subtag.length() < 3) {
return false;
}
String region = subtag.substring(0, subtag.charAt(0) <= '9' ? 3 : 2);
String subdivision = subtag.substring(region.length());
if (ValidIdentifiers.isValid(Datatype.subdivision, datasubtypes, region, subdivision) == null) {
return false;
}
// Then check for consistency with the locale's region
String localeRegion = locale.getCountry();
if (localeRegion.isEmpty()) {
ULocale max = ULocale.addLikelySubtags(locale);
localeRegion = max.getCountry();
}
if (!region.equalsIgnoreCase(localeRegion)) {
return false;
}
return true;
}
static final Set REORDERING_INCLUDE = new HashSet(Arrays.asList("space", "punct", "symbol", "currency", "digit", "others", "zzzz"));
static final Set REORDERING_EXCLUDE = new HashSet(Arrays.asList("zinh", "zyyy"));
static final Set REGULAR_ONLY = EnumSet.of(Datasubtype.regular);
/**
* @param subtag
* @return
*/
private boolean isScriptReorder(String subtag) {
subtag = AsciiUtil.toLowerString(subtag);
if (REORDERING_INCLUDE.contains(subtag)) {
return true;
} else if (REORDERING_EXCLUDE.contains(subtag)) {
return false;
}
return ValidIdentifiers.isValid(Datatype.script, REGULAR_ONLY, subtag) != null;
// space, punct, symbol, currency, digit - core groups of characters below 'a'
// any script code except Common and Inherited.
// sc ; Zinh ; Inherited ; Qaai
// sc ; Zyyy ; Common
// Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana.
// others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others. return false;
}
/**
* @param extensionString
* @param where
* @return
*/
private boolean isValidLocale(String extensionString, Where where) {
try {
ULocale locale = new ULocale.Builder().setLanguageTag(extensionString).build();
return isValid(locale, where);
} catch (IllformedLocaleException e) {
int startIndex = e.getErrorIndex();
String[] list = SEPARATOR.split(extensionString.substring(startIndex));
return where.set(Datatype.t, list[0]);
} catch (Exception e) {
return where.set(Datatype.t, e.getMessage());
}
}
/**
* @param datatype
* @param code
* @param where
* @return
*/
private boolean isValid(Datatype datatype, String code, Where where) {
if (code.isEmpty()) {
return true;
}
// Note:
// BCP 47 -u- locale extension '-u-va-posix' is mapped to variant 'posix' automatically.
// For example, ULocale.forLanguageTag("en-u-va-posix").getVariant() returns "posix".
// This is only the exceptional case when -u- locale extension is mapped to a subtag type
// other than keyword.
//
// The locale validity data is based on IANA language subtag registry data and "posix"
// is not a valid variant. So we need to handle this specific case here. There are no
// othe exceptions.
if (datatype == Datatype.variant && "posix".equalsIgnoreCase(code)) {
return true;
}
return ValidIdentifiers.isValid(datatype, datasubtypes, code) != null ?
true : (where == null ? false : where.set(datatype, code));
}
}