All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.lib.xliff2.lang.Language Maven / Gradle / Ivy

There is a newer version: 1.47.0
Show newest version
/*===========================================================================
 * Adapted from Henri Sivonen's code
 * --------------------------------------------------------------------------
 * Copyright (c) 2006 Henri Sivonen
 * Copyright (c) 2007-2010 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a 
 * copy of this software and associated documentation files (the "Software"), 
 * to deal in the Software without restriction, including without limitation 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 * and/or sell copies of the Software, and to permit persons to whom the 
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 * -------------------------------------------------------------------------
 * The adapted code has bug fixes
 ===========================================================================*/

package net.sf.okapi.lib.xliff2.lang;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

import net.sf.okapi.lib.xliff2.InvalidParameterException;
import net.sf.okapi.lib.xliff2.XLIFFException;

/**
 * Implement verification for language tags.
 */
public final class Language {

    /**
     * The singleton instance.
     */
    public static final Language THE_INSTANCE = new Language();

    private static final Pattern HYPHEN = Pattern.compile("-");
    private static final boolean WARN = true;

    private static String[] languages = null;
    private static String[] extlangs = null;
    private static String[] scripts = null;
    private static String[] regions = null;
    private static String[] variants = null;
    private static String[] grandfathered = null;
    private static String[] redundant = null;
    private static String[] deprecated = null;
    private static String[] deprecatedLang = null;
    private static int[] suppressedScriptByLanguage = null;
    private static Map preferredValueByLanguageMap = new HashMap<>();
    private static String[][][] prefixesByVariant = null;
    private static int[] prefixByExtlang = null;

    static {
        try {
            LanguageData data = new LanguageData();
            languages = data.getLanguages();
            extlangs = data.getExtlangs();
            scripts = data.getScripts();
            regions = data.getRegions();
            variants = data.getVariants();
            grandfathered = data.getGrandfathered();
            redundant = data.getRedundant();
            deprecated = data.getDeprecated();
            deprecatedLang = data.getDeprecatedLang();
            suppressedScriptByLanguage = data.getSuppressedScriptByLanguage();
            prefixByExtlang = data.getPrefixByExtlang();
            preferredValueByLanguageMap = data.getPreferredValueByLanguageMap();
            prefixesByVariant = data.getPrefixesByVariant();
		}
		catch ( IOException e ) {
			throw new XLIFFException(e);
		}
    }

    /**
     * Package-private constructor
     */
    private Language() {
        super();
    }

    /**
     * Verify the validity of a given language tag.
     * @param literal the language tag to verify.
     * @throws InvalidParameterException if the tag is invalid or if there is a warning for it.
     * If the issue is just a warning, the error message starts with "Warning: ".
     */
    public void checkValid (String literal) {
        if (literal.length() == 0) {
        	throw new InvalidParameterException("The empty string is not a valid language tag.");
        }
        literal = toAsciiLowerCase(literal);
		if ( isGrandfathered(literal) ) {
			if ( WARN && isDeprecated(literal) ) {
            	throw new InvalidParameterException("Warning: The grandfathered language tag "+
                        literal+ " is deprecated." + " Use \u201C"
                                + preferredValueByLanguageMap.get(literal)
                                + "\u201D instead.");
            }
            return;
        }
		if ( isRedundant(literal) ) {
			if ( isDeprecated(literal) && WARN ) {
            	throw new InvalidParameterException("Warning: The language tag "+ literal+
                        " is deprecated." + " Use \u201C"
                                + preferredValueByLanguageMap.get(literal)
                                + "\u201D instead.");
            }
            return;
        }
		if ( literal.startsWith("-") ) {
        	throw new InvalidParameterException("Language tag must not start with HYPHEN-MINUS.");
        }
		if ( literal.endsWith("-") ) {
        	throw new InvalidParameterException("Language tag must not end with HYPHEN-MINUS.");
        }

        String[] subtags = HYPHEN.split(literal);

		for (String s : subtags) {
			int len = s.length();
			if (len == 0) {
				throw new InvalidParameterException("Zero-length subtag.");
			} else if (len > 8) {
				throw new InvalidParameterException("Subtags must not exceed 8 characters in length.");
			}
		}

        // Language

        int i = 0;
        String subtag = subtags[i];
        int len = subtag.length();
        if ("x".equals(subtag)) {
            checkPrivateUse(i, subtags);
            return;
        }
		if ( (len == 2 || len == 3) && isLowerCaseAlpha(subtag) ) {
			if ( !isLanguage(subtag) ) {
            	throw new InvalidParameterException("The language subtag "+ subtag+
                        " is not a valid ISO language part of a language tag.");
            }
			if ( isDeprecatedLang(subtag) && WARN ) {
            	throw new InvalidParameterException("Warning: The language subtag "+ subtag+
                        " is deprecated." + " Use \u201C"
                                + preferredValueByLanguageMap.get(literal)
                                + "\u201D instead.");
            }
            i++;
			if ( i == subtags.length ) {
                return;
            }
            subtag = subtags[i];
            len = subtag.length();
		}
		else if ( len == 4 && isLowerCaseAlpha(subtag) ) {
        	throw new InvalidParameterException("Found reserved language tag: "+ subtag+
                    ".");
		}
		else if ( len >= 5 && isLowerCaseAlpha(subtag) ) {
			if ( !isLanguage(subtag) ) {
            	throw new InvalidParameterException("The language subtag "+ subtag+
                        " is not a valid IANA language part of a language tag.");
            }
			if ( isDeprecatedLang(subtag) && WARN ) {
            	throw new InvalidParameterException("Warning: The language subtag "+ subtag+
                        " is deprecated." + " Use \u201C"
                                + preferredValueByLanguageMap.get(literal)
                                + "\u201D instead.");
            }
            i++;
			if ( i == subtags.length ) {
                return;
            }
            subtag = subtags[i];
            len = subtag.length();
		}
		else {
        	throw new InvalidParameterException("The language subtag "+ subtag+
                    " is not a valid language subtag.");
        }

        // extlang

		if ( "x".equals(subtag) ) {
            checkPrivateUse(i, subtags);
            return;
        }
		if ( subtag.length() == 3 && isLowerCaseAlpha(subtag) ) {
			if ( !isExtlang(subtag) ) {
            	throw new InvalidParameterException("Bad extlang subtag "+ subtag+ ".");
            }
			if ( !usesPrefixByExtlang(subtags[0], subtag) ) {
                // IANA language tags are never correct prefixes.
            	throw new InvalidParameterException("Extlang subtag "+ subtag+
                        " has an incorrect prefix.");
            }
            i++;
            if (i == subtags.length) {
                return;
            }
            subtag = subtags[i];
            len = subtag.length();
        }

        // Script?

		if ( "x".equals(subtag) ) {
            checkPrivateUse(i, subtags);
            return;
        }
		if ( subtag.length() == 4 & isLowerCaseAlpha(subtag) ) {
			if ( !isScript(subtag) ) {
            	throw new InvalidParameterException("Bad script subtag.");
            }
			if ( WARN && isDeprecated(subtag) ) {
            	throw new InvalidParameterException("Warning: The script subtag "+ subtag+
                        " is deprecated." + " Use \u201C"
                                + preferredValueByLanguageMap.get(literal)
                                + "\u201D instead.");
            }
			if ( WARN && shouldSuppressScript(subtags[0], subtag) ) {
            	throw new InvalidParameterException("Warning: Language tag should omit the default script for the language.");
            }
            i++;
			if ( i == subtags.length ) {
                return;
            }
            subtag = subtags[i];
            len = subtag.length();
        }

        // Region

        if ((len == 3 && isDigit(subtag))
                || (len == 2 && isLowerCaseAlpha(subtag))) {
            if (!isRegion(subtag)) {
            	throw new InvalidParameterException("Bad region subtag.");
            }
            if (isDeprecated(subtag) && WARN) {
                throw new InvalidParameterException("Warning: The region subtag "+ subtag+
                        " is deprecated." + " Use \u201C"
                                + preferredValueByLanguageMap.get(literal)
                                + "\u201D instead.");
            }
            i++;
            if (i == subtags.length) {
                return;
            }
            subtag = subtags[i];
            len = subtag.length();
        }

        // Variant

        for (;;) {
            if ("x".equals(subtag)) {
                checkPrivateUse(i, subtags);
                return;
            }
            // cutting corners here a bit since there are no extensions at this
            // time
            if (len == 1 && isLowerCaseAlphaNumeric(subtag)) {
                throw new InvalidParameterException("Unknown extension "+ subtag+ ".");
            }
            else if ((len == 4 && isDigit(subtag.charAt(0)) && isLowerCaseAlphaNumeric(subtag))
                    || (len >= 5 && isLowerCaseAlphaNumeric(subtag))) {
                if (!isVariant(subtag)) {
                    throw new InvalidParameterException("Bad variant subtag "+ subtag+ ".");
                }
				if ( isDeprecated(subtag) && WARN ) {
                    throw new InvalidParameterException("Warning: The variant subtag "+ subtag+
                            " is deprecated." + " Use \u201C"
                                    + preferredValueByLanguageMap.get(literal)
                                    + "\u201D instead.");
                }
				if ( !hasGoodPrefix(subtags, i) ) {
                    throw new InvalidParameterException("Variant "+ subtag+ " lacks required prefix.");
                }
            }
            else {
                throw new InvalidParameterException("The subtag "+ subtag+
                        " does not match the format for any permissible subtag type.");
            }
            i++;
            if (i == subtags.length) {
                return;
            }
            subtag = subtags[i];
            len = subtag.length();
        }
    }
    
    private boolean hasGoodPrefix(String[] subtags, int i) {
        String variant = subtags[i];
        int index = Arrays.binarySearch(variants, variant);
        assert index >= 0;
        String[][] prefixes = prefixesByVariant[index];
        if (prefixes.length == 0) {
            return true;
        }
		for (String[] prefix : prefixes) {
			if (prefixMatches(prefix, subtags, i)) {
				return true;
			}
		}
        return false;
    }

    private boolean prefixMatches(String[] prefix, String[] subtags, int limit) {
		for (String prefixComponent : prefix) {
			if (!subtagsContainPrefixComponent(prefixComponent, subtags, limit)) {
				return false;
			}
		}
        return true;
    }

    private boolean subtagsContainPrefixComponent(String prefixComponent,
            String[] subtags, int limit) {
        for (int i = 0; i < limit; i++) {
            String subtag = subtags[i];
            if (subtag.equals(prefixComponent)) {
                return true;
            }
        }
        return false;
    }

    private boolean usesPrefixByExtlang(String language, String extlang) {
        int langIndex = Arrays.binarySearch(languages, language);
        int extlangIndex = Arrays.binarySearch(extlangs, extlang);
        assert langIndex > -1;
        int prefixExpected = prefixByExtlang[extlangIndex];
        return prefixExpected == langIndex;
    }

    private boolean shouldSuppressScript(String language, String script) {
        int langIndex = Arrays.binarySearch(languages, language);
        assert langIndex > -1;
        int scriptIndex = suppressedScriptByLanguage[langIndex];
        if (scriptIndex < 0) {
            return false;
        } else {
            return scripts[scriptIndex].equals(script);
        }
    }

    private boolean isVariant(String subtag) {
        return (Arrays.binarySearch(variants, subtag) > -1);
    }

    private boolean isRegion(String subtag) {
        return (Arrays.binarySearch(regions, subtag) > -1)
                || "aa".equals(subtag)
                || ("qm".compareTo(subtag) <= 0 && "qz".compareTo(subtag) >= 0)
                || ("xa".compareTo(subtag) <= 0 && "xz".compareTo(subtag) >= 0)
                || "zz".equals(subtag);
    }

    private boolean isScript(String subtag) {
        return (Arrays.binarySearch(scripts, subtag) > -1)
                || ("qaaa".compareTo(subtag) <= 0 && "qabx".compareTo(subtag) >= 0);
    }

    private boolean isExtlang(String subtag) {
        return (Arrays.binarySearch(extlangs, subtag) > -1);
    }

    private boolean isLanguage(String subtag) {
        return (Arrays.binarySearch(languages, subtag) > -1)
                || ("qaa".compareTo(subtag) <= 0 && "qtz".compareTo(subtag) >= 0);
    }

    private void checkPrivateUse (int i,
    	String[] subtags)
    {
        int len = subtags.length;
        i++;
        if (i == len) {
            throw new InvalidParameterException("No subtags in private use sequence.");
        }
        while (i < len) {
            String subtag = subtags[i];
			if ( !isLowerCaseAlphaNumeric(subtag) ) {
                throw new InvalidParameterException("Bad character in private use subtag "+ subtag+ ".");
            }
            i++;
        }
    }

	private final boolean isLowerCaseAlphaNumeric (char c) {
        return isLowerCaseAlpha(c) || isDigit(c);
    }

	private final boolean isLowerCaseAlphaNumeric (String str) {
        for (int i = 0; i < str.length(); i++) {
            if (!isLowerCaseAlphaNumeric(str.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    private final boolean isDigit(char c) {
        return (c >= '0' && c <= '9');
    }

    private final boolean isDigit(String str) {
        for (int i = 0; i < str.length(); i++) {
            if (!isDigit(str.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    private final boolean isLowerCaseAlpha(char c) {
        return (c >= 'a' && c <= 'z');
    }

    private final boolean isLowerCaseAlpha(String str) {
        for (int i = 0; i < str.length(); i++) {
            if (!isLowerCaseAlpha(str.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    private boolean isGrandfathered (String literal) {
        return Arrays.binarySearch (grandfathered, literal) > -1;
    }

    private boolean isRedundant(String literal) {
        return Arrays.binarySearch (redundant, literal) > -1;
    }

    private boolean isDeprecated(String subtag) {
        return Arrays.binarySearch (deprecated, subtag) > -1;
    }

    private boolean isDeprecatedLang (String subtag) {
        return Arrays.binarySearch(deprecatedLang, subtag) > -1;
    }

	private String toAsciiLowerCase (CharSequence str) {
		if ( str == null ) {
			return null;
		}
		char[] buf = new char[str.length()];
		for (int i = 0; i < str.length(); i++) {
			char c = str.charAt(i);
			if ( c >= 'A' && c <= 'Z' ) {
				c += 0x20;
			}
			buf[i] = c;
		}
		return new String(buf);
    }    

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy