All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.upokecenter.text.Idna Maven / Gradle / Ivy

package com.upokecenter.text;
/*
Written by Peter O. in 2014.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://upokecenter.dreamhosters.com/articles/donate-now-2/
 */

    /**
     * 

Contains methods that implement Internationalized Domain Names in * Applications (IDNA). IDNA enables using a wider range of letters, * numbers, and certain other characters in domain names.

NOTICE: * While this class's source code is in the public domain, the class * uses two internal classes, called NormalizationData and * IdnaData, that include data derived from the Unicode Character * Database. See the documentation for the NormalizingCharacterInput * class for the permission notice for the Unicode Character * Database.

*/ public final class Idna { private Idna() { } private static final int Unassigned = 0; // PValid = 1; private static final int Disallowed = 2; private static final int ContextJ = 3; private static final int ContextO = 4; private static final int BidiClassL = 0; private static final int BidiClassR = 1; private static final int BidiClassAL = 2; private static final int BidiClassEN = 3; private static final int BidiClassES = 4; private static final int BidiClassET = 5; private static final int BidiClassAN = 6; private static final int BidiClassCS = 7; private static final int BidiClassNSM = 8; private static final int BidiClassBN = 9; private static final int BidiClassON = 10; private static ByteData bidiClasses; private static ByteData joiningTypes; private static ByteData scripts; private static Object bidiClassesSync = new Object(); private static Object joiningTypesSync = new Object(); private static Object scriptsSync = new Object(); static int CodePointBefore(String str, int index) { if (str == null) { throw new NullPointerException("str"); } if (index <= 0) { return -1; } if (index > str.length()) { return -1; } int c = str.charAt(index - 1); if ((c & 0xfc00) == 0xdc00 && index - 2 >= 0 && str.charAt(index - 2) >= 0xd800 && str.charAt(index - 2) <= 0xdbff) { // Get the Unicode code point for the surrogate pair return 0x10000 + ((str.charAt(index - 2) - 0xd800) << 10) + (c - 0xdc00); } return ((c & 0xf800) == 0xd800) ? 0xfffd : c; } static int CodePointAt(String str, int index) { if (str == null) { throw new NullPointerException("str"); } if (index >= str.length()) { return -1; } if (index < 0) { return -1; } int c = str.charAt(index); if ((c & 0xfc00) == 0xd800 && index + 1 < str.length() && str.charAt(index + 1) >= 0xdc00 && str.charAt(index + 1) <= 0xdfff) { // Get the Unicode code point for the surrogate pair return 0x10000 + ((c - 0xd800) << 10) + (str.charAt(index + 1) - 0xdc00); } return ((c & 0xf800) == 0xd800) ? 0xfffd : c; } static int GetBidiClass(int ch) { ByteData table = null; synchronized (bidiClassesSync) { bidiClasses = (bidiClasses == null) ? (ByteData.Decompress(IdnaData.BidiClasses)) : bidiClasses; table = bidiClasses; } return table.GetByte(ch); } private static int GetJoiningType(int ch) { ByteData table = null; synchronized (joiningTypesSync) { joiningTypes = (joiningTypes == null) ? (ByteData.Decompress(IdnaData.JoiningTypes)) : joiningTypes; table = joiningTypes; } return table.GetByte(ch); } private static int GetScript(int ch) { ByteData table = null; synchronized (scriptsSync) { scripts = (scripts == null) ? (ByteData.Decompress(IdnaData.IdnaRelevantScripts)) : scripts; table = scripts; } return table.GetByte(ch); } private static boolean JoiningTypeTransparent(int ch) { return GetJoiningType(ch) == 1; } private static boolean JoiningTypeLeftOrDual(int ch) { int jtype = GetJoiningType(ch); return jtype == 3 || jtype == 4; } private static boolean JoiningTypeRightOrDual(int ch) { int jtype = GetJoiningType(ch); return jtype == 2 || jtype == 4; } private static boolean IsGreek(int ch) { return GetScript(ch) == 1; } private static boolean IsHebrew(int ch) { return GetScript(ch) == 2; } private static boolean IsKanaOrHan(int ch) { return GetScript(ch) == 3; } private static boolean IsValidConjunct(String str, int index) { // Assumes that the character at the given index // is Zero-Width Non-Joiner // Check the left boolean found = false; int oldIndex = index; while (index > 0) { int ch = CodePointBefore(str, index); index -= (ch >= 0x10000) ? 2 : 1; if (JoiningTypeLeftOrDual(ch)) { found = true; } else if (!JoiningTypeTransparent(ch)) { return false; } } if (!found) { return false; } // Check the right index = oldIndex + 1; while (index < str.length()) { int ch = CodePointAt(str, index); index += (ch >= 0x10000) ? 2 : 1; if (JoiningTypeRightOrDual(ch)) { return true; } if (!JoiningTypeTransparent(ch)) { return false; } } return false; } private static boolean HasRtlCharacters(String str) { for (int i = 0; i < str.length(); ++i) { if (str.charAt(i) >= 0x80) { int c = CodePointAt(str, i); if (c >= 0x10000) { ++i; } int bidiClass = GetBidiClass(c); if (bidiClass == BidiClassAL || bidiClass == BidiClassAN || bidiClass == BidiClassR) { return true; } } } return false; } /** * Tries to encode each label of a domain name into Punycode. * @param value A domain name. * @return The domain name where each label with non-ASCII characters is * encoded into Punycode. Labels where this is not possible remain * unchanged. * @throws NullPointerException Value is null. */ public static String EncodeDomainName(String value) { if (value == null) { throw new NullPointerException("value"); } if (value.length() == 0) { return ""; } StringBuilder builder = new StringBuilder(); String retval = null; int lastIndex = 0; for (int i = 0; i < value.length(); ++i) { char c = value.charAt(i); if (c == '.') { if (i != lastIndex) { retval = DomainUtility.PunycodeEncodePortion(value, lastIndex, i); if (retval == null) { // Append the unmodified domain plus the dot builder.append(value.substring(lastIndex, (lastIndex)+((i + 1) - lastIndex))); } else { builder.append(retval); builder.append('.'); } } lastIndex = i + 1; } } retval = DomainUtility.PunycodeEncodePortion( value, lastIndex, value.length()); if (retval == null) { builder.append(value.substring(lastIndex, (lastIndex)+(value.length() - lastIndex))); } else { builder.append(retval); } return builder.toString(); } /** * Determines whether the given string is a syntactically valid domain name. * @param str A string object. * @param lookupRules If true, uses rules to apply when looking up the string * as a domain name. If false, uses rules to apply when registering the * string as a domain name. * @return True if the given string is a syntactically valid domain name; * otherwise; false. */ public static boolean IsValidDomainName(String str, boolean lookupRules) { if (((str) == null || (str).length() == 0)) { return false; } boolean bidiRule = HasRtlCharacters(str); int lastIndex = 0; for (int i = 0; i < str.length(); ++i) { char c = str.charAt(i); if (c == '.') { if (i == lastIndex) { // Empty label return false; } if (!IsValidLabel( str.substring(lastIndex, (lastIndex)+(i - lastIndex)), lookupRules, bidiRule)) { return false; } lastIndex = i + 1; } } return (str.length() != lastIndex) && IsValidLabel( str.substring(lastIndex, (lastIndex)+(str.length() - lastIndex)), lookupRules, bidiRule); } private static String ToLowerCaseAscii(String str) { if (str == null) { return null; } int len = str.length(); char c = (char)0; boolean hasUpperCase = false; for (int i = 0; i < len; ++i) { c = str.charAt(i); if (c >= 'A' && c <= 'Z') { hasUpperCase = true; break; } } if (!hasUpperCase) { return str; } StringBuilder builder = new StringBuilder(); for (int i = 0; i < len; ++i) { c = str.charAt(i); if (c >= 'A' && c <= 'Z') { builder.append((char)(c + 0x20)); } else { builder.append(c); } } return builder.toString(); } private static boolean IsValidLabel( String str, boolean lookupRules, boolean bidiRule) { if (((str) == null || (str).length() == 0)) { return false; } boolean maybeALabel = str.length() >= 4 && (str.charAt(0) == 'x' || str.charAt(0) == 'X') && (str.charAt(1) == 'n' || str.charAt(1) == 'N') && str.charAt(2) == '-' && str.charAt(3) == '-'; boolean allLDH = true; for (int i = 0; i < str.length(); ++i) { if ((str.charAt(i) >= 'a' && str.charAt(i) <= 'z') || (str.charAt(i) >= 'A' && str.charAt(i) <= 'Z' ) || (str.charAt(i) >= '0' && str.charAt(i) <= '9') || str.charAt(i) == '-') { // LDH character continue; } if (str.charAt(i) >= 0x80) { // Non-ASCII character allLDH = false; continue; } return false; } if (maybeALabel) { str = ToLowerCaseAscii(str); String ustr = DomainUtility.PunycodeDecode(str, 4, str.length()); if (ustr == null) { // NOTE: Returns null if "str" contains non-ASCII characters return false; } if (!IsValidULabel(ustr, lookupRules, bidiRule)) { return false; } String astr = DomainUtility.PunycodeEncodePortion(ustr, 0, ustr.length()); // NOTE: "astr" and "str" will contain only ASCII characters // at this point, so a simple null check and // binary comparison are enough return (astr != null) && astr.equals(str); } if (allLDH) { if (str.length() >= 4 && str.charAt(2) == '-' && str.charAt(3) == '-') { // Contains a hyphen at the third and fourth (one-based) character // positions return false; } if (str.charAt(0) != '-' && str.charAt(str.length() - 1) != '-' && !(str.charAt(0) >= '0' && str.charAt(0) <= '9')) { // Only LDH characters, doesn't start with hyphen or digit, // and doesn't end with hyphen return true; } } return IsValidULabel(str, lookupRules, bidiRule); } private static boolean IsValidULabel( String str, boolean lookupRules, boolean bidiRule) { if (((str) == null || (str).length() == 0)) { return false; } if (str.length() > 63 && !lookupRules) { // Too long return false; } if (str.length() >= 4 && str.charAt(2) == '-' && str.charAt(3) == '-') { // Contains a hyphen at the third and fourth (one-based) character // positions return false; } if (!lookupRules) { // Checking for a hyphen at the start and // the end is part of the registration validity // rules (sec. 4.2 of RFC 5891), but not the lookup // rules (sec. 5.4). if (str.charAt(0) == '-' || str.charAt(str.length() - 1) == '-') { return false; } } if (!NormalizingCharacterInput.IsNormalized(str, Normalization.NFC)) { return false; } int ch; boolean first = true; boolean haveContextual = false; boolean rtl = false; int bidiClass; for (int i = 0; i < str.length(); ++i) { ch = CodePointAt(str, i); if (ch >= 0x10000) { ++i; } int category = UnicodeDatabase.GetIdnaCategory(ch); if (category == Disallowed || category == Unassigned) { return false; } if (first) { if (UnicodeDatabase.IsCombiningMark(ch)) { return false; } if (bidiRule) { bidiClass = GetBidiClass(ch); if (bidiClass == BidiClassR || bidiClass == BidiClassAL) { rtl = true; } else if (bidiClass != BidiClassL) { // forbidden bidi type as the first character return false; } } } haveContextual |= category == ContextO || category == ContextJ; first = false; } if (haveContextual) { boolean regArabDigits = false; boolean extArabDigits = false; boolean haveKatakanaMiddleDot = false; boolean haveKanaOrHan = false; int lastChar = 0; for (int i = 0; i < str.length(); ++i) { int thisChar = CodePointAt(str, i); if (thisChar >= 0x660 && thisChar <= 0x669) { // Arabic-Indic digits // NOTE: Test done here even under lookup rules, // even though they're CONTEXTO characters if (extArabDigits) { return false; } regArabDigits = true; } else if (thisChar >= 0x6f0 && thisChar <= 0x6f9) { // Extended Arabic-Indic digits // NOTE: Test done here even under lookup rules, // even though they're CONTEXTO characters if (regArabDigits) { return false; } extArabDigits = true; } else if (thisChar == 0xb7) { // Middle dot // NOTE: Test done here even under lookup rules, // even though it's a CONTEXTO character if (!(i - 1 >= 0 && i + 1 < str.length() && lastChar == 0x6c && str.charAt(i + 1) == 0x6c)) { // Dot must come between two l's return false; } } else if (thisChar == 0x200d) { // Zero-width joiner if (UnicodeDatabase.GetCombiningClass(lastChar) != 9) { return false; } } else if (thisChar == 0x200c) { // Zero-width non-joiner if (UnicodeDatabase.GetCombiningClass(lastChar) != 9 && !IsValidConjunct(str, i)) { return false; } } else if (thisChar == 0x375) { // Keraia // NOTE: Test done here even under lookup rules, // even though it's a CONTEXTO character if (i + 1 >= str.length() || !IsGreek(CodePointAt(str, i + 1))) { return false; } } else if (thisChar == 0x5f3 || thisChar == 0x5f4) { // Geresh or gershayim // NOTE: Test done here even under lookup rules, // even though they're CONTEXTO characters if (i <= 0 || !IsHebrew(lastChar)) { return false; } } else if (thisChar == 0x30fb) { haveKatakanaMiddleDot = true; } else { int category = UnicodeDatabase.GetIdnaCategory(thisChar); if (category == ContextJ || category == ContextO) { // Context character without a rule return false; } } if (!haveKanaOrHan && IsKanaOrHan(thisChar)) { haveKanaOrHan = true; } if (thisChar >= 0x10000) { ++i; } lastChar = thisChar; } if (haveKatakanaMiddleDot && !haveKanaOrHan) { // NOTE: Test done here even under lookup rules, // even though it's a CONTEXTO character return false; } } // Bidi Rule if (bidiRule) { boolean found = false; for (int i = str.length(); i > 0; --i) { int c = CodePointBefore(str, i); if (c >= 0x10000) { --i; } bidiClass = GetBidiClass(c); if (rtl && (bidiClass == BidiClassR || bidiClass == BidiClassAL || bidiClass == BidiClassAN)) { found = true; break; } if (!rtl && (bidiClass == BidiClassL)) { found = true; break; } if (bidiClass == BidiClassEN) { found = true; break; } if (bidiClass != BidiClassNSM) { return false; } } if (!found) { return false; } boolean haveEN = false; boolean haveAN = false; for (int i = 0; i < str.length(); ++i) { int c = CodePointAt(str, i); if (c >= 0x10000) { ++i; } bidiClass = GetBidiClass(c); if (rtl && (bidiClass == BidiClassR || bidiClass == BidiClassAL || bidiClass == BidiClassAN)) { if (bidiClass == BidiClassAN) { if (haveEN) { return false; } haveAN = true; } continue; } if (!rtl && (bidiClass == BidiClassL)) { continue; } if (bidiClass == BidiClassEN) { if (rtl) { if (haveAN) { return false; } haveEN = false; } continue; } if (bidiClass == BidiClassES || bidiClass == BidiClassCS || bidiClass == BidiClassET || bidiClass == BidiClassON || bidiClass == BidiClassBN || bidiClass == BidiClassNSM) { continue; } return false; } } int aceLength = DomainUtility.PunycodeLength(str, 0, str.length()); if (aceLength < 0) { return false; // Overflow error } if (!lookupRules) { // Additional rules for non-lookup validation if (aceLength > 63) { return false; } } return true; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy