com.ibm.icu.impl.UTS46 Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.util.EnumSet;
import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UCharacterDirection;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.util.ICUException;
// Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG:
//
// The domain name length limit is 255 octets in an internal DNS representation
// where the last ("root") label is the empty label
// represented by length byte 0 alone.
// In a conventional string, this translates to 253 characters, or 254
// if there is a trailing dot for the root label.
/**
* UTS #46 (IDNA2008) implementation.
* @author Markus Scherer
* @since 2010jul09
*/
public final class UTS46 extends IDNA {
public UTS46(int options) {
this.options=options;
}
@Override
public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) {
return process(label, true, true, dest, info);
}
@Override
public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) {
return process(label, true, false, dest, info);
}
@Override
public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) {
process(name, false, true, dest, info);
if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) &&
isASCIIString(dest) &&
(dest.length()>254 || dest.charAt(253)!='.')
) {
addError(info, Error.DOMAIN_NAME_TOO_LONG);
}
return dest;
}
@Override
public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) {
return process(name, false, false, dest, info);
}
private static final Normalizer2 uts46Norm2=
Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE); // uts46.nrm
final int options;
// Severe errors which usually result in a U+FFFD replacement character in the result string.
private static final EnumSet severeErrors=EnumSet.of(
Error.LEADING_COMBINING_MARK,
Error.DISALLOWED,
Error.PUNYCODE,
Error.LABEL_HAS_DOT,
Error.INVALID_ACE_LABEL);
private static boolean
isASCIIString(CharSequence dest) {
int length=dest.length();
for(int i=0; i0x7f) {
return false;
}
}
return true;
}
// UTS #46 data for ASCII characters.
// The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
// and passes through all other ASCII characters.
// If USE_STD3_RULES is set, then non-LDH characters are disallowed
// using this data.
// The ASCII fastpath also uses this data.
// Values: -1=disallowed 0==valid 1==mapped (lowercase)
private static final byte asciiData[]={
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
// 002D..002E; valid # HYPHEN-MINUS..FULL STOP
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1,
// 0030..0039; valid # DIGIT ZERO..DIGIT NINE
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
// 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
// 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z
-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1
};
private StringBuilder
process(CharSequence src,
boolean isLabel, boolean toASCII,
StringBuilder dest,
Info info) {
// uts46Norm2.normalize() would do all of this error checking and setup,
// but with the ASCII fastpath we do not always call it, and do not
// call it first.
if(dest==src) {
throw new IllegalArgumentException();
}
// Arguments are fine, reset output values.
dest.delete(0, 0x7fffffff);
resetInfo(info);
int srcLength=src.length();
if(srcLength==0) {
addError(info, Error.EMPTY_LABEL);
return dest;
}
// ASCII fastpath
boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
int labelStart=0;
int i;
for(i=0;; ++i) {
if(i==srcLength) {
if(toASCII) {
if((i-labelStart)>63) {
addLabelError(info, Error.LABEL_TOO_LONG);
}
// There is a trailing dot if labelStart==i.
if(!isLabel && i>=254 && (i>254 || labelStart0x7f) {
break;
}
int cData=asciiData[c];
if(cData>0) {
dest.append((char)(c+0x20)); // Lowercase an uppercase ASCII letter.
} else if(cData<0 && disallowNonLDHDot) {
break; // Replacing with U+FFFD can be complicated for toASCII.
} else {
dest.append(c);
if(c=='-') { // hyphen
if(i==(labelStart+3) && src.charAt(i-1)=='-') {
// "??--..." is Punycode or forbidden.
++i; // '-' was copied to dest already
break;
}
if(i==labelStart) {
// label starts with "-"
addLabelError(info, Error.LEADING_HYPHEN);
}
if((i+1)==srcLength || src.charAt(i+1)=='.') {
// label ends with "-"
addLabelError(info, Error.TRAILING_HYPHEN);
}
} else if(c=='.') { // dot
if(isLabel) {
// Replacing with U+FFFD can be complicated for toASCII.
++i; // '.' was copied to dest already
break;
}
if(i==labelStart) {
addLabelError(info, Error.EMPTY_LABEL);
}
if(toASCII && (i-labelStart)>63) {
addLabelError(info, Error.LABEL_TOO_LONG);
}
promoteAndResetLabelErrors(info);
labelStart=i+1;
}
}
}
promoteAndResetLabelErrors(info);
processUnicode(src, labelStart, i, isLabel, toASCII, dest, info);
if( isBiDi(info) && !hasCertainErrors(info, severeErrors) &&
(!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart)))
) {
addError(info, Error.BIDI);
}
return dest;
}
private StringBuilder
processUnicode(CharSequence src,
int labelStart, int mappingStart,
boolean isLabel, boolean toASCII,
StringBuilder dest,
Info info) {
if(mappingStart==0) {
uts46Norm2.normalize(src, dest);
} else {
uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length()));
}
boolean doMapDevChars=
toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 :
(options&NONTRANSITIONAL_TO_UNICODE)==0;
int destLength=dest.length();
int labelLimit=labelStart;
while(labelLimit=0x200c)) {
setTransitionalDifferent(info);
if(doMapDevChars) {
destLength=mapDevChars(dest, labelStart, labelLimit);
// All deviation characters have been mapped, no need to check for them again.
doMapDevChars=false;
// Do not increment labelLimit in case c was removed.
continue;
}
} else if(Character.isSurrogate(c)) {
if(UTF16Plus.isSurrogateLead(c) ?
(labelLimit+1)==destLength ||
!Character.isLowSurrogate(dest.charAt(labelLimit+1)) :
labelLimit==labelStart ||
!Character.isHighSurrogate(dest.charAt(labelLimit-1))) {
// Map an unpaired surrogate to U+FFFD before normalization so that when
// that removes characters we do not turn two unpaired ones into a pair.
addLabelError(info, Error.DISALLOWED);
dest.setCharAt(labelLimit, '\ufffd');
}
}
++labelLimit;
}
// Permit an empty label at the end (0=4 &&
dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' &&
dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-'
) {
// Label starts with "xn--", try to un-Punycode it.
// In IDNA2008, labels like "xn--" (decodes to an empty string) and
// "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
// comparing the ToUnicode input with the back-to-ToASCII output.
// They are alternate encodings of the respective ASCII labels.
// Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
// the round-trip verification.
if(labelLength==4 || (labelLength>5 && dest.charAt(labelStart+labelLength-1)=='-')) {
addLabelError(info, Error.INVALID_ACE_LABEL);
return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
}
wasPunycode=true;
try {
fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null);
} catch (StringPrepParseException e) {
addLabelError(info, Error.PUNYCODE);
return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
}
// Check for NFC, and for characters that are not
// valid or deviation characters according to the normalizer.
// If there is something wrong, then the string will change.
// Note that the normalizer passes through non-LDH ASCII and deviation characters.
// Deviation characters are ok in Punycode even in transitional processing.
// In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
// then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
boolean isValid=uts46Norm2.isNormalized(fromPunycode);
if(!isValid) {
addLabelError(info, Error.INVALID_ACE_LABEL);
return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
}
labelString=fromPunycode;
labelStart=0;
labelLength=fromPunycode.length();
} else {
wasPunycode=false;
labelString=dest;
}
// Validity check
if(labelLength==0) {
addLabelError(info, Error.EMPTY_LABEL);
return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
}
// labelLength>0
if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') {
// label starts with "??--"
addLabelError(info, Error.HYPHEN_3_4);
}
if(labelString.charAt(labelStart)=='-') {
// label starts with "-"
addLabelError(info, Error.LEADING_HYPHEN);
}
if(labelString.charAt(labelStart+labelLength-1)=='-') {
// label ends with "-"
addLabelError(info, Error.TRAILING_HYPHEN);
}
// If the label was not a Punycode label, then it was the result of
// mapping, normalization and label segmentation.
// If the label was in Punycode, then we mapped it again above
// and checked its validity.
// Now we handle the STD3 restriction to LDH characters (if set)
// and we look for U+FFFD which indicates disallowed characters
// in a non-Punycode label or U+FFFD itself in a Punycode label.
// We also check for dots which can come from the input to a single-label function.
// Ok to cast away const because we own the UnicodeString.
int i=labelStart;
int limit=labelStart+labelLength;
char oredChars=0;
// If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
do {
char c=labelString.charAt(i);
if(c<=0x7f) {
if(c=='.') {
addLabelError(info, Error.LABEL_HAS_DOT);
labelString.setCharAt(i, '\ufffd');
} else if(disallowNonLDHDot && asciiData[c]<0) {
addLabelError(info, Error.DISALLOWED);
labelString.setCharAt(i, '\ufffd');
}
} else {
oredChars|=c;
if(c==0xfffd) {
addLabelError(info, Error.DISALLOWED);
}
}
++i;
} while(i0xffff) {
// Remove c's trail surrogate.
labelString.deleteCharAt(labelStart+1);
--labelLength;
if(labelString==dest) {
--destLabelLength;
}
}
}
if(!hasCertainLabelErrors(info, severeErrors)) {
// Do contextual checks only if we do not have U+FFFD from a severe error
// because U+FFFD can make these checks fail.
if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) {
checkLabelBiDi(labelString, labelStart, labelLength, info);
}
if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
!isLabelOkContextJ(labelString, labelStart, labelLength)
) {
addLabelError(info, Error.CONTEXTJ);
}
if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
checkLabelContextO(labelString, labelStart, labelLength, info);
}
if(toASCII) {
if(wasPunycode) {
// Leave a Punycode label unchanged if it has no severe errors.
if(destLabelLength>63) {
addLabelError(info, Error.LABEL_TOO_LONG);
}
return destLabelLength;
} else if(oredChars>=0x80) {
// Contains non-ASCII characters.
StringBuilder punycode;
try {
punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null);
} catch (StringPrepParseException e) {
throw new ICUException(e); // unexpected
}
punycode.insert(0, "xn--");
if(punycode.length()>63) {
addLabelError(info, Error.LABEL_TOO_LONG);
}
return replaceLabel(dest, destLabelStart, destLabelLength,
punycode, punycode.length());
} else {
// all-ASCII label
if(labelLength>63) {
addLabelError(info, Error.LABEL_TOO_LONG);
}
}
}
} else {
// If a Punycode label has severe errors,
// then leave it but make sure it does not look valid.
if(wasPunycode) {
addLabelError(info, Error.INVALID_ACE_LABEL);
return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
}
}
return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
}
private int
markBadACELabel(StringBuilder dest,
int labelStart, int labelLength,
boolean toASCII, Info info) {
boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
boolean isASCII=true;
boolean onlyLDH=true;
int limit=labelStart+labelLength;
// Start after the initial "xn--".
for(int i=labelStart+4; i63) {
addLabelError(info, Error.LABEL_TOO_LONG);
}
}
return labelLength;
}
private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT);
private static final int R_AL_MASK=
U_MASK(UCharacterDirection.RIGHT_TO_LEFT)|
U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC);
private static final int L_R_AL_MASK=L_MASK|R_AL_MASK;
private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER);
private static final int EN_AN_MASK=
U_MASK(UCharacterDirection.EUROPEAN_NUMBER)|
U_MASK(UCharacterDirection.ARABIC_NUMBER);
private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER);
private static final int ES_CS_ET_ON_BN_NSM_MASK=
U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)|
U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)|
U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)|
U_MASK(UCharacterDirection.OTHER_NEUTRAL)|
U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)|
U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK);
private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
// We scan the whole label and check both for whether it contains RTL characters
// and whether it passes the BiDi Rule.
// In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
// that a domain name is a BiDi domain name (has an RTL label) only after
// processing several earlier labels.
private void
checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) {
// IDNA2008 BiDi rule
// Get the directionality of the first character.
int c;
int i=labelStart;
c=Character.codePointAt(label, i);
i+=Character.charCount(c);
int firstMask=U_MASK(UBiDiProps.INSTANCE.getClass(c));
// 1. The first character must be a character with BIDI property L, R
// or AL. If it has the R or AL property, it is an RTL label; if it
// has the L property, it is an LTR label.
if((firstMask&~L_R_AL_MASK)!=0) {
setNotOkBiDi(info);
}
// Get the directionality of the last non-NSM character.
int lastMask;
int labelLimit=labelStart+labelLength;
for(;;) {
if(i>=labelLimit) {
lastMask=firstMask;
break;
}
c=Character.codePointBefore(label, labelLimit);
labelLimit-=Character.charCount(c);
int dir=UBiDiProps.INSTANCE.getClass(c);
if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) {
lastMask=U_MASK(dir);
break;
}
}
// 3. In an RTL label, the end of the label must be a character with
// BIDI property R, AL, EN or AN, followed by zero or more
// characters with BIDI property NSM.
// 6. In an LTR label, the end of the label must be a character with
// BIDI property L or EN, followed by zero or more characters with
// BIDI property NSM.
if( (firstMask&L_MASK)!=0 ?
(lastMask&~L_EN_MASK)!=0 :
(lastMask&~R_AL_EN_AN_MASK)!=0
) {
setNotOkBiDi(info);
}
// Add the directionalities of the intervening characters.
int mask=firstMask|lastMask;
while(ilabelStart) {
c=s.charAt(i-1);
if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) {
// Last character in the label is not an L or EN.
return false;
}
}
labelStart=i+1;
} else if(i==labelStart) {
if(!('a'<=c && c<='z')) {
// First character in the label is not an L.
return false;
}
} else {
if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
// Intermediate character in the label is a B, S or WS.
return false;
}
}
}
return true;
}
private boolean
isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) {
// [IDNA2008-Tables]
// 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
int labelLimit=labelStart+labelLength;
for(int i=labelStart; i0) {
addLabelError(info, Error.CONTEXTO_DIGITS);
}
arabicDigits=-1;
} else if(0x6f0<=c) {
if(arabicDigits<0) {
addLabelError(info, Error.CONTEXTO_DIGITS);
}
arabicDigits=1;
}
}
} else if(c==0x30fb) {
// Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
// Rule Set:
// False;
// For All Characters:
// If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
// End For;
for(int j=labelStart;; j+=Character.charCount(c)) {
if(j>labelEnd) {
addLabelError(info, Error.CONTEXTO_PUNCTUATION);
break;
}
c=Character.codePointAt(label, j);
int script=UScript.getScript(c);
if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
break;
}
}
}
}
}
// TODO: make public(?) -- in C, these are public in uchar.h
private static int U_MASK(int x) {
return 1<