com.ibm.icu.impl.IDNA2003 Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/*
*******************************************************************************
* Copyright (C) 2003-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.StringPrep;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UCharacterIterator;
/**
* IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java
* while extending that class to support IDNA2008/UTS #46 as well.
* @author Ram Viswanadha
*/
public final class IDNA2003 {
/* IDNA ACE Prefix is "xn--" */
private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
//private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length;
private static final int MAX_LABEL_LENGTH = 63;
private static final int HYPHEN = 0x002D;
private static final int CAPITAL_A = 0x0041;
private static final int CAPITAL_Z = 0x005A;
private static final int LOWER_CASE_DELTA = 0x0020;
private static final int FULL_STOP = 0x002E;
private static final int MAX_DOMAIN_NAME_LENGTH = 255;
// The NamePrep profile object
private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
private static boolean startsWithPrefix(StringBuffer src){
boolean startsWithPrefix = true;
if(src.length() < ACE_PREFIX.length){
return false;
}
for(int i=0; i0x007A){
return false;
}
//[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
if( (ch==0x002D) ||
(0x0030 <= ch && ch <= 0x0039) ||
(0x0041 <= ch && ch <= 0x005A) ||
(0x0061 <= ch && ch <= 0x007A)
){
return true;
}
return false;
}
/**
* Ascertain if the given code point is a label separator as
* defined by the IDNA RFC
*
* @param ch The code point to be ascertained
* @return true if the char is a label separator
* @stable ICU 2.8
*/
private static boolean isLabelSeparator(int ch){
switch(ch){
case 0x002e:
case 0x3002:
case 0xFF0E:
case 0xFF61:
return true;
default:
return false;
}
}
public static StringBuffer convertToASCII(UCharacterIterator src, int options)
throws StringPrepParseException{
boolean[] caseFlags = null;
// the source contains all ascii codepoints
boolean srcIsASCII = true;
// assume the source contains all LDH codepoints
boolean srcIsLDH = true;
//get the options
boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
int ch;
// step 1
while((ch = src.next())!= UCharacterIterator.DONE){
if(ch> 0x7f){
srcIsASCII = false;
}
}
int failPos = -1;
src.setToStart();
StringBuffer processOut = null;
// step 2 is performed only if the source contains non ASCII
if(!srcIsASCII){
// step 2
processOut = namePrep.prepare(src, options);
}else{
processOut = new StringBuffer(src.getText());
}
int poLen = processOut.length();
if(poLen==0){
throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
}
StringBuffer dest = new StringBuffer();
// reset the variable to verify if output of prepare is ASCII or not
srcIsASCII = true;
// step 3 & 4
for(int j=0;j 0x7F){
srcIsASCII = false;
}else if(isLDHChar(ch)==false){
// here we do not assemble surrogates
// since we know that LDH code points
// are in the ASCII range only
srcIsLDH = false;
failPos = j;
}
}
if(useSTD3ASCIIRules == true){
// verify 3a and 3b
if( srcIsLDH == false /* source contains some non-LDH characters */
|| processOut.charAt(0) == HYPHEN
|| processOut.charAt(processOut.length()-1) == HYPHEN){
/* populate the parseError struct */
if(srcIsLDH==false){
throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
StringPrepParseException.STD3_ASCII_RULES_ERROR,
processOut.toString(),
(failPos>0) ? (failPos-1) : failPos);
}else if(processOut.charAt(0) == HYPHEN){
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
}else{
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
StringPrepParseException.STD3_ASCII_RULES_ERROR,
processOut.toString(),
(poLen>0) ? poLen-1 : poLen);
}
}
}
if(srcIsASCII){
dest = processOut;
}else{
// step 5 : verify the sequence does not begin with ACE prefix
if(!startsWithPrefix(processOut)){
//step 6: encode the sequence with punycode
caseFlags = new boolean[poLen];
StringBuilder punyout = Punycode.encode(processOut,caseFlags);
// convert all codepoints to lower case ASCII
StringBuffer lowerOut = toASCIILower(punyout);
//Step 7: prepend the ACE prefix
dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
//Step 6: copy the contents in b2 into dest
dest.append(lowerOut);
}else{
throw new StringPrepParseException("The input does not start with the ACE Prefix.",
StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
}
}
if(dest.length() > MAX_LABEL_LENGTH){
throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
}
return dest;
}
public static StringBuffer convertIDNToASCII(String src,int options)
throws StringPrepParseException{
char[] srcArr = src.toCharArray();
StringBuffer result = new StringBuffer();
int sepIndex=0;
int oldSepIndex=0;
for(;;){
sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
//make sure this is not a root label separator.
if(!(label.length()==0 && sepIndex==srcArr.length)){
UCharacterIterator iter = UCharacterIterator.getInstance(label);
result.append(convertToASCII(iter,options));
}
if(sepIndex==srcArr.length){
break;
}
// increment the sepIndex to skip past the separator
sepIndex++;
oldSepIndex = sepIndex;
result.append((char)FULL_STOP);
}
if(result.length() > MAX_DOMAIN_NAME_LENGTH){
throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
}
return result;
}
public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
throws StringPrepParseException{
boolean[] caseFlags = null;
// the source contains all ascii codepoints
boolean srcIsASCII = true;
// assume the source contains all LDH codepoints
//boolean srcIsLDH = true;
//get the options
//boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
//int failPos = -1;
int ch;
int saveIndex = src.getIndex();
// step 1: find out if all the codepoints in src are ASCII
while((ch=src.next())!= UCharacterIterator.DONE){
if(ch>0x7F){
srcIsASCII = false;
}/*else if((srcIsLDH = isLDHChar(ch))==false){
failPos = src.getIndex();
}*/
}
StringBuffer processOut;
if(srcIsASCII == false){
try {
// step 2: process the string
src.setIndex(saveIndex);
processOut = namePrep.prepare(src,options);
} catch (StringPrepParseException ex) {
return new StringBuffer(src.getText());
}
}else{
//just point to source
processOut = new StringBuffer(src.getText());
}
// TODO:
// The RFC states that
//
// ToUnicode never fails. If any step fails, then the original input
// is returned immediately in that step.
//
//step 3: verify ACE Prefix
if(startsWithPrefix(processOut)){
StringBuffer decodeOut = null;
//step 4: Remove the ACE Prefix
String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
//step 5: Decode using punycode
try {
decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags));
} catch (StringPrepParseException e) {
decodeOut = null;
}
//step 6:Apply toASCII
if (decodeOut != null) {
StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);
//step 7: verify
if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
// throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
// StringPrepParseException.VERIFICATION_ERROR);
decodeOut = null;
}
}
//step 8: return output of step 5
if (decodeOut != null) {
return decodeOut;
}
}
// }else{
// // verify that STD3 ASCII rules are satisfied
// if(useSTD3ASCIIRules == true){
// if( srcIsLDH == false /* source contains some non-LDH characters */
// || processOut.charAt(0) == HYPHEN
// || processOut.charAt(processOut.length()-1) == HYPHEN){
//
// if(srcIsLDH==false){
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
// StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
// (failPos>0) ? (failPos-1) : failPos);
// }else if(processOut.charAt(0) == HYPHEN){
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
// processOut.toString(),0);
//
// }else{
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
// processOut.toString(),
// processOut.length());
//
// }
// }
// }
// // just return the source
// return new StringBuffer(src.getText());
// }
return new StringBuffer(src.getText());
}
public static StringBuffer convertIDNToUnicode(String src, int options)
throws StringPrepParseException{
char[] srcArr = src.toCharArray();
StringBuffer result = new StringBuffer();
int sepIndex=0;
int oldSepIndex=0;
for(;;){
sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
if(label.length()==0 && sepIndex!=srcArr.length ){
throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
}
UCharacterIterator iter = UCharacterIterator.getInstance(label);
result.append(convertToUnicode(iter,options));
if(sepIndex==srcArr.length){
break;
}
// Unlike the ToASCII operation we don't normalize the label separators
result.append(srcArr[sepIndex]);
// increment the sepIndex to skip past the separator
sepIndex++;
oldSepIndex =sepIndex;
}
if(result.length() > MAX_DOMAIN_NAME_LENGTH){
throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
}
return result;
}
public static int compare(String s1, String s2, int options) throws StringPrepParseException{
StringBuffer s1Out = convertIDNToASCII(s1, options);
StringBuffer s2Out = convertIDNToASCII(s2, options);
return compareCaseInsensitiveASCII(s1Out,s2Out);
}
}