com.ibm.icu.impl.Punycode Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2003-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.ICUInputTooLongException;
/**
* Ported code from ICU punycode.c
* @author ram
*/
public final class Punycode {
/* Punycode parameters for Bootstring */
private static final int BASE = 36;
private static final int TMIN = 1;
private static final int TMAX = 26;
private static final int SKEW = 38;
private static final int DAMP = 700;
private static final int INITIAL_BIAS = 72;
private static final int INITIAL_N = 0x80;
/* "Basic" Unicode/ASCII code points */
private static final char HYPHEN = 0x2d;
private static final char DELIMITER = HYPHEN;
private static final int ZERO = 0x30;
//private static final int NINE = 0x39;
private static final int SMALL_A = 0x61;
private static final int SMALL_Z = 0x7a;
private static final int CAPITAL_A = 0x41;
private static final int CAPITAL_Z = 0x5a;
private static int adaptBias(int delta, int length, boolean firstTime){
if(firstTime){
delta /=DAMP;
}else{
delta /= 2;
}
delta += delta/length;
int count=0;
for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
delta/=(BASE-TMIN);
}
return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}
/**
* @return the numeric value of a basic code point (for use in representing integers)
* in the range 0 to BASE-1, or a negative value if cp is invalid.
*/
private static final int decodeDigit(int cp) {
if(cp<='Z') {
if(cp<='9') {
if(cp<'0') {
return -1;
} else {
return cp-'0'+26; // 0..9 -> 26..35
}
} else {
return cp-'A'; // A-Z -> 0..25
}
} else if(cp<='z') {
return cp-'a'; // a..z -> 0..25
} else {
return -1;
}
}
///CLOVER:OFF
private static char asciiCaseMap(char b, boolean uppercase) {
if(uppercase) {
if(SMALL_A<=b && b<=SMALL_Z) {
b-=(SMALL_A-CAPITAL_A);
}
} else {
if(CAPITAL_A<=b && b<=CAPITAL_Z) {
b+=(SMALL_A-CAPITAL_A);
}
}
return b;
}
///CLOVER:ON
/**
* digitToBasic() returns the basic code point whose value
* (when used for representing integers) is d, which must be in the
* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
* nonzero, in which case the uppercase form is used.
*/
private static char digitToBasic(int digit, boolean uppercase) {
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
if(digit<26) {
if(uppercase) {
return (char)(CAPITAL_A+digit);
} else {
return (char)(SMALL_A+digit);
}
} else {
return (char)((ZERO-26)+digit);
}
}
// ICU-13727: Limit input length for n^2 algorithm
// where well-formed strings are at most 59 characters long.
private static final int ENCODE_MAX_CODE_UNITS = 1000;
private static final int DECODE_MAX_CHARS = 2000;
/**
* Converts Unicode to Punycode.
* The input string must not contain single, unpaired surrogates.
* The output will be represented as an array of ASCII code points.
*
* @param src The source of the String Buffer passed.
* @param caseFlags The boolean array of case flags.
* @return An array of ASCII code points.
*/
public static StringBuilder encode(CharSequence src, boolean[] caseFlags) throws StringPrepParseException{
int n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, srcCPCount;
char c, c2;
int srcLength = src.length();
if (srcLength > ENCODE_MAX_CODE_UNITS) {
throw new ICUInputTooLongException(
"input too long: " + srcLength + " UTF-16 code units");
}
int[] cpBuffer = new int[srcLength];
StringBuilder dest = new StringBuilder(srcLength);
/*
* Handle the basic code points and
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
*/
srcCPCount=0;
for(j=0; j0) {
dest.append(DELIMITER);
}
/*
* handledCPCount is the number of code points that have been handled
* basicLength is the number of basic code points
* destLength is the number of chars that have been output
*/
/* Initialize the state: */
n=INITIAL_N;
delta=0;
bias=INITIAL_BIAS;
/* Main encoding loop: */
for(handledCPCount=basicLength; handledCPCount state to , but guard against overflow:
*/
if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) {
throw new IllegalStateException("Internal program error");
}
delta+=(m-n)*(handledCPCount+1);
n=m;
/* Encode a sequence of same code points n */
for(j=0; jTMAX) {
t=TMAX;
}
*/
t=k-bias;
if(t=(bias+TMAX)) {
t=TMAX;
}
if(q= CAPITAL_Z);
}
///CLOVER:ON
private static boolean isSurrogate(int ch){
return (((ch)&0xfffff800)==0xd800);
}
/**
* Converts Punycode to Unicode.
* The Unicode string will be at most as long as the Punycode string.
*
* @param src The source of the string buffer being passed.
* @param caseFlags The array of boolean case flags.
* @return StringBuilder string.
*/
public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
throws StringPrepParseException{
int srcLength = src.length();
if (srcLength > DECODE_MAX_CHARS) {
throw new ICUInputTooLongException("input too long: " + srcLength + " characters");
}
StringBuilder dest = new StringBuilder(src.length());
int n, i, bias, basicLength, j, in, oldi, w, k, digit, t,
destCPCount, firstSupplementaryIndex, cpLength;
char b;
/*
* Handle the basic code points:
* Let basicLength be the number of input code points
* before the last delimiter, or 0 if there is none,
* then copy the first basicLength code points to the output.
*
* The following loop iterates backward.
*/
for(j=srcLength; j>0;) {
if(src.charAt(--j)==DELIMITER) {
break;
}
}
basicLength=destCPCount=j;
for(j=0; j0 ? basicLength+1 : 0; in=srcLength) {
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
digit=decodeDigit(src.charAt(in++));
if(digit<0) {
throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
}
if(digit>(0x7fffffff-i)/w) {
/* integer overflow */
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
i+=digit*w;
t=k-bias;
if(t=(bias+TMAX)) {
t=TMAX;
}
if(digit0x7fffffff/(BASE-t)) {
/* integer overflow */
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
w*=BASE-t;
}
/*
* Modification from sample code:
* Increments destCPCount here,
* where needed instead of in for() loop tail.
*/
++destCPCount;
bias=adaptBias(i-oldi, destCPCount, (oldi==0));
/*
* i was supposed to wrap around from (incremented) destCPCount to 0,
* incrementing n each time, so we'll fix that now:
*/
if(i/destCPCount>(0x7fffffff-n)) {
/* integer overflow */
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
n+=i/destCPCount;
i%=destCPCount;
/* not needed for Punycode: */
/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
if(n>0x10ffff || isSurrogate(n)) {
/* Unicode code point overflow */
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
/* Insert n at position i of the output: */
cpLength=Character.charCount(n);
int codeUnitIndex;
/*
* Handle indexes when supplementary code points are present.
*
* In almost all cases, there will be only BMP code points before i
* and even in the entire string.
* This is handled with the same efficiency as with UTF-32.
*
* Only the rare cases with supplementary code points are handled
* more slowly - but not too bad since this is an insertion anyway.
*/
if(i<=firstSupplementaryIndex) {
codeUnitIndex=i;
if(cpLength>1) {
firstSupplementaryIndex=codeUnitIndex;
} else {
++firstSupplementaryIndex;
}
} else {
codeUnitIndex=dest.offsetByCodePoints(firstSupplementaryIndex, i-firstSupplementaryIndex);
}
/* use the UChar index codeUnitIndex instead of the code point index i */
if(caseFlags!=null && (dest.length()+cpLength)<=caseFlags.length) {
if(codeUnitIndex