com.ibm.icu.impl.Punycode Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2003-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UTF16;
/**
* Ported code from ICU punycode.c
* @author ram
*/
public final class Punycode {
/* Punycode parameters for Bootstring */
private static final int BASE = 36;
private static final int TMIN = 1;
private static final int TMAX = 26;
private static final int SKEW = 38;
private static final int DAMP = 700;
private static final int INITIAL_BIAS = 72;
private static final int INITIAL_N = 0x80;
/* "Basic" Unicode/ASCII code points */
private static final char HYPHEN = 0x2d;
private static final char DELIMITER = HYPHEN;
private static final int ZERO = 0x30;
//private static final int NINE = 0x39;
private static final int SMALL_A = 0x61;
private static final int SMALL_Z = 0x7a;
private static final int CAPITAL_A = 0x41;
private static final int CAPITAL_Z = 0x5a;
private static int adaptBias(int delta, int length, boolean firstTime){
if(firstTime){
delta /=DAMP;
}else{
delta /= 2;
}
delta += delta/length;
int count=0;
for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
delta/=(BASE-TMIN);
}
return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}
/**
* basicToDigit[] contains the numeric value of a basic code
* point (for use in representing integers) in the range 0 to
* BASE-1, or -1 if b is does not represent a value.
*/
static final int[] basicToDigit= new int[]{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
///CLOVER:OFF
private static char asciiCaseMap(char b, boolean uppercase) {
if(uppercase) {
if(SMALL_A<=b && b<=SMALL_Z) {
b-=(SMALL_A-CAPITAL_A);
}
} else {
if(CAPITAL_A<=b && b<=CAPITAL_Z) {
b+=(SMALL_A-CAPITAL_A);
}
}
return b;
}
///CLOVER:ON
/**
* digitToBasic() returns the basic code point whose value
* (when used for representing integers) is d, which must be in the
* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
* nonzero, in which case the uppercase form is used.
*/
private static char digitToBasic(int digit, boolean uppercase) {
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
if(digit<26) {
if(uppercase) {
return (char)(CAPITAL_A+digit);
} else {
return (char)(SMALL_A+digit);
}
} else {
return (char)((ZERO-26)+digit);
}
}
/**
* Converts Unicode to Punycode.
* The input string must not contain single, unpaired surrogates.
* The output will be represented as an array of ASCII code points.
*
* @param src The source of the String Buffer passed.
* @param caseFlags The boolean array of case flags.
* @return An array of ASCII code points.
*/
public static StringBuilder encode(CharSequence src, boolean[] caseFlags) throws StringPrepParseException{
int n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, srcCPCount;
char c, c2;
int srcLength = src.length();
int[] cpBuffer = new int[srcLength];
StringBuilder dest = new StringBuilder(srcLength);
/*
* Handle the basic code points and
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
*/
srcCPCount=0;
for(j=0; j0) {
dest.append(DELIMITER);
}
/*
* handledCPCount is the number of code points that have been handled
* basicLength is the number of basic code points
* destLength is the number of chars that have been output
*/
/* Initialize the state: */
n=INITIAL_N;
delta=0;
bias=INITIAL_BIAS;
/* Main encoding loop: */
for(handledCPCount=basicLength; handledCPCount state to , but guard against overflow:
*/
if(m-n>(0x7fffffff-delta)/(handledCPCount+1)) {
throw new IllegalStateException("Internal program error");
}
delta+=(m-n)*(handledCPCount+1);
n=m;
/* Encode a sequence of same code points n */
for(j=0; jTMAX) {
t=TMAX;
}
*/
t=k-bias;
if(t=(bias+TMAX)) {
t=TMAX;
}
if(q= CAPITAL_Z);
}
///CLOVER:ON
private static boolean isSurrogate(int ch){
return (((ch)&0xfffff800)==0xd800);
}
/**
* Converts Punycode to Unicode.
* The Unicode string will be at most as long as the Punycode string.
*
* @param src The source of the string buffer being passed.
* @param caseFlags The array of boolean case flags.
* @return StringBuilder string.
*/
public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
throws StringPrepParseException{
int srcLength = src.length();
StringBuilder dest = new StringBuilder(src.length());
int n, i, bias, basicLength, j, in, oldi, w, k, digit, t,
destCPCount, firstSupplementaryIndex, cpLength;
char b;
/*
* Handle the basic code points:
* Let basicLength be the number of input code points
* before the last delimiter, or 0 if there is none,
* then copy the first basicLength code points to the output.
*
* The following loop iterates backward.
*/
for(j=srcLength; j>0;) {
if(src.charAt(--j)==DELIMITER) {
break;
}
}
basicLength=destCPCount=j;
for(j=0; j0 ? basicLength+1 : 0; in=srcLength) {
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
digit=basicToDigit[src.charAt(in++) & 0xFF];
if(digit<0) {
throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
}
if(digit>(0x7fffffff-i)/w) {
/* integer overflow */
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
i+=digit*w;
t=k-bias;
if(t=(bias+TMAX)) {
t=TMAX;
}
if(digit0x7fffffff/(BASE-t)) {
/* integer overflow */
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
w*=BASE-t;
}
/*
* Modification from sample code:
* Increments destCPCount here,
* where needed instead of in for() loop tail.
*/
++destCPCount;
bias=adaptBias(i-oldi, destCPCount, (oldi==0));
/*
* i was supposed to wrap around from (incremented) destCPCount to 0,
* incrementing n each time, so we'll fix that now:
*/
if(i/destCPCount>(0x7fffffff-n)) {
/* integer overflow */
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
n+=i/destCPCount;
i%=destCPCount;
/* not needed for Punycode: */
/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
if(n>0x10ffff || isSurrogate(n)) {
/* Unicode code point overflow */
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
/* Insert n at position i of the output: */
cpLength=Character.charCount(n);
int codeUnitIndex;
/*
* Handle indexes when supplementary code points are present.
*
* In almost all cases, there will be only BMP code points before i
* and even in the entire string.
* This is handled with the same efficiency as with UTF-32.
*
* Only the rare cases with supplementary code points are handled
* more slowly - but not too bad since this is an insertion anyway.
*/
if(i<=firstSupplementaryIndex) {
codeUnitIndex=i;
if(cpLength>1) {
firstSupplementaryIndex=codeUnitIndex;
} else {
++firstSupplementaryIndex;
}
} else {
codeUnitIndex=dest.offsetByCodePoints(firstSupplementaryIndex, i-firstSupplementaryIndex);
}
/* use the UChar index codeUnitIndex instead of the code point index i */
if(caseFlags!=null && (dest.length()+cpLength)<=caseFlags.length) {
if(codeUnitIndex
© 2015 - 2025 Weber Informatics LLC | Privacy Policy