![JAR search and dependency download from the Maven repository](/logo.png)
com.ibm.icu.impl.ImplicitCEGenerator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/**
*******************************************************************************
* Copyright (C) 2004-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.impl;
/**
* For generation of Implicit CEs
* @author Mark Davis
*
* Cleaned up so that changes can be made more easily.
* Old values:
# First Implicit: E26A792D
# Last Implicit: E3DC70C0
# First CJK: E0030300
# Last CJK: E0A9DD00
# First CJK_A: E0A9DF00
# Last CJK_A: E0DE3100
@internal
*/
public class ImplicitCEGenerator {
/**
* constants
*/
static final boolean DEBUG = false;
static final long topByte = 0xFF000000L;
static final long bottomByte = 0xFFL;
static final long fourBytes = 0xFFFFFFFFL;
static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2
// public static final int CJK_BASE = 0x4E00;
// public static final int CJK_LIMIT = 0x9FFF+1;
// public static final int CJK_COMPAT_USED_BASE = 0xFA0E;
// public static final int CJK_COMPAT_USED_LIMIT = 0xFA2F+1;
// public static final int CJK_A_BASE = 0x3400;
// public static final int CJK_A_LIMIT = 0x4DBF+1;
// public static final int CJK_B_BASE = 0x20000;
// public static final int CJK_B_LIMIT = 0x2A6DF+1;
public static final int
// 4E00;;Lo;0;L;;;;;N;;;;;
// 9FCC;;Lo;0;L;;;;;N;;;;; (Unicode 6.1)
CJK_BASE = 0x4E00,
CJK_LIMIT = 0x9FCC+1,
CJK_COMPAT_USED_BASE = 0xFA0E,
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
//3400;;Lo;0;L;;;;;N;;;;;
//4DB5;;Lo;0;L;;;;;N;;;;;
CJK_A_BASE = 0x3400,
CJK_A_LIMIT = 0x4DB5+1,
//20000;;Lo;0;L;;;;;N;;;;;
//2A6D6;;Lo;0;L;;;;;N;;;;;
CJK_B_BASE = 0x20000,
CJK_B_LIMIT = 0x2A6D6+1,
//2A700;;Lo;0;L;;;;;N;;;;;
//2B734;;Lo;0;L;;;;;N;;;;;
CJK_C_BASE = 0x2A700,
CJK_C_LIMIT = 0x2B734+1,
//2B740;;Lo;0;L;;;;;N;;;;;
//2B81D;;Lo;0;L;;;;;N;;;;;
CJK_D_BASE = 0x2B740,
CJK_D_LIMIT = 0x2B81D+1
// when adding to this list, look for all occurrences (in project) of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
;
// private void throwError(String title, int cp) {
// throw new IllegalArgumentException(title + "\t" + Utility.hex(cp, 6) + "\t" +
// Utility.hex(getImplicitFromRaw(cp) & fourBytes));
// }
//
// private void throwError(String title, long ce) {
// throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes));
// }
//
// private void show(int i) {
// if (i >= 0 && i <= MAX_INPUT) {
// System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes));
// }
// }
/**
* Precomputed by constructor
*/
int final3Multiplier;
int final4Multiplier;
int final3Count;
int final4Count;
int medialCount;
int min3Primary;
int min4Primary;
int max4Primary;
int minTrail;
int maxTrail;
int max3Trail;
int max4Trail;
int min4Boundary;
public int getGap4() {
return final4Multiplier - 1;
}
public int getGap3() {
return final3Multiplier - 1;
}
// old comment
// we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
// we shift so that HAN all has the same first primary, for compression.
// for the 4 byte case, we make the gap as large as we can fit.
/**
* Supply parameters for generating implicit CEs
*/
public ImplicitCEGenerator(int minPrimary, int maxPrimary) {
// 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
}
/**
* Set up to generate implicits.
* @param minPrimary The minimum primary value.
* @param maxPrimary The maximum primary value.
* @param minTrail final byte
* @param maxTrail final byte
* @param gap3 the gap we leave for tailoring for 3-byte forms
* @param primaries3count number of 3-byte primarys we can use (normally 1)
*/
public ImplicitCEGenerator(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
// some simple parameter checks
if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
throw new IllegalArgumentException("bad lead bytes");
}
if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) {
throw new IllegalArgumentException("bad trail bytes");
}
if (primaries3count < 1) {
throw new IllegalArgumentException("bad three-byte primaries");
}
this.minTrail = minTrail;
this.maxTrail = maxTrail;
min3Primary = minPrimary;
max4Primary = maxPrimary;
// compute constants for use later.
// number of values we can use in trailing bytes
// leave room for empty values between AND above, e.g. if gap = 2
// range 3..7 => +3 -4 -5 -6 -7: so 1 value
// range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
// range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
final3Multiplier = gap3 + 1;
final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
// medials can use full range
medialCount = (maxTrail - minTrail + 1);
// find out how many values fit in each form
int threeByteCount = medialCount * final3Count;
// now determine where the 3/4 boundary is.
// we use 3 bytes below the boundary, and 4 above
int primariesAvailable = maxPrimary - minPrimary + 1;
int primaries4count = primariesAvailable - primaries3count;
int min3ByteCoverage = primaries3count * threeByteCount;
min4Primary = minPrimary + primaries3count;
min4Boundary = min3ByteCoverage;
// Now expand out the multiplier for the 4 bytes, and redo.
int totalNeeded = MAX_INPUT - min4Boundary;
int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
if (DEBUG) System.out.println("expandedGap: " + gap4);
if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s");
final4Multiplier = gap4 + 1;
final4Count = neededPerFinalByte;
max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) {
throw new IllegalArgumentException("internal error");
}
if (DEBUG) {
System.out.println("final4Count: " + final4Count);
for (int counter = 0; counter < final4Count; ++counter) {
int value = minTrail + (1 + counter)*final4Multiplier;
System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
}
}
}
static public int divideAndRoundUp(int a, int b) {
return 1 + (a-1)/b;
}
/**
* Converts implicit CE into raw integer
* @param implicit The implicit value passed.
* @return -1 if illegal format
*/
public int getRawFromImplicit(int implicit) {
int result;
int b3 = implicit & 0xFF;
implicit >>= 8;
int b2 = implicit & 0xFF;
implicit >>= 8;
int b1 = implicit & 0xFF;
implicit >>= 8;
int b0 = implicit & 0xFF;
// simple parameter checks
if (b0 < min3Primary || b0 > max4Primary
|| b1 < minTrail || b1 > maxTrail) return -1;
// normal offsets
b1 -= minTrail;
// take care of the final values, and compose
if (b0 < min4Primary) {
if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
b2 -= minTrail;
int remainder = b2 % final3Multiplier;
if (remainder != 0) return -1;
b0 -= min3Primary;
b2 /= final3Multiplier;
result = ((b0 * medialCount) + b1) * final3Count + b2;
} else {
if (b2 < minTrail || b2 > maxTrail
|| b3 < minTrail || b3 > max4Trail) return -1;
b2 -= minTrail;
b3 -= minTrail;
int remainder = b3 % final4Multiplier;
if (remainder != 0) return -1;
b3 /= final4Multiplier;
b0 -= min4Primary;
result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
}
// final check
if (result < 0 || result > MAX_INPUT) return -1;
return result;
}
/**
* Generate the implicit CE, from raw integer.
* Left shifted to put the first byte at the top of an int.
* @param cp code point
* @return Primary implicit weight
*/
public int getImplicitFromRaw(int cp) {
if (cp < 0 || cp > MAX_INPUT) {
throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
}
int last0 = cp - min4Boundary;
if (last0 < 0) {
int last1 = cp / final3Count;
last0 = cp % final3Count;
int last2 = last1 / medialCount;
last1 %= medialCount;
last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = min3Primary + last2; // offset
if (last2 >= min4Primary) {
throw new IllegalArgumentException("4-byte out of range: " +
Utility.hex(cp) + ", " + Utility.hex(last2));
}
return (last2 << 24) + (last1 << 16) + (last0 << 8);
} else {
int last1 = last0 / final4Count;
last0 %= final4Count;
int last2 = last1 / medialCount;
last1 %= medialCount;
int last3 = last2 / medialCount;
last2 %= medialCount;
last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = minTrail + last2; // offset
last3 = min4Primary + last3; // offset
if (last3 > max4Primary) {
throw new IllegalArgumentException("4-byte out of range: " +
Utility.hex(cp) + ", " + Utility.hex(last3));
}
return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
}
}
/**
* Gets an Implicit from a code point. Internally,
* swaps (which produces a raw value 0..220000,
* then converts raw to implicit.
* @param cp The code point to convert to implicit.
* @return Primary implicit weight
*/
public int getImplicitFromCodePoint(int cp) {
if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
// Produce Raw value
// note, we add 1 so that the first value is always empty!!
cp = ImplicitCEGenerator.swapCJK(cp) + 1;
// we now have a range of numbers from 0 to 220000.
if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
return getImplicitFromRaw(cp);
}
/**
* Function used to:
* a) collapse the 2 different Han ranges from UCA into one (in the right order), and
* b) bump any non-CJK characters by 10FFFF.
* The relevant blocks are:
* A: 4E00..9FFF; CJK Unified Ideographs
* F900..FAFF; CJK Compatibility Ideographs
* B: 3400..4DBF; CJK Unified Ideographs Extension A
* 20000..XX; CJK Unified Ideographs Extension B (and others later on)
* As long as
* no new B characters are allocated between 4E00 and FAFF, and
* no new A characters are outside of this range,
* (very high probability) this simple code will work.
* The reordered blocks are:
* Block1 is CJK
* Block2 is CJK_COMPAT_USED
* Block3 is CJK_A
* (all contiguous)
* Any other CJK gets its normal code point
* Any non-CJK gets +10FFFF
* When we reorder Block1, we make sure that it is at the very start,
* so that it will use a 3-byte form.
* Warning: the we only pick up the compatibility characters that are
* NOT decomposed, so that block is smaller!
*/
static int NON_CJK_OFFSET = 0x110000;
public static int swapCJK(int i) {
if (i >= CJK_BASE) {
if (i < CJK_LIMIT) return i - CJK_BASE;
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
+ (CJK_LIMIT - CJK_BASE);
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
if (i < CJK_C_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_C_LIMIT) return i; // non-BMP-CJK
if (i < CJK_D_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_D_LIMIT) return i; // non-BMP-CJK
return i + NON_CJK_OFFSET; // non-CJK
}
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
+ (CJK_LIMIT - CJK_BASE)
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
return i + NON_CJK_OFFSET; // non-CJK
}
/**
* @return Minimal trail value
*/
public int getMinTrail() {
return minTrail;
}
/**
* @return Maximal trail value
*/
public int getMaxTrail() {
return maxTrail;
}
public int getCodePointFromRaw(int i) {
i--;
int result = 0;
if(i >= NON_CJK_OFFSET) {
result = i - NON_CJK_OFFSET;
} else if(i >= CJK_B_BASE) {
result = i;
} else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
// rest of CJKs, compacted
if(i < CJK_LIMIT - CJK_BASE) {
result = i + CJK_BASE;
} else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
} else {
result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
}
} else {
result = -1;
}
return result;
}
public int getRawFromCodePoint(int i) {
return swapCJK(i)+1;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy