com.ibm.icu.charset.UConverterAlias Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j-charset Show documentation
Show all versions of icu4j-charset Show documentation
icu4j-charset is a supplemental library for icu4j, implementing Java Charset SPI.
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2006-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.io.IOException;
import java.nio.ByteBuffer;
import com.ibm.icu.impl.ICUBinary;
final class UConverterAlias {
static final int UNNORMALIZED = 0;
static final int STD_NORMALIZED = 1;
static final int AMBIGUOUS_ALIAS_MAP_BIT = 0x8000;
static final int CONTAINS_OPTION_BIT = 0x4000;
static final int CONVERTER_INDEX_MASK = 0xFFF;
static final int NUM_RESERVED_TAGS = 2;
static final int NUM_HIDDEN_TAGS = 1;
static char[] gConverterList = null;
static char[] gTagList = null;
static char[] gAliasList = null;
static char[] gUntaggedConvArray = null;
static char[] gTaggedAliasArray = null;
static char[] gTaggedAliasLists = null;
static char[] gOptionTable = null;
static byte[] gStringTable = null;
static byte[] gNormalizedStringTable = null;
private static final String GET_STRING(int idx) {
return extractString(gStringTable, 2 * idx);
}
private static final String GET_NORMALIZED_STRING(int idx) {
return extractString(gNormalizedStringTable, 2 * idx);
}
private static final String extractString(byte[] sArray, int sBegin) {
char[] buf = new char[strlen(sArray, sBegin)];
for (int i = 0; i < buf.length; i++) {
buf[i] = (char)(sArray[sBegin + i] & 0xff);
}
return new String(buf);
}
private static final int strlen(byte[] sArray, int sBegin)
{
int i = sBegin;
while(i < sArray.length && sArray[i++] != 0) {}
return i - sBegin - 1;
}
/*private*/ static final int tocLengthIndex = 0;
private static final int converterListIndex = 1;
private static final int tagListIndex = 2;
private static final int aliasListIndex = 3;
private static final int untaggedConvArrayIndex = 4;
private static final int taggedAliasArrayIndex = 5;
private static final int taggedAliasListsIndex = 6;
private static final int optionTableIndex = 7;
private static final int stringTableIndex = 8;
private static final int normalizedStringTableIndex = 9;
private static final int minTocLength = 9; /*
* min. tocLength in the file,
* does not count the
* tocLengthIndex!
*/
private static final int offsetsCount = minTocLength + 1; /*
* length of the
* swapper's
* temporary
* offsets[]
*/
static ByteBuffer gAliasData = null;
private static final boolean isAlias(String alias) {
if (alias == null) {
throw new IllegalArgumentException("Alias param is null!");
}
return (alias.length() != 0);
}
private static final String CNVALIAS_DATA_FILE_NAME = "cnvalias.icu";
private static final synchronized boolean haveAliasData()
throws IOException{
boolean needInit;
needInit = gAliasData == null;
/* load converter alias data from file if necessary */
if (needInit) {
ByteBuffer data = null;
int[] tableArray = null;
int tableStart;
ByteBuffer b = ICUBinary.getRequiredData(CNVALIAS_DATA_FILE_NAME);
UConverterAliasDataReader reader = new UConverterAliasDataReader(b);
tableArray = reader.readToc(offsetsCount);
tableStart = tableArray[0];
if (tableStart < minTocLength) {
throw new IOException("Invalid data format.");
}
gConverterList = ICUBinary.getChars(b, tableArray[converterListIndex], 0);
gTagList = ICUBinary.getChars(b, tableArray[tagListIndex], 0);
gAliasList = ICUBinary.getChars(b, tableArray[aliasListIndex], 0);
gUntaggedConvArray = ICUBinary.getChars(b, tableArray[untaggedConvArrayIndex], 0);
gTaggedAliasArray = ICUBinary.getChars(b, tableArray[taggedAliasArrayIndex], 0);
gTaggedAliasLists = ICUBinary.getChars(b, tableArray[taggedAliasListsIndex], 0);
gOptionTable = ICUBinary.getChars(b, tableArray[optionTableIndex], 0);
gStringTable = new byte[tableArray[stringTableIndex]*2];
b.get(gStringTable);
gNormalizedStringTable = new byte[tableArray[normalizedStringTableIndex]*2];
b.get(gNormalizedStringTable);
data = ByteBuffer.allocate(0); // dummy UDataMemory object in absence
// of memory mapping
if (gOptionTable[0] != STD_NORMALIZED) {
throw new IOException("Unsupported alias normalization");
}
if (gAliasData == null) {
gAliasData = data;
data = null;
}
}
return true;
}
// U_CFUNC const char * io_getConverterName(const char *alias, UErrorCode
// *pErrorCode)
// public static final String io_getConverterName(String alias)
// throws IOException{
// if (haveAliasData() && isAlias(alias)) {
// boolean[] isAmbigous = new boolean[1];
// int convNum = findConverter(alias, isAmbigous);
// if (convNum < gConverterList.length) {
// return GET_STRING(gConverterList[(int) convNum]);
// }
// /* else converter not found */
// }
// return null;
// }
/*
* search for an alias return the converter number index for gConverterList
*/
// static U_INLINE uint32_t findConverter(const char *alias, UErrorCode
// *pErrorCode)
private static final int findConverter(String alias, boolean[] isAmbigous) {
int mid, start, limit;
int lastMid;
int result;
StringBuilder strippedName = new StringBuilder();
String aliasToCompare;
stripForCompare(strippedName, alias);
alias = strippedName.toString();
/* do a binary search for the alias */
start = 0;
limit = gUntaggedConvArray.length;
mid = limit;
lastMid = Integer.MAX_VALUE;
for (;;) {
mid = (start + limit) / 2;
if (lastMid == mid) { /* Have we moved? */
break; /* We haven't moved, and it wasn't found. */
}
lastMid = mid;
aliasToCompare = GET_NORMALIZED_STRING(gAliasList[mid]);
result = alias.compareTo(aliasToCompare);
if (result < 0) {
limit = mid;
} else if (result > 0) {
start = mid;
} else {
/*
* Since the gencnval tool folds duplicates into one entry, this
* alias in gAliasList is unique, but different standards may
* map an alias to different converters.
*/
if ((gUntaggedConvArray[mid] & AMBIGUOUS_ALIAS_MAP_BIT) != 0) {
isAmbigous[0]=true;
}
/* State whether the canonical converter name contains an option.
This information is contained in this list in order to maintain backward & forward compatibility. */
/*if (containsOption) {
UBool containsCnvOptionInfo = (UBool)gMainTable.optionTable->containsCnvOptionInfo;
*containsOption = (UBool)((containsCnvOptionInfo
&& ((gMainTable.untaggedConvArray[mid] & UCNV_CONTAINS_OPTION_BIT) != 0))
|| !containsCnvOptionInfo);
}*/
return gUntaggedConvArray[mid] & CONVERTER_INDEX_MASK;
}
}
return Integer.MAX_VALUE;
}
/**
* stripForCompare Remove the underscores, dashes and spaces from
* the name, and convert the name to lower case.
*
* @param dst The destination buffer, which is <= the buffer of name.
* @param name The alias to strip
* @return the destination buffer.
*/
public static final StringBuilder stripForCompare(StringBuilder dst, String name) {
return io_stripASCIIForCompare(dst, name);
}
// enum {
private static final byte IGNORE = 0;
private static final byte ZERO = 1;
private static final byte NONZERO = 2;
static final byte MINLETTER = 3; /* any values from here on are lowercase letter mappings */
// }
/* character types for ASCII 00..7F */
static final byte asciiTypes[] = new byte[] {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, NONZERO, 0, 0, 0, 0, 0, 0,
0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0,
0, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0, 0, 0, 0, 0
};
private static final char GET_CHAR_TYPE(char c) {
return (char)((c < asciiTypes.length) ? asciiTypes[c] : (char)IGNORE);
}
/** @see UConverterAlias#compareNames */
private static final StringBuilder io_stripASCIIForCompare(StringBuilder dst, String name) {
int nameIndex = 0;
char type, nextType;
char c1;
boolean afterDigit = false;
while (nameIndex < name.length()) {
c1 = name.charAt(nameIndex++);
type = GET_CHAR_TYPE(c1);
switch (type) {
case IGNORE:
afterDigit = false;
continue; /* ignore all but letters and digits */
case ZERO:
if (!afterDigit && nameIndex < name.length()) {
nextType = GET_CHAR_TYPE(name.charAt(nameIndex));
if (nextType == ZERO || nextType == NONZERO) {
continue; /* ignore leading zero before another digit */
}
}
break;
case NONZERO:
afterDigit = true;
break;
default:
c1 = type; /* lowercased letter */
afterDigit = false;
break;
}
dst.append(c1);
}
return dst;
}
/**
* Do a fuzzy compare of a two converter/alias names. The comparison is
* case-insensitive. It also ignores the characters '-', '_', and ' ' (dash,
* underscore, and space). Thus the strings "UTF-8", "utf_8", and "Utf 8"
* are exactly equivalent.
*
* This is a symmetrical (commutative) operation; order of arguments is
* insignificant. This is an important property for sorting the list (when
* the list is preprocessed into binary form) and for performing binary
* searches on it at run time.
*
* @param name1
* a converter name or alias, zero-terminated
* @param name2
* a converter name or alias, zero-terminated
* @return 0 if the names match, or a negative value if the name1 lexically
* precedes name2, or a positive value if the name1 lexically
* follows name2.
*
* @see UConverterAlias#stripForCompare
*/
static int compareNames(String name1, String name2){
int rc, name1Index = 0, name2Index = 0;
char type, nextType;
char c1 = 0, c2 = 0;
boolean afterDigit1 = false, afterDigit2 = false;
for (;;) {
while (name1Index < name1.length()) {
c1 = name1.charAt(name1Index++);
type = GET_CHAR_TYPE(c1);
switch (type) {
case IGNORE:
afterDigit1 = false;
continue; /* ignore all but letters and digits */
case ZERO:
if (!afterDigit1 && name1Index < name1.length()) {
nextType = GET_CHAR_TYPE(name1.charAt(name1Index));
if (nextType == ZERO || nextType == NONZERO) {
continue; /* ignore leading zero before another digit */
}
}
break;
case NONZERO:
afterDigit1 = true;
break;
default:
c1 = type; /* lowercased letter */
afterDigit1 = false;
break;
}
break; /* deliver c1 */
}
while (name2Index < name2.length()) {
c2 = name2.charAt(name2Index++);
type = GET_CHAR_TYPE(c2);
switch (type) {
case IGNORE:
afterDigit2 = false;
continue; /* ignore all but letters and digits */
case ZERO:
if (!afterDigit2 && name1Index < name1.length()) {
nextType = GET_CHAR_TYPE(name2.charAt(name2Index));
if (nextType == ZERO || nextType == NONZERO) {
continue; /* ignore leading zero before another digit */
}
}
break;
case NONZERO:
afterDigit2 = true;
break;
default:
c2 = type; /* lowercased letter */
afterDigit2 = false;
break;
}
break; /* deliver c2 */
}
/* If we reach the ends of both strings then they match */
if (name1Index >= name1.length() && name2Index >= name2.length()) {
return 0;
}
/* Case-insensitive comparison */
rc = (int)c1 - (int)c2;
if (rc != 0) {
return rc;
}
}
}
static int io_countAliases(String alias)
throws IOException{
if (haveAliasData() && isAlias(alias)) {
boolean[] isAmbigous = new boolean[1];
int convNum = findConverter(alias, isAmbigous);
if (convNum < gConverterList.length) {
/* tagListNum - 1 is the ALL tag */
int listOffset = gTaggedAliasArray[(gTagList.length - 1)
* gConverterList.length + convNum];
if (listOffset != 0) {
return gTaggedAliasLists[listOffset];
}
/* else this shouldn't happen. internal program error */
}
/* else converter not found */
}
return 0;
}
/**
* Return the number of all aliases (and converter names).
*
* @return the number of all aliases
*/
// U_CFUNC uint16_t io_countTotalAliases(UErrorCode *pErrorCode);
// static int io_countTotalAliases() throws IOException{
// if (haveAliasData()) {
// return (int) gAliasList.length;
// }
// return 0;
// }
// U_CFUNC const char * io_getAlias(const char *alias, uint16_t n,
// UErrorCode *pErrorCode)
static String io_getAlias(String alias, int n) throws IOException{
if (haveAliasData() && isAlias(alias)) {
boolean[] isAmbigous = new boolean[1];
int convNum = findConverter(alias,isAmbigous);
if (convNum < gConverterList.length) {
/* tagListNum - 1 is the ALL tag */
int listOffset = gTaggedAliasArray[(gTagList.length - 1)
* gConverterList.length + convNum];
if (listOffset != 0) {
//int listCount = gTaggedAliasListsArray[listOffset];
/* +1 to skip listCount */
int currListArrayIndex = listOffset + 1;
return GET_STRING(gTaggedAliasLists[currListArrayIndex + n]);
}
/* else this shouldn't happen. internal program error */
}
/* else converter not found */
}
return null;
}
// U_CFUNC uint16_t io_countStandards(UErrorCode *pErrorCode) {
// static int io_countStandards() throws IOException{
// if (haveAliasData()) {
// return (int) (gTagList.length - NUM_HIDDEN_TAGS);
// }
// return 0;
// }
// U_CAPI const char * U_EXPORT2getStandard(uint16_t n, UErrorCode
// *pErrorCode)
// static String getStandard(int n) throws IOException{
// if (haveAliasData()) {
// return GET_STRING(gTagList[n]);
// }
// return null;
// }
// U_CAPI const char * U_EXPORT2 getStandardName(const char *alias, const
// char *standard, UErrorCode *pErrorCode)
static final String getStandardName(String alias, String standard)throws IOException {
if (haveAliasData() && isAlias(alias)) {
int listOffset = findTaggedAliasListsOffset(alias, standard);
if (0 < listOffset && listOffset < gTaggedAliasLists.length) {
int currListArrayIndex = listOffset + 1;
if (gTaggedAliasLists[0] != 0) {
return GET_STRING(gTaggedAliasLists[currListArrayIndex]);
}
}
}
return null;
}
// U_CAPI uint16_t U_EXPORT2 countAliases(const char *alias, UErrorCode
// *pErrorCode)
static int countAliases(String alias) throws IOException{
return io_countAliases(alias);
}
// U_CAPI const char* U_EXPORT2 getAlias(const char *alias, uint16_t n,
// UErrorCode *pErrorCode)
static String getAlias(String alias, int n) throws IOException{
return io_getAlias(alias, n);
}
// U_CFUNC uint16_t countStandards(void)
// static int countStandards()throws IOException{
// return io_countStandards();
// }
/*returns a single Name from the list, will return NULL if out of bounds
*/
static String getAvailableName (int n){
try{
if (0 <= n && n <= 0xffff) {
String name = bld_getAvailableConverter(n);
return name;
}
}catch(IOException ex){
//throw away exception
}
return null;
}
// U_CAPI const char * U_EXPORT2 getCanonicalName(const char *alias, const
// char *standard, UErrorCode *pErrorCode) {
static String getCanonicalName(String alias, String standard) throws IOException{
if (haveAliasData() && isAlias(alias)) {
int convNum = findTaggedConverterNum(alias, standard);
if (convNum < gConverterList.length) {
return GET_STRING(gConverterList[convNum]);
}
}
return null;
}
static int countAvailable (){
try{
return bld_countAvailableConverters();
}catch(IOException ex){
//throw away exception
}
return -1;
}
// U_CAPI UEnumeration * U_EXPORT2 openStandardNames(const char *convName,
// const char *standard, UErrorCode *pErrorCode)
/* static final UConverterAliasesEnumeration openStandardNames(String convName, String standard)throws IOException {
UConverterAliasesEnumeration aliasEnum = null;
if (haveAliasData() && isAlias(convName)) {
int listOffset = findTaggedAliasListsOffset(convName, standard);
* When listOffset == 0, we want to acknowledge that the converter
* name and standard are okay, but there is nothing to enumerate.
if (listOffset < gTaggedAliasLists.length) {
UConverterAliasesEnumeration.UAliasContext context = new UConverterAliasesEnumeration.UAliasContext(listOffset, 0);
aliasEnum = new UConverterAliasesEnumeration();
aliasEnum.setContext(context);
}
else converter or tag not found
}
return aliasEnum;
}*/
// static uint32_t getTagNumber(const char *tagname)
private static int getTagNumber(String tagName) {
if (gTagList != null) {
int tagNum;
for (tagNum = 0; tagNum < gTagList.length; tagNum++) {
if (tagName.equals(GET_STRING(gTagList[tagNum]))) {
return tagNum;
}
}
}
return Integer.MAX_VALUE;
}
// static uint32_t findTaggedAliasListsOffset(const char *alias, const char
// *standard, UErrorCode *pErrorCode)
private static int findTaggedAliasListsOffset(String alias, String standard) {
int idx;
int listOffset;
int convNum;
int tagNum = getTagNumber(standard);
boolean[] isAmbigous = new boolean[1];
/* Make a quick guess. Hopefully they used a TR22 canonical alias. */
convNum = findConverter(alias, isAmbigous);
if (tagNum < (gTagList.length - NUM_HIDDEN_TAGS)
&& convNum < gConverterList.length) {
listOffset = gTaggedAliasArray[tagNum
* gConverterList.length + convNum];
if (listOffset != 0
&& gTaggedAliasLists[listOffset + 1] != 0) {
return listOffset;
}
if (isAmbigous[0]==true) {
/*
* Uh Oh! They used an ambiguous alias. We have to search the
* whole swiss cheese starting at the highest standard affinity.
* This may take a while.
*/
for (idx = 0; idx < gTaggedAliasArray.length; idx++) {
listOffset = gTaggedAliasArray[idx];
if (listOffset != 0 && isAliasInList(alias, listOffset)) {
int currTagNum = idx / gConverterList.length;
int currConvNum = (idx - currTagNum
* gConverterList.length);
int tempListOffset = gTaggedAliasArray[tagNum
* gConverterList.length + currConvNum];
if (tempListOffset != 0
&& gTaggedAliasLists[tempListOffset + 1] != 0) {
return tempListOffset;
}
/*
* else keep on looking We could speed this up by
* starting on the next row because an alias is unique
* per row, right now. This would change if alias
* versioning appears.
*/
}
}
/* The standard doesn't know about the alias */
}
/* else no default name */
return 0;
}
/* else converter or tag not found */
return Integer.MAX_VALUE;
}
/* Return the canonical name */
// static uint32_t findTaggedConverterNum(const char *alias, const char
// *standard, UErrorCode *pErrorCode)
private static int findTaggedConverterNum(String alias, String standard) {
int idx;
int listOffset;
int convNum;
int tagNum = getTagNumber(standard);
boolean[] isAmbigous = new boolean[1];
/* Make a quick guess. Hopefully they used a TR22 canonical alias. */
convNum = findConverter(alias, isAmbigous);
if (tagNum < (gTagList.length - NUM_HIDDEN_TAGS)
&& convNum < gConverterList.length) {
listOffset = gTaggedAliasArray[tagNum
* gConverterList.length + convNum];
if (listOffset != 0 && isAliasInList(alias, listOffset)) {
return convNum;
}
if (isAmbigous[0] == true) {
/*
* Uh Oh! They used an ambiguous alias. We have to search one
* slice of the swiss cheese. We search only in the requested
* tag, not the whole thing. This may take a while.
*/
int convStart = (tagNum) * gConverterList.length;
int convLimit = (tagNum + 1) * gConverterList.length;
for (idx = convStart; idx < convLimit; idx++) {
listOffset = gTaggedAliasArray[idx];
if (listOffset != 0 && isAliasInList(alias, listOffset)) {
return idx - convStart;
}
}
/* The standard doesn't know about the alias */
}
/* else no canonical name */
}
/* else converter or tag not found */
return Integer.MAX_VALUE;
}
// static U_INLINE UBool isAliasInList(const char *alias, uint32_t
// listOffset)
private static boolean isAliasInList(String alias, int listOffset) {
if (listOffset != 0) {
int currAlias;
int listCount = gTaggedAliasLists[listOffset];
/* +1 to skip listCount */
int currListArrayIndex = listOffset + 1;
for (currAlias = 0; currAlias < listCount; currAlias++) {
if (gTaggedAliasLists[currAlias + currListArrayIndex] != 0
&& compareNames(
alias,
GET_STRING(gTaggedAliasLists[currAlias + currListArrayIndex])) == 0) {
return true;
}
}
}
return false;
}
// begin bld.c
static String[] gAvailableConverters = null;
static int gAvailableConverterCount = 0;
static String gDefaultConverterName = null;
// static UBool haveAvailableConverterList(UErrorCode *pErrorCode)
static boolean haveAvailableConverterList() throws IOException{
if (gAvailableConverters == null) {
int idx;
int localConverterCount;
String converterName;
String[] localConverterList;
if (!haveAliasData()) {
return false;
}
/* We can't have more than "*converterTable" converters to open */
localConverterList = new String[gConverterList.length];
localConverterCount = 0;
for (idx = 0; idx < gConverterList.length; idx++) {
converterName = GET_STRING(gConverterList[idx]);
//UConverter cnv = UConverter.open(converterName);
//TODO: Fix me
localConverterList[localConverterCount++] = converterName;
}
// agljport:todo umtx_lock(NULL);
if (gAvailableConverters == null) {
gAvailableConverters = localConverterList;
gAvailableConverterCount = localConverterCount;
/* haveData should have already registered the cleanup function */
} else {
// agljport:todo free((char **)localConverterList);
}
// agljport:todo umtx_unlock(NULL);
}
return true;
}
// U_CFUNC uint16_t bld_countAvailableConverters(UErrorCode *pErrorCode)
static int bld_countAvailableConverters() throws IOException{
if (haveAvailableConverterList()) {
return gAvailableConverterCount;
}
return 0;
}
// U_CFUNC const char * bld_getAvailableConverter(uint16_t n, UErrorCode
// *pErrorCode)
static String bld_getAvailableConverter(int n) throws IOException{
if (haveAvailableConverterList()) {
if (n < gAvailableConverterCount) {
return gAvailableConverters[n];
}
}
return null;
}
}