com.ibm.icu.charset.CharsetICU Maven / Gradle / Ivy
Show all versions of icu4j-charset Show documentation
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 2006-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;
import com.ibm.icu.text.UnicodeSet;
/**
* A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
* This API is used to convert codepage or character encoded data to and
* from UTF-16. You can open a converter with {@link Charset#forName} and {@link #forNameICU}. With that
* converter, you can get its properties, set options, convert your data.
*
*
Since many software programs recognize different converter names for
* different types of converters, there are other functions in this API to
* iterate over the converter aliases.
*
*
Note that {@link #name()} cannot always return a unique charset name.
* {@link Charset} documents that,
* for charsets listed in the IANA Charset Registry,
* the {@link #name()} must be listed there,
* and it “must be the MIME-preferred name” if there are multiple names.
*
*
However, there are different implementations of many if not most charsets,
* ICU provides multiple variants for some of them,
* ICU provides variants of some java.nio-system-supported charsets,
* and ICU users are free to add more variants.
* This is so that applications can be compatible with multiple implementations at the same time.
*
*
This is in conflict with the {@link Charset#name()} requirements.
* It is not possible to offer variants of an IANA charset and
* always use the MIME-preferred name and also have those names be unique.
*
*
{@link #name()} returns the MIME-preferred name, or IANA name,
* so that it can always be used for the charset field in internet protocols.
*
*
Same-name charsets are accessible via {@link Charset#forName} or {@link #forNameICU}
* by using unique aliases (e.g., the ICU-canonical names).
*
*
{@link Charset} also documents that
* “Two charsets are equal if, and only if, they have the same canonical names.”
* This is not possible.
*
*
Unfortunately, {@link Charset#equals} is final, and
* {@link Charset#availableCharsets} returns
* “a sorted map from canonical charset names to charset objects”.
* Since {@link #name()} cannot be unique,
* {@link #equals} cannot work properly in such cases, and
* {@link Charset#availableCharsets} can only include one variant for a name.
*
* @stable ICU 3.6
*/
public abstract class CharsetICU extends Charset{
String icuCanonicalName;
int options;
float maxCharsPerByte;
String name; /* +4: 60 internal name of the converter- invariant chars */
int codepage; /* +64: 4 codepage # (now IBM-$codepage) */
byte platform; /* +68: 1 platform of the converter (only IBM now) */
byte conversionType; /* +69: 1 conversion type */
int minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */
int maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */
byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */
byte subCharLen; /* +76: 1 */
byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
byte hasFromUnicodeFallback; /* +78: 1 */
short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */
byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
//byte reserved[/*19*/]; /* +81: 19 to round out the structure */
// typedef enum UConverterUnicodeSet {
/**
* Parameter that select the set of roundtrippable Unicode code points.
* @stable ICU 4.0
*/
public static final int ROUNDTRIP_SET=0;
/**
* Select the set of Unicode code points with roundtrip or fallback mappings.
* Not supported at this point.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static final int ROUNDTRIP_AND_FALLBACK_SET =1;
//} UConverterUnicodeSet;
/**
*
* @param icuCanonicalName
* @param canonicalName
* @param aliases
* @stable ICU 3.6
*/
protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
super(canonicalName,aliases);
if(canonicalName.length() == 0){
throw new IllegalCharsetNameException(canonicalName);
}
this.icuCanonicalName = icuCanonicalName;
}
/**
* Ascertains if a charset is a sub set of this charset
* Implements the abstract method of super class.
* @param cs charset to test
* @return true if the given charset is a subset of this charset
* @stable ICU 3.6
*/
@Override
public boolean contains(Charset cs){
if (null == cs) {
return false;
} else if (this.equals(cs)) {
return true;
}
return false;
}
private static final HashMap algorithmicCharsets = new HashMap();
static{
algorithmicCharsets.put("LMBCS-1", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-2", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-3", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-4", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-5", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-6", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-8", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-11", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-16", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-17", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-18", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("LMBCS-19", "com.ibm.icu.charset.CharsetLMBCS");
algorithmicCharsets.put("BOCU-1", "com.ibm.icu.charset.CharsetBOCU1" );
algorithmicCharsets.put("SCSU", "com.ibm.icu.charset.CharsetSCSU" );
algorithmicCharsets.put("US-ASCII", "com.ibm.icu.charset.CharsetASCII" );
algorithmicCharsets.put("ISO-8859-1", "com.ibm.icu.charset.Charset88591" );
algorithmicCharsets.put("UTF-16", "com.ibm.icu.charset.CharsetUTF16" );
algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.charset.CharsetUTF16BE" );
algorithmicCharsets.put("UTF-16BE,version=1", "com.ibm.icu.charset.CharsetUTF16BE" );
algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.charset.CharsetUTF16LE" );
algorithmicCharsets.put("UTF-16LE,version=1", "com.ibm.icu.charset.CharsetUTF16LE" );
algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.charset.CharsetUTF16LE" );
algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.charset.CharsetUTF16" );
algorithmicCharsets.put("UTF-32", "com.ibm.icu.charset.CharsetUTF32" );
algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.charset.CharsetUTF32BE" );
algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.charset.CharsetUTF32LE" );
algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.charset.CharsetUTF32LE" );
algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.charset.CharsetUTF32" );
algorithmicCharsets.put("UTF-8", "com.ibm.icu.charset.CharsetUTF8" );
algorithmicCharsets.put("CESU-8", "com.ibm.icu.charset.CharsetCESU8" );
algorithmicCharsets.put("UTF-7", "com.ibm.icu.charset.CharsetUTF7" );
algorithmicCharsets.put("ISCII,version=0", "com.ibm.icu.charset.CharsetISCII" );
algorithmicCharsets.put("ISCII,version=1", "com.ibm.icu.charset.CharsetISCII" );
algorithmicCharsets.put("ISCII,version=2", "com.ibm.icu.charset.CharsetISCII" );
algorithmicCharsets.put("ISCII,version=3", "com.ibm.icu.charset.CharsetISCII" );
algorithmicCharsets.put("ISCII,version=4", "com.ibm.icu.charset.CharsetISCII" );
algorithmicCharsets.put("ISCII,version=5", "com.ibm.icu.charset.CharsetISCII" );
algorithmicCharsets.put("ISCII,version=6", "com.ibm.icu.charset.CharsetISCII" );
algorithmicCharsets.put("ISCII,version=7", "com.ibm.icu.charset.CharsetISCII" );
algorithmicCharsets.put("ISCII,version=8", "com.ibm.icu.charset.CharsetISCII" );
algorithmicCharsets.put("IMAP-mailbox-name", "com.ibm.icu.charset.CharsetUTF7" );
algorithmicCharsets.put("HZ", "com.ibm.icu.charset.CharsetHZ" );
algorithmicCharsets.put("ISO_2022,locale=ja,version=0", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("ISO_2022,locale=ja,version=1", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("ISO_2022,locale=ja,version=2", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("ISO_2022,locale=ja,version=3", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("ISO_2022,locale=ja,version=4", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("ISO_2022,locale=zh,version=0", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("ISO_2022,locale=zh,version=1", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("ISO_2022,locale=zh,version=2", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("ISO_2022,locale=ko,version=0", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("ISO_2022,locale=ko,version=1", "com.ibm.icu.charset.CharsetISO2022" );
algorithmicCharsets.put("x11-compound-text", "com.ibm.icu.charset.CharsetCompoundText" );
}
/*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
String className = algorithmicCharsets.get(icuCanonicalName);
if(className==null){
//all the cnv files are loaded as MBCS
className = "com.ibm.icu.charset.CharsetMBCS";
}
try{
CharsetICU conv = null;
Class extends CharsetICU> cs = Class.forName(className).asSubclass(CharsetICU.class);
Class>[] paramTypes = new Class>[]{ String.class, String.class, String[].class};
final Constructor extends CharsetICU> c = cs.getConstructor(paramTypes);
Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};
// Run constructor
try {
conv = c.newInstance(params);
if (conv != null) {
return conv;
}
}catch (InvocationTargetException e) {
Throwable cause = e.getCause();
UnsupportedCharsetException e2 = new UnsupportedCharsetException(
icuCanonicalName + ": " + "Could not load " + className + ". Exception: " + cause);
e2.initCause(cause);
throw e2;
}
}catch(ClassNotFoundException ex){
}catch(NoSuchMethodException ex){
}catch (IllegalAccessException ex){
}catch (InstantiationException ex){
}
throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);
}
static final boolean isSurrogate(int c){
return (((c)&0xfffff800)==0xd800);
}
/*
* Returns the default charset name
*/
// static final String getDefaultCharsetName(){
// String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
// return defaultEncoding;
// }
/**
* Returns a charset object for the named charset.
* This method gurantee that ICU charset is returned when
* available. If the ICU charset provider does not support
* the specified charset, then try other charset providers
* including the standard Java charset provider.
*
* @param charsetName The name of the requested charset,
* may be either a canonical name or an alias
* @return A charset object for the named charset
* @throws IllegalCharsetNameException If the given charset name
* is illegal
* @throws UnsupportedCharsetException If no support for the
* named charset is available in this instance of th Java
* virtual machine
* @stable ICU 3.6
*/
public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
CharsetProviderICU icuProvider = new CharsetProviderICU();
CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
if (cs != null) {
return cs;
}
return Charset.forName(charsetName);
}
// /**
// * @see java.lang.Comparable#compareTo(java.lang.Object)
// * @stable 3.8
// */
// public int compareTo(Object otherObj) {
// if (!(otherObj instanceof CharsetICU)) {
// return -1;
// }
// return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName);
// }
/**
* This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
* start of the stream for example U+FEFF (the Unicode BOM/signature
* character) that can be ignored.
*
* Detects Unicode signature byte sequences at the start of the byte stream
* and returns number of bytes of the BOM of the indicated Unicode charset.
* 0 is returned when no Unicode signature is recognized.
*
*/
// TODO This should be proposed as CharsetDecoderICU API.
// static String detectUnicodeSignature(ByteBuffer source) {
// int signatureLength = 0; // number of bytes of the signature
// final int SIG_MAX_LEN = 5;
// String sigUniCharset = null; // states what unicode charset is the BOM
// int i = 0;
//
// /*
// * initial 0xa5 bytes: make sure that if we read The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be
* roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback
* mappings or are only the result of reverse fallback mappings. See UTR #22 "Character Mapping Markup Language" at http://www.unicode.org/reports/tr22/
*
* In the future, there may be more UConverterUnicodeSet choices to select sets with different properties.
*
*
This is useful for example for
*
- checking that a string or document can be roundtrip-converted with a converter,
* without/before actually performing the conversion
* - testing if a converter can be used for text for typical text for a certain locale,
* by comparing its roundtrip set with the set of ExemplarCharacters from
* ICU's locale data or other sources
*
* @param setFillIn A valid UnicodeSet. It will be cleared by this function before
* the converter's specific set is filled in.
* @param which A selector; currently ROUNDTRIP_SET is the only supported value.
* @throws IllegalArgumentException if the parameters does not match.
* @stable ICU 4.0
*/
public void getUnicodeSet(UnicodeSet setFillIn, int which){
if( setFillIn == null || which != ROUNDTRIP_SET ){
throw new IllegalArgumentException();
}
setFillIn.clear();
getUnicodeSetImpl(setFillIn, which);
}
/**
* Returns whether or not the charset of the converter has a fixed number of bytes
* per charset character.
* An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS.
* Another example is UTF-32 which is always 4 bytes per character. A UTF-32 code point
* may represent more than one UTF-8 or UTF-16 code units but always have size of 4 bytes.
* Note: This method is not intended to be used to determine whether the charset has a
* fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form.
* @return true if the converter is fixed-width
* @stable ICU 4.8
*/
public boolean isFixedWidth() {
if (this instanceof CharsetASCII || this instanceof CharsetUTF32) {
return true;
}
if (this instanceof CharsetMBCS) {
if (((CharsetMBCS)this).sharedData.staticData.maxBytesPerChar == ((CharsetMBCS)this).sharedData.staticData.minBytesPerChar) {
return true;
}
}
return false;
}
static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
setFillIn.add(0, 0xd7ff);
setFillIn.add(0xe000, 0x10ffff);
}
static void getCompleteUnicodeSet(UnicodeSet setFillIn){
setFillIn.add(0, 0x10ffff);
}
}