All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.adobe.agl.charset.CharsetICU Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/**
*******************************************************************************
* Copyright (C) 2006-2008, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*
*******************************************************************************
*/ 

/*
 * File: CharsetICU.java
 * ************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * ___________________
 *
 *  Copyright 2012 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/

package com.adobe.agl.charset;

//import java.io.ByteArrayInputStream;
//import java.io.InputStreamReader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;

/**
 * 

A subclass of java.nio.Charset for providing implementation of ICU's charset converters. * This API is used to convert codepage or character encoded data to and * from UTF-16. You can open a converter with {@link Charset#forName } and {@link #forNameICU }. With that * converter, you can get its properties, set options, convert your data.

* *

Since many software programs recogize different converter names for * different types of converters, there are other functions in this API to * iterate over the converter aliases. * * @stable ICU 3.6 */ public abstract class CharsetICU extends Charset{ String icuCanonicalName; String javaCanonicalName; int options; float maxCharsPerByte; String name; /* +4: 60 internal name of the converter- invariant chars */ int codepage; /* +64: 4 codepage # (now IBM-$codepage) */ byte platform; /* +68: 1 platform of the converter (only IBM now) */ byte conversionType; /* +69: 1 conversion type */ int minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */ int maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */ byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */ byte subCharLen; /* +76: 1 */ byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */ byte hasFromUnicodeFallback; /* +78: 1 */ short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */ byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */ //byte reserved[/*19*/]; /* +81: 19 to round out the structure */ // typedef enum UConverterUnicodeSet { /** * Parameter that select the set of roundtrippable Unicode code points. * @draft ICU 4.0 * @provisional This API might change or be removed in a future release. */ public static final int ROUNDTRIP_SET=0; /** * Select the set of Unicode code points with roundtrip or fallback mappings. * Not supported at this point. * @internal * @deprecated This API is ICU internal only. */ public static final int ROUNDTRIP_AND_FALLBACK_SET =1; //} UConverterUnicodeSet; /** * * @param icuCanonicalName * @param canonicalName * @param aliases * @stable ICU 3.6 */ protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) { super(canonicalName,aliases); if(canonicalName.length() == 0){ throw new IllegalCharsetNameException(canonicalName); } this.javaCanonicalName = canonicalName; this.icuCanonicalName = icuCanonicalName; } /** * Ascertains if a charset is a sub set of this charset * Implements the abstract method of super class. * @param cs charset to test * @return true if the given charset is a subset of this charset * @stable ICU 3.6 */ public boolean contains(Charset cs){ if (null == cs) { return false; } else if (this.equals(cs)) { return true; } return false; } private static final HashMap algorithmicCharsets = new HashMap(); static{ /* algorithmicCharsets.put("lmbcs1", "com.adobe.agl.charset.CharsetLMBCS1" ); algorithmicCharsets.put("lmbcs11", "com.adobe.agl.charset.CharsetLMBCS11" ); algorithmicCharsets.put("lmbcs16", "com.adobe.agl.charset.CharsetLMBCS16" ); algorithmicCharsets.put("lmbcs17", "com.adobe.agl.charset.CharsetLMBCS17" ); algorithmicCharsets.put("lmbcs18", "com.adobe.agl.charset.CharsetLMBCS18" ); algorithmicCharsets.put("lmbcs19", "com.adobe.agl.charset.CharsetLMBCS19" ); algorithmicCharsets.put("lmbcs2", "com.adobe.agl.charset.CharsetLMBCS2" ); algorithmicCharsets.put("lmbcs3", "com.adobe.agl.charset.CharsetLMBCS3" ); algorithmicCharsets.put("lmbcs4", "com.adobe.agl.charset.CharsetLMBCS4" ); algorithmicCharsets.put("lmbcs5", "com.adobe.agl.charset.CharsetLMBCS5" ); algorithmicCharsets.put("lmbcs6", "com.adobe.agl.charset.CharsetLMBCS6" ); algorithmicCharsets.put("lmbcs8", "com.adobe.agl.charset.CharsetLMBCS8" ); */ algorithmicCharsets.put("BOCU-1", "com.adobe.agl.charset.CharsetBOCU1" ); algorithmicCharsets.put("SCSU", "com.adobe.agl.charset.CharsetSCSU" ); algorithmicCharsets.put("US-ASCII", "com.adobe.agl.charset.CharsetASCII" ); algorithmicCharsets.put("ISO-8859-1", "com.adobe.agl.charset.Charset88591" ); algorithmicCharsets.put("UTF-16", "com.adobe.agl.charset.CharsetUTF16" ); algorithmicCharsets.put("UTF-16BE", "com.adobe.agl.charset.CharsetUTF16BE" ); algorithmicCharsets.put("UTF-16LE", "com.adobe.agl.charset.CharsetUTF16LE" ); algorithmicCharsets.put("UTF16_OppositeEndian", "com.adobe.agl.charset.CharsetUTF16LE" ); algorithmicCharsets.put("UTF16_PlatformEndian", "com.adobe.agl.charset.CharsetUTF16" ); algorithmicCharsets.put("UTF-32", "com.adobe.agl.charset.CharsetUTF32" ); algorithmicCharsets.put("UTF-32BE", "com.adobe.agl.charset.CharsetUTF32BE" ); algorithmicCharsets.put("UTF-32LE", "com.adobe.agl.charset.CharsetUTF32LE" ); algorithmicCharsets.put("UTF32_OppositeEndian", "com.adobe.agl.charset.CharsetUTF32LE" ); algorithmicCharsets.put("UTF32_PlatformEndian", "com.adobe.agl.charset.CharsetUTF32" ); algorithmicCharsets.put("UTF-8", "com.adobe.agl.charset.CharsetUTF8" ); algorithmicCharsets.put("CESU-8", "com.adobe.agl.charset.CharsetCESU8" ); algorithmicCharsets.put("UTF-7", "com.adobe.agl.charset.CharsetUTF7" ); algorithmicCharsets.put("ISCII,version=0", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=1", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=2", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=3", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=4", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=5", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=6", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=7", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=8", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=0,mac", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=2,mac", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("ISCII,version=3,mac", "com.adobe.agl.charset.CharsetISCII" ); algorithmicCharsets.put("IMAP-mailbox-name", "com.adobe.agl.charset.CharsetUTF7" ); algorithmicCharsets.put("HZ", "com.adobe.agl.charset.CharsetHZ" ); algorithmicCharsets.put("machebrew", "com.adobe.agl.charset.CharsetMacHebrew" ); algorithmicCharsets.put("macarabic", "com.adobe.agl.charset.CharsetMacArabic" ); algorithmicCharsets.put("ISO_2022,locale=ja,version=0", "com.adobe.agl.charset.CharsetISO2022" ); algorithmicCharsets.put("ISO_2022,locale=ja,version=1", "com.adobe.agl.charset.CharsetISO2022" ); algorithmicCharsets.put("ISO_2022,locale=ja,version=2", "com.adobe.agl.charset.CharsetISO2022" ); algorithmicCharsets.put("ISO_2022,locale=ja,version=3", "com.adobe.agl.charset.CharsetISO2022" ); algorithmicCharsets.put("ISO_2022,locale=ja,version=4", "com.adobe.agl.charset.CharsetISO2022" ); algorithmicCharsets.put("ISO_2022,locale=zh,version=0", "com.adobe.agl.charset.CharsetISO2022" ); algorithmicCharsets.put("ISO_2022,locale=zh,version=1", "com.adobe.agl.charset.CharsetISO2022" ); algorithmicCharsets.put("ISO_2022,locale=ko,version=0", "com.adobe.agl.charset.CharsetISO2022" ); algorithmicCharsets.put("ISO_2022,locale=ko,version=1", "com.adobe.agl.charset.CharsetISO2022" ); } /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){ String className = (String) algorithmicCharsets.get(icuCanonicalName); if(className==null){ //all the cnv files are loaded as MBCS className = "com.adobe.agl.charset.CharsetMBCS"; } try{ CharsetICU conv = null; Class cs = Class.forName(className); Class[] paramTypes = new Class[]{ String.class, String.class, String[].class}; final Constructor c = cs.getConstructor(paramTypes); Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases}; // Run constructor try { Object obj = c.newInstance(params); if(obj!=null && obj instanceof CharsetICU){ conv = (CharsetICU)obj; return conv; } }catch (InvocationTargetException e) { throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className+ ". Exception:" + e.getTargetException()); } }catch(ClassNotFoundException ex){ }catch(NoSuchMethodException ex){ }catch (IllegalAccessException ex){ }catch (InstantiationException ex){ } throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className); } static final boolean isSurrogate(int c){ return (((c)&0xfffff800)==0xd800); } /* * Returns the default charset name */ // static final String getDefaultCharsetName(){ // String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding(); // return defaultEncoding; // } /** * Returns a charset object for the named charset. * This method gurantee that ICU charset is returned when * available. If the ICU charset provider does not support * the specified charset, then try other charset providers * including the standard Java charset provider. * * @param charsetName The name of the requested charset, * may be either a canonical name or an alias * @return A charset object for the named charset * @throws IllegalCharsetNameException If the given charset name * is illegal * @throws UnsupportedCharsetException If no support for the * named charset is available in this instance of th Java * virtual machine * @stable ICU 3.6 */ public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException { CharsetProviderICU icuProvider = new CharsetProviderICU(); CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName); if (cs != null) { return cs; } return Charset.forName(charsetName); } // /** // * @see java.lang.Comparable#compareTo(java.lang.Object) // * @stable 3.8 // */ // public int compareTo(Object otherObj) { // if (!(otherObj instanceof CharsetICU)) { // return -1; // } // return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName); // } /** * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the * start of the stream for example U+FEFF (the Unicode BOM/signature * character) that can be ignored. * * Detects Unicode signature byte sequences at the start of the byte stream * and returns number of bytes of the BOM of the indicated Unicode charset. * 0 is returned when no Unicode signature is recognized. * */ // TODO This should be proposed as CharsetDecoderICU API. // static String detectUnicodeSignature(ByteBuffer source) { // int signatureLength = 0; // number of bytes of the signature // final int SIG_MAX_LEN = 5; // String sigUniCharset = null; // states what unicode charset is the BOM // int i = 0; // // /* // * initial 0xa5 bytes: make sure that if we read





© 2015 - 2024 Weber Informatics LLC | Privacy Policy