com.adobe.agl.charset.CharsetICU Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
There is a newer version: 2024.11.18751.20241128T090041Z-241100
/**
*******************************************************************************
* Copyright (C) 2006-2008, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*
*******************************************************************************
*/ 

/*
 * File: CharsetICU.java
 * ************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * ___________________
 *
 *  Copyright 2012 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/

package com.adobe.agl.charset;

//import java.io.ByteArrayInputStream;
//import java.io.InputStreamReader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;

/**
 * A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
 * This API is used to convert codepage or character encoded data to and
 * from UTF-16. You can open a converter with {@link Charset#forName } and {@link #forNameICU }. With that
 * converter, you can get its properties, set options, convert your data.
 *
 * Since many software programs recogize different converter names for
 * different types of converters, there are other functions in this API to
 * iterate over the converter aliases. 
 * 
 * @stable ICU 3.6
 */
public abstract class CharsetICU extends Charset{

     String icuCanonicalName;
     String javaCanonicalName;
     int options;

     float  maxCharsPerByte;
    
     String name; /* +4: 60  internal name of the converter- invariant chars */

     int codepage;               /* +64: 4 codepage # (now IBM-$codepage) */

     byte platform;                /* +68: 1 platform of the converter (only IBM now) */
     byte conversionType;          /* +69: 1 conversion type */

     int minBytesPerChar;         /* +70: 1 Minimum # bytes per char in this codepage */
     int maxBytesPerChar;         /* +71: 1 Maximum # bytes output per UChar in this codepage */

     byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4  [note:  4 and 8 byte boundary] */
     byte subCharLen;              /* +76: 1 */
    
     byte hasToUnicodeFallback;   /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
     byte hasFromUnicodeFallback; /* +78: 1 */
     short unicodeMask;            /* +79: 1  bit 0: has supplementary  bit 1: has single surrogates */
     byte subChar1;               /* +80: 1  single-byte substitution character for IBM MBCS (0 if none) */
     //byte reserved[/*19*/];           /* +81: 19 to round out the structure */
     
     
    // typedef enum UConverterUnicodeSet {
     /** 
      * Parameter that select the set of roundtrippable Unicode code points. 
      * @draft ICU 4.0
      * @provisional This API might change or be removed in a future release.
      */
      public static final int ROUNDTRIP_SET=0; 
      /**
       * Select the set of Unicode code points with roundtrip or fallback mappings.
       * Not supported at this point.
       * @internal
       * @deprecated This API is ICU internal only.
       */
      public static final int ROUNDTRIP_AND_FALLBACK_SET =1;
      
    //} UConverterUnicodeSet;
     
    /**
     * 
     * @param icuCanonicalName
     * @param canonicalName
     * @param aliases
     * @stable ICU 3.6
     */
    protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
        super(canonicalName,aliases);
        if(canonicalName.length() == 0){
            throw new IllegalCharsetNameException(canonicalName);
        }
        this.javaCanonicalName = canonicalName;
        this.icuCanonicalName  = icuCanonicalName;
    }
    
    /**
     * Ascertains if a charset is a sub set of this charset
     * Implements the abstract method of super class.
     * @param cs charset to test
     * @return true if the given charset is a subset of this charset
     * @stable ICU 3.6
     */
    public boolean contains(Charset cs){
        if (null == cs) {
            return false;
        } else if (this.equals(cs)) {
            return true;
        }
        return false;
    }
    private static final HashMap algorithmicCharsets = new HashMap();
    static{
        /*
        algorithmicCharsets.put("lmbcs1",                "com.adobe.agl.charset.CharsetLMBCS1" );
        algorithmicCharsets.put("lmbcs11",               "com.adobe.agl.charset.CharsetLMBCS11" );
        algorithmicCharsets.put("lmbcs16",               "com.adobe.agl.charset.CharsetLMBCS16" );
        algorithmicCharsets.put("lmbcs17",               "com.adobe.agl.charset.CharsetLMBCS17" );
        algorithmicCharsets.put("lmbcs18",               "com.adobe.agl.charset.CharsetLMBCS18" );
        algorithmicCharsets.put("lmbcs19",               "com.adobe.agl.charset.CharsetLMBCS19" );
        algorithmicCharsets.put("lmbcs2",                "com.adobe.agl.charset.CharsetLMBCS2" );
        algorithmicCharsets.put("lmbcs3",                "com.adobe.agl.charset.CharsetLMBCS3" );
        algorithmicCharsets.put("lmbcs4",                "com.adobe.agl.charset.CharsetLMBCS4" );
        algorithmicCharsets.put("lmbcs5",                "com.adobe.agl.charset.CharsetLMBCS5" );
        algorithmicCharsets.put("lmbcs6",                "com.adobe.agl.charset.CharsetLMBCS6" );
        algorithmicCharsets.put("lmbcs8",                "com.adobe.agl.charset.CharsetLMBCS8" );
        */
        algorithmicCharsets.put("BOCU-1",                "com.adobe.agl.charset.CharsetBOCU1" );
        algorithmicCharsets.put("SCSU",                  "com.adobe.agl.charset.CharsetSCSU" ); 
        algorithmicCharsets.put("US-ASCII",              "com.adobe.agl.charset.CharsetASCII" );
        algorithmicCharsets.put("ISO-8859-1",            "com.adobe.agl.charset.Charset88591" );
        algorithmicCharsets.put("UTF-16",                "com.adobe.agl.charset.CharsetUTF16" );
        algorithmicCharsets.put("UTF-16BE",              "com.adobe.agl.charset.CharsetUTF16BE" );
        algorithmicCharsets.put("UTF-16LE",              "com.adobe.agl.charset.CharsetUTF16LE" );
        algorithmicCharsets.put("UTF16_OppositeEndian",  "com.adobe.agl.charset.CharsetUTF16LE" );
        algorithmicCharsets.put("UTF16_PlatformEndian",  "com.adobe.agl.charset.CharsetUTF16" );
        algorithmicCharsets.put("UTF-32",                "com.adobe.agl.charset.CharsetUTF32" );
        algorithmicCharsets.put("UTF-32BE",              "com.adobe.agl.charset.CharsetUTF32BE" );
        algorithmicCharsets.put("UTF-32LE",              "com.adobe.agl.charset.CharsetUTF32LE" );
        algorithmicCharsets.put("UTF32_OppositeEndian",  "com.adobe.agl.charset.CharsetUTF32LE" );
        algorithmicCharsets.put("UTF32_PlatformEndian",  "com.adobe.agl.charset.CharsetUTF32" );
        algorithmicCharsets.put("UTF-8",                 "com.adobe.agl.charset.CharsetUTF8" );
        algorithmicCharsets.put("CESU-8",                "com.adobe.agl.charset.CharsetCESU8" );
        algorithmicCharsets.put("UTF-7",                 "com.adobe.agl.charset.CharsetUTF7" );
        algorithmicCharsets.put("ISCII,version=0",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=1",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=2",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=3",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=4",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=5",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=6",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=7",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=8",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=0,mac",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=2,mac",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("ISCII,version=3,mac",       "com.adobe.agl.charset.CharsetISCII" );
        algorithmicCharsets.put("IMAP-mailbox-name",     "com.adobe.agl.charset.CharsetUTF7" );
        algorithmicCharsets.put("HZ",                    "com.adobe.agl.charset.CharsetHZ" );
        algorithmicCharsets.put("machebrew",                    "com.adobe.agl.charset.CharsetMacHebrew" );
        algorithmicCharsets.put("macarabic",                    "com.adobe.agl.charset.CharsetMacArabic" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=0",               "com.adobe.agl.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=1",               "com.adobe.agl.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=2",               "com.adobe.agl.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=3",               "com.adobe.agl.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ja,version=4",               "com.adobe.agl.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=zh,version=0",               "com.adobe.agl.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=zh,version=1",               "com.adobe.agl.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ko,version=0",               "com.adobe.agl.charset.CharsetISO2022" );
        algorithmicCharsets.put("ISO_2022,locale=ko,version=1",               "com.adobe.agl.charset.CharsetISO2022" );
        }

    /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
       String className = (String) algorithmicCharsets.get(icuCanonicalName);
       if(className==null){
           //all the cnv files are loaded as MBCS
           className = "com.adobe.agl.charset.CharsetMBCS";
       }
       try{
           CharsetICU conv = null;
           Class cs = Class.forName(className);
           Class[] paramTypes = new Class[]{ String.class, String.class,  String[].class};
           final Constructor c = cs.getConstructor(paramTypes);
           Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};
           
           // Run constructor
           try {
               Object obj = c.newInstance(params);
               if(obj!=null && obj instanceof CharsetICU){
                   conv = (CharsetICU)obj;
                   return conv;
               }
           }catch (InvocationTargetException e) {
               throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className+ ". Exception:" + e.getTargetException());    
           }
       }catch(ClassNotFoundException ex){
       }catch(NoSuchMethodException ex){
       }catch (IllegalAccessException ex){ 
       }catch (InstantiationException ex){ 
       }
       throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);    
    }
    
    static final boolean isSurrogate(int c){
        return (((c)&0xfffff800)==0xd800);
    }
    
    /*
     * Returns the default charset name 
     */
//    static final String getDefaultCharsetName(){
//        String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
//        return defaultEncoding;
//    }

    /**
     * Returns a charset object for the named charset.
     * This method gurantee that ICU charset is returned when
     * available.  If the ICU charset provider does not support
     * the specified charset, then try other charset providers
     * including the standard Java charset provider.
     * 
     * @param charsetName The name of the requested charset,
     * may be either a canonical name or an alias
     * @return A charset object for the named charset
     * @throws IllegalCharsetNameException If the given charset name
     * is illegal
     * @throws UnsupportedCharsetException If no support for the
     * named charset is available in this instance of th Java
     * virtual machine
     * @stable ICU 3.6
     */
    public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
        CharsetProviderICU icuProvider = new CharsetProviderICU();
        CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
        if (cs != null) {
            return cs;
        }
        return Charset.forName(charsetName);
    }

//    /**
//     * @see java.lang.Comparable#compareTo(java.lang.Object)
//     * @stable 3.8
//     */
//    public int compareTo(Object otherObj) {
//        if (!(otherObj instanceof CharsetICU)) {
//            return -1;
//        }
//        return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName);
//    }

    /**
     * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
     * start of the stream for example U+FEFF (the Unicode BOM/signature
     * character) that can be ignored.
     * 
     * Detects Unicode signature byte sequences at the start of the byte stream
     * and returns number of bytes of the BOM of the indicated Unicode charset.
     * 0 is returned when no Unicode signature is recognized.
     * 
     */
    // TODO This should be proposed as CharsetDecoderICU API.
//    static String detectUnicodeSignature(ByteBuffer source) {
//        int signatureLength = 0; // number of bytes of the signature
//        final int SIG_MAX_LEN = 5;
//        String sigUniCharset = null; // states what unicode charset is the BOM
//        int i = 0;
//
//        /*
//         * initial 0xa5 bytes: make sure that if we read