com.ibm.icu.charset.CharsetCallback Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j-charset Show documentation
icu4j-charset is a supplemental library for icu4j, implementing Java Charset SPI.
There is a newer version: 76.1
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 2006-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/

package com.ibm.icu.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CoderResult;

/**
 *  Callback API for CharsetICU API 
 *
 *  CharsetCallback class defines some error behaviour functions called
 *  by CharsetDecoderICU and CharsetEncoderICU. The class also provides
 *  the facility by which clients can write their own callbacks.
 *
 *  These functions, although public, should NEVER be called directly.
 *  They should be used as parameters to the onUmappableCharacter() and
 *  onMalformedInput() methods, to set the behaviour of a converter
 *  when it encounters UNMAPPED/INVALID sequences.
 *  Currently the only way to set callbacks is by using CodingErrorAction.
 *  In the future we will provide set methods on CharsetEncoder and CharsetDecoder
 *  that will accept CharsetCallback fields.
 *
 * @stable ICU 3.6
 */

public class CharsetCallback {
    /*
     * FROM_U, TO_U context options for sub callback
     */
    private static final String SUB_STOP_ON_ILLEGAL = "i";

//    /*
//     * FROM_U, TO_U context options for skip callback
//     */
//    private static final String SKIP_STOP_ON_ILLEGAL = "i";

//    /*
//     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
//     */
//    private static final String ESCAPE_ICU  = null;

    /*
     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
     */
    private static final String ESCAPE_JAVA     =  "J";

    /*
     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
     * TO_U_CALLBACK_ESCAPE option to escape the character value according to C (\\xXXXX)
     */
    private static final String ESCAPE_C        = "C";

    /*
     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly
     * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly
     */
    private static final String ESCAPE_XML_DEC  = "D";

    /*
     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly
     * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly
     */
    private static final String ESCAPE_XML_HEX  = "X";

    /*
     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
     */
    private static final String ESCAPE_UNICODE  = "U";

    /*
     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
     */
    private static final String ESCAPE_CSS2  = "S";

    /*
     * IS_DEFAULT_IGNORABLE_CODE_POINT
     * This is to check if a code point has the default ignorable unicode property.
     * As such, this list needs to be updated if the ignorable code point list ever
     * changes.
     * To avoid dependency on other code, this list is hard coded here.
     * When an ignorable code point is found and is unmappable, the default callbacks
     * will ignore them.
     * For a list of the default ignorable code points, use this link:
     * https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
     *
     * This list should be sync with the one in ucnv_err.cpp.
     */
    private static boolean IS_DEFAULT_IGNORABLE_CODE_POINT(int c) {
        return
            (c == 0x00AD) ||
            (c == 0x034F) ||
            (c == 0x061C) ||
            (c == 0x115F) ||
            (c == 0x1160) ||
            (0x17B4 <= c && c <= 0x17B5) ||
            (0x180B <= c && c <= 0x180E) ||
            (0x200B <= c && c <= 0x200F) ||
            (0x202A <= c && c <= 0x202E) ||
            (0x2060 <= c && c <= 0x206F) ||
            (c == 0x3164) ||
            (0xFE00 <= c && c <= 0xFE0F) ||
            (c == 0xFEFF) ||
            (c == 0xFFA0) ||
            (0xFFF0 <= c && c <= 0xFFF8) ||
            (0x1BCA0 <= c && c <= 0x1BCA3) ||
            (0x1D173 <= c && c <= 0x1D17A) ||
            (0xE0000 <= c && c <= 0xE0FFF);
    }
    /**
     * Decoder Callback interface
     * @stable ICU 3.6
     */
    public interface Decoder {
        /**
         * This function is called when the bytes in the source cannot be handled,
         * and this function is meant to handle or fix the error if possible.
         *
         * @return Result of decoding action. This returned object is set to an error
         *  if this function could not handle the conversion.
         * @stable ICU 3.6
         */
        public CoderResult call(CharsetDecoderICU decoder, Object context,
                                ByteBuffer source, CharBuffer target, IntBuffer offsets,
                                char[] buffer, int length, CoderResult cr);
    }
    /**
     * Encoder Callback interface
     * @stable ICU 3.6
     */
    public interface Encoder {
        /**
         * This function is called when the Unicode characters in the source cannot be handled,
         * and this function is meant to handle or fix the error if possible.
         * @return Result of decoding action. This returned object is set to an error
         *  if this function could not handle the conversion.
         * @stable ICU 3.6
         */
        public CoderResult call(CharsetEncoderICU encoder, Object context,
                                CharBuffer source, ByteBuffer target, IntBuffer offsets,
                                char[] buffer, int length, int cp, CoderResult cr);
    }
    /**
     * Skip callback
     * @stable ICU 3.6
     */
    public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() {
        @Override
        public CoderResult call(CharsetEncoderICU encoder, Object context,
                CharBuffer source, ByteBuffer target, IntBuffer offsets,
                char[] buffer, int length, int cp, CoderResult cr){
            if(context==null){
                return CoderResult.UNDERFLOW;
            }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
                if(!cr.isUnmappable()){
                    return cr;
                }else{
                    return CoderResult.UNDERFLOW;
                }
            }
            return cr;
        }
    };
    /**
     * Skip callback
     * @stable ICU 3.6
     */
    public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() {
        @Override
        public CoderResult call(CharsetDecoderICU decoder, Object context,
                ByteBuffer source, CharBuffer target, IntBuffer offsets,
                char[] buffer, int length, CoderResult cr){
            if(context==null){
                return CoderResult.UNDERFLOW;
            }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
                if(!cr.isUnmappable()){
                    return cr;
                }else{
                    return CoderResult.UNDERFLOW;
                }
            }
            return cr;
        }
    };
    /**
     * Write substitute callback
     * @stable ICU 3.6
     */
    public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){
        @Override
        public CoderResult call(CharsetEncoderICU encoder, Object context,
                CharBuffer source, ByteBuffer target, IntBuffer offsets,
                char[] buffer, int length, int cp, CoderResult cr){
            if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
                return CoderResult.UNDERFLOW;
            }else if(context==null){
                return encoder.cbFromUWriteSub(encoder, source, target, offsets);
            }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
                if(!cr.isUnmappable()){
                    return cr;
                }else{
                   return encoder.cbFromUWriteSub(encoder, source, target, offsets);
                }
            }
            return cr;
        }
    };
    private static final char[] kSubstituteChar1 = new char[]{0x1A};
    private static final char[] kSubstituteChar = new char[] {0xFFFD};
    /**
     * Write substitute callback
     * @stable ICU 3.6
     */
    public static final Decoder TO_U_CALLBACK_SUBSTITUTE  = new Decoder() {
        @Override
        public CoderResult call(CharsetDecoderICU decoder, Object context,
                ByteBuffer source, CharBuffer target, IntBuffer offsets,
                char[] buffer, int length, CoderResult cr){

            CharsetICU cs = (CharsetICU) decoder.charset();
            /* Use the specified replacement character if it is different than the default one. */
            boolean useReplacement = true;
            char [] replacementChar = decoder.replacement().toCharArray();
            if (replacementChar.length == 1 && (replacementChar[0] == kSubstituteChar1[0] || replacementChar[0] == kSubstituteChar[0])) {
                useReplacement = false;
            }

            /* could optimize this case, just one uchar */
            if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) {
                return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar1, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
            } else {
                return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
            }
        }
    };
    /**
     * Stop callback
     * @stable ICU 3.6
     */
    public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() {
        @Override
        public CoderResult call(CharsetEncoderICU encoder, Object context,
                CharBuffer source, ByteBuffer target, IntBuffer offsets,
                char[] buffer, int length, int cp, CoderResult cr){
            if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
                return CoderResult.UNDERFLOW;
            }
            return cr;
        }
    };
    /**
     * Stop callback
     * @stable ICU 3.6
     */
    public static final Decoder TO_U_CALLBACK_STOP = new Decoder() {
        @Override
        public CoderResult call(CharsetDecoderICU decoder, Object context,
                ByteBuffer source, CharBuffer target, IntBuffer offsets,
                char[] buffer, int length, CoderResult cr){
            return cr;
        }
    };
    private static final int VALUE_STRING_LENGTH = 32;
    private static final char UNICODE_PERCENT_SIGN_CODEPOINT    = 0x0025;
    private static final char UNICODE_U_CODEPOINT               = 0x0055;
    private static final char UNICODE_X_CODEPOINT               = 0x0058;
    private static final char UNICODE_RS_CODEPOINT              = 0x005C;
    private static final char UNICODE_U_LOW_CODEPOINT           = 0x0075;
    private static final char UNICODE_X_LOW_CODEPOINT           = 0x0078;
    private static final char UNICODE_AMP_CODEPOINT             = 0x0026;
    private static final char UNICODE_HASH_CODEPOINT            = 0x0023;
    private static final char UNICODE_SEMICOLON_CODEPOINT       = 0x003B;
    private static final char UNICODE_PLUS_CODEPOINT            = 0x002B;
    private static final char UNICODE_LEFT_CURLY_CODEPOINT      = 0x007B;
    private static final char UNICODE_RIGHT_CURLY_CODEPOINT     = 0x007D;
    private static final char UNICODE_SPACE_CODEPOINT           = 0x0020;
    /**
     * Write escape callback
     * @stable ICU 4.0
     */
    public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() {
        @Override
        public CoderResult call(CharsetEncoderICU encoder, Object context,
                CharBuffer source, ByteBuffer target, IntBuffer offsets,
                char[] buffer, int length, int cp, CoderResult cr){
            char[] valueString = new char[VALUE_STRING_LENGTH];
            int valueStringLength = 0;
            int i = 0;

            if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
                return CoderResult.UNDERFLOW;
            }

            if (context == null || !(context instanceof String)) {
                while (i < length) {
                    valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
                    valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
                    valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
                }
            } else {
                if (((String)context).equals(ESCAPE_JAVA)) {
                    while (i < length) {
                        valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
                        valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
                        valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
                    }
                } else if (((String)context).equals(ESCAPE_C)) {
                    valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */

                    if (length == 2) {
                        valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
                        valueStringLength = itou(valueString, valueStringLength, cp, 16, 8);
                    } else {
                        valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
                        valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
                    }
                } else if (((String)context).equals(ESCAPE_XML_DEC)) {
                    valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
                    valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
                    if (length == 2) {
                        valueStringLength += itou(valueString, valueStringLength, cp, 10, 0);
                    } else {
                        valueStringLength += itou(valueString, valueStringLength, buffer[0], 10, 0);
                    }
                    valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
                } else if (((String)context).equals(ESCAPE_XML_HEX)) {
                    valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
                    valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
                    valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */
                    if (length == 2) {
                        valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
                    } else {
                        valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 0);
                    }
                    valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
                } else if (((String)context).equals(ESCAPE_UNICODE)) {
                    valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
                    valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
                    valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT;          /* adding + */
                    if (length == 2) {
                        valueStringLength += itou(valueString, valueStringLength,cp, 16, 4);
                    } else {
                        valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
                    }
                    valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT;   /* adding } */
                } else if (((String)context).equals(ESCAPE_CSS2)) {
                    valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
                    valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
                    /* Always add space character, because the next character might be whitespace,
                       which would erroneously be considered the termination of the escape sequence. */
                    valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT;
                } else {
                    while (i < length) {
                        valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
                        valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
                        valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
                    }
                }
            }
            return encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets);
        }
    };
    /**
     * Write escape callback
     * @stable ICU 4.0
     */
    public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() {
        @Override
        public CoderResult call(CharsetDecoderICU decoder, Object context,
                ByteBuffer source, CharBuffer target, IntBuffer offsets,
                char[] buffer, int length, CoderResult cr){
            char[] uniValueString = new char[VALUE_STRING_LENGTH];
            int valueStringLength = 0;
            int i = 0;

            if (context == null || !(context instanceof String)) {
                while (i < length) {
                    uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
                    uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding U */
                    valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
                }
            } else {
                if (((String)context).equals(ESCAPE_XML_DEC)) {
                    while (i < length) {
                        uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
                        uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
                        valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0);
                        uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
                    }
                } else if (((String)context).equals(ESCAPE_XML_HEX)) {
                    while (i < length) {
                        uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
                        uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
                        uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;  /* adding x */
                        valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0);
                        uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
                    }
                } else if (((String)context).equals(ESCAPE_C)) {
                    while (i < length) {
                        uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT;         /* adding \ */
                        uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;      /* adding x */
                        valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
                    }
                } else {
                    while (i < length) {
                        uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
                        uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding X */
                        itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
                        valueStringLength += 2;
                    }
                }
            }

            cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0);

            return cr;
        }
    };
    /***
     * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE.
     * Fills in a char string with the radix-based representation of a number padded with zeroes
     * to minwidth.
     */
    private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) {
        int length = 0;
        int digit;
        int j;
        char temp;

        do {
            digit = i % radix;
            buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7));
            i = i/radix;
        } while (i != 0 && (sourceIndex + length) < buffer.length);

        while (length < minwidth) {
            buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */
        }
        /* reverses the string */
        for (j = 0; j < (length / 2); j++) {
            temp = buffer[(sourceIndex + length - 1) - j];
            buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j];
            buffer[sourceIndex + j] = temp;
        }

        return length;
    }

    /*
     * No need to create an instance
     */
    private CharsetCallback() {
    }
}