All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.charset.CharsetBOCU1 Maven / Gradle / Ivy

Go to download

icu4j-charset is a supplemental library for icu4j, implementing Java Charset SPI.

There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2008-2011, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
package com.ibm.icu.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

/**
 * @author krajwade
 *
 */
class CharsetBOCU1 extends CharsetICU {
    /* BOCU constants and macros */

    /* initial value for "prev": middle of the ASCII range */
    private static final byte BOCU1_ASCII_PREV = 0x40;

    /* bounding byte values for differences */
    private static final int BOCU1_MIN = 0x21;
    private static final int BOCU1_MIDDLE = 0x90;
    //private static final int BOCU1_MAX_LEAD = 0xfe;
    private static final int BOCU1_MAX_TRAIL = 0xff;
    private static final int BOCU1_RESET = 0xff;

    /* number of lead bytes */
    //private static final int BOCU1_COUNT = (BOCU1_MAX_LEAD-BOCU1_MIN+1);

    /* adjust trail byte counts for the use of some C0 control byte values */
    private static final int BOCU1_TRAIL_CONTROLS_COUNT =  20;
    private static final int BOCU1_TRAIL_BYTE_OFFSET = (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT);

    /* number of trail bytes */
    private static final int BOCU1_TRAIL_COUNT =((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT);

    /*
     * number of positive and negative single-byte codes
     * (counting 0==BOCU1_MIDDLE among the positive ones)
     */
    private static final int BOCU1_SINGLE = 64;

    /* number of lead bytes for positive and negative 2/3/4-byte sequences */
    private static final int BOCU1_LEAD_2 = 43;
    private static final int BOCU1_LEAD_3 = 3;
    //private static final int BOCU1_LEAD_4 = 1;

    /* The difference value range for single-byters. */
    private static final int BOCU1_REACH_POS_1 = (BOCU1_SINGLE-1);
    private static final int BOCU1_REACH_NEG_1 = (-BOCU1_SINGLE);

    /* The difference value range for double-byters. */
    private static final int BOCU1_REACH_POS_2 = (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);
    private static final int BOCU1_REACH_NEG_2 = (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);

    /* The difference value range for 3-byters. */
    private static final int BOCU1_REACH_POS_3  =
        (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);

    private static final int BOCU1_REACH_NEG_3 = (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);

    /* The lead byte start values. */
    private static final int BOCU1_START_POS_2 =  (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1);
    private static final int BOCU1_START_POS_3  = (BOCU1_START_POS_2+BOCU1_LEAD_2);
    private static final int BOCU1_START_POS_4  = (BOCU1_START_POS_3+BOCU1_LEAD_3);
         /* ==BOCU1_MAX_LEAD */

    private static final int BOCU1_START_NEG_2 = (BOCU1_MIDDLE+BOCU1_REACH_NEG_1);
    private static final int BOCU1_START_NEG_3 = (BOCU1_START_NEG_2-BOCU1_LEAD_2);
    //private static final int BOCU1_START_NEG_4 = (BOCU1_START_NEG_3-BOCU1_LEAD_3);
         /* ==BOCU1_MIN+1 */

    /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
   /* private static int BOCU1_LENGTH_FROM_LEAD(int lead) {
       return ((BOCU1_START_NEG_2<=(lead) && (lead)>24 : 4);
    }

    /*
     * Byte value map for control codes,
     * from external byte values 0x00..0x20
     * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
     * External byte values that are illegal as trail bytes are mapped to -1.
     */
    private static final int[]
    bocu1ByteToTrail={
    /*  0     1     2     3     4     5     6     7    */
        -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,

    /*  8     9     a     b     c     d     e     f    */
        -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,

    /*  10    11    12    13    14    15    16    17   */
        0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,

    /*  18    19    1a    1b    1c    1d    1e    1f   */
        0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,

    /*  20   */
        -1
    };

    /*
     * Byte value map for control codes,
     * from trail byte values 0..19 (0..0x13) as used in the difference calculation
     * to external byte values 0x00..0x20.
     */
    private static final int[]
    bocu1TrailToByte = {
    /*  0     1     2     3     4     5     6     7    */
        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,

    /*  8     9     a     b     c     d     e     f    */
        0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,

    /*  10    11    12    13   */
        0x1c, 0x1d, 0x1e, 0x1f
    };


    /*
     * 12 commonly used C0 control codes (and space) are only used to encode
     * themselves directly,
     * which makes BOCU-1 MIME-usable and reasonably safe for
     * ASCII-oriented software.
     *
     * These controls are
     *  0   NUL
     *
     *  7   BEL
     *  8   BS
     *
     *  9   TAB
     *  a   LF
     *  b   VT
     *  c   FF
     *  d   CR
     *
     *  e   SO
     *  f   SI
     *
     * 1a   SUB
     * 1b   ESC
     *
     * The other 20 C0 controls are also encoded directly (to preserve order)
     * but are also used as trail bytes in difference encoding
     * (for better compression).
     */
    private static int BOCU1_TRAIL_TO_BYTE(int trail) {
        return ((trail)>=BOCU1_TRAIL_CONTROLS_COUNT ? (trail)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[trail]);
    }

    /* BOCU-1 implementation functions ------------------------------------------ */
    private static int BOCU1_SIMPLE_PREV(int c){
        return (((c)&~0x7f)+BOCU1_ASCII_PREV);
    }

    /**
     * Compute the next "previous" value for differencing
     * from the current code point.
     *
     * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
     * @return "previous code point" state value
     */
    private static  int bocu1Prev(int c) {
        /* compute new prev */
        if(/* 0x3040<=c && */ c<=0x309f) {
            /* Hiragana is not 128-aligned */
            return 0x3070;
        } else if(0x4e00<=c && c<=0x9fa5) {
            /* CJK Unihan */
            return 0x4e00-BOCU1_REACH_NEG_2;
        } else if(0xac00<=c /* && c<=0xd7a3 */) {
            /* Korean Hangul */
            return (0xd7a3+0xac00)/2;
        } else {
            /* mostly small scripts */
            return BOCU1_SIMPLE_PREV(c);
        }
    }

    /** Fast version of bocu1Prev() for most scripts. */
    private static int BOCU1_PREV(int c) {
        return ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c));
    }

    protected byte[] fromUSubstitution = new byte[]{(byte)0x1A};

    /* Faster versions of packDiff() for single-byte-encoded diff values. */

    /** Is a diff value encodable in a single byte? */
    private static boolean DIFF_IS_SINGLE(int diff){
        return (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1);
    }

    /** Encode a diff value in a single byte. */
    private static int PACK_SINGLE_DIFF(int diff){
        return (BOCU1_MIDDLE+(diff));
    }

    /** Is a diff value encodable in two bytes? */
    private static boolean DIFF_IS_DOUBLE(int diff){
        return (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2);
    }

    public CharsetBOCU1(String icuCanonicalName, String javaCanonicalName, String[] aliases){
        super(icuCanonicalName, javaCanonicalName, aliases);
        maxBytesPerChar = 4;
        minBytesPerChar = 1;
        maxCharsPerByte = 1;
     }

    class CharsetEncoderBOCU extends CharsetEncoderICU {
        public CharsetEncoderBOCU(CharsetICU cs) {
            super(cs,fromUSubstitution);
        }

        int sourceIndex, nextSourceIndex;
        int prev, c , diff;
        boolean checkNegative;
        boolean LoopAfterTrail;
        int targetCapacity;
        CoderResult cr;

        /* label values for supporting behavior similar to goto in C */
        private static final int fastSingle=0;
        private static final int getTrail=1;
        private static final int regularLoop=2;

        private boolean LabelLoop; //used to break the while loop
        private int labelType = fastSingle; //labeType is set to fastSingle to start the code from fastSingle:

        /**
         * Integer division and modulo with negative numerators
         * yields negative modulo results and quotients that are one more than
         * what we need here.
         * This macro adjust the results so that the modulo-value m is always >=0.
         *
         * For positive n, the if() condition is always FALSE.
         *
         * @param n Number to be split into quotient and rest.
         *          Will be modified to contain the quotient.
         * @param d Divisor.
         * @param m Output variable for the rest (modulo result).
         */
        private int NEGDIVMOD(int n, int d, int m) {
            diff = n;
            (m)=(diff)%(d);
            (diff)/=(d);
            if((m)<0) {
                --(diff);
                (m)+=(d);
            }
            return m;
        }

        /**
         * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
         * and return a packed integer with them.
         *
         * The encoding favors small absolute differences with short encodings
         * to compress runs of same-script characters.
         *
         * Optimized version with unrolled loops and fewer floating-point operations
         * than the standard packDiff().
         *
         * @param diff difference value -0x10ffff..0x10ffff
         * @return
         *      0x010000zz for 1-byte sequence zz
         *      0x0200yyzz for 2-byte sequence yy zz
         *      0x03xxyyzz for 3-byte sequence xx yy zz
         *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
         */
        private int packDiff(int n) {
            int result, m = 0;
            diff = n;

            if(diff>=BOCU1_REACH_NEG_1) {
                /* mostly positive differences, and single-byte negative ones */
                if(diff<=BOCU1_REACH_POS_2) {
                    /* two bytes */
                    diff-=BOCU1_REACH_POS_1+1;
                    result=0x02000000;

                    m=diff%BOCU1_TRAIL_COUNT;
                    diff/=BOCU1_TRAIL_COUNT;
                    result|=BOCU1_TRAIL_TO_BYTE(m);

                    result|=(BOCU1_START_POS_2+diff)<<8;
                } else if(diff<=BOCU1_REACH_POS_3) {
                    /* three bytes */
                    diff-=BOCU1_REACH_POS_2+1;
                    result=0x03000000;

                    m=diff%BOCU1_TRAIL_COUNT;
                    diff/=BOCU1_TRAIL_COUNT;
                    result|=BOCU1_TRAIL_TO_BYTE(m);

                    m=diff%BOCU1_TRAIL_COUNT;
                    diff/=BOCU1_TRAIL_COUNT;
                    result|=BOCU1_TRAIL_TO_BYTE(m)<<8;

                    result|=(BOCU1_START_POS_3+diff)<<16;
                } else {
                    /* four bytes */
                    diff-=BOCU1_REACH_POS_3+1;

                    m=diff%BOCU1_TRAIL_COUNT;
                    diff/=BOCU1_TRAIL_COUNT;
                    result=BOCU1_TRAIL_TO_BYTE(m);

                    m=diff%BOCU1_TRAIL_COUNT;
                    diff/=BOCU1_TRAIL_COUNT;
                    result|=BOCU1_TRAIL_TO_BYTE(m)<<8;

                    /*
                     * We know that / and % would deliver quotient 0 and rest=diff.
                     * Avoid division and modulo for performance.
                     */
                    result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;

                    result|=((BOCU1_START_POS_4&UConverterConstants.UNSIGNED_INT_MASK))<<24;
                }
            } else {
                /* two- to four-byte negative differences */
                if(diff>=BOCU1_REACH_NEG_2) {
                    /* two bytes */
                    diff-=BOCU1_REACH_NEG_1;
                    result=0x02000000;

                    m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                    result|=BOCU1_TRAIL_TO_BYTE(m);

                    result|=(BOCU1_START_NEG_2+diff)<<8;
                } else if(diff>=BOCU1_REACH_NEG_3) {
                    /* three bytes */
                    diff-=BOCU1_REACH_NEG_2;
                    result=0x03000000;

                    m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                    result|=BOCU1_TRAIL_TO_BYTE(m);

                    m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                    result|=BOCU1_TRAIL_TO_BYTE(m)<<8;

                    result|=(BOCU1_START_NEG_3+diff)<<16;
                } else {
                    /* four bytes */
                    diff-=BOCU1_REACH_NEG_3;

                    m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                    result=BOCU1_TRAIL_TO_BYTE(m);

                    m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                    result|=BOCU1_TRAIL_TO_BYTE(m)<<8;

                    /*
                     * We know that NEGDIVMOD would deliver
                     * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
                     * Avoid division and modulo for performance.
                     */
                    m=diff+BOCU1_TRAIL_COUNT;
                    result|=BOCU1_TRAIL_TO_BYTE(m)<<16;

                    result|=BOCU1_MIN<<24;
                }
            }
            return result;
        }

        @Override
        protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
            cr = CoderResult.UNDERFLOW;

            LabelLoop = true; //used to break the while loop
            checkNegative = false; // its value is set to true to get out of while loop when c = -c
            LoopAfterTrail = false; // its value is set to true to ignore code before getTrail:

            /*set up the local pointers*/
            targetCapacity = target.limit() - target.position();
            c = fromUChar32;
            prev = fromUnicodeStatus;

            if(prev==0){
                prev = BOCU1_ASCII_PREV;
            }

            /*sourceIndex ==-1 if the current character began in the previous buffer*/
            sourceIndex = c == 0 ? 0: -1;
            nextSourceIndex = 0;

            /*conversion loop*/
            if(c!=0 && targetCapacity>0){
                labelType = getTrail;
            }

            while(LabelLoop){
                switch(labelType){
                    case fastSingle:
                        labelType = fastSingle(source, target, offsets);
                        break;
                    case getTrail:
                        labelType = getTrail(source, target, offsets);
                        break;
                    case regularLoop:
                        labelType = regularLoop(source, target, offsets);
                        break;
                }
            }

            return cr;
        }

        private int fastSingle(CharBuffer source, ByteBuffer target, IntBuffer offsets){
//fastSingle:
            /*fast loop for single-byte differences*/
            /*use only one loop counter variable , targetCapacity, not also source*/
            diff = source.limit() - source.position();
            if(targetCapacity>diff){
                targetCapacity = diff;
            }
            while(targetCapacity>0 && (c=source.get(source.position()))<0x3000){
                if(c<=0x20){
                    if(c!=0x20){
                        prev = BOCU1_ASCII_PREV;
                    }
                    target.put((byte)c);
                    if(offsets!=null){
                        offsets.put(nextSourceIndex++);
                    }
                    source.position(source.position()+1);
                    --targetCapacity;
                }else {
                    diff = c-prev;
                    if(DIFF_IS_SINGLE(diff)){
                        prev = BOCU1_SIMPLE_PREV(c);
                        target.put((byte)PACK_SINGLE_DIFF(diff));
                        if(offsets!=null){
                            offsets.put(nextSourceIndex++);
                        }
                        source.position(source.position()+1);
                        --targetCapacity;
                    }else {
                        break;
                    }
                }
            }
            return regularLoop;
        }

        private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){
            if(source.hasRemaining()){
                /*test the following code unit*/
                char trail = source.get(source.position());
                if(UTF16.isTrailSurrogate(trail)){
                    source.position(source.position()+1);
                    ++nextSourceIndex;
                    c=UCharacter.getCodePoint((char)c, trail);
                }
            } else {
                /*no more input*/
                c = -c; /*negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else*/
                checkNegative = true;
            }
            LoopAfterTrail = true;
            return regularLoop;
        }

        @SuppressWarnings("fallthrough")
        private int regularLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
            if(!LoopAfterTrail){
                /*restore real values*/
                targetCapacity = target.limit()-target.position();
                sourceIndex = nextSourceIndex; /*wrong if offsets==null but does not matter*/
            }
            /*regular loop for all classes*/
            while(LoopAfterTrail || source.hasRemaining()){
                if(LoopAfterTrail || targetCapacity>0){

                    if(!LoopAfterTrail){
                        c = source.get();
                        ++nextSourceIndex;

                        if(c<=0x20){
                            /*
                             * ISO C0 control & space:
                             * Encode directly for MIME compatibility,
                             * and reset state except for space, to not disrupt compression.
                             */
                            if(c!=0x20) {
                                prev=BOCU1_ASCII_PREV;
                            }
                            target.put((byte)c);
                            if(offsets != null){
                                offsets.put(sourceIndex++);
                            }
                            --targetCapacity;

                            sourceIndex=nextSourceIndex;
                            continue;
                        }

                        if(UTF16.isLeadSurrogate((char)c)){
                            getTrail(source, target, offsets);
                            if(checkNegative){
                                break;
                            }
                        }
                    }

                    if(LoopAfterTrail){
                        LoopAfterTrail = false;
                    }

                    /*
                     * all other Unicode code points c==U+0021..U+10ffff
                     * are encoded with the difference c-prev
                     *
                     * a new prev is computed from c,
                     * placed in the middle of a 0x80-block (for most small scripts) or
                     * in the middle of the Unihan and Hangul blocks
                     * to statistically minimize the following difference
                     */
                    diff = c- prev;
                    prev = BOCU1_PREV(c);
                    if(DIFF_IS_SINGLE(diff)){
                        target.put((byte)PACK_SINGLE_DIFF(diff));
                        if(offsets!=null){
                            offsets.put(sourceIndex++);
                        }
                        --targetCapacity;
                        sourceIndex=nextSourceIndex;
                        if(c<0x3000){
                            labelType = fastSingle;
                            return labelType;
                        }
                    } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity){
                        /*optimize 2 byte case*/
                        int m = 0;
                        if(diff>=0){
                            diff -= BOCU1_REACH_POS_1 +1;
                            m = diff%BOCU1_TRAIL_COUNT;
                            diff/=BOCU1_TRAIL_COUNT;
                            diff+=BOCU1_START_POS_2;
                        } else {
                            diff -= BOCU1_REACH_NEG_1;
                            m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                            diff+=BOCU1_START_NEG_2;
                        }
                        target.put((byte)diff);
                        target.put((byte)BOCU1_TRAIL_TO_BYTE(m));
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                            offsets.put(sourceIndex);
                        }
                        targetCapacity -= 2;
                        sourceIndex = nextSourceIndex;
                    } else {
                        int length; /*will be 2..4*/
                        diff = packDiff(diff);
                        length = BOCU1_LENGTH_FROM_PACKED(diff);

                        /*write the output character bytes from diff and length*/
                        /*from the first if in the loop we know that targetCapacity>0*/
                        if(length<=targetCapacity){
                            switch(length){
                                /*each branch falls through the next one*/
                                case 4:
                                    target.put((byte)(diff>>24));
                                    if(offsets!= null){
                                        offsets.put(sourceIndex);
                                    }
                                case 3:
                                    target.put((byte)(diff>>16));
                                    if(offsets!= null){
                                        offsets.put(sourceIndex);
                                    }
                                case 2:
                                    target.put((byte)(diff>>8));
                                    if(offsets!= null){
                                        offsets.put(sourceIndex);
                                    }
                                    /*case 1 handled above*/
                                    target.put((byte)diff);
                                    if(offsets!= null){
                                        offsets.put(sourceIndex);
                                    }
                                default:
                                    /*will never occur*/
                                    break;
                            }
                            targetCapacity -= length;
                            sourceIndex = nextSourceIndex;
                        } else {
                            ByteBuffer error = ByteBuffer.wrap(errorBuffer);
                            /*
                             * We actually do this backwards here:
                             * In order to save an intermediate variable, we output
                             * first to the overflow buffer what does not fit into the
                             * regular target.
                             */
                            /* we know that 1<=targetCapacity>16));
                            case 2:
                                error.put((byte)(diff>>8));
                            case 1:
                                error.put((byte)diff);
                            default:
                                /* will never occur */
                                break;
                            }
                            errorBufferLength = length;

                            /* now output what fits into the regular target */
                            diff>>=8*length; /* length was reduced by targetCapacity */
                            switch(targetCapacity) {
                                /* each branch falls through to the next one */
                            case 3:
                                target.put((byte)(diff>>16));
                                if(offsets!= null){
                                    offsets.put(sourceIndex);
                                }
                            case 2:
                                target.put((byte)(diff>>8));
                                if(offsets!= null){
                                    offsets.put(sourceIndex);
                                }
                            case 1:
                                target.put((byte)diff);
                                if(offsets!= null){
                                    offsets.put(sourceIndex);
                                }
                            default:
                                /* will never occur */
                                break;
                            }

                            /* target overflow */
                            targetCapacity=0;
                            cr = CoderResult.OVERFLOW;
                            break;
                        }
                    }
                } else{
                    /*target is full*/
                    cr = CoderResult.OVERFLOW;
                    break;
                }

            }
            /*set the converter state back into UConverter*/
            fromUChar32 = c<0 ? -c :0;
            fromUnicodeStatus = prev;
            LabelLoop = false;
            labelType = fastSingle;
            return labelType;
        }

    }

    static class CharsetDecoderBOCU extends CharsetDecoderICU{
        public CharsetDecoderBOCU(CharsetICU cs) {
            super(cs);
        }

        int byteIndex;
        int sourceIndex, nextSourceIndex;
        int prev, c , diff, count;
        byte[] bytes;
        CoderResult cr;

        /* label values for supporting behavior similar to goto in C */
        private static final int fastSingle=0;
        private static final int getTrail=1;
        private static final int regularLoop=2;
        private static final int endLoop=3;

        private boolean LabelLoop;//used to break the while loop
        private boolean afterTrail; // its value is set to true to ignore code after getTrail:
        private int labelType;
        /*
         * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
         * The UConverter fields are used as follows:
         *
         * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
         *
         * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
         * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
         */

        /* BOCU-1-from-Unicode conversion functions --------------------------------- */



        /**
         * Function for BOCU-1 decoder; handles multi-byte lead bytes.
         *
         * @param b lead byte;
         *          BOCU1_MIN<=b= BOCU1_START_NEG_2) {
                /* positive difference */
                if(b < BOCU1_START_POS_3) {
                    /* two bytes */
                    diffValue = (b - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1+1;
                    countValue = 1;
                } else if(b < BOCU1_START_POS_4) {
                    /* three bytes */
                    diffValue = (b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
                    countValue = 2;
                } else {
                    /* four bytes */
                    diffValue = BOCU1_REACH_POS_3+1;
                    countValue = 3;
                }
            } else {
                /* negative difference */
                if(b >= BOCU1_START_NEG_3) {
                    /* two bytes */
                    diffValue=(b -BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
                    countValue=1;
                } else if(b>BOCU1_MIN) {
                    /* three bytes */
                    diffValue=(b - BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2;
                    countValue = 2;
                } else {
                    /* four bytes */
                    diffValue=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
                    countValue=3;
                }
            }

            /* return the state for decoding the trail byte(s) */
            return (diffValue<<2)|countValue;
        }

        /**
         * Function for BOCU-1 decoder; handles multi-byte trail bytes.
         *
         * @param count number of remaining trail bytes including this one
         * @param b trail byte
         * @return new delta for diff including b - <0 indicates an error
         *
         * @see decodeBocu1
         */
        private int decodeBocu1TrailByte(int countValue, int b) {
            b = b&UConverterConstants.UNSIGNED_BYTE_MASK;
            if((b)<=0x20) {
                /* skip some C0 controls and make the trail byte range contiguous */
                b = bocu1ByteToTrail[b];
                /* b<0 for an illegal trail byte value will result in return<0 below */
            } else {
                //b-= BOCU1_TRAIL_BYTE_OFFSET;
                b = b - BOCU1_TRAIL_BYTE_OFFSET;
            }

            /* add trail byte into difference and decrement count */
            if(countValue==1) {
                return b;
            } else if(countValue==2) {
                return b*BOCU1_TRAIL_COUNT;
            } else /* count==3 */ {
                return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
            }
        }

        @Override
        protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,
                boolean flush){
            cr = CoderResult.UNDERFLOW;

            LabelLoop = true;
            afterTrail = false;
            labelType = fastSingle; // labelType is set to fastSingle so t

            /*get the converter state*/
            prev = toUnicodeStatus;

            if(prev==0){
                prev = BOCU1_ASCII_PREV;
            }
            diff = mode;
            count = diff&3;
            diff>>=2;

            byteIndex = toULength;
            bytes = toUBytesArray;

            /* sourceIndex=-1 if the current character began in the previous buffer */
            sourceIndex=byteIndex==0 ? 0 : -1;
            nextSourceIndex=0;

            /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
            if(count>0 && byteIndex>0 && target.position()diff) {
                count = diff;
            }
            while(count>0) {
                if(BOCU1_START_NEG_2 <=(c=source.get(source.position())&UConverterConstants.UNSIGNED_BYTE_MASK) && c< BOCU1_START_POS_2) {
                    c = prev + (c-BOCU1_MIDDLE);
                    if(c<0x3000) {
                        target.put((char)c);
                        if(offsets!=null){
                            offsets.put(nextSourceIndex++);
                        }
                        prev = BOCU1_SIMPLE_PREV(c);
                    } else {
                        break;
                    }
                } else if((c&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0x20) {
                    if((c&UConverterConstants.UNSIGNED_BYTE_MASK) != 0x20) {
                        prev = BOCU1_ASCII_PREV;
                    }
                    target.put((char)c);
                    if(offsets!=null){
                        offsets.put(nextSourceIndex++);
                    }
                } else {
                    break;
                }
                source.position(source.position()+1);
                --count;
            }
            sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
            return labelType;
        }

        private int getTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){
            labelType = regularLoop;
            for(;;) {
                if(source.position() >= source.limit()) {
                    labelType = endLoop;
                    return labelType;
                }
                ++nextSourceIndex;
                c = bytes[byteIndex++] = source.get();

                /* trail byte in any position */
                c = decodeBocu1TrailByte(count, c);
                if(c<0) {
                    cr = CoderResult.malformedForLength(1);
                    labelType = endLoop;
                    return labelType;
                }

                diff+=c;
                if(--count==0) {
                    /* final trail byte, deliver a code point */
                    byteIndex=0;
                    c = prev + diff;
                    if(c > 0x10ffff) {
                        cr = CoderResult.malformedForLength(1);
                        labelType = endLoop;
                        return labelType;
                    }
                    break;
                }
            }
            afterTrail = true;
            return labelType;

        }

        private int afterGetTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){
            /* decode a sequence of single and lead bytes */
            while(afterTrail || source.hasRemaining()) {
                if(!afterTrail){
                    if(target.position() >= target.limit()) {
                        /* target is full */
                        cr = CoderResult.OVERFLOW;
                        break;
                    }

                    ++nextSourceIndex;
                    c = source.get()&UConverterConstants.UNSIGNED_BYTE_MASK;
                    if(BOCU1_START_NEG_2 <= c && c < BOCU1_START_POS_2) {
                        /* Write a code point directly from a single-byte difference. */
                        c = prev + (c-BOCU1_MIDDLE);
                        if(c<0x3000) {
                            target.put((char)c);
                            if(offsets!=null){
                                offsets.put(sourceIndex);
                            }
                            prev = BOCU1_SIMPLE_PREV(c);
                            sourceIndex = nextSourceIndex;
                            labelType = fastSingle;
                            return labelType;
                        }
                    } else if(c <= 0x20) {
                        /*
                         * Direct-encoded C0 control code or space.
                         * Reset prev for C0 control codes but not for space.
                         */
                        if(c != 0x20) {
                            prev=BOCU1_ASCII_PREV;
                        }
                        target.put((char)c);
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                        sourceIndex=nextSourceIndex;
                        continue;
                    } else if(BOCU1_START_NEG_3 <= c && c < BOCU1_START_POS_3 && source.hasRemaining()) {
                        /* Optimize two-byte case. */
                        if(c >= BOCU1_MIDDLE) {
                            diff=(c - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;
                        } else {
                            diff=(c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
                        }

                        /* trail byte */
                        ++nextSourceIndex;
                        c = decodeBocu1TrailByte(1, source.get());
                        if(c<0 || ((c = prev + diff + c)&UConverterConstants.UNSIGNED_INT_MASK)>0x10ffff) {
                            bytes[0]= source.get(source.position()-2);
                            bytes[1]= source.get(source.position()-1);
                            byteIndex = 2;
                            cr = CoderResult.malformedForLength(2);
                            break;
                        }
                    } else if(c == BOCU1_RESET) {
                        /* only reset the state, no code point */
                        prev=BOCU1_ASCII_PREV;
                        sourceIndex=nextSourceIndex;
                        continue;
                    } else {
                        /*
                         * For multi-byte difference lead bytes, set the decoder state
                         * with the partial difference value from the lead byte and
                         * with the number of trail bytes.
                         */
                        bytes[0]= (byte)c;
                        byteIndex = 1;

                        diff = decodeBocu1LeadByte(c);
                        count = diff&3;
                        diff>>=2;
                        getTrail(source, target, offsets);
                        if(labelType != regularLoop){
                            return labelType;
                        }
                    }
                }

                if(afterTrail){
                    afterTrail = false;
                }

                /* calculate the next prev and output c */
                prev = BOCU1_PREV(c);
                if(c<=0xffff) {
                    target.put((char)c);
                    if(offsets!=null){
                        offsets.put(sourceIndex);
                    }
                } else {
                    /* output surrogate pair */
                    target.put(UTF16.getLeadSurrogate(c));
                    if(target.hasRemaining()) {
                        target.put(UTF16.getTrailSurrogate(c));
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                            offsets.put(sourceIndex);
                        }
                    } else {
                        /* target overflow */
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                        charErrorBufferArray[0] = UTF16.getTrailSurrogate(c);
                        charErrorBufferLength = 1;
                        cr = CoderResult.OVERFLOW;
                        break;
                }
            }
            sourceIndex=nextSourceIndex;
          }
          labelType = endLoop;
          return labelType;
        }

        private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
            if(cr.isMalformed()) {
                /* set the converter state in UConverter to deal with the next character */
                toUnicodeStatus = BOCU1_ASCII_PREV;
                mode = 0;
            } else {
                /* set the converter state back into UConverter */
                toUnicodeStatus=prev;
                mode=(diff<<2)|count;
            }
            toULength=byteIndex;
            LabelLoop = false;
        }

    }


    @Override
    public CharsetDecoder newDecoder() {
        return new CharsetDecoderBOCU(this);
    }

    @Override
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderBOCU(this);
    }

    @Override
    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
        CharsetICU.getCompleteUnicodeSet(setFillIn);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy