All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.charset.CharsetSCSU Maven / Gradle / Ivy

Go to download

icu4j-charset is a supplemental library for icu4j, implementing Java Charset SPI.

There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2008-2011, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
package com.ibm.icu.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

/**
 * @author krajwade
 *
 */
class CharsetSCSU extends CharsetICU{
    /* SCSU definitions --------------------------------------------------------- */

    /* SCSU command byte values */
    //enum {
    private static final short SQ0=0x01; /* Quote from window pair 0 */
    private static final short SQ7=0x08; /* Quote from window pair 7 */
    private static final short SDX=0x0B; /* Define a window as extended */
    //private static final short Srs=0x0C; /* reserved */
    private static final short SQU=0x0E; /* Quote a single Unicode character */
    private static final short SCU=0x0F; /* Change to Unicode mode */
    private static final short SC0=0x10; /* Select window 0 */
    private static final short SC7=0x17; /* Select window 7 */
    private static final short SD0=0x18; /* Define and select window 0 */
    //private static final short SD7=0x1F; /* Define and select window 7 */

    private static final short UC0=0xE0; /* Select window 0 */
    private static final short UC7=0xE7; /* Select window 7 */
    private static final short UD0=0xE8; /* Define and select window 0 */
    private static final short UD7=0xEF; /* Define and select window 7 */
    private static final short UQU=0xF0; /* Quote a single Unicode character */
    private static final short UDX=0xF1; /* Define a Window as extended */
    private static final short Urs=0xF2;  /* reserved */
   // };

  //  enum {
        /*
         * Unicode code points from 3400 to E000 are not adressible by
         * dynamic window, since in these areas no short run alphabets are
         * found. Therefore add gapOffset to all values from gapThreshold.
         */
    private static final int gapThreshold=0x68;
    private static final int gapOffset = 0xAC00 ;
    /* values between reservedStart and fixedThreshold are reserved */
    private static final int reservedStart=0xA8;
    /* use table of predefined fixed offsets for values from fixedThreshold */
    private static final int fixedThreshold=0xF9;
    //};

    protected byte[] fromUSubstitution = new byte[]{(byte)0x0E,(byte)0xFF, (byte)0xFD};

    /* constant offsets for the 8 static windows */
    private static final int staticOffsets[]={
        0x0000, /* ASCII for quoted tags */
        0x0080, /* Latin - 1 Supplement (for access to punctuation) */
        0x0100, /* Latin Extended-A */
        0x0300, /* Combining Diacritical Marks */
        0x2000, /* General Punctuation */
        0x2080, /* Currency Symbols */
        0x2100, /* Letterlike Symbols and Number Forms */
        0x3000  /* CJK Symbols and punctuation */
    };

    /* initial offsets for the 8 dynamic (sliding) windows */
   private static final int initialDynamicOffsets[]={
        0x0080, /* Latin-1 */
        0x00C0, /* Latin Extended A */
        0x0400, /* Cyrillic */
        0x0600, /* Arabic */
        0x0900, /* Devanagari */
        0x3040, /* Hiragana */
        0x30A0, /* Katakana */
        0xFF00  /* Fullwidth ASCII */
    };

    /* Table of fixed predefined Offsets */
    private static final int fixedOffsets[]={
        /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
        /* 0xFA */ 0x0250, /* IPA extensions */
        /* 0xFB */ 0x0370, /* Greek */
        /* 0xFC */ 0x0530, /* Armenian */
        /* 0xFD */ 0x3040, /* Hiragana */
        /* 0xFE */ 0x30A0, /* Katakana */
        /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
    };

    /* state values */
    //enum {
    private static final int readCommand=0;
    private static final int quotePairOne=1;
    private static final int quotePairTwo=2;
    private static final int quoteOne=3;
    private static final int definePairOne=4;
    private static final int definePairTwo=5;
    private static final int defineOne=6;
  //  };

    private final static class SCSUData {
        /* dynamic window offsets, initialize to default values from initialDynamicOffsets */
        int toUDynamicOffsets[] = new int[8] ;
        int fromUDynamicOffsets[] = new int[8] ;

        /* state machine state - toUnicode */
        boolean toUIsSingleByteMode;
        short toUState;
        byte toUQuoteWindow, toUDynamicWindow;
        short toUByteOne;

        /* state machine state - fromUnicode */
        boolean fromUIsSingleByteMode;
        byte fromUDynamicWindow;

        /*
         * windowUse[] keeps track of the use of the dynamic windows:
         * At nextWindowUseIndex there is the least recently used window,
         * and the following windows (in a wrapping manner) are more and more
         * recently used.
         * At nextWindowUseIndex-1 there is the most recently used window.
         */
        byte locale;
        byte nextWindowUseIndex;
        byte windowUse[] = new byte[8];

        SCSUData(){
            initialize();
        }

        void initialize(){
            for(int i=0;i<8;i++){
                this.toUDynamicOffsets[i] = initialDynamicOffsets[i];
            }
            this.toUIsSingleByteMode = true;
            this.toUState = readCommand;
            this.toUQuoteWindow = 0;
            this.toUDynamicWindow = 0;
            this.toUByteOne = 0;
            this.fromUIsSingleByteMode = true;
            this.fromUDynamicWindow = 0;
            for(int i=0;i<8;i++){
                this.fromUDynamicOffsets[i] = initialDynamicOffsets[i];
            }
            this.nextWindowUseIndex = 0;
            switch(this.locale){
            /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */
            /*    case l_ja:
                    for(int i=0;i<8;i++){
                        this.windowUse[i] = initialWindowUse_ja[i];
                    }
                    break; */
                default:
                    for(int i=0;i<8;i++){
                        this.windowUse[i] = initialWindowUse[i];
                    }

            }
        }
    }

    static final byte initialWindowUse[]={ 7, 0, 3, 2, 4, 5, 6, 1 };
    /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */
    // static final byte initialWindowUse_ja[]={ 3, 2, 4, 1, 0, 7, 5, 6 };

    //enum {
    //private static final int lGeneric = 0;
    /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */
    // private static final int l_ja = 1;
    //};

    private SCSUData extraInfo = null;

    public CharsetSCSU(String icuCanonicalName, String javaCanonicalName, String[] aliases){
        super(icuCanonicalName, javaCanonicalName, aliases);
        maxBytesPerChar = 3;
        minBytesPerChar = 1;
        maxCharsPerByte = 1;
        extraInfo = new SCSUData();
    }

    class CharsetDecoderSCSU extends CharsetDecoderICU {
        /* label values for supporting behavior similar to goto in C */
        private static final int FastSingle=0;
        private static final int SingleByteMode=1;
        private static final int EndLoop=2;

        /* Mode Type */
        private static final int ByteMode = 0;
        private static final int UnicodeMode =1;

        public CharsetDecoderSCSU(CharsetICU cs) {
            super(cs);
            implReset();
        }

        //private SCSUData data ;
        @Override
        protected void implReset(){
            super.implReset();
            toULength = 0;
            extraInfo.initialize();
        }

        short b;

        //Get the state machine state
        private boolean isSingleByteMode ;
        private short state ;
        private byte quoteWindow ;
        private byte dynamicWindow ;
        private short byteOne;


        //sourceIndex=-1 if the current character began in the previous buffer
        private int sourceIndex  ;
        private int nextSourceIndex ;

        CoderResult cr;
        SCSUData data ;
        private boolean LabelLoop;// used to break the while loop

        @Override
        protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,
                boolean flush){
            data = extraInfo;

            //Get the state machine state
            isSingleByteMode = data.toUIsSingleByteMode;
            state = data.toUState;
            quoteWindow = data.toUQuoteWindow;
            dynamicWindow = data.toUDynamicWindow;
            byteOne = data.toUByteOne;

            LabelLoop = true;

            //sourceIndex=-1 if the current character began in the previous buffer
            sourceIndex = data.toUState == readCommand ? 0: -1 ;
            nextSourceIndex = 0;

            cr = CoderResult.UNDERFLOW;
            int labelType = 0;
            while(LabelLoop){
                if(isSingleByteMode){
                    switch(labelType){
                        case FastSingle:
                            /*fast path for single-byte mode*/
                            labelType = fastSingle(source, target, offsets, ByteMode);
                            break;
                        case SingleByteMode:
                            /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */
                            labelType = singleByteMode(source, target, offsets, ByteMode);
                            break;
                        case EndLoop:
                            endLoop(source, target, offsets);
                            break;
                    }
                }else{
                    switch(labelType){
                        case FastSingle:
                            /*fast path for single-byte mode*/
                            labelType = fastSingle(source, target, offsets, UnicodeMode);
                            break;
                        case SingleByteMode:
                            /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */
                            labelType = singleByteMode(source, target, offsets, UnicodeMode);
                            break;
                        case EndLoop:
                            endLoop(source, target, offsets);
                            break;
                    }
                    //LabelLoop = false;
                }
            }
            return cr;
        }

        private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType){
            int label = 0;
            if(modeType==ByteMode){

                if(state==readCommand){
                    while(source.hasRemaining() && target.hasRemaining() && (b=(short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK)) >= 0x20){
                        source.position(source.position()+1);
                        ++nextSourceIndex;
                        if(b <= 0x7f){
                            /*Write US graphic character or DEL*/
                            target.put((char)b);
                            if(offsets != null){
                                offsets.put(sourceIndex);
                            }
                        }else{
                            /*Write from dynamic window*/
                            int c = data.toUDynamicOffsets[dynamicWindow] + (b&0x7f);
                            if(c <= 0xffff){
                                target.put((char)c);
                                if(offsets != null){
                                    offsets.put(sourceIndex);
                                }
                            }else{
                                /*Output surrogate pair */
                                target.put((char)(0xd7c0 + (c>>10)));
                                if(target.hasRemaining()){
                                    target.put((char)(0xdc00 | (c&0x3ff)));
                                    if(offsets != null){
                                        offsets.put(sourceIndex);
                                        offsets.put(sourceIndex);
                                    }
                                }else{
                                    /* target overflow */
                                    if(offsets != null){
                                        offsets.put(sourceIndex);
                                    }
                                    charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff));
                                    charErrorBufferLength = 1;
                                    label = EndLoop;
                                    cr = CoderResult.OVERFLOW;
                                    return label;
                                }
                            }
                        }
                        sourceIndex = nextSourceIndex;
                    }
                   // label = SingleByteMode;
                }
            }else if(modeType==UnicodeMode){
                /* fast path for unicode mode */
                if(state == readCommand){
                    while((source.position()+1)(Urs-UC0)){
                        target.put((char)((b<<8)|(source.get(source.position()+1)&UConverterConstants.UNSIGNED_BYTE_MASK)));
                        if(offsets != null){
                            offsets.put(sourceIndex);
                        }
                        sourceIndex = nextSourceIndex;
                        nextSourceIndex+=2;
                        source.position(source.position()+2);
                    }
                }
            }
            label = SingleByteMode;
            return label;
        }

        private int singleByteMode(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType){
            int label = SingleByteMode;
            if(modeType == ByteMode){
                while(source.hasRemaining()){
                    if(!target.hasRemaining()){
                        cr = CoderResult.OVERFLOW;
                        label = EndLoop;
                        return label;
                     }
                    b = (short)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK);
                    ++nextSourceIndex;
                    switch(state){
                    case readCommand:
                        /*redundant conditions are commented out */
                        if(((1L<>10)));
                                if(target.hasRemaining()){
                                    target.put((char)(0xdc00 | (c&0x3ff)));
                                    if(offsets != null){
                                        offsets.put(sourceIndex);
                                        offsets.put(sourceIndex);
                                    }
                                }else {
                                    /* target overflow */
                                    if(offsets != null){
                                        offsets.put(sourceIndex);
                                    }
                                    charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff));
                                    charErrorBufferLength = 1;
                                    label = EndLoop;
                                    cr = CoderResult.OVERFLOW;
                                    LabelLoop = false;
                                    return label;
                                }
                            }
                        }
                        sourceIndex = nextSourceIndex;
                        state = readCommand;
                        label = FastSingle;
                        return label;
                    case definePairOne:
                        dynamicWindow = (byte)((b>>5)&7);
                        byteOne = (byte)(b&0x1f);
                        toUBytesArray[1] = (byte)b;
                        toULength = 2;
                        state = definePairTwo;
                        break;
                    case definePairTwo:
                        data.toUDynamicOffsets[dynamicWindow] = 0x10000 + (byteOne<<15L | b<<7L);
                        sourceIndex = nextSourceIndex;
                        state = readCommand;
                        label = FastSingle;
                        return label;
                    case defineOne:
                        if(b==0){
                            /*callback (illegal)*/
                            toUBytesArray[1] = (byte)b;
                            toULength =2;
                            label = EndLoop;
                            return label;
                        }else if(b=fixedThreshold){
                            data.toUDynamicOffsets[dynamicWindow] = fixedOffsets[b-fixedThreshold];
                        }else{
                            /*callback (illegal)*/
                            toUBytesArray[1] = (byte)b;
                            toULength =2;
                            label = EndLoop;
                            return label;
                        }
                        sourceIndex = nextSourceIndex;
                        state = readCommand;
                        label = FastSingle;
                        return label;
                    }
                }

            }else if(modeType==UnicodeMode){
                while(source.hasRemaining()){
                    if(!target.hasRemaining()){
                        cr = CoderResult.OVERFLOW;
                        label = EndLoop;
                        return label;
                    }
                    b = (short)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK);
                    ++nextSourceIndex;
                    switch(state){
                    case readCommand:
                        if((short)((b -UC0)&UConverterConstants.UNSIGNED_BYTE_MASK)>(Urs - UC0)){
                            byteOne = b;
                            toUBytesArray[0] = (byte)b;
                            toULength = 1;
                            state = quotePairTwo;
                        }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UC7){
                            dynamicWindow = (byte)(b - UC0);
                            sourceIndex = nextSourceIndex;
                            isSingleByteMode = true;
                            label = FastSingle;
                            return label;
                        }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UD7){
                            dynamicWindow = (byte)(b - UD0);
                            isSingleByteMode = true;
                            toUBytesArray[0] = (byte)b;
                            toULength = 1;
                            state = defineOne;
                            label = SingleByteMode;
                            return label;
                        }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UDX){
                            isSingleByteMode = true;
                            toUBytesArray[0] = (byte)b;
                            toULength = 1;
                            state = definePairOne;
                            label = SingleByteMode;
                            return label;
                        }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UQU){
                            toUBytesArray[0] = (byte)b;
                            toULength = 1;
                            state = quotePairOne;
                        }else {
                            /* callback (illegal)*/
                            cr = CoderResult.malformedForLength(1);
                            toUBytesArray[0] = (byte)b;
                            toULength = 1;
                            label = EndLoop;
                            return label;
                        }
                        break;
                    case quotePairOne:
                        byteOne = b;
                        toUBytesArray[1] = (byte)b;
                        toULength = 2;
                        state = quotePairTwo;
                        break;
                    case quotePairTwo:
                        target.put((char)((byteOne<<8) | b));
                        if(offsets != null){
                            offsets.put(sourceIndex);
                        }
                        sourceIndex = nextSourceIndex;
                        state = readCommand;
                        label = FastSingle;
                        return label;
                    }
                }
            }
            label = EndLoop;
            return label;
        }

        private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
            if(cr==CoderResult.OVERFLOW){
                state = readCommand;
            }else if(state == readCommand){
                toULength = 0;
            }
            data.toUIsSingleByteMode = isSingleByteMode;
            data.toUState = state;
            data.toUQuoteWindow = quoteWindow;
            data.toUDynamicWindow = dynamicWindow;
            data.toUByteOne = byteOne;
            LabelLoop = false;
        }
    }

    class CharsetEncoderSCSU extends CharsetEncoderICU{
        public CharsetEncoderSCSU(CharsetICU cs) {
            super(cs, fromUSubstitution);
            implReset();
        }

        //private SCSUData data;
        @Override
        protected void implReset() {
            super.implReset();
            extraInfo.initialize();
        }

        /* label values for supporting behavior similar to goto in C */
        private static final int Loop=0;
        private static final int GetTrailUnicode=1;
        private static final int OutputBytes=2;
        private static final int EndLoop =3;

        private int delta;
        private int length;

        ///variables of compression heuristics
        private int offset;
        private char lead, trail;
        private int code;
        private byte window;

        //Get the state machine state
        private boolean isSingleByteMode;
        private byte dynamicWindow ;
        private int currentOffset;
        int c;

        SCSUData data ;

        //sourceIndex=-1 if the current character began in the previous buffer
        private int sourceIndex ;
        private int nextSourceIndex;
        private int targetCapacity;

        private boolean LabelLoop;//used to break the while loop
        private boolean AfterGetTrail;// its value is set to true in order to ignore the code before getTrailSingle:
        private boolean AfterGetTrailUnicode;// is value is set to true in order to ignore the code before getTrailUnicode:

        CoderResult cr;

        @Override
        protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
            data = extraInfo;
            cr = CoderResult.UNDERFLOW;

            //Get the state machine state
            isSingleByteMode = data.fromUIsSingleByteMode;
            dynamicWindow = data.fromUDynamicWindow;
            currentOffset = data.fromUDynamicOffsets[dynamicWindow];
            c = fromUChar32;

            sourceIndex = c== 0 ? 0: -1 ;
            nextSourceIndex = 0;


            targetCapacity = target.limit()-target.position();

            //sourceIndex=-1 if the current character began in the previous buffer
            sourceIndex = c== 0 ? 0: -1 ;
            nextSourceIndex = 0;

            int labelType = Loop; // set to Loop so that the code starts from loop:
            LabelLoop = true;
            AfterGetTrail = false;
            AfterGetTrailUnicode = false;

            while(LabelLoop){
                switch(labelType){
                case Loop:
                    labelType = loop(source, target, offsets);
                    break;
                case GetTrailUnicode:
                    labelType = getTrailUnicode(source, target, offsets);
                    break;
                case OutputBytes:
                    labelType = outputBytes(source, target, offsets);
                    break;
                case EndLoop:
                    endLoop(source, target, offsets);
                    break;
                }
            }
            return cr;
        }

        private byte getWindow(int[] offsets){
            int i;
            for (i=0;i<8;i++){
                if(((c-offsets[i]) & UConverterConstants.UNSIGNED_INT_MASK) <= 0x7f){
                    return (byte)i;
                }
            }
            return -1;
        }

        private boolean isInOffsetWindowOrDirect(int offsetValue, int a){
            return (a & UConverterConstants.UNSIGNED_INT_MASK)<=(offsetValue & UConverterConstants.UNSIGNED_INT_MASK)+0x7f &
                    ((a & UConverterConstants.UNSIGNED_INT_MASK)>=(offsetValue & UConverterConstants.UNSIGNED_INT_MASK) ||
                            ((a & UConverterConstants.UNSIGNED_INT_MASK)<=0x7f && ((a & UConverterConstants.UNSIGNED_INT_MASK)>=0x20
                                    || ((1L<<(a & UConverterConstants.UNSIGNED_INT_MASK))&0x2601)!=0)));
        }

        private byte getNextDynamicWindow(){
            byte windowValue = data.windowUse[data.nextWindowUseIndex];
            if(++data.nextWindowUseIndex==8){
                data.nextWindowUseIndex=0;
            }
            return windowValue;
        }

        private void useDynamicWindow(byte windowValue){
            /*first find the index of the window*/
            int i,j;
            i = data.nextWindowUseIndex;
            do{
                if(--i<0){
                    i=7;
                }
            }while(data.windowUse[i]!=windowValue);

            /*now copy each window[i+1] to [i]*/
            j= i+1;
            if(j==8){
                j=0;
            }
            while(j!=data.nextWindowUseIndex){
                data.windowUse[i] = data.windowUse[j];
                i=j;
                if(++j==8){
                    j=0;
                }
            }

            /*finally, set the window into the most recently used index*/
            data.windowUse[i]= windowValue;
        }


       private int getDynamicOffset(){
            int i;
            for(i=0;i<7;++i){
                if(((c-fixedOffsets[i])&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){
                    offset = fixedOffsets[i];
                    return 0xf9+i;
                }
            }
            if((c&UConverterConstants.UNSIGNED_INT_MASK)<0x80){
                /*No dynamic window for US-ASCII*/
                return -1;
            }else if((c&UConverterConstants.UNSIGNED_INT_MASK)<0x3400 || ((c-0x10000)&UConverterConstants.UNSIGNED_INT_MASK)<(0x14000-0x10000) ||
                    ((c-0x1d000)&UConverterConstants.UNSIGNED_INT_MASK)<=(0x1ffff-0x1d000)){
                /*This character is in the code range for a "small", i.e, reasonably windowable, script*/
                offset = c&0x7fffff80;
                return (c>>7);
            }else if(0xe000<=(c&UConverterConstants.UNSIGNED_INT_MASK) && (c&UConverterConstants.UNSIGNED_INT_MASK)!=0xfeff && (c&UConverterConstants.UNSIGNED_INT_MASK) < 0xfff0){
                /*for these characters we need to take the gapOffset into account*/
                offset=(c)&0x7fffff80;
                return ((c-gapOffset)>>7);
            }else{
                return -1;
            }
        }

        private int loop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
            int label = 0;
            if(isSingleByteMode){
                if(c!=0 && targetCapacity>0 && !AfterGetTrail){
                    label = getTrail(source, target, offsets);
                    return label;
                }
                /*state machine for single byte mode*/
                while(AfterGetTrail || source.hasRemaining()){
                    if(targetCapacity<=0 && !AfterGetTrail){
                        /*target is full*/
                        cr = CoderResult.OVERFLOW;
                        label = EndLoop;
                        return label;
                    }
                    if(!AfterGetTrail){
                        c = source.get();
                        ++nextSourceIndex;

                    }
                    if(((c -0x20)&UConverterConstants.UNSIGNED_INT_MASK)<=0x5f && !AfterGetTrail){
                        /*pass US-ASCII graphic character through*/
                        target.put((byte)c);
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                        --targetCapacity;
                    }else if((c & UConverterConstants.UNSIGNED_INT_MASK)<0x20 && !AfterGetTrail){
                        if(((1L<<(c & UConverterConstants.UNSIGNED_INT_MASK))&0x2601)!=0){
                            /*CR/LF/TAB/NUL*/
                            target.put((byte)c);
                            if(offsets!=null){
                                offsets.put(sourceIndex);
                            }
                            --targetCapacity;
                        } else {
                            /*quote c0 control character*/
                            c|=SQ0<<8;
                            length = 2;
                            label = OutputBytes;
                            return label;
                        }
                    } else if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f && !AfterGetTrail){
                        /*use the current dynamic window*/
                        target.put((byte)(delta|0x80));
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                        --targetCapacity;
                    } else if(AfterGetTrail || UTF16.isSurrogate(c)){
                        if(!AfterGetTrail){
                            if(UTF16.isLeadSurrogate(c)){
                                label = getTrail(source, target, offsets);
                                if(label==EndLoop){
                                    return label;
                                }
                            } else {
                                /*this is unmatched lead code unit (2nd Surrogate)*/
                                /*callback(illegal)*/
                                cr = CoderResult.malformedForLength(1);
                                label = EndLoop;
                                return label;
                            }
                        }


                        if(AfterGetTrail){
                            AfterGetTrail = false;
                        }

                        /*Compress supplementary character U+10000...U+10ffff */
                        if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){
                            /*use the current dynamic window*/
                            target.put((byte)(delta|0x80));
                            if(offsets!=null){
                                offsets.put(sourceIndex);
                            }
                            --targetCapacity;
                        } else if((window=getWindow(data.fromUDynamicOffsets))>=0){
                            /*there is a dynamic window that contains this character, change to it*/
                            dynamicWindow = window;
                            currentOffset = data.fromUDynamicOffsets[dynamicWindow];
                            useDynamicWindow(dynamicWindow);
                            c = ((SC0+dynamicWindow)<<8 | (c-currentOffset)|0x80);
                            length = 2;
                            label  = OutputBytes;
                            return label;
                        } else if((code=getDynamicOffset())>=0){
                            /*might check if there are come character in this window to come */
                            /*define an extended window with this character*/
                            code-=0x200;
                            dynamicWindow=getNextDynamicWindow();
                            currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset;
                            useDynamicWindow(dynamicWindow);
                            c = ((SDX<<24) | (dynamicWindow<<21)|
                                 (code<<8)| (c- currentOffset) |0x80);
                           // c = (((SDX)<<25) | (dynamicWindow<<21)|
                             //           (code<<8)| (c- currentOffset) |0x80  );
                            length = 4;
                            label = OutputBytes;
                            return label;
                        } else {
                            /*change to unicode mode and output this (lead, trail) pair*/
                            isSingleByteMode = false;
                            target.put((byte)SCU);
                            if(offsets!=null){
                                offsets.put(sourceIndex);
                            }
                            --targetCapacity;
                            c = (lead<<16)|trail;
                            length = 4;
                            label = OutputBytes;
                            return label;
                        }
                    } else if((c&UConverterConstants.UNSIGNED_INT_MASK)<0xa0){
                        /*quote C1 control character*/
                        c = (c&0x7f) | (SQ0+1)<<8; /*SQ0+1 == SQ1*/
                        length = 2;
                        label = OutputBytes;
                        return label;
                    } else if((c&UConverterConstants.UNSIGNED_INT_MASK)==0xfeff || (c&UConverterConstants.UNSIGNED_INT_MASK)>= 0xfff0){
                        /*quote signature character = byte order mark and specials*/
                        c |= SQU<<16;
                        length = 3;
                        label = OutputBytes;
                        return label;
                    } else {
                        /*compress all other BMP characters*/
                        if((window=getWindow(data.fromUDynamicOffsets))>=0){
                            /*there is a window defined that contains this character - switch to it or quote from it*/
                            if(source.position()>=source.limit() || isInOffsetWindowOrDirect(data.fromUDynamicOffsets[window], source.get(source.position()))){
                                /*change to dynamic window*/
                                dynamicWindow = window;
                                currentOffset = data.fromUDynamicOffsets[dynamicWindow];
                                useDynamicWindow(dynamicWindow);
                                c = ((SC0+window)<<8) | (c- currentOffset) | 0x80;
                                length = 2;
                                label = OutputBytes;
                                return label;
                            } else {
                                /*quote from dynamic window*/
                                c = ((SQ0+window)<<8) | (c - data.fromUDynamicOffsets[window]) |
                                    0x80;
                                length = 2;
                                label = OutputBytes;
                                return label;
                            }
                        } else if((window = getWindow(staticOffsets))>=0){
                            /*quote from static window*/
                            c = ((SQ0+window)<<8) | (c - staticOffsets[window]);
                            length = 2;
                            label = OutputBytes;
                            return label;
                        }else if((code=getDynamicOffset())>=0){
                            /*define a dynamic window with this character*/
                            dynamicWindow = getNextDynamicWindow();
                            currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset;
                            useDynamicWindow(dynamicWindow);
                            c = ((SD0+dynamicWindow)<<16) | (code<<8)|
                                (c - currentOffset) | 0x80;
                            length = 3;
                            label = OutputBytes;
                            return label;
                        } else if(((int)((c-0x3400)&UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400) && (source.position()>=source.limit() ||
                                ((int)((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_INT_MASK))< (0xd800 - 0x3400))){

                            /*
                             * this character is not compressible (a BMP ideograph of similar)
                             * switch to Unicode mode if this is the last character in the block
                             * or there is at least one more ideograph following immediately
                             */
                             isSingleByteMode = false;
                             c|=SCU<<16;
                             length =3;
                             label = OutputBytes;
                             return label;
                        } else {
                            /*quote Unicode*/
                            c|=SQU<<16;
                            length = 3;
                            label = OutputBytes;
                            return label;
                        }
                    }
                    /*normal end of conversion : prepare for new character */
                    c = 0;
                    sourceIndex = nextSourceIndex;
                }
            } else {
                if(c!=0 && targetCapacity>0 && !AfterGetTrailUnicode){
                    label = GetTrailUnicode;
                    return label;
                }

                /*state machine for Unicode*/
                /*unicodeByteMode*/
                while(AfterGetTrailUnicode || source.hasRemaining()){
                    if(targetCapacity<=0 && !AfterGetTrailUnicode){
                        /*target is full*/
                        cr = CoderResult.OVERFLOW;
                        LabelLoop = false;
                        break;
                    }
                    if(!AfterGetTrailUnicode){
                        c = source.get();
                        ++nextSourceIndex;
                    }

                    if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400) && !AfterGetTrailUnicode){
                        /*not compressible, write character directly */
                        if(targetCapacity>=2){
                            target.put((byte)(c>>8));
                            target.put((byte)c);
                            if(offsets!=null){
                                offsets.put(sourceIndex);
                                offsets.put(sourceIndex);
                            }
                            targetCapacity-=2;
                        } else {
                            length =2;
                            label = OutputBytes;
                            return label;
                        }
                    } else if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300*/&& !AfterGetTrailUnicode){
                        /*compress BMP character if the following one is not an uncompressible ideograph*/
                        if(!(source.hasRemaining() && (((source.get(source.position())-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400))){
                            if(((((c-0x30)&UConverterConstants.UNSIGNED_INT_MASK))<10 || (((c-0x61)&UConverterConstants.UNSIGNED_INT_MASK))<26
                                    || (((c-0x41)&UConverterConstants.UNSIGNED_INT_MASK))<26)){
                                /*ASCII digit or letter*/
                                isSingleByteMode = true;
                                c |=((UC0+dynamicWindow)<<8)|c;
                                length = 2;
                                label = OutputBytes;
                                return label;
                            } else if((window=getWindow(data.fromUDynamicOffsets))>=0){
                                /*there is a dynamic window that contains this character, change to it*/
                                isSingleByteMode = true;
                                dynamicWindow = window;
                                currentOffset = data.fromUDynamicOffsets[dynamicWindow];
                                useDynamicWindow(dynamicWindow);
                                c = ((UC0+dynamicWindow)<<8) | (c- currentOffset) | 0x80;
                                length = 2;
                                label = OutputBytes;
                                return label;
                            } else if((code=getDynamicOffset())>=0){
                                /*define a dynamic window with this character*/
                                isSingleByteMode = true;
                                dynamicWindow = getNextDynamicWindow();
                                currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset;
                                useDynamicWindow(dynamicWindow);
                                c = ((UD0+dynamicWindow)<<16) | (code<<8)
                                    |(c - currentOffset) | 0x80;
                                length = 3;
                                label = OutputBytes;
                                return label;
                            }
                        }

                        /*don't know how to compress these character, just write it directly*/
                        length = 2;
                        label = OutputBytes;
                        return label;
                    } else if(c<0xe000 && !AfterGetTrailUnicode){
                        label = GetTrailUnicode;
                        return label;
                    } else if (!AfterGetTrailUnicode){
                        /*quote to avoid SCSU tags*/
                        c|=UQU<<16;
                        length = 3;
                        label = OutputBytes;
                        return label;
                    }

                    if(AfterGetTrailUnicode){
                        AfterGetTrailUnicode = false;
                    }
                    /*normal end of conversion, prepare for a new character*/
                    c = 0;
                    sourceIndex = nextSourceIndex;
                }
            }
            label = EndLoop;
            return label;
        }

        private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){
            lead = (char)c;
            int label = Loop;
            if(source.hasRemaining()){
                /*test the following code unit*/
                trail = source.get(source.position());
                if(UTF16.isTrailSurrogate(trail)){
                    source.position(source.position()+1);
                    ++nextSourceIndex;
                    c = UCharacter.getCodePoint(c, trail);
                    label = Loop;
                } else {
                    /*this is unmatched lead code unit (1st Surrogate)*/
                    /*callback(illegal)*/
                    cr = CoderResult.malformedForLength(1);
                    label = EndLoop;
                }
            }else {
                /*no more input*/
                label = EndLoop;
            }
            AfterGetTrail = true;
            return label;
        }

        private int getTrailUnicode(CharBuffer source, ByteBuffer target, IntBuffer offsets){
            int label = EndLoop;
            AfterGetTrailUnicode = true;
            /*c is surrogate*/
            if(UTF16.isLeadSurrogate(c)){
      // getTrailUnicode:
                lead = (char)c;
                if(source.hasRemaining()){
                    /*test the following code unit*/
                    trail = source.get(source.position());
                    if(UTF16.isTrailSurrogate(trail)){
                        source.get();
                        ++nextSourceIndex;
                        c = UCharacter.getCodePoint(c, trail);
                        /*convert this surrogate code point*/
                        /*exit this condition tree*/
                    } else {
                        /*this is unmatched lead code unit(1st surrogate)*/
                        /*callback(illegal)*/
                        cr = CoderResult.malformedForLength(1);
                        label = EndLoop;
                        return label;
                    }
                } else {
                    /*no more input*/
                    label = EndLoop;
                    return label;
                }
            } else {
                /*this is an unmatched trail code point (2nd surrogate)*/
                /*callback (illegal)*/
                cr = CoderResult.malformedForLength(1);
                label = EndLoop;
                return label;
            }

            /*compress supplementary character*/
            if((window=getWindow(data.fromUDynamicOffsets))>=0 &&
                    !(source.hasRemaining() && ((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_INT_MASK) <
                            (0xd800 - 0x3400))){
                /*
                 * this is the dynamic window that contains this character and the following
                 * character is not uncompressible,
                 * change to the window
                 */
                isSingleByteMode = true;
                dynamicWindow = window;
                currentOffset = data.fromUDynamicOffsets[dynamicWindow];
                useDynamicWindow(dynamicWindow);
                c = ((UC0+dynamicWindow)<<8 | (c-currentOffset) | 0x80);
                length = 2;
                label = OutputBytes;
                return label;
            } else if(source.hasRemaining() && lead == source.get(source.position()) && (code=getDynamicOffset())>=0){
                /*two supplementary characters in (probably) the same window - define an extended one*/
                isSingleByteMode = true;
                dynamicWindow = getNextDynamicWindow();
                currentOffset = data.fromUDynamicOffsets[dynamicWindow] = offset;
                useDynamicWindow(dynamicWindow);
                c = (UDX<<24) | (dynamicWindow<<21) |(code<<8) |(c - currentOffset) | 0x80;
                length = 4;
                label = OutputBytes;
                return label;
            } else {
                /*don't know how to compress this character, just write it directly*/
                c = (lead<<16)|trail;
                length = 4;
                label = OutputBytes;
                return label;
            }

        }

        private void endLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
            /*set the converter state back to UConverter*/
            data.fromUIsSingleByteMode = isSingleByteMode;
            data.fromUDynamicWindow = dynamicWindow;
            fromUChar32 = c;
            LabelLoop = false;
        }

        @SuppressWarnings("fallthrough")
        private int outputBytes(CharBuffer source, ByteBuffer target, IntBuffer offsets){
            int label;
            //int targetCapacity = target.limit()-target.position();
            /*write the output character byte from c and length*/
            /*from the first if in the loop we know that targetCapacity>0*/
            if(length<=targetCapacity){
                switch(length){
                /*each branch falls through the next one*/
                    case 4:
                        target.put((byte)(c>>24));
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                    case 3:
                        target.put((byte)(c>>16));
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                    case 2:
                        target.put((byte)(c>>8));
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                    case 1:
                        target.put((byte)c);
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                    default:
                        /*will never occur*/
                        break;
                }
                targetCapacity-=length;

                /*normal end of conversion: prepare for a new character*/
                c = 0;
                sourceIndex = nextSourceIndex;
                label = Loop;
                return label;
            } else {
                ByteBuffer p = ByteBuffer.wrap(errorBuffer);
                /*
                 * We actually do this backwards here:
                 * In order to save an intermediate variable, we output
                 * first to the overflow buffer what does not fit into the
                 * regular target
                 */
                /* we know that 0<=targetCapacity>24));
                    case 3:
                        p.put((byte)(c>>16));
                    case 2:
                        p.put((byte)(c>>8));
                    case 1:
                        p.put((byte)c);
                    default:
                        /*will never occur*/
                        break;
                }
                errorBufferLength = length;

                /*now output what fits into the regular target*/
                c>>=8*length; //length was reduced by targetCapacity
                switch(targetCapacity){
                    /*each branch falls through the next one*/
                    case 3:
                        target.put((byte)(c>>16));
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                    case 2:
                        target.put((byte)(c>>8));
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                    case 1:
                        target.put((byte)c);
                        if(offsets!=null){
                            offsets.put(sourceIndex);
                        }
                    default:
                        break;
                }

                /*target overflow*/
                targetCapacity = 0;
                cr = CoderResult.OVERFLOW;
                c = 0;
                label = EndLoop;
                return label;
            }
        }

    }

    @Override
    public CharsetDecoder newDecoder() {
        return new CharsetDecoderSCSU(this);
    }

    @Override
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderSCSU(this);
    }

    @Override
    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
        CharsetICU.getCompleteUnicodeSet(setFillIn);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy