com.ibm.icu.charset.CharsetUTF32 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j-charset Show documentation
icu4j-charset is a supplemental library for icu4j, implementing Java Charset SPI.
There is a newer version: 76.1
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
 *******************************************************************************
 * Copyright (C) 2006-2008, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
package com.ibm.icu.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;

import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

/**
 * @author Niti Hantaweepant
 */
class CharsetUTF32 extends CharsetICU {

    private static final int SIGNATURE_LENGTH = 4;
    private static final byte[] fromUSubstitution_BE = { (byte) 0, (byte) 0, (byte) 0xff, (byte) 0xfd };
    private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff, (byte) 0, (byte) 0 };
    private static final byte[] BOM_BE = { 0, 0, (byte) 0xfe, (byte) 0xff };
    private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe, 0, 0 };
    private static final int ENDIAN_XOR_BE = 0;
    private static final int ENDIAN_XOR_LE = 3;
    private static final int NEED_TO_WRITE_BOM = 1;

    private boolean isEndianSpecified;
    private boolean isBigEndian;
    private int endianXOR;
    private byte[] bom;
    private byte[] fromUSubstitution;

    public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
        super(icuCanonicalName, javaCanonicalName, aliases);

        this.isEndianSpecified = (this instanceof CharsetUTF32BE || this instanceof CharsetUTF32LE);
        this.isBigEndian = !(this instanceof CharsetUTF32LE);

        if (isBigEndian) {
            this.bom = BOM_BE;
            this.fromUSubstitution = fromUSubstitution_BE;
            this.endianXOR = ENDIAN_XOR_BE;
        } else {
            this.bom = BOM_LE;
            this.fromUSubstitution = fromUSubstitution_LE;
            this.endianXOR = ENDIAN_XOR_LE;
        }

        maxBytesPerChar = 4;
        minBytesPerChar = 4;
        maxCharsPerByte = 1;
    }

    class CharsetDecoderUTF32 extends CharsetDecoderICU {

        private boolean isBOMReadYet;
        private int actualEndianXOR;
        private byte[] actualBOM;

        public CharsetDecoderUTF32(CharsetICU cs) {
            super(cs);
        }

        @Override
        protected void implReset() {
            super.implReset();
            isBOMReadYet = false;
            actualBOM = null;
        }

        @Override
        protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
            /*
             * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual
             * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that
             * are in the current buffer.
             */
            if (!isBOMReadYet) {
                while (true) {
                    if (!source.hasRemaining())
                        return CoderResult.UNDERFLOW;

                    toUBytesArray[toULength++] = source.get();

                    if (toULength == 1) {
                        // on the first byte, we haven't decided whether or not it's bigEndian yet
                        if ((!isEndianSpecified || isBigEndian)
                                && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {
                            actualBOM = BOM_BE;
                            actualEndianXOR = ENDIAN_XOR_BE;
                        } else if ((!isEndianSpecified || !isBigEndian)
                                && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {
                            actualBOM = BOM_LE;
                            actualEndianXOR = ENDIAN_XOR_LE;
                        } else {
                            // we do not have a BOM (and we have toULength==1 bytes)
                            actualBOM = null;
                            actualEndianXOR = endianXOR;
                            break;
                        }
                    } else if (toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
                        // we do not have a BOM (and we have toULength bytes)
                        actualBOM = null;
                        actualEndianXOR = endianXOR;
                        break;
                    } else if (toULength == SIGNATURE_LENGTH) {
                        // we found a BOM! at last!
                        // too bad we have to get ignore it now (like it was unwanted or something)
                        toULength = 0;
                        break;
                    }
                }

                isBOMReadYet = true;
            }

            // now that we no longer need to look for a BOM, let's do some work
            int char32;

            while (true) {
                while (toULength < 4) {
                    if (!source.hasRemaining())
                        return CoderResult.UNDERFLOW;
                    toUBytesArray[toULength++] = source.get();
                }

                if (!target.hasRemaining())
                    return CoderResult.OVERFLOW;

                char32 = 0;
                for (int i = 0; i < 4; i++)
                    char32 = (char32 << 8)
                            | (toUBytesArray[i ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK);

                if (0 <= char32 && char32 <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(char32)) {
                    toULength = 0;
                    if (char32 <= UConverterConstants.MAXIMUM_UCS2) {
                        /* fits in 16 bits */
                        target.put((char) char32);
                    } else {
                        /* write out the surrogates */
                        target.put(UTF16.getLeadSurrogate(char32));
                        char32 = UTF16.getTrailSurrogate(char32);
                        if (target.hasRemaining()) {
                            target.put((char) char32);
                        } else {
                            /* Put in overflow buffer (not handled here) */
                            charErrorBufferArray[0] = (char) char32;
                            charErrorBufferLength = 1;
                            return CoderResult.OVERFLOW;
                        }
                    }
                } else {
                    return CoderResult.malformedForLength(toULength);
                }
            }
        }
    }

    class CharsetEncoderUTF32 extends CharsetEncoderICU {
        private final byte[] temp = new byte[4];

        public CharsetEncoderUTF32(CharsetICU cs) {
            super(cs, fromUSubstitution);
            fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
        }

        @Override
        protected void implReset() {
            super.implReset();
            fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
        }

        @Override
        protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
            CoderResult cr;

            /* write the BOM if necessary */
            if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
                if (!target.hasRemaining())
                    return CoderResult.OVERFLOW;

                fromUnicodeStatus = 0;
                cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
                if (cr.isOverflow())
                    return cr;
            }

            if (fromUChar32 != 0) {
                if (!target.hasRemaining())
                    return CoderResult.OVERFLOW;

                // a note: fromUChar32 will either be 0 or a lead surrogate
                cr = encodeChar(source, target, offsets, (char) fromUChar32);
                if (cr != null)
                    return cr;
            }

            while (true) {
                if (!source.hasRemaining())
                    return CoderResult.UNDERFLOW;
                if (!target.hasRemaining())
                    return CoderResult.OVERFLOW;

                cr = encodeChar(source, target, offsets, source.get());
                if (cr != null)
                    return cr;
            }
        }

        private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
            int sourceIndex = source.position() - 1;
            CoderResult cr;
            int char32;

            if (UTF16.isSurrogate(ch)) {
                cr = handleSurrogates(source, ch);
                if (cr != null)
                    return cr;

                char32 = fromUChar32;
                fromUChar32 = 0;
            } else {
                char32 = ch;
            }

            /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
            // temp[0 ^ endianXOR] = (byte) (char32 >>> 24); // (always 0)
            temp[1 ^ endianXOR] = (byte) (char32 >>> 16); // same as (byte)((char32 >>> 16) & 0x1f)
            temp[2 ^ endianXOR] = (byte) (char32 >>> 8);
            temp[3 ^ endianXOR] = (byte) (char32);
            cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
            return (cr.isUnderflow() ? null : cr);
        }
    }

    @Override
    public CharsetDecoder newDecoder() {
        return new CharsetDecoderUTF32(this);
    }

    @Override
    public CharsetEncoder newEncoder() {
        return new CharsetEncoderUTF32(this);
    }


    @Override
    void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
        getNonSurrogateUnicodeSet(setFillIn);
    }
}