com.fasterxml.aalto.in.Utf32Reader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
There is a newer version: 1.3.3
/* Woodstox Lite ("wool") XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.io.*;


import com.fasterxml.aalto.in.ReaderConfig;
import com.fasterxml.aalto.util.XmlConsts;

/**
 * Since JDK does not come with UTF-32/UCS-4, let's implement a simple
 * decoder to use.
 */
public final class Utf32Reader
    extends Reader
{
    final static char NULL_CHAR = (char) 0;

    final ReaderConfig mConfig;

    protected InputStream mIn;

    protected byte[] mBuffer;

    protected int mPtr;
    protected int mLength;

    final boolean mBigEndian;

    /**
     * Although input is fine with full Unicode set, Java still uses
     * 16-bit chars, so we may have to split high-order chars into
     * surrogate pairs.
     */
    char mSurrogate = NULL_CHAR;

    /**
     * Total read character count; used for error reporting purposes
     */
    int mCharCount = 0;

    /**
     * Total read byte count; used for error reporting purposes
     */
    int mByteCount = 0;

    /*
    ////////////////////////////////////////
    // Life-cycle
    ////////////////////////////////////////
    */

    public Utf32Reader(ReaderConfig cfg, InputStream in,
                       byte[] buf, int ptr, int len,
                       boolean isBigEndian)
    {
        mConfig = cfg;
        mBigEndian = isBigEndian;
    }

    /*
    ////////////////////////////////////////
    // Reader API
    ////////////////////////////////////////
    */

    @Override
    public void close() throws IOException
    {
        InputStream in = mIn;

        if (in != null) {
            mIn = null;
            freeBuffers();
            in.close();
        }
    }

    char[] mTmpBuf = null;

    /**
     * Although this method is implemented by the base class, AND it should
     * never be called by Woodstox code, let's still implement it bit more
     * efficiently just in case
     */
    @Override
    public int read() throws IOException
    {
        if (mTmpBuf == null) {
            mTmpBuf = new char[1];
        }
        if (read(mTmpBuf, 0, 1) < 1) {
            return -1;
        }
        return mTmpBuf[0];
    }

    /*
    ////////////////////////////////////////
    // Public API
    ////////////////////////////////////////
    */

    @Override
    public int read(char[] cbuf, int start, int len) throws IOException
    {
        // Already EOF?
        if (mBuffer == null) {
            return -1;
        }
        if (len < 1) {
            return len;
        }
        // Let's then ensure there's enough room...
        if (start < 0 || (start+len) > cbuf.length) {
            reportBounds(cbuf, start, len);
        }

        len += start;
        int outPtr = start;

        // Ok, first; do we have a surrogate from last round?
        if (mSurrogate != NULL_CHAR) {
            cbuf[outPtr++] = mSurrogate;
            mSurrogate = NULL_CHAR;
            // No need to load more, already got one char
        } else {
            /* Note: we'll try to avoid blocking as much as possible. As a
             * result, we only need to get 4 bytes for a full char.
             */
            int left = (mLength - mPtr);
            if (left < 4) {
                if (!loadMore(left)) { // (legal) EOF?
                    return -1;
                }
            }
        }

        byte[] buf = mBuffer;

        main_loop:
        while (outPtr < len) {
            int ptr = mPtr;
            int ch;

            if (mBigEndian) {
                ch = (buf[ptr] << 24) | ((buf[ptr+1] & 0xFF) << 16)
                    | ((buf[ptr+2] & 0xFF) << 8) | (buf[ptr+3] & 0xFF);
            } else {
                ch = (buf[ptr] & 0xFF) | ((buf[ptr+1] & 0xFF) << 8)
                    | ((buf[ptr+2] & 0xFF) << 16) | (buf[ptr+3] << 24);
            }
            mPtr += 4;

            // Does it need to be split to surrogates?
            if (ch >= 0xD800) {
                // Illegal?
                if (ch > XmlConsts.MAX_UNICODE_CHAR) {
                    reportInvalid(ch, outPtr-start,
                                  "(above "+Integer.toHexString(XmlConsts.MAX_UNICODE_CHAR)+") ");
                }
                if (ch > 0xFFFF) { // need to split into surrogates?
                    ch -= 0x10000; // to normalize it starting with 0x0
                    cbuf[outPtr++] = (char) (0xD800 + (ch >> 10));
                    // hmmh. can this ever be 0? (not legal, at least?)
                    ch = (0xDC00 | (ch & 0x03FF));
                    // Room for second part?
                    if (outPtr >= len) { // nope
                        mSurrogate = (char) ch;
                        break main_loop;
                    }
                } else { // in 16-bit range... just need validity checks
                    if (ch < 0xE000) {
                        reportInvalid(ch, outPtr-start, "(a surrogate char) ");
                    } else if (ch >= 0xFFFE) {
                        reportInvalid(ch, outPtr-start, "");
                    }
                }
            }
            cbuf[outPtr++] = (char) ch;
            if (mPtr >= mLength) {
                break main_loop;
            }
        }
        
        len = outPtr - start;
        mCharCount += len;
        return len;
    }

    /*
    ////////////////////////////////////////
    // Internal methods
    ////////////////////////////////////////
    */


    /**
     * @param available Number of "unused" bytes in the input buffer
     *
     * @return True, if enough bytes were read to allow decoding of at least
     *   one full character; false if EOF was encountered instead.
     */
    private boolean loadMore(int available)
        throws IOException
    {
        mByteCount += (mLength - available);

        // Bytes that need to be moved to the beginning of buffer?
        if (available > 0) {
            if (mPtr > 0) {
                for (int i = 0; i < available; ++i) {
                    mBuffer[i] = mBuffer[mPtr+i];
                }
                mPtr = 0;
            }
            mLength = available;
        } else {
            /* Ok; here we can actually reasonably expect an EOF,
             * so let's do a separate read right away:
             */
            mPtr = 0;
            int count = mIn.read(mBuffer);
            if (count < 1) {
                mLength = 0;
                if (count < 0) { // -1
                    freeBuffers(); // to help GC?
                    return false;
                }
                // 0 count is no good; let's err out
                reportStrangeStream();
            }
            mLength = count;
        }

        /* Need at least 4 bytes; if we don't get that many, it's an
         * error.
         */
        while (mLength < 4) {
            int count = mIn.read(mBuffer, mLength, mBuffer.length - mLength);
            if (count < 1) {
                if (count < 0) { // -1, EOF... no good!
                    freeBuffers(); // to help GC?
                    reportUnexpectedEOF(mLength, 4);
                }
                // 0 count is no good; let's err out
                reportStrangeStream();
            }
            mLength += count;
        }
        return true;
    }

    public final void freeBuffers()
    {
        byte[] buf = mBuffer;
        if (buf != null) {
            mBuffer = null;
            if (mConfig != null) {
                mConfig.freeFullBBuffer(buf);
            }
        }
    }

    /*
    //////////////////////////////////////////
    // Error reporting
    //////////////////////////////////////////
     */

    private void reportUnexpectedEOF(int gotBytes, int needed)
        throws IOException
    {
        int bytePos = mByteCount + gotBytes;
        int charPos = mCharCount;

        throw new CharConversionException("Unexpected EOF in the middle of a 4-byte UTF-32 char: got "
                                          +gotBytes+", needed "+needed
                                          +", at char #"+charPos+", byte #"+bytePos+")");
    }

    private void reportInvalid(int value, int offset, String msg)
        throws IOException
    {
        int bytePos = mByteCount + mPtr - 1;
        int charPos = mCharCount + offset;

        throw new CharConversionException("Invalid UTF-32 character 0x"
                                          +Integer.toHexString(value)
                                          +msg+" at char #"+charPos+", byte #"+bytePos+")");
    }

    protected void reportBounds(char[] cbuf, int start, int len)
        throws IOException
    {
        throw new ArrayIndexOutOfBoundsException("read(buf,"+start+","+len+"), cbuf["+cbuf.length+"]");
    }

    protected void reportStrangeStream()
        throws IOException
    {
        throw new IOException("Strange I/O stream, returned 0 bytes on read");
    }
}