com.ctc.wstx.io.UTF8Writer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of woodstox-core-asl Show documentation
Woodstox is a high-performance XML processor that implements Stax (JSR-173) and SAX2 APIs
There is a newer version: 4.4.1
package com.ctc.wstx.io;

import java.io.*;

import com.ctc.wstx.api.WriterConfig;

/**
 * Specialized buffering UTF-8 writer used by
 * {@link com.ctc.wstx.sw.XmlWriter}.
 * The main reason for custom version is to allow for efficient
 * buffer recycling; the second benefit is that encoder has less
 * overhead for short content encoding (compared to JDK default
 * codecs).
 */
public final class UTF8Writer
    extends Writer
    implements CompletelyCloseable
{
    private final static int DEFAULT_BUF_LEN = 4000;

    final static int SURR1_FIRST = 0xD800;
    final static int SURR1_LAST = 0xDBFF;
    final static int SURR2_FIRST = 0xDC00;
    final static int SURR2_LAST = 0xDFFF;

    final WriterConfig mConfig;

    final boolean mAutoCloseOutput;

    final OutputStream mOut;

    byte[] mOutBuffer;

    final int mOutBufferLast;

    int mOutPtr;

    /**
     * When outputting chars from BMP, surrogate pairs need to be coalesced.
     * To do this, both pairs must be known first; and since it is possible
     * pairs may be split, we need temporary storage for the first half
     */
    int mSurrogate = 0;

    public UTF8Writer(WriterConfig cfg, OutputStream out, boolean autoclose)
    {
        mConfig = cfg;
        mAutoCloseOutput = autoclose;
        mOut = out;
        mOutBuffer = (mConfig == null) ? new byte[DEFAULT_BUF_LEN] : cfg.allocFullBBuffer(DEFAULT_BUF_LEN);
        /* Max. expansion for a single char (in unmodified UTF-8) is
         * 4 bytes (or 3 depending on how you view it -- 4 when recombining
         * surrogate pairs)
         */
        mOutBufferLast = mOutBuffer.length - 4;
        mOutPtr = 0;
    }

    /*
    ////////////////////////////////////////////////////////
    // CompletelyCloseable impl
    ////////////////////////////////////////////////////////
     */

    public void closeCompletely() throws IOException
    {
        _close(true);
    }

    /*
    ////////////////////////////////////////////////////////
    // java.io.Writer implementation
    ////////////////////////////////////////////////////////
     */

    /* !!! 30-Nov-2006, TSa: Due to co-variance between Appendable and
     *    Writer, this would not compile with javac 1.5, in 1.4 mode
     *    (source and target set to "1.4". Not a huge deal, but since
     *    the base impl is just fine, no point in overriding it.
     */
    /*
    public Writer append(char c)
        throws IOException
               // note: this is a JDK 1.5 method
    {
        write(c);
        return this;
    }
    */

    public void close() throws IOException
    {
        _close(mAutoCloseOutput);
    }

    public void flush()
        throws IOException
    {
        if (mOutPtr > 0 && mOutBuffer != null) {
            mOut.write(mOutBuffer, 0, mOutPtr);
            mOutPtr = 0;
        }
        mOut.flush();
    }

    public void write(char[] cbuf)
        throws IOException
    {
        write(cbuf, 0, cbuf.length);
    }

    public void write(char[] cbuf, int off, int len)
        throws IOException
    {
        if (len < 2) {
            if (len == 1) {
                write(cbuf[off]);
            }
            return;
        }

        // First: do we have a leftover surrogate to deal with?
        if (mSurrogate > 0) {
            char second = cbuf[off++];
            --len;
            write(_convertSurrogate(second));
            // will have at least one more char
        }

        int outPtr = mOutPtr;
        byte[] outBuf = mOutBuffer;
        int outBufLast = mOutBufferLast; // has 4 'spare' bytes

        // All right; can just loop it nice and easy now:
        len += off; // len will now be the end of input buffer

        output_loop:
        for (; off < len; ) {
            /* First, let's ensure we can output at least 4 bytes
             * (longest UTF-8 encoded codepoint):
             */
            if (outPtr >= outBufLast) {
                mOut.write(outBuf, 0, outPtr);
                outPtr = 0;
            }

            int c = cbuf[off++];
            // And then see if we have an Ascii char:
            if (c < 0x80) { // If so, can do a tight inner loop:
                outBuf[outPtr++] = (byte)c;
                // Let's calc how many ascii chars we can copy at most:
                int maxInCount = (len - off);
                int maxOutCount = (outBufLast - outPtr);

                if (maxInCount > maxOutCount) {
                    maxInCount = maxOutCount;
                }
                maxInCount += off;
                ascii_loop:
                while (true) {
                    if (off >= maxInCount) { // done with max. ascii seq
                        continue output_loop;
                    }
                    c = cbuf[off++];
                    if (c >= 0x80) {
                        break ascii_loop;
                    }
                    outBuf[outPtr++] = (byte) c;
                }
            }

            // Nope, multi-byte:
            if (c < 0x800) { // 2-byte
                outBuf[outPtr++] = (byte) (0xc0 | (c >> 6));
                outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
            } else { // 3 or 4 bytes
                // Surrogates?
                if (c < SURR1_FIRST || c > SURR2_LAST) {
                    outBuf[outPtr++] = (byte) (0xe0 | (c >> 12));
                    outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                    outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
                    continue;
                }
                // Yup, a surrogate:
                if (c > SURR1_LAST) { // must be from first range
                    mOutPtr = outPtr;
                    throwIllegal(c);
                }
                mSurrogate = c;
                // and if so, followed by another from next range
                if (off >= len) { // unless we hit the end?
                    break;
                }
                c = _convertSurrogate(cbuf[off++]);
                if (c > 0x10FFFF) { // illegal, as per RFC 3629
                    mOutPtr = outPtr;
                    throwIllegal(c);
                }
                outBuf[outPtr++] = (byte) (0xf0 | (c >> 18));
                outBuf[outPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
                outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
            }
        }
        mOutPtr = outPtr;
    }
    
    public void write(int c)
        throws IOException
    {
        // First; do we have a left over surrogate?
        if (mSurrogate > 0) {
            c = _convertSurrogate(c);
            // If not, do we start with a surrogate?
        } else if (c >= SURR1_FIRST && c <= SURR2_LAST) {
            // Illegal to get second part without first:
            if (c > SURR1_LAST) {
                throwIllegal(c);
            }
            // First part just needs to be held for now
            mSurrogate = c;
            return;
        }

        if (mOutPtr >= mOutBufferLast) { // let's require enough room, first
            mOut.write(mOutBuffer, 0, mOutPtr);
            mOutPtr = 0;
        }

        if (c < 0x80) { // ascii
            mOutBuffer[mOutPtr++] = (byte) c;
        } else {
            int ptr = mOutPtr;
            if (c < 0x800) { // 2-byte
                mOutBuffer[ptr++] = (byte) (0xc0 | (c >> 6));
                mOutBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
            } else if (c <= 0xFFFF) { // 3 bytes
                mOutBuffer[ptr++] = (byte) (0xe0 | (c >> 12));
                mOutBuffer[ptr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                mOutBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
            } else { // 4 bytes
                if (c > 0x10FFFF) { // illegal, as per RFC 3629
                    throwIllegal(c);
                }
                mOutBuffer[ptr++] = (byte) (0xf0 | (c >> 18));
                mOutBuffer[ptr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
                mOutBuffer[ptr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                mOutBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
            }
            mOutPtr = ptr;
        }
    }

    public void write(String str)
        throws IOException
    {
        write(str, 0, str.length());
    }

    public void write(String str, int off, int len) 
        throws IOException
    {
        if (len < 2) {
            if (len == 1) {
                write(str.charAt(off));
            }
            return;
        }

        // First: do we have a leftover surrogate to deal with?
        if (mSurrogate > 0) {
            char second = str.charAt(off++);
            --len;
            write(_convertSurrogate(second));
            // will have at least one more char (case of 1 char was checked earlier on)
        }

        int outPtr = mOutPtr;
        byte[] outBuf = mOutBuffer;
        int outBufLast = mOutBufferLast; // has 4 'spare' bytes

        // All right; can just loop it nice and easy now:
        len += off; // len will now be the end of input buffer

        output_loop:
        for (; off < len; ) {
            /* First, let's ensure we can output at least 4 bytes
             * (longest UTF-8 encoded codepoint):
             */
            if (outPtr >= outBufLast) {
                mOut.write(outBuf, 0, outPtr);
                outPtr = 0;
            }

            int c = str.charAt(off++);
            // And then see if we have an Ascii char:
            if (c < 0x80) { // If so, can do a tight inner loop:
                outBuf[outPtr++] = (byte)c;
                // Let's calc how many ascii chars we can copy at most:
                int maxInCount = (len - off);
                int maxOutCount = (outBufLast - outPtr);

                if (maxInCount > maxOutCount) {
                    maxInCount = maxOutCount;
                }
                maxInCount += off;
                ascii_loop:
                while (true) {
                    if (off >= maxInCount) { // done with max. ascii seq
                        continue output_loop;
                    }
                    c = str.charAt(off++);
                    if (c >= 0x80) {
                        break ascii_loop;
                    }
                    outBuf[outPtr++] = (byte) c;
                }
            }

            // Nope, multi-byte:
            if (c < 0x800) { // 2-byte
                outBuf[outPtr++] = (byte) (0xc0 | (c >> 6));
                outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
            } else { // 3 or 4 bytes
                // Surrogates?
                if (c < SURR1_FIRST || c > SURR2_LAST) {
                    outBuf[outPtr++] = (byte) (0xe0 | (c >> 12));
                    outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                    outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
                    continue;
                }
                // Yup, a surrogate:
                if (c > SURR1_LAST) { // must be from first range
                    mOutPtr = outPtr;
                    throwIllegal(c);
                }
                mSurrogate = c;
                // and if so, followed by another from next range
                if (off >= len) { // unless we hit the end?
                    break;
                }
                c = _convertSurrogate(str.charAt(off++));
                if (c > 0x10FFFF) { // illegal, as per RFC 3629
                    mOutPtr = outPtr;
                    throwIllegal(c);
                }
                outBuf[outPtr++] = (byte) (0xf0 | (c >> 18));
                outBuf[outPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
                outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
            }
        }
        mOutPtr = outPtr;
    }

    /*
    ////////////////////////////////////////////////////////////
    // Internal methods
    ////////////////////////////////////////////////////////////
     */

    private final void _close(boolean forceClosing)
        throws IOException
    {
        byte[] buf = mOutBuffer;
        if (buf != null) {
            mOutBuffer = null;
            if (mOutPtr > 0) {
                mOut.write(buf, 0, mOutPtr);
                mOutPtr = 0;
            }
            if (mConfig != null) {
                mConfig.freeFullBBuffer(buf);
            }
        }

        if (forceClosing) {
            mOut.close();
        }
        
        /* Let's 'flush' orphan surrogate, no matter what; but only
         * after cleanly closing everything else.
         */
        int code = mSurrogate;
        if (code > 0) {
            mSurrogate = 0;
            throwIllegal(code);
        }
    }

    /**
     * Method called to calculate UTF codepoint, from a surrogate pair.
     */
    private final int _convertSurrogate(int secondPart)
        throws IOException
    {
        int firstPart = mSurrogate;
        mSurrogate = 0;

        // Ok, then, is the second part valid?
        if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
            throw new IOException("Broken surrogate pair: first char 0x"+Integer.toHexString(firstPart)+", second 0x"+Integer.toHexString(secondPart)+"; illegal combination");
        }
        return 0x10000 + ((firstPart - SURR1_FIRST) << 10) + (secondPart - SURR2_FIRST);
    }

    private void throwIllegal(int code)
        throws IOException
    {
        if (code > 0x10FFFF) { // over max?
            throw new IOException("Illegal character point (0x"+Integer.toHexString(code)+") to output; max is 0x10FFFF as per RFC 3629");
        }
        if (code >= SURR1_FIRST) {
            if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?)
                throw new IOException("Unmatched first part of surrogate pair (0x"+Integer.toHexString(code)+")");
            }
            throw new IOException("Unmatched second part of surrogate pair (0x"+Integer.toHexString(code)+")");
        }

        // should we ever get this?
        throw new IOException("Illegal character point (0x"+Integer.toHexString(code)+") to output");
    }
}