src.java.com.ctc.wstx.sw.ISOLatin1XmlWriter Maven / Gradle / Ivy

Go to download
/* Woodstox XML processor
 *
 * Copyright (c) 2004- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in file LICENSE, included with
 * the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ctc.wstx.sw;

import java.io.*;

import javax.xml.stream.XMLStreamException;

import com.ctc.wstx.api.WriterConfig;
import com.ctc.wstx.io.CharsetNames;

/**
 * Concrete implementation of {@link EncodingXmlWriter} used when output
 * is to be encoded using ISO-8859-1, aka ISO-Latin1 encoding.
 *
 * Regarding surrogate pair handling: most of the checks are in the base
 * class, and here we only need to worry about writeRaw
 * methods.
 */
public final class ISOLatin1XmlWriter
    extends EncodingXmlWriter
{
    public ISOLatin1XmlWriter(OutputStream out, WriterConfig cfg, boolean autoclose)
        throws IOException
    {
        super(out, cfg, CharsetNames.CS_ISO_LATIN1, autoclose);
    }

    public void writeRaw(char[] cbuf, int offset, int len)
        throws IOException
    {
        if (mSurrogate != 0) {
            throwUnpairedSurrogate();
        }

        int ptr = mOutputPtr;
        while (len > 0) {
            int max = mOutputBuffer.length - ptr;
            if (max < 1) { // output buffer full?
                mOutputPtr = ptr;
                flushBuffer();
                ptr = 0;
                max = mOutputBuffer.length;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            if (mCheckContent) {
                for (int inEnd = offset + max; offset < inEnd; ++offset) {
                    int c = cbuf[offset];
                    if (c < 32) {
                        if (c == '\n') {
                            // !!! TBI: line nr
                        } else if (c == '\r') {
                            // !!! TBI: line nr (and skipping \n that may follow)
                        } else if (c != '\t') {
                            mOutputPtr = ptr;
                            c = handleInvalidChar(c);
                        }
                    } else if (c > 0x7E) {
                        if (c > 0xFF) {
                            mOutputPtr = ptr;
                            handleInvalidLatinChar(c);
                        } else if (mXml11) {
                            if (c < 0x9F && c != 0x85) {
                                mOutputPtr = ptr;
                                c = handleInvalidChar(c);
                            }
                        }
                    }
                    mOutputBuffer[ptr++] = (byte) c;
                }
            } else {
                for (int inEnd = offset + max; offset < inEnd; ++offset) {
                    mOutputBuffer[ptr++] = (byte) cbuf[offset];
                }
            }
            len -= max;
        }
        mOutputPtr = ptr;
    }

    public void writeRaw(String str, int offset, int len)
        throws IOException
    {
        if (mSurrogate != 0) {
            throwUnpairedSurrogate();
        }
        int ptr = mOutputPtr;
        while (len > 0) {
            int max = mOutputBuffer.length - ptr;
            if (max < 1) { // output buffer full?
                mOutputPtr = ptr;
                flushBuffer();
                ptr = 0;
                max = mOutputBuffer.length;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            if (mCheckContent) {
                for (int inEnd = offset + max; offset < inEnd; ++offset) {
                    int c = str.charAt(offset);
                    if (c < 32) {
                        if (c == '\n') {
                            // !!! TBI: line nr
                        } else if (c == '\r') {
                            // !!! TBI: line nr (and skipping \n that may follow)
                        } else if (c != '\t') {
                            mOutputPtr = ptr;
                            c = handleInvalidChar(c);
                        }
                    } else if (c > 0x7E) {
                        if (c > 0xFF) {
                            mOutputPtr = ptr;
                            handleInvalidLatinChar(c);
                        } else if (mXml11) {
                            if (c < 0x9F && c != 0x85) {
                                mOutputPtr = ptr;
                                c = handleInvalidChar(c);
                            }
                        }
                    }
                    mOutputBuffer[ptr++] = (byte) c;
                }
            } else {
                for (int inEnd = offset + max; offset < inEnd; ++offset) {
                    mOutputBuffer[ptr++] = (byte) str.charAt(offset);
                }
            }
            len -= max;
        }
        mOutputPtr = ptr;
    }

    protected void writeAttrValue(String data)
        throws IOException
    {
        int offset = 0;
        int len = data.length();
        int ptr = mOutputPtr;

        main_loop:
        while (len > 0) {
            int max = mOutputBuffer.length - ptr;
            if (max < 1) { // output buffer full?
                mOutputPtr = ptr;
                flushBuffer();
                ptr = 0;
                max = mOutputBuffer.length;
            }
            // Do we start with a surrogate?
            if (mSurrogate != 0) {
                int sec = data.charAt(offset++);
                sec = calcSurrogate(sec);
                mOutputPtr = ptr;
                ptr = writeAsEntity(sec);
                --len;
                continue main_loop;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            inner_loop:
            for (int inEnd = offset + max; offset < inEnd; ) {
                int c = data.charAt(offset++);
                if (c < 32) {
                    /* Need to quote all white space except for regular
                     * space chars, to preserve them (round-tripping)
                     */
                    // !!! TODO: line counting
                    if (mCheckContent) {
                        if (c != '\n' && c != '\r' && c != '\t'
                            && (!mXml11 || c == 0)) {
                            c = handleInvalidChar(c);
                            mOutputBuffer[ptr++] = (byte) c;
                            continue;
                        }
                    }
                    // fall-through to char entity output
                } else if (c < 0x7F) {
                    if (c != '<' && c != '&' && c != '"') {
                        mOutputBuffer[ptr++] = (byte) c;
                        continue;
                    }
                    // otherwise fall back on quoting
                } else if (c > 0x9F && c <= 0xFF) {
                    mOutputBuffer[ptr++] = (byte) c;
                    continue; // [WSTX-88]
                } else {
                    // Surrogate?
                    if (c >= SURR1_FIRST && c <= SURR2_LAST) {
                        mSurrogate = c;
                        // Last char needs special handling:
                        if (offset == inEnd) {
                            break inner_loop;
                        }
                        c = calcSurrogate(data.charAt(offset++));
                        // Let's fall down to entity output
                    }
                }
                /* Has to be escaped as char entity; as such, also need
                 * to re-calc max. continguous data we can output
                 */
                mOutputPtr = ptr;
                ptr = writeAsEntity(c);
                len = data.length() - offset;
                continue main_loop;
            }
            len -= max;
        }
        mOutputPtr = ptr;
    }

    protected void writeAttrValue(char[] data, int offset, int len)
        throws IOException
    {
        int ptr = mOutputPtr;

        main_loop:
        while (len > 0) {
            int max = mOutputBuffer.length - ptr;
            if (max < 1) { // output buffer full?
                mOutputPtr = ptr;
                flushBuffer();
                ptr = 0;
                max = mOutputBuffer.length;
            }
            // Do we start with a surrogate?
            if (mSurrogate != 0) {
                int sec = data[offset++];
                sec = calcSurrogate(sec);
                mOutputPtr = ptr;
                ptr = writeAsEntity(sec);
                --len;
                continue main_loop;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            inner_loop:
            for (int inEnd = offset + max; offset < inEnd; ) {
                int c = data[offset++];
                if (c < 32) {
                    /* Need to quote all white space except for regular
                     * space chars, to preserve them (round-tripping)
                     */
                    // !!! TODO: line counting
                    if (mCheckContent) {
                        if (c != '\n' && c != '\r' && c != '\t'
                            && (!mXml11 || c == 0)) {
                            c = handleInvalidChar(c);
                            mOutputBuffer[ptr++] = (byte) c;
                            continue;
                        }
                    }
                    // fall-through to char entity output
                } else if (c < 0x7F) {
                    if (c != '<' && c != '&' && c != '"') {
                        mOutputBuffer[ptr++] = (byte) c;
                        continue;
                    }
                    // otherwise fall back on quoting
                } else if (c > 0x9F && c <= 0xFF) {
                    mOutputBuffer[ptr++] = (byte) c;
                    continue; // [WSTX-88]
                } else {
                    // Surrogate?
                    if (c >= SURR1_FIRST && c <= SURR2_LAST) {
                        mSurrogate = c;
                        // Last char needs special handling:
                        if (offset == inEnd) {
                            break inner_loop;
                        }
                        c = calcSurrogate(data[offset++]);
                        // Let's fall down to entity output
                    }
                }
                /* Has to be escaped as char entity; as such, also need
                 * to re-calc max. contiguous data we can output
                 */
                mOutputPtr = ptr;
                ptr = writeAsEntity(c);
                max -= (inEnd - offset); // since we didn't loop completely
                break inner_loop;
            }
            len -= max;
        }
        mOutputPtr = ptr;
    }

    protected int writeCDataContent(String data)
        throws IOException
    {
        // Note: mSurrogate can not be non-zero at this point, no need to check

        int offset = 0;
        int len = data.length();
        if (!mCheckContent) {
            writeRaw(data, offset, len);
            return -1;
        }
        int ptr = mOutputPtr;

        main_loop:
        while (len > 0) {
            int max = mOutputBuffer.length - ptr;
            if (max < 1) { // output buffer full?
                mOutputPtr = ptr;
                flushBuffer();
                ptr = 0;
                max = mOutputBuffer.length;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            for (int inEnd = offset + max; offset < inEnd; ) {
                int c = data.charAt(offset++);
                if (c < 32) {
                    if (c == '\n') {
                        // !!! TBI: line nr
                    } else if (c == '\r') {
                        // !!! TBI: line nr (and skipping \n that may follow)
                    } else if (c != '\t') {
                        mOutputPtr = ptr;
                        c = handleInvalidChar(c);
                    }
                } else if (c > 0x7E) {
                    if (c > 0xFF) {
                        mOutputPtr = ptr;
                        handleInvalidLatinChar(c);
                    } else if (mXml11) {
                        if (c < 0x9F && c != 0x85) {
                            mOutputPtr = ptr;
                            c = handleInvalidChar(c);
                        }
                    }
                } else if (c == '>') { // embedded "]]>"?
                    if (offset > 2 && data.charAt(offset-2) == ']'
                        && data.charAt(offset-3) == ']') {
                        if (!mFixContent) {
                            return offset-3;
                        }
                        /* Relatively easy fix; just need to close this
                         * section, and open a new one...
                         */
                        mOutputPtr = ptr;
                        writeCDataEnd();
                        writeCDataStart();
                        writeAscii(BYTE_GT);
                        ptr = mOutputPtr;
                        /* No guarantees there's as much free room in the
                         * output buffer, thus, need to restart loop:
                         */
                        len = data.length() - offset;
                        continue main_loop;
                    }
                }
                mOutputBuffer[ptr++] = (byte) c;
            }
            len -= max;
        }
        mOutputPtr = ptr;
        return -1;
    }

    protected int writeCDataContent(char[] cbuf, int start, int len)
        throws IOException
    {
        // Note: mSurrogate can not be non-zero at this point, no need to check

        if (!mCheckContent) {
            writeRaw(cbuf, start, len);
            return -1;
        }

        int ptr = mOutputPtr;
        int offset = start;

        while (len > 0) {
            int max = mOutputBuffer.length - ptr;
            if (max < 1) { // output buffer full?
                mOutputPtr = ptr;
                flushBuffer();
                ptr = 0;
                max = mOutputBuffer.length;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            inner_loop:
            for (int inEnd = offset + max; offset < inEnd; ) {
                int c = cbuf[offset++];
                if (c < 32) {
                    if (c == '\n') {
                        // !!! TBI: line nr
                    } else if (c == '\r') {
                        // !!! TBI: line nr (and skipping \n that may follow)
                    } else if (c != '\t') {
                        mOutputPtr = ptr;
                        c = handleInvalidChar(c);
                    }
                } else if (c > 0x7E) {
                    if (c > 0xFF) {
                        mOutputPtr = ptr;
                        handleInvalidLatinChar(c);
                    } else if (mXml11) {
                        if (c < 0x9F && c != 0x85) {
                            mOutputPtr = ptr;
                            c = handleInvalidChar(c);
                        }
                    }
                } else if (c == '>') { // embedded "]]>"?
                    if (offset >= (start+3) && cbuf[offset-2] == ']'
                        && cbuf[offset-3] == ']') {
                        if (!mFixContent) {
                            return offset-3;
                        }
                        /* Relatively easy fix; just need to close this
                         * section, and open a new one...
                         */
                        mOutputPtr = ptr;
                        writeCDataEnd();
                        writeCDataStart();
                        writeAscii(BYTE_GT);
                        ptr = mOutputPtr;
                        /* No guarantees there's as much free room in the
                         * output buffer, thus, need to restart loop:
                         */
                        max -= (inEnd - offset);
                        break inner_loop;
                    }
                }
                mOutputBuffer[ptr++] = (byte) c;
            }
            len -= max;
        }
        mOutputPtr = ptr;
        return -1;
    }

    protected int writeCommentContent(String data)
        throws IOException
    {
        // Note: mSurrogate can not be non-zero at this point, no need to check

        int offset = 0;
        int len = data.length();
        if (!mCheckContent) {
            writeRaw(data, offset, len);
            return -1;
        }

        int ptr = mOutputPtr;

        while (len > 0) {
            int max = mOutputBuffer.length - ptr;
            if (max < 1) { // output buffer full?
                mOutputPtr = ptr;
                flushBuffer();
                ptr = 0;
                max = mOutputBuffer.length;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            inner_loop:
            for (int inEnd = offset + max; offset < inEnd; ) {
                int c = data.charAt(offset++);
                if (c < 32) {
                    if (c == '\n') {
                        // !!! TBI: line nr
                    } else if (c == '\r') {
                        // !!! TBI: line nr (and skipping \n that may follow)
                    } else if (c != '\t') {
                        mOutputPtr = ptr;
                        c = handleInvalidChar(c);
                    }
                } else if (c > 0x7E) {
                    if (c > 0xFF) {
                        mOutputPtr = ptr;
                        handleInvalidLatinChar(c);
                    } else if (mXml11) {
                        if (c < 0x9F && c != 0x85) {
                            mOutputPtr = ptr;
                            c = handleInvalidChar(c);
                        }
                    }
                } else if (c == '-') { // embedded "--"?
                    if (offset > 1 && data.charAt(offset-2) == '-') {
                        if (!mFixContent) {
                            return offset-2;
                        }
                        /* Quite easy to fix: just add an extra space
                         * in front. There will be room for that char;
                         * but may need to take that the following '-'
                         * also fits.
                         */
                        mOutputBuffer[ptr++] = ' ';
                        if (ptr >= mOutputBuffer.length) { // whops. need to flush
                            mOutputPtr = ptr;
                            flushBuffer();
                            ptr = 0;
                        }
                        mOutputBuffer[ptr++] = BYTE_HYPHEN;
                        /* Also, since we did output an extra char, better
                         * restart the loop (since max calculation is now
                         * off)
                         */
                        max -= (inEnd - offset);
                        break inner_loop;
                    }
                }
                mOutputBuffer[ptr++] = (byte) c;
            }
            len -= max;
        }
        mOutputPtr = ptr;
        return -1;
    }

    protected int writePIData(String data)
        throws IOException, XMLStreamException
    {
        // Note: mSurrogate can not be non-zero at this point, no need to check

        int offset = 0;
        int len = data.length();
        if (!mCheckContent) {
            writeRaw(data, offset, len);
            return -1;
        }

        int ptr = mOutputPtr;
        while (len > 0) {
            int max = mOutputBuffer.length - ptr;
            if (max < 1) { // output buffer full?
                mOutputPtr = ptr;
                flushBuffer();
                ptr = 0;
                max = mOutputBuffer.length;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            for (int inEnd = offset + max; offset < inEnd; ++offset) {
                int c = data.charAt(offset);
                if (c < 32) {
                    if (c == '\n') {
                        // !!! TBI: line nr
                    } else if (c == '\r') {
                        // !!! TBI: line nr (and skipping \n that may follow)
                    } else if (c != '\t') {
                        mOutputPtr = ptr;
                        c = handleInvalidChar(c);
                    }
                } else if (c > 0x7E) {
                    if (c > 0xFF) {
                        mOutputPtr = ptr;
                        handleInvalidLatinChar(c);
                    } else if (mXml11) {
                        if (c < 0x9F && c != 0x85) {
                            mOutputPtr = ptr;
                            c = handleInvalidChar(c);
                        }
                    }
                } else if (c == '>') { // enclosed end marker ("?>")?
                    if (offset > 0 && data.charAt(offset-1) == '?') {
                        return offset-2;
                    }
                }
                mOutputBuffer[ptr++] = (byte) c;
            }
            len -= max;
        }
        mOutputPtr = ptr;
        return -1;
    }

    protected void writeTextContent(String data)
        throws IOException
    {
        int offset = 0;
        int len = data.length();

        main_loop:
        while (len > 0) {
            int max = mOutputBuffer.length - mOutputPtr;
            if (max < 1) { // output buffer full?
                flushBuffer();
                max = mOutputBuffer.length;
            }
            // Do we start with a surrogate?
            if (mSurrogate != 0) {
                int sec = data.charAt(offset++);
                sec = calcSurrogate(sec);
                writeAsEntity(sec);
                --len;
                continue main_loop;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            inner_loop:
            for (int inEnd = offset + max; offset < inEnd; ) {
                int c = data.charAt(offset++);
                if (c < 32) {
                    if (c == '\n' || c == '\t') { // TODO: line count
                        mOutputBuffer[mOutputPtr++] = (byte) c;
                        continue;
                    } else if (c == '\r') {
                        if (!mEscapeCR) {
                            mOutputBuffer[mOutputPtr++] = (byte) c;
                            continue;
                        }
                    } else if (!mXml11 || c == 0) { // ok in xml1.1, as entity
                        if (mCheckContent) {
                            c = handleInvalidChar(c);
                            mOutputBuffer[mOutputPtr++] = (byte) c;
                            continue;
                        }
                        // otherwise... well, I guess we can just escape it
                    }
                    // \r, or xml1.1 + other whitespace, need to escape
                } else if (c < 0x7F) {
                    if (c != '<' && c != '&') {
                        if (c != '>' || (offset > 1 && data.charAt(offset-2) != ']')) {
                            mOutputBuffer[mOutputPtr++] = (byte) c;
                            continue;
                        }
                    }
                    // otherwise fall back on quoting
                } else if (c > 0x9F && c <= 0xFF) {
                    mOutputBuffer[mOutputPtr++] = (byte) c;
                    continue; // [WSTX-88]
                } else {
                    // Surrogate?
                    if (c >= SURR1_FIRST && c <= SURR2_LAST) {
                        mSurrogate = c;
                        // Last char needs special handling:
                        if (offset == inEnd) {
                            break inner_loop;
                        }
                        c = calcSurrogate(data.charAt(offset++));
                        // Let's fall down to entity output
                    }
                }
                /* Has to be escaped as char entity; as such, also need
                 * to re-calc max. continguous data we can output
                 */
                writeAsEntity(c);
                len = data.length() - offset;
                continue main_loop;
            }
            len -= max;
        }
    }

    protected void writeTextContent(char[] cbuf, int offset, int len)
        throws IOException
    {
        main_loop:
        while (len > 0) {
            int max = mOutputBuffer.length - mOutputPtr;
            if (max < 1) { // output buffer full?
                flushBuffer();
                max = mOutputBuffer.length;
            }
            // Do we start with a surrogate?
            if (mSurrogate != 0) {
                int sec = cbuf[offset++];
                sec = calcSurrogate(sec);
                writeAsEntity(sec);
                --len;
                continue main_loop;
            }
            // How much can we output?
            if (max > len) {
                max = len;
            }
            inner_loop:
            for (int inEnd = offset + max; offset < inEnd; ) {
                int c = cbuf[offset++];
                if (c < 32) {
                    if (c == '\n' || c == '\t') { // TODO: line count
                        mOutputBuffer[mOutputPtr++] = (byte) c;
                        continue;
                    } else if (c == '\r') {
                        if (!mEscapeCR) {
                            mOutputBuffer[mOutputPtr++] = (byte) c;
                            continue;
                        }
                    } else if (!mXml11 || c == 0) { // ok in xml1.1, as entity
                        if (mCheckContent) {
                            c = handleInvalidChar(c);
                            mOutputBuffer[mOutputPtr++] = (byte) c;
                            continue;
                        }
                        // otherwise... well, I guess we can just escape it
                    }
                    // \r, or xml1.1 + other whitespace, need to escape
                } else if (c < 0x7F) {
                    if (c !='<' && c != '&') {
                        /* Since we can be conservative, it doesn't matter
                         * if second check is not exact
                         */
                        if (c != '>' || (offset > 1 && cbuf[offset-2] != ']')) {
                            mOutputBuffer[mOutputPtr++] = (byte) c;
                            continue;
                        }
                    }
                    // otherwise fall back on quoting
                } else if (c > 0x9F && c <= 0xFF) {
                    mOutputBuffer[mOutputPtr++] = (byte) c;
                    continue; // [WSTX-88]
                } else {
                    // Surrogate?
                    if (c >= SURR1_FIRST && c <= SURR2_LAST) {
                        mSurrogate = c;
                        // Last char needs special handling:
                        if (offset == inEnd) {
                            break inner_loop;
                        }
                        c = calcSurrogate(cbuf[offset++]);
                        // Let's fall down to entity output
                    }
                }
                /* Has to be escaped as char entity; as such, also need
                 * to re-calc max. continguous data we can output
                 */
                writeAsEntity(c);
                max -= (inEnd - offset);
                break inner_loop;
            }
            len -= max;
        }
    }

    /*
    ////////////////////////////////////////////////////
    // Internal methods
    ////////////////////////////////////////////////////
     */

    protected void handleInvalidLatinChar(int c)
        throws IOException
    {
        // First, let's flush any output we may have, to make debugging easier
        flush();
        
        /* 17-May-2006, TSa: Would really be useful if we could throw
         *   XMLStreamExceptions; esp. to indicate actual output location.
         *   However, this causes problem with methods that call us and
         *   can only throw IOExceptions (when invoked via Writer proxy).
         *   Need to figure out how to resolve this.
         */
        throw new IOException("Invalid XML character (0x"+Integer.toHexString(c)+"); can only be output using character entity when using ISO-8859-1 encoding");
    }
}