All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fasterxml.aalto.in.Utf8Scanner Maven / Gradle / Ivy

There is a newer version: 1.3.3
Show newest version
/* Aalto XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.io.*;

import javax.xml.stream.XMLStreamException;

import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;

/**
 * Scanner for tokenizing XML content from a byte stream encoding using
 * UTF-8 encoding, or something suitably close it for decoding purposes
 * (including ISO-Latin1 and US-ASCII).
 */
public final class Utf8Scanner
    extends StreamScanner
{
    /*
    /**********************************************************************
    /* Life-cycle
    /**********************************************************************
     */

    public Utf8Scanner(ReaderConfig cfg, InputStream in,
                       byte[] buffer, int ptr, int last)
    {
        super(cfg, in, buffer, ptr, last);
    }

    /*
    /**********************************************************************
    /* Internal methods, secondary parsing
    /**********************************************************************
     */

    @Override
    protected final void finishToken() throws XMLStreamException
    {
        _tokenIncomplete = false;
        switch (_currToken) {
        case PROCESSING_INSTRUCTION:
            finishPI();
            break;
        case CHARACTERS:
            finishCharacters();
            break;
        case COMMENT:
            finishComment();
            break;
        case SPACE:
            finishSpace();
            break;
        case DTD:
            finishDTD(true); // true -> get text
            break;
        case CDATA:
            finishCData();
            break;
        default:
            ErrorConsts.throwInternalError();
        }
    }
    
    @Override
    protected int handleStartElement(byte b)
        throws XMLStreamException
    {
        _currToken = START_ELEMENT;
        _currNsCount = 0;
        PName elemName = parsePName(b);

        /* Ok. Need to create a qualified name. Simplest for element
         * in default ns (no extra work -- expressed as null binding);
         * otherwise need to find binding
         */
        String prefix = elemName.getPrefix();
        boolean allBound; // flag to check 'late' bindings

        if (prefix == null) { // element in default ns
            allBound = true; // which need not be bound
        } else {
            elemName = bindName(elemName, prefix);
            allBound = elemName.isBound();
        }

        _tokenName = elemName;
        _currElem = new ElementScope(elemName, _currElem);

        // And then attribute parsing loop:
        int attrPtr = 0;

        while (true) {
            if (_inputPtr >= _inputEnd) {
                loadMoreGuaranteed();
            }
            b = _inputBuffer[_inputPtr++];
            int c = (int) b & 0xFF;
            // Intervening space to skip?
            if (c <= INT_SPACE) {
                do {
                    if (c == INT_LF) {
                        markLF();
                    } else if (c == INT_CR) {
                        if (_inputPtr >= _inputEnd) {
                            loadMoreGuaranteed();
                        }
                        if (_inputBuffer[_inputPtr] == BYTE_LF) {
                            ++_inputPtr;
                        }
                        markLF();
                    } else if (c != INT_SPACE && c != INT_TAB) {
                        throwInvalidSpace(c);
                    }
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    b = _inputBuffer[_inputPtr++];
                    c = (int) b & 0xFF;
                } while (c <= INT_SPACE);
            } else if (c != INT_SLASH && c != INT_GT) {
                c = decodeCharForError(b);
                throwUnexpectedChar(c, " expected space, or '>' or \"/>\"");
            }

            // Ok; either need to get an attribute name, or end marker:
            if (c == INT_SLASH) {
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                b = _inputBuffer[_inputPtr++];
                if (b != BYTE_GT) {
                    c = decodeCharForError(b);
                    throwUnexpectedChar(c, " expected '>'");
                }
                _isEmptyTag = true;
                break;
            } else if (c == INT_GT) {
                _isEmptyTag = false;
                break;
            } else if (c == INT_LT) {
                reportInputProblem("Unexpected '<' character in element (missing closing '>'?)");
            }

            // Ok, an attr name:
            PName attrName = parsePName(b);
            prefix = attrName.getPrefix();

            boolean isNsDecl;

            if (prefix == null) { // can be default ns decl:
                isNsDecl = (attrName.getLocalName() == "xmlns");
            } else {
                // May be a namespace decl though?
                if (prefix == "xmlns") {
                    isNsDecl = true;
                } else {
                    attrName = bindName(attrName, prefix);
                    if (allBound) {
                        allBound = attrName.isBound();
                    }
                    isNsDecl = false;
                }
            }

            // Optional space to skip again
            while (true) {
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                b = _inputBuffer[_inputPtr++];
                c = (int) b & 0xFF;
                if (c > INT_SPACE) {
                    break;
                }
                if (c == INT_LF) {
                    markLF();
                } else if (c == INT_CR) {
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    if (_inputBuffer[_inputPtr] == BYTE_LF) {
                        ++_inputPtr;
                    }
                    markLF();
                } else if (c != INT_SPACE && c != INT_TAB) {
                    throwInvalidSpace(c);
                }
            }

            if (c != INT_EQ) {
                c = decodeCharForError(b);
                throwUnexpectedChar(c, " expected '='");
            }

            // Optional space to skip again
            while (true) {
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                b = _inputBuffer[_inputPtr++];
                c = (int) b & 0xFF;
                if (c > INT_SPACE) {
                    break;
                }
                if (c == INT_LF) {
                    markLF();
                } else if (c == INT_CR) {
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    if (_inputBuffer[_inputPtr] == BYTE_LF) {
                        ++_inputPtr;
                    }
                    markLF();
                } else if (c != INT_SPACE && c != INT_TAB) {
                    throwInvalidSpace(c);
                }
            }

            if (c != INT_QUOTE && c != INT_APOS) {
                c = decodeCharForError(b);
                throwUnexpectedChar(c, " Expected a quote");
            }

            /* Ok, finally: value parsing. However, ns URIs are to be handled
             * different from attribute values... let's offline URIs, since
             * they should be less common than attribute values.
             */
            if (isNsDecl) { // default ns, or explicit?
                handleNsDeclaration(attrName, b);
                ++_currNsCount;
            } else { // nope, a 'real' attribute:
                attrPtr = collectValue(attrPtr, b, attrName);
            }
        }
        {
            // Note: this call also checks attribute uniqueness
            int act = _attrCollector.finishLastValue(attrPtr);
            if (act < 0) { // error, dup attr indicated by -1
                act = _attrCollector.getCount(); // let's get correct count
                reportInputProblem(_attrCollector.getErrorMsg());
            }
            _attrCount = act;
        }
        ++_depth;

        /* Was there any prefix that wasn't bound prior to use?
         * That's legal, assuming declaration was found later on...
         * let's check
         */
        if (!allBound) {
            if (!elemName.isBound()) { // element itself unbound
                reportUnboundPrefix(_tokenName, false);
            }
            for (int i = 0, len = _attrCount; i < len; ++i) {
                PName attrName = _attrCollector.getName(i);
                if (!attrName.isBound()) {
                    reportUnboundPrefix(attrName, true);
                }
            }
        }
        return START_ELEMENT;
    }

    /**
     * This method implements the tight loop for parsing attribute
     * values. It's off-lined from the main start element method to
     * simplify main method, which makes code more maintainable
     * and possibly easier for JIT/HotSpot to optimize.
     */
    private final int collectValue(int attrPtr, byte quoteByte, PName attrName)
        throws XMLStreamException
    {
        char[] attrBuffer = _attrCollector.startNewValue(attrName, attrPtr);
        final int[] TYPES = _charTypes.ATTR_CHARS;
        final int quoteChar = (int) quoteByte;
        
        value_loop:
        while (true) {
            int c;

            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                if (attrPtr >= attrBuffer.length) {
                    attrBuffer = _attrCollector.valueBufferFull();
                }
                int max = _inputEnd;
                {
                    int max2 = ptr + (attrBuffer.length - attrPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (ptr < max) {
                    c = (int) _inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                    attrBuffer[attrPtr++] = (char) c;
                }
                _inputPtr = ptr;
            }
            
            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (_inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                // fall through
            case XmlCharTypes.CT_WS_LF:
                markLF();
                // fall through
            case XmlCharTypes.CT_WS_TAB:
                // Plus, need to convert these all to simple space
                c = INT_SPACE;
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
                c = 0xDC00 | (c & 0x3FF);
                if (attrPtr >= attrBuffer.length) {
                    attrBuffer = _attrCollector.valueBufferFull();
                }
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_LT:
                throwUnexpectedChar(c, "'<' not allowed in attribute value");
            case XmlCharTypes.CT_AMP:
                c = handleEntityInText(false);
                if (c == 0) { // unexpanded general entity... not good
                    reportUnexpandedEntityInAttr(attrName, false);
                }
                // Ok; does it need a surrogate though? (over 16 bits)
                if ((c >> 16) != 0) {
                    c -= 0x10000;
                    attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
                    c = 0xDC00 | (c & 0x3FF);
                    if (attrPtr >= attrBuffer.length) {
                        attrBuffer = _attrCollector.valueBufferFull();
                    }
                }
                break;
            case XmlCharTypes.CT_ATTR_QUOTE:
                if (c == quoteChar) {
                    break value_loop;
                }
                
                // default:
                // Other chars are not important here...
            }
            // We know there's room for at least one char without checking
            attrBuffer[attrPtr++] = (char) c;
        }

        return attrPtr;
    }

    /**
     * Method called from the main START_ELEMENT handling loop, to
     * parse namespace URI values.
     */
    private void handleNsDeclaration(PName name, byte quoteByte)
        throws XMLStreamException
    {
        int attrPtr = 0;
        char[] attrBuffer = _nameBuffer;

        while (true) {
            if (_inputPtr >= _inputEnd) {
                loadMoreGuaranteed();
            }
            byte b = _inputBuffer[_inputPtr++];
            if (b == quoteByte) {
                break;
            }
            int c;
            if (b == BYTE_AMP) { // entity
                c = handleEntityInText(false);
                if (c == 0) { // general entity; should never happen
                    reportUnexpandedEntityInAttr(name, true);
                }
                // Ok; does it need a surrogate though? (over 16 bits)
                if ((c >> 16) != 0) {
                    if (attrPtr >= attrBuffer.length) {
                        _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
                    }
                    c -= 0x10000;
                    attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
                    c = 0xDC00 | (c & 0x3FF);
                }
            } else if (b == BYTE_LT) { // error
                c = (int) b;
                throwUnexpectedChar(c, "'<' not allowed in attribute value");
            } else {
                c = (int) b & 0xFF;
                if (c < INT_SPACE) {
                    if (c == INT_LF) {
                        markLF();
                    } else if (c == INT_CR) {
                        if (_inputPtr >= _inputEnd) {
                            loadMoreGuaranteed();
                        }
                        if (_inputBuffer[_inputPtr] == BYTE_LF) {
                            ++_inputPtr;
                        }
                        markLF();
                    } else {
                        if (c < 0) {
                            c = decodeMultiByteChar(c, _inputPtr);
                            if (c < 0) { // surrogate pair
                                c = -c;
                                // Let's add first part right away:
                                if (attrPtr >= attrBuffer.length) {
                                    _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
                                }
                                c -= 0x10000;
                                attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
                                c = 0xDC00 | (c & 0x3FF);
                            }
                        } else if (c != INT_TAB) {
                            throwInvalidSpace(c);
                        }
                    }
                }
            }
            if (attrPtr >= attrBuffer.length) {
                _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
            }
            attrBuffer[attrPtr++] = (char) c;
        }

        /* Simple optimization: for default ns removal (or, with
         * ns 1.1, any other as well), will use empty value... no
         * need to try to intern:
         */
        if (attrPtr == 0) {
            bindNs(name, "");
        } else {
            String uri = _config.canonicalizeURI(attrBuffer, attrPtr);
            bindNs(name, uri);
        }
    }

    /**
     * Method called when an ampersand is encounter in text segment.
     * Method needs to determine whether it is a pre-defined or character
     * entity (in which case it will be expanded into a single char or
     * surrogate pair), or a general
     * entity (in which case it will most likely be returned as
     * ENTITY_REFERENCE event)
     *
     * @param inAttr True, if reference is from attribute value; false
     *   if from normal text content
     *
     * @return 0 if a general parsed entity encountered; integer 
     *    value of a (valid) XML content character otherwise
     */
    @Override
    protected final int handleEntityInText(boolean inAttr)
        throws XMLStreamException
    {
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        byte b = _inputBuffer[_inputPtr++];
        if (b == BYTE_HASH) {
            return handleCharEntity();
        }
        String start;
        if (b == BYTE_a) { // amp or apos?
            b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
            if (b == BYTE_m) { // amp?
                b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                if (b == BYTE_p) {
                    b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                    if (b == BYTE_SEMICOLON) {
                        return INT_AMP;
                    }
                    start = "amp";
                } else {
                    start = "am";
                }
            } else if (b == BYTE_p) { // apos?
                b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                if (b == BYTE_o) {
                    b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                    if (b == BYTE_s) {
                        b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                        if (b == BYTE_SEMICOLON) {
                            return INT_APOS;
                        }
                        start = "apos";
                    } else {
                        start = "apo";
                    }
                } else {
                    start = "ap";
                }
            } else {
                start = "a";
            }
        } else if (b == BYTE_l) { // lt?
            b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
            if (b == BYTE_t) {
                b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                if (b == BYTE_SEMICOLON) {
                    return INT_LT;
                }
                start = "lt";
            } else {
                start = "l";
            }
        } else if (b == BYTE_g) { // gt?
            b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
            if (b == BYTE_t) {
                b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                if (b == BYTE_SEMICOLON) {
                    return INT_GT;
                }
                start = "gt";
            } else {
                start = "g";
            }
        } else if (b == BYTE_q) { // quot?
            b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
            if (b == BYTE_u) {
                b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                if (b == BYTE_o) {
                    b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                    if (b == BYTE_t) {
                        b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
                        if (b == BYTE_SEMICOLON) {
                            return INT_QUOTE;
                        }
                        start = "quot";
                    } else {
                        start = "quo";
                    }
                } else {
                    start = "qu";
                }
            } else {
                start = "q";
            }
        } else {
            start = "";
        }

        final int[] TYPES = _charTypes.NAME_CHARS;

        /* All righty: we have the beginning of the name, plus the first
         * byte too. So let's see what we can do with it.
         */
        char[] cbuf = _nameBuffer;
        int cix = 0;
        for (int len = start.length(); cix < len; ++cix) {
            cbuf[cix] = start.charAt(cix);
        }
        //int colon = -1;
        while (b != BYTE_SEMICOLON) {
            boolean ok;
            int c = (int) b & 0xFF;

            // Has to be a valid name start char though:
            switch (TYPES[c]) {
            case XmlCharTypes.CT_NAME_NONE:
            case XmlCharTypes.CT_NAME_COLON: // not ok for entities?
            case XmlCharTypes.CT_NAME_NONFIRST:
                ok = (cix > 0);
                break;
            case XmlCharTypes.CT_NAME_ANY:
                ok = true;
                break;
            case InputCharTypes.CT_INPUT_NAME_MB_2:
                c = decodeUtf8_2(c);
                ok = XmlChars.is10NameStartChar(c);
                break;
            case InputCharTypes.CT_INPUT_NAME_MB_3:
                c = decodeUtf8_3(c);
                ok = XmlChars.is10NameStartChar(c);
                break;
            case InputCharTypes.CT_INPUT_NAME_MB_4:
                c = decodeUtf8_4(c);
                ok = XmlChars.is10NameStartChar(c);
                if (ok) {
                    if (cix >= cbuf.length) {
                        _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
                    }
                    // Let's add first part right away:
                    c -= 0x10000;
                    cbuf[cix++] = (char) (0xD800 | (c >> 10));
                    c = 0xDC00 | (c & 0x3FF);
                }
                break;
            case InputCharTypes.CT_INPUT_NAME_MB_N:
            default:
                ok = false;
                break;
            }
            if (!ok) {
                reportInvalidNameChar(c, cix);
            }
            if (cix >= cbuf.length) {
                _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
            }
            cbuf[cix++] = (char) c;
            if (_inputPtr >= _inputEnd) {
                loadMoreGuaranteed();
            }
            b = _inputBuffer[_inputPtr++];
        }

        // Ok, let's construct a (temporary) entity name, then:
        String pname = new String(cbuf, 0, cix);
        // (note: hash is dummy... not to be compared to anything etc)
        _tokenName = new PNameC(pname, null, pname, 0);

        /* One more thing: do we actually allow entities in this mode
         * and with this event?
         */
        if (_config.willExpandEntities()) {
            reportInputProblem("General entity reference (&"+pname+";) encountered in entity expanding mode: operation not (yet) implemented");
        }
        if (inAttr) {
            reportInputProblem("General entity reference (&"+pname+";) encountered in attribute value, in non-entity-expanding mode: no way to handle it");
        }
        return 0;
    }

    /*
    /**********************************************************************
    /* Internal methods, name parsing:
    /**********************************************************************
     */

    /**
     * Parsing of public ids is bit more complicated than that of system
     * ids, since white space is to be coalesced.
     */
    @Override
    protected String parsePublicId(byte quoteChar) throws XMLStreamException
    {
        char[] outputBuffer = _nameBuffer;
        int outPtr = 0;
        final int[] TYPES = XmlCharTypes.PUBID_CHARS;
        boolean addSpace = false;

        main_loop:
        while (true) {
            if (_inputPtr >= _inputEnd) {
                loadMoreGuaranteed();
            }
            // Easier to check without char type table, first:
            byte b = _inputBuffer[_inputPtr++];
            if (b == quoteChar) {
                break main_loop;
            }
            int c = (int) b & 0xFF;
            if (TYPES[c] != XmlCharTypes.PUBID_OK) {
                throwUnexpectedChar(c, " in public identifier");
            }

            // White space? Needs to be coalecsed
            if (c <= INT_SPACE) {
                addSpace = true;
                continue;
            }
            if (addSpace) {
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                outputBuffer[outPtr++] = ' ';
                addSpace = false;
            }
            if (outPtr >= outputBuffer.length) {
                _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length);
                outPtr = 0;
            }
            outputBuffer[outPtr++] = (char) c;
        }
        return new String(outputBuffer, 0, outPtr);
    }

    @Override
    protected String parseSystemId(byte quoteChar) throws XMLStreamException
    {
        // caller has init'ed the buffer...
        char[] outputBuffer = _nameBuffer;
        int outPtr = 0;
        // attribute types are closest matches, so let's use them
        final int[] TYPES = _charTypes.ATTR_CHARS;
        //boolean spaceToAdd = false;

        main_loop:
        while (true) {
            if (_inputPtr >= _inputEnd) {
                loadMoreGuaranteed();
            }
            int c = (int) _inputBuffer[_inputPtr++] & 0xFF;
            if (TYPES[c] != 0) {
                switch (TYPES[c]) {
                case XmlCharTypes.CT_INVALID:
                    c = handleInvalidXmlChar(c);
                case XmlCharTypes.CT_WS_CR:
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    if (_inputBuffer[_inputPtr] == BYTE_LF) {
                        ++_inputPtr;
                    }
                    markLF();
                    c = INT_LF;
                    break;
                case XmlCharTypes.CT_WS_LF:
                    markLF();
                    break;
                case XmlCharTypes.CT_MULTIBYTE_2:
                    c = decodeUtf8_2(c);
                    break;
                case XmlCharTypes.CT_MULTIBYTE_3:
                    c = decodeUtf8_3(c);
                    break;
                case XmlCharTypes.CT_MULTIBYTE_4:
                    c = decodeUtf8_4(c);
                    if (outPtr >= outputBuffer.length) {
                        outputBuffer = _textBuilder.finishCurrentSegment();
                        outPtr = 0;
                    }
                    // Let's add first part right away:
                    outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                    c = 0xDC00 | (c & 0x3FF);
                    // And let the other char output down below
                    break;
                case XmlCharTypes.CT_MULTIBYTE_N:
                    reportInvalidInitial(c);

                case XmlCharTypes.CT_ATTR_QUOTE:
                    if (c == (int) quoteChar) {
                        break main_loop;
                    }
                }

            }
            
            if (outPtr >= outputBuffer.length) {
                _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length);
                outPtr = 0;
            }
            outputBuffer[outPtr++] = (char) c;
        }
        return new String(outputBuffer, 0, outPtr);
    }

    /*
    /**********************************************************************
    /* Content skipping
    /**********************************************************************
     */

    @Override
    protected final boolean skipCharacters() throws XMLStreamException
    {
        final int[] TYPES = _charTypes.TEXT_CHARS;
        final byte[] inputBuffer = _inputBuffer;

        while (true) {
            int c;

            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                int max = _inputEnd;
                if (ptr >= max) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                    max = _inputEnd;
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                }
                _inputPtr = ptr;
            }

            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                skipUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                skipUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                skipUtf8_4(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_LT:
                --_inputPtr;
                return false;
            case XmlCharTypes.CT_AMP:
                c = handleEntityInText(false);
                if (c == 0) { // unexpandable general parsed entity
                    return true;
                }
                break;
            case XmlCharTypes.CT_RBRACKET: // ']]>'?
                {
                    // Let's then just count number of brackets --
                    // in case they are not followed by '>'
                    int count = 1;
                    byte b;
                    while (true) {
                        if (_inputPtr >= _inputEnd) {
                            loadMoreGuaranteed();
                        }
                        b = inputBuffer[_inputPtr];
                        if (b != BYTE_RBRACKET) {
                            break;
                        }
                        ++_inputPtr; // to skip past bracket
                        ++count;
                    }
                    if (b == BYTE_GT && count > 1) {
                        reportIllegalCDataEnd();
                    }
                }
                break;

                // default:
                    // Other types are not important here...
            }
        }
    }

    @Override
    protected final void skipComment() throws XMLStreamException
    {
        final int[] TYPES = _charTypes.OTHER_CHARS;
        final byte[] inputBuffer = _inputBuffer;

        while (true) {
            int c;

            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                int max = _inputEnd;
                if (ptr >= max) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                    max = _inputEnd;
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                }
                _inputPtr = ptr;
            }

            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                skipUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                skipUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                skipUtf8_4(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_HYPHEN: // '-->'?
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then
                    ++_inputPtr;
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    if (_inputBuffer[_inputPtr++] != BYTE_GT) {
                        reportDoubleHyphenInComments();
                    }
                    return;
                }
                break;
                
            // default:
                // Other types are not important here...
            }
        }
    }

    @Override
    protected final void skipCData() throws XMLStreamException
    {
        final int[] TYPES = _charTypes.OTHER_CHARS;
        final byte[] inputBuffer = _inputBuffer;

        while (true) {
            int c;

            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                int max = _inputEnd;
                if (ptr >= max) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                    max = _inputEnd;
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                }
                _inputPtr = ptr;
            }

            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                skipUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                skipUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                skipUtf8_4(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_RBRACKET: // ']]>'?
                {
                    // end is nigh?
                    int count = 0;
                    byte b;

                    do {
                        if (_inputPtr >= _inputEnd) {
                            loadMoreGuaranteed();
                        }
                        ++count;
                        b = _inputBuffer[_inputPtr++];
                    } while (b == BYTE_RBRACKET);

                    if (b == BYTE_GT) {
                        if (count > 1) { // gotcha
                            return;
                        }
                        // can still skip plain ']>'...
                    } else {
                        --_inputPtr; // need to push back last char
                    }
                }
                break;
                
            // default:
                // Other types are not important here...
            }
        }
    }

    @Override
    protected final void skipPI() throws XMLStreamException
    {
        final int[] TYPES = _charTypes.OTHER_CHARS;
        final byte[] inputBuffer = _inputBuffer;

        while (true) {
            int c;

            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                int max = _inputEnd;
                if (ptr >= max) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                    max = _inputEnd;
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                }
                _inputPtr = ptr;
            }

            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                skipUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                skipUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                skipUtf8_4(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_QMARK: // '?>'?
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (_inputBuffer[_inputPtr] == BYTE_GT) {
                    ++_inputPtr;
                    return;
                }
                break;
                
            // default:
                // Other types are not important here...
            }
        }
    }

    @Override
    protected final void skipSpace() throws XMLStreamException
    {
        // mTmpChar has a space, but it's been checked, can ignore
        int ptr = _inputPtr;

        while (true) {
            if (ptr >= _inputEnd) {
                if (!loadMore()) {
                    break;
                }
                ptr = _inputPtr;
            }
            int c = (int) _inputBuffer[ptr] & 0xFF;
            if (c > INT_SPACE) { // !!! TODO: xml 1.1 ws
                break;
            }
            ++ptr;

            if (c == INT_LF) {
                markLF(ptr);
            } else if (c == INT_CR) {
                if (ptr >= _inputEnd) {
                    if (!loadMore()) {
                        break;
                    }
                    ptr = _inputPtr;
                }
                if (_inputBuffer[ptr] == BYTE_LF) {
                    ++ptr;
                }
                markLF(ptr);
            } else if (c != INT_SPACE && c != INT_TAB) {
                _inputPtr = ptr;
                throwInvalidSpace(c);
            }
        }

        _inputPtr = ptr;
    }

    /*
    private final int skipMultiByteChar(int c, int ptr)
        throws XMLStreamException
    {
        int needed;

        // Ok; if we end here, we got multi-byte combination
        if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
            needed = 1;
        } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
            needed = 2;
        } else if ((c & 0xF8) == 0xF0) {
            // 4 bytes; double-char with surrogates and all...
            needed = 3;
        } else {
            reportInvalidInitial(c & 0xFF);
            needed = 1; // never gets here
        }
        
        if (ptr >= _inputEnd) {
            loadMoreGuaranteed();
            ptr = _inputPtr;
        }
        c = (int) _inputBuffer[ptr++];

        if ((c & 0xC0) != 0x080) {
            reportInvalidOther(c & 0xFF, ptr);
        }
        
        if (needed > 1) { // needed == 1 means 2 bytes total
            if (ptr >= _inputEnd) {
                loadMoreGuaranteed();
                ptr = _inputPtr;
            }
            c = (int) _inputBuffer[ptr++];

            if ((c & 0xC0) != 0x080) {
                reportInvalidOther(c & 0xFF, ptr);
            }
            if (needed > 2) { // 4 bytes? (need surrogates)
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                c = (int) _inputBuffer[ptr++];

                if ((c & 0xC0) != 0x080) {
                    reportInvalidOther(c & 0xFF, ptr);
                }
            }
        }
        return ptr;
    }

    private final int skipMultiByteChar(int c, int type, int ptr)
        throws XMLStreamException
    {
        type -= XmlCharTypes.CT_MULTIBYTE_N; // number of more bytes needed
        
        if (ptr >= _inputEnd) {
            loadMoreGuaranteed();
            ptr = _inputPtr;
        }
        c = (int) _inputBuffer[ptr++];
        if ((c & 0xC0) != 0x080) {
            reportInvalidOther(c & 0xFF, ptr);
        }
        if (type > 1) { // needed == 1 means 2 bytes total
            if (ptr >= _inputEnd) {
                loadMoreGuaranteed();
                ptr = _inputPtr;
            }
            c = (int) _inputBuffer[ptr++];
            if ((c & 0xC0) != 0x080) {
                reportInvalidOther(c & 0xFF, ptr);
            }
            if (type > 2) { // 4 bytes? (need surrogates)
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                c = (int) _inputBuffer[ptr++];
                if ((c & 0xC0) != 0x080) {
                    reportInvalidOther(c & 0xFF, ptr);
                }
            }
        }
        return ptr;
    }
    */

    private final void skipUtf8_2(int c)
        throws XMLStreamException
    {
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        c = (int) _inputBuffer[_inputPtr++];
        if ((c & 0xC0) != 0x080) {
            reportInvalidOther(c & 0xFF, _inputPtr);
        }
    }

    /* Alas, can't heavily optimize skipping, since we still have to
     * do validity checks...
     */
    private final void skipUtf8_3(int c)
        throws XMLStreamException
    {
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        c &= 0x0F;
        if (c >= 0xD) { // have to check
            c <<= 6;
            int d = (int) _inputBuffer[_inputPtr++];
            if ((d & 0xC0) != 0x080) {
                reportInvalidOther(d & 0xFF, _inputPtr);
            }
            c |= (d & 0x3F);
            if (_inputPtr >= _inputEnd) {
                loadMoreGuaranteed();
            }
            d = (int) _inputBuffer[_inputPtr++];
            if ((d & 0xC0) != 0x080) {
                reportInvalidOther(d & 0xFF, _inputPtr);
            }
            c = (c << 6) | (d & 0x3F);
            // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal
            if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF
                if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) {
                    c = handleInvalidXmlChar(c);
                }
            }
        } else { // no checks, can discard
            c = (int) _inputBuffer[_inputPtr++];
            if ((c & 0xC0) != 0x080) {
                reportInvalidOther(c & 0xFF, _inputPtr);
            }
            if (_inputPtr >= _inputEnd) {
                loadMoreGuaranteed();
            }
            c = (int) _inputBuffer[_inputPtr++];
            if ((c & 0xC0) != 0x080) {
                reportInvalidOther(c & 0xFF, _inputPtr);
            }
        }
    }

    private final void skipUtf8_4(int c) throws XMLStreamException
    {
        if ((_inputPtr + 4) > _inputEnd) {
            skipUtf8_4Slow(c);
            return;
        }
        int d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
    }

    private final void skipUtf8_4Slow(int c) throws XMLStreamException
    {
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        int d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
    }

    /*
    /**********************************************************************
    /* Content parsing
    /**********************************************************************
     */

    @Override
    protected final void finishCData()
        throws XMLStreamException
    {
        final int[] TYPES = _charTypes.OTHER_CHARS;
        final byte[] inputBuffer = _inputBuffer;
        char[] outputBuffer = _textBuilder.resetWithEmpty();
        int outPtr = 0;

        /* At this point, space (if any) has been skipped, and we are
         * to parse and store the contents
         */
        main_loop:
        while (true) {
            int c;
            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = ptr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
                _inputPtr = ptr;
            }
            // And then exceptions:
            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_RBRACKET: // close ']]>' marker?
                /* Ok: let's just parse all consequtive right brackets,
                 * and see if followed by greater-than char. This because
                 * we can only push back at most one char at a time, and
                 * thus can't easily just check a subset
                 */
                int count = 0; // ignoring first one
                byte b;

                do {
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    b = _inputBuffer[_inputPtr];
                    if (b != BYTE_RBRACKET) {
                        break;
                    }
                    ++_inputPtr;
                    ++count;
                } while (true);

                // Was the marker found?
                boolean ok = (b == BYTE_GT && count >= 1);
                if (ok) {
                    --count;
                }
                // Brackets to copy to output?
                for (; count > 0; --count) {
                    outputBuffer[outPtr++] = ']';
                    if (outPtr >= outputBuffer.length) {
                        outputBuffer = _textBuilder.finishCurrentSegment();
                        outPtr = 0;
                    }
                }
                if (ok) {
                    ++_inputPtr; // to consume '>'
                    break main_loop;
                }
                break;
            }
            // Ok, can output the char; there's room for one char at least
            outputBuffer[outPtr++] = (char) c;
        }

        _textBuilder.setCurrentLength(outPtr);
        /* 03-Feb-2009, tatu: To support coalescing mode, may need to
         *   do some extra work
         */
        if (_cfgCoalescing && !_entityPending) {
            finishCoalescedText();
        }
    }

    @Override
    protected final void finishCharacters() throws XMLStreamException
    {
        int outPtr;
        int c;
        char[] outputBuffer;

        // Ok, so what was the first char / entity?
        c = _tmpChar;
        if (c < 0) { // from entity; can just copy as is
            c = -c;
            outputBuffer = _textBuilder.resetWithEmpty();
            outPtr = 0;
            if ((c >> 16) != 0) { // surrogate pair?
                c -= 0x10000;
                /* Note: after resetting the buffer, it's known to have
                 * space for more than 2 chars we need to add
                 */
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                c = 0xDC00 | (c & 0x3FF);
            }
            outputBuffer[outPtr++] = (char) c;
        } else { // white space that we are interested in?
            if (c == INT_CR || c == INT_LF) {
                ++_inputPtr; // wasn't advanced yet, in this case
                outPtr = checkInTreeIndentation(c);
                if (outPtr < 0) {
                    return;
                }
                // Above call also initializes the text builder appropriately
                outputBuffer = _textBuilder.getBufferWithoutReset();
            } else {
                outputBuffer = _textBuilder.resetWithEmpty();
                outPtr = 0;
            }
        }

        final int[] TYPES = _charTypes.TEXT_CHARS;
        final byte[] inputBuffer = _inputBuffer;

        main_loop:
        while (true) {
            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = ptr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
                _inputPtr = ptr;
            }
            // And then fallback for funny chars / UTF-8 multibytes:
            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                if ((_inputEnd - _inputPtr) >= 2) {
                    c = decodeUtf8_3fast(c);
                } else {
                    c = decodeUtf8_3(c);
                }
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_LT:
                --_inputPtr;
                break main_loop;
            case XmlCharTypes.CT_AMP:
                c = handleEntityInText(false);
                if (c == 0) { // unexpandable general parsed entity
                    // _inputPtr set by entity expansion method
                    _entityPending = true;
                    break main_loop;
                }
                // Ok; does it need a surrogate though? (over 16 bits)
                if ((c >> 16) != 0) {
                    c -= 0x10000;
                    outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                    // Need to ensure room for one more char
                    if (outPtr >= outputBuffer.length) {
                        outputBuffer = _textBuilder.finishCurrentSegment();
                        outPtr = 0;
                    }
                    c = 0xDC00 | (c & 0x3FF);
                }
                break;
            case XmlCharTypes.CT_RBRACKET: // ']]>'?
                {
                    // Let's then just count number of brackets --
                    // in case they are not followed by '>'
                    int count = 1;
                    byte b;
                    while (true) {
                        if (_inputPtr >= _inputEnd) {
                            loadMoreGuaranteed();
                        }
                        b = inputBuffer[_inputPtr];
                        if (b != BYTE_RBRACKET) {
                            break;
                        }
                        ++_inputPtr; // to skip past bracket
                        ++count;
                    }
                    if (b == BYTE_GT && count > 1) {
                        reportIllegalCDataEnd();
                    }
                    // Nope. Need to output all brackets, then; except
                    // for one that can be left for normal output
                    while (count > 1) {
                        outputBuffer[outPtr++] = ']';
                        // Need to ensure room for one more char
                        if (outPtr >= outputBuffer.length) {
                            outputBuffer = _textBuilder.finishCurrentSegment();
                            outPtr = 0;
                        }
                        --count;
                    }
                }
                // Can just output the first ']' along normal output
                break;
                
            // default:
                // Other types are not important here...
            }
            // We know there's room for one more:
            outputBuffer[outPtr++] = (char) c;
        }

        _textBuilder.setCurrentLength(outPtr);

        /* 03-Feb-2009, tatu: To support coalescing mode, may need to
         *   do some extra work
         */
        if (_cfgCoalescing && !_entityPending) {
            finishCoalescedText();
        }
    }

    @Override
    protected final void finishComment() throws XMLStreamException
    {
        final int[] TYPES = _charTypes.OTHER_CHARS;
        final byte[] inputBuffer = _inputBuffer;
        char[] outputBuffer = _textBuilder.resetWithEmpty();
        int outPtr = 0;

        main_loop:
        while (true) {
            int c;
            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = ptr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
                _inputPtr = ptr;
            }

            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_HYPHEN: // '-->'?
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then
                    ++_inputPtr;
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    if (_inputBuffer[_inputPtr++] != BYTE_GT) {
                        reportDoubleHyphenInComments();
                    }
                    break main_loop;
                }
                break;
            // default:
                // Other types are not important here...
            }

            // Ok, can output the char (we know there's room for one more)
            outputBuffer[outPtr++] = (char) c;
        }
        _textBuilder.setCurrentLength(outPtr);
    }

    /**
     * When this method gets called we know that we have an internal subset,
     * and that the opening '[' has already been read.
     */
    @Override
    protected final void finishDTD(boolean copyContents) throws XMLStreamException
    {
        char[] outputBuffer = copyContents ? _textBuilder.resetWithEmpty() : null;
        int outPtr = 0;

        final int[] TYPES = _charTypes.DTD_CHARS;
        boolean inDecl = false; // in declaration/directive?
        int quoteChar = 0; // inside quoted string?

        main_loop:
        while (true) {
            int c;

            /* First we'll have a quickie loop for speeding through
             * uneventful chars...
             */
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                int max = _inputEnd;
                if (outputBuffer != null) {
                    if (outPtr >= outputBuffer.length) {
                        outputBuffer = _textBuilder.finishCurrentSegment();
                        outPtr = 0;
                    }
                    {
                        int max2 = ptr + (outputBuffer.length - outPtr);
                        if (max2 < max) {
                            max = max2;
                        }
                    }
                }
                while (ptr < max) {
                    c = (int) _inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                    if (outputBuffer != null) {
                        outputBuffer[outPtr++] = (char) c;
                    }
                }
                _inputPtr = ptr;
            }

            switch (TYPES[c]) {

                // First, common types

            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (_inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                c = decodeUtf8_4(c);
                if (outputBuffer != null) {
                    // Let's add first part right away:
                    outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                    c = 0xDC00 | (c & 0x3FF);
                    if (outPtr >= outputBuffer.length) {
                        outputBuffer = _textBuilder.finishCurrentSegment();
                        outPtr = 0;
                    }
                    // And let the other char output down below
                }
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);

                // Then DTD-specific types:

            case XmlCharTypes.CT_DTD_QUOTE: // apos or quot
                if (quoteChar == 0) {
                    quoteChar = c;
                } else {
                    if (quoteChar == c) {
                        quoteChar = 0;
                    }
                }
                break;

            case XmlCharTypes.CT_DTD_LT:
                if (!inDecl) {
                    inDecl = true;
                }
                break;
            case XmlCharTypes.CT_DTD_GT:
                if (quoteChar == 0) {
                    inDecl = false;
                }
                break;
            case XmlCharTypes.CT_DTD_RBRACKET:
                if (!inDecl && quoteChar == 0) {
                    break main_loop;
                }
                break;
                
            // default:
                // Other types are not important here...
            }

            if (outputBuffer != null) { // will have room for one more
                outputBuffer[outPtr++] = (char) c;
            }
        }
        if (outputBuffer != null) {
            _textBuilder.setCurrentLength(outPtr);
        }

        // but still need to match the '>'...
        byte b = skipInternalWs(false, null);
        if (b != BYTE_GT) {
            throwUnexpectedChar(decodeCharForError(b), " expected '>' after the internal subset");
        }
    }

    @Override
    protected final void finishPI() throws XMLStreamException
    {
        final int[] TYPES = _charTypes.OTHER_CHARS;
        final byte[] inputBuffer = _inputBuffer;
        char[] outputBuffer = _textBuilder.resetWithEmpty();
        int outPtr = 0;

        /* At this point, space (if any) has been skipped, and we are
         * to parse and store the contents
         */
        main_loop:
        while (true) {
            int c;
            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = ptr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
                _inputPtr = ptr;
            }
            // And then exceptions:
            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_QMARK:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (_inputBuffer[_inputPtr] == BYTE_GT) { // ok, the end!
                    ++_inputPtr;
                    break main_loop;
                }
                // Not end mark, just need to reprocess the second char
            // default:
                // Other types are not important here...
            }
            // Ok, can output the char (we know there's room for one more)
            outputBuffer[outPtr++] = (char) c;
        }
        _textBuilder.setCurrentLength(outPtr);
    }

    /**
     * Note: this method is only called in cases where it is known
     * that only space chars are legal. Thus, encountering a non-space
     * is an error (WFC or VC). However, an end-of-input is ok.
     */
    @Override
    protected final void finishSpace() throws XMLStreamException
    {
        /* Ok: so, mTmpChar contains first space char. If it looks
         * like indentation, we can probably optimize a bit...
         */
        int tmp = _tmpChar;
        char[] outputBuffer;
        int outPtr;

        if (tmp == BYTE_CR || tmp == BYTE_LF) {
            outPtr = checkPrologIndentation(tmp);
            if (outPtr < 0) {
                return;
            }
            // Above call also initializes the text builder appropriately
            outputBuffer = _textBuilder.getBufferWithoutReset();
        } else {
            outputBuffer = _textBuilder.resetWithEmpty();
            outputBuffer[0] = (char) tmp;
            outPtr = 1;
        }

        int ptr = _inputPtr;

        while (true) {
            if (ptr >= _inputEnd) {
                if (!loadMore()) {
                    break;
                }
                ptr = _inputPtr;
            }
            int c = (int) _inputBuffer[ptr] & 0xFF;
            // !!! TODO: check for xml 1.1 whitespace?
            if (c > INT_SPACE) {
                break;
            }
            ++ptr;

            if (c == INT_LF) {
                markLF(ptr);
            } else if (c == INT_CR) {
                if (ptr >= _inputEnd) {
                    if (!loadMore()) { // still need to output the lf
                        if (outPtr >= outputBuffer.length) {
                            outputBuffer = _textBuilder.finishCurrentSegment();
                            outPtr = 0;
                        }
                        outputBuffer[outPtr++] = '\n';
                        break;
                    }
                    ptr = _inputPtr;
                }
                if (_inputBuffer[ptr] == BYTE_LF) {
                    ++ptr;
                }
                markLF(ptr);
                c = INT_LF; // need to convert to canonical lf
            } else if (c != INT_SPACE && c != INT_TAB) {
                _inputPtr = ptr;
                throwInvalidSpace(c);
            }

            // Ok, can output the char
            if (outPtr >= outputBuffer.length) {
                outputBuffer = _textBuilder.finishCurrentSegment();
                outPtr = 0;
            }
            outputBuffer[outPtr++] = (char) c;
        }

        _inputPtr = ptr;
        _textBuilder.setCurrentLength(outPtr);
    }

    /*
    /**********************************************************************
    /* 2nd level parsing/skipping for coalesced text
    /**********************************************************************
     */

    /**
     * Method that gets called after a primary text segment (of type
     * CHARACTERS or CDATA, not applicable to SPACE) has been read in
     * text buffer. Method has to see if the following event would
     * be textual as well, and if so, read it (and any other following
     * textual segments).
     */
    protected final void finishCoalescedText()
        throws XMLStreamException
    {
        while (true) {
            // no matter what, will need (and can get) one char
            if (_inputPtr >= _inputEnd) {
                if (!loadMore()) { // most likely an error, will be handled later on
                    return;
                }
            }

            if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind
                /* In worst case, need 3 chars ("= _inputEnd) {
                    if (!loadAndRetain(3)) {
                        // probably an error, but will be handled later
                        return;
                    }
                }
                if (_inputBuffer[_inputPtr+1] != BYTE_EXCL
                    || _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) {
                    // can't be CDATA, we are done here
                    return;
                }
                // but let's verify it still:
                _inputPtr += 3;
                for (int i = 0; i < 6; ++i) {
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    byte b = _inputBuffer[_inputPtr++];
                    if (b != (byte) CDATA_STR.charAt(i)) {
                        int ch = decodeCharForError(b);
                        reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
                    }
                }
                finishCoalescedCData();
            } else { // textual (or entity, error etc)
                finishCoalescedCharacters();
                if (_entityPending) {
                    break;
                }
            }
        }
    }

    // note: code mostly copied from 'finishCharacters', just simplified
    // in some places
    protected final void finishCoalescedCharacters()
        throws XMLStreamException
    {
        // first char can't be from (char) entity (wrt finishCharacters)

        final int[] TYPES = _charTypes.TEXT_CHARS;
        final byte[] inputBuffer = _inputBuffer;

        char[] outputBuffer = _textBuilder.getBufferWithoutReset();
        int outPtr = _textBuilder.getCurrentLength();

        int c;

        main_loop:
        while (true) {
            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = ptr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
                _inputPtr = ptr;
            }
            // And then fallback for funny chars / UTF-8 multibytes:
            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                if ((_inputEnd - _inputPtr) >= 2) {
                    c = decodeUtf8_3fast(c);
                } else {
                    c = decodeUtf8_3(c);
                }
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_LT:
                --_inputPtr;
                break main_loop;
            case XmlCharTypes.CT_AMP:
                c = handleEntityInText(false);
                if (c == 0) { // unexpandable general parsed entity
                    // _inputPtr set by entity expansion method
                    _entityPending = true;
                    break main_loop;
                }
                // Ok; does it need a surrogate though? (over 16 bits)
                if ((c >> 16) != 0) {
                    c -= 0x10000;
                    outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                    // Need to ensure room for one more char
                    if (outPtr >= outputBuffer.length) {
                        outputBuffer = _textBuilder.finishCurrentSegment();
                        outPtr = 0;
                    }
                    c = 0xDC00 | (c & 0x3FF);
                }
                break;
            case XmlCharTypes.CT_RBRACKET: // ']]>'?
                {
                    // Let's then just count number of brackets --
                    // in case they are not followed by '>'
                    int count = 1;
                    byte b;
                    while (true) {
                        if (_inputPtr >= _inputEnd) {
                            loadMoreGuaranteed();
                        }
                        b = inputBuffer[_inputPtr];
                        if (b != BYTE_RBRACKET) {
                            break;
                        }
                        ++_inputPtr; // to skip past bracket
                        ++count;
                    }
                    if (b == BYTE_GT && count > 1) {
                        reportIllegalCDataEnd();
                    }
                    // Nope. Need to output all brackets, then; except
                    // for one that can be left for normal output
                    while (count > 1) {
                        outputBuffer[outPtr++] = ']';
                        // Need to ensure room for one more char
                        if (outPtr >= outputBuffer.length) {
                            outputBuffer = _textBuilder.finishCurrentSegment();
                            outPtr = 0;
                        }
                        --count;
                    }
                }
                // Can just output the first ']' along normal output
                break;
                
            // default:
                // Other types are not important here...
            }
            // We know there's room for one more:
            outputBuffer[outPtr++] = (char) c;
        }

        _textBuilder.setCurrentLength(outPtr);
    }

    // note: code mostly copied from 'finishCharacters', just simplified
    // in some places
    protected final void finishCoalescedCData()
        throws XMLStreamException
    {
        final int[] TYPES = _charTypes.OTHER_CHARS;
        final byte[] inputBuffer = _inputBuffer;

        char[] outputBuffer = _textBuilder.getBufferWithoutReset();
        int outPtr = _textBuilder.getCurrentLength();

        /* At this point, space (if any) has been skipped, and we are
         * to parse and store the contents
         */
        main_loop:
        while (true) {
            int c;
            // Then the tight ascii non-funny-char loop:
            ascii_loop:
            while (true) {
                int ptr = _inputPtr;
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = ptr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (ptr < max) {
                    c = (int) inputBuffer[ptr++] & 0xFF;
                    if (TYPES[c] != 0) {
                        _inputPtr = ptr;
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
                _inputPtr = ptr;
            }
            // And then exceptions:
            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    loadMoreGuaranteed();
                }
                if (inputBuffer[_inputPtr] == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_RBRACKET: // close ']]>' marker?
                /* Ok: let's just parse all consequtive right brackets,
                 * and see if followed by greater-than char. This because
                 * we can only push back at most one char at a time, and
                 * thus can't easily just check a subset
                 */
                int count = 0; // ignoring first one
                byte b;

                do {
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    b = _inputBuffer[_inputPtr];
                    if (b != BYTE_RBRACKET) {
                        break;
                    }
                    ++_inputPtr;
                    ++count;
                } while (true);

                // Was the marker found?
                boolean ok = (b == BYTE_GT && count >= 1);
                if (ok) {
                    --count;
                }
                // Brackets to copy to output?
                for (; count > 0; --count) {
                    outputBuffer[outPtr++] = ']';
                    if (outPtr >= outputBuffer.length) {
                        outputBuffer = _textBuilder.finishCurrentSegment();
                        outPtr = 0;
                    }
                }
                if (ok) {
                    ++_inputPtr; // to consume '>'
                    break main_loop;
                }
                break;
            }
            // Ok, can output the char; there's room for one char at least
            outputBuffer[outPtr++] = (char) c;
        }

        _textBuilder.setCurrentLength(outPtr);
    }

    /**
     * Method that gets called after a primary text segment (of type
     * CHARACTERS or CDATA, not applicable to SPACE) has been skipped.
     * Method has to see if the following event would
     * be textual as well, and if so, skip it (and any other following
     * textual segments).
     *
     * @return True if we encountered an unexpandable entity
     */
    @Override
    protected final boolean skipCoalescedText()
        throws XMLStreamException
    {
        while (true) {
            // no matter what, will need (and can get) one char
            if (_inputPtr >= _inputEnd) {
                if (!loadMore()) { // most likely an error, will be handled later on
                    return false;
                }
            }

            if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind
                /* In worst case, need 3 chars ("= _inputEnd) {
                    if (!loadAndRetain(3)) { // probably an error, but will be handled later
                        return false;
                    }
                }
                if (_inputBuffer[_inputPtr+1] != BYTE_EXCL
                    || _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) {
                    // can't be CDATA, we are done here
                    return false;
                }
                // but let's verify it still:
                _inputPtr += 3;
                for (int i = 0; i < 6; ++i) {
                    if (_inputPtr >= _inputEnd) {
                        loadMoreGuaranteed();
                    }
                    byte b = _inputBuffer[_inputPtr++];
                    if (b != (byte) CDATA_STR.charAt(i)) {
                        int ch = decodeCharForError(b);
                        reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
                    }
                }
                skipCData();
            } else { // textual (or entity, error etc)
                if (skipCharacters()) {
                    return true;
                }
            }
        }
    }

    /*
    /**********************************************************************
    /* Other methods, utf-decoding
    /**********************************************************************
     */

    /**
     * @return Either decoded character (if positive int); or negated
     *    value of a high-order char (one that needs surrogate pair)
     */
    private final int decodeMultiByteChar(int c, int ptr)
        throws XMLStreamException
    {
        int needed;

        if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
            c &= 0x1F;
            needed = 1;
        } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
            c &= 0x0F;
            needed = 2;
        } else if ((c & 0xF8) == 0xF0) {
            // 4 bytes; double-char with surrogates and all...
            c &= 0x07;
            needed = 3;
        } else {
            reportInvalidInitial(c & 0xFF);
            needed = 1; // never gets here
        }
        
        if (ptr >= _inputEnd) {
            loadMoreGuaranteed();
            ptr = _inputPtr;
        }
        int d = (int) _inputBuffer[ptr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, ptr);
        }
        c = (c << 6) | (d & 0x3F);
        
        if (needed > 1) { // needed == 1 means 2 bytes total
            if (ptr >= _inputEnd) {
                loadMoreGuaranteed();
                ptr = _inputPtr;
            }
            d = (int) _inputBuffer[ptr++];
            if ((d & 0xC0) != 0x080) {
                reportInvalidOther(d & 0xFF, ptr);
            }
            c = (c << 6) | (d & 0x3F);
            if (needed > 2) { // 4 bytes? (need surrogates)
                if (ptr >= _inputEnd) {
                    loadMoreGuaranteed();
                    ptr = _inputPtr;
                }
                d = (int) _inputBuffer[ptr++];
                if ((d & 0xC0) != 0x080) {
                    reportInvalidOther(d & 0xFF, ptr);
                }
                c = (c << 6) | (d & 0x3F);
                /* Need to signal such pair differently (to make comparison
                 * easier)
                 */
                c = -c;
            }
        }
        _inputPtr = ptr;
        return c;
    }

    private final int decodeUtf8_2(int c)
        throws XMLStreamException
    {
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        int d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        return ((c & 0x1F) << 6) | (d & 0x3F);
    }

    private final int decodeUtf8_3(int c1)
        throws XMLStreamException
    {
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        c1 &= 0x0F;
        int d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        int c = (c1 << 6) | (d & 0x3F);
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        c = (c << 6) | (d & 0x3F);
        if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal
            if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF
                if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) {
                    c = handleInvalidXmlChar(c);
                }
            }
        }
        return c;
    }

    private final int decodeUtf8_3fast(int c1)
        throws XMLStreamException
    {
        c1 &= 0x0F;
        int d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        int c = (c1 << 6) | (d & 0x3F);
        d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        c = (c << 6) | (d & 0x3F);
        if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal
            if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF
                if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) {
                    c = handleInvalidXmlChar(c);
                }
            }
        }
        return c;
    }

    /**
     * @return Character value minus 0x10000; this so that caller
     *    can readily expand it to actual surrogates
     */
    private final int decodeUtf8_4(int c)
        throws XMLStreamException
    {
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        int d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        c = ((c & 0x07) << 6) | (d & 0x3F);

        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }
        c = (c << 6) | (d & 0x3F);
        if (_inputPtr >= _inputEnd) {
            loadMoreGuaranteed();
        }
        d = (int) _inputBuffer[_inputPtr++];
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF, _inputPtr);
        }

        /* note: won't change it to negative here, since caller
         * already knows it'll need a surrogate
         */
        return ((c << 6) | (d & 0x3F)) - 0x10000;
    }

    /*
    /**********************************************************************
    /* Internal methods, error reporting
    /**********************************************************************
     */

    /**
     * Method called called to decode a full UTF-8 characters, given
     * its first byte. Note: does not do any validity checks, since this
     * is only to be used for informational purposes (often when an error
     * has already been encountered)
     */
    @Override
    public int decodeCharForError(byte b) throws XMLStreamException
    {
        int c = (int) b;
        if (c >= 0) { // ascii? fine as is...
            return c;
        }
        int needed;

        // Ok; if we end here, we got multi-byte combination
        if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
            c &= 0x1F;
            needed = 1;
        } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
            c &= 0x0F;
            needed = 2;
        } else if ((c & 0xF8) == 0xF0) {
            // 4 bytes; double-char with surrogates and all...
            c &= 0x07;
            needed = 3;
        } else {
            reportInvalidInitial(c & 0xFF);
            needed = 1; // never gets here
        }
        
        int d = nextByte();
        if ((d & 0xC0) != 0x080) {
            reportInvalidOther(d & 0xFF);
        }
        c = (c << 6) | (d & 0x3F);
        
        if (needed > 1) { // needed == 1 means 2 bytes total
            d = nextByte(); // 3rd byte
            if ((d & 0xC0) != 0x080) {
                reportInvalidOther(d & 0xFF);
            }
            c = (c << 6) | (d & 0x3F);
            if (needed > 2) { // 4 bytes? (need surrogates)
                d = nextByte();
                if ((d & 0xC0) != 0x080) {
                    reportInvalidOther(d & 0xFF);
                }
                c = (c << 6) | (d & 0x3F);
            }
        }
        return c;
    }

    protected void reportInvalidOther(int mask, int ptr) throws XMLStreamException
    {
        _inputPtr = ptr;
        reportInvalidOther(mask);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy