All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fasterxml.aalto.async.AsyncByteBufferScanner Maven / Gradle / Ivy

There is a newer version: 1.3.3
Show newest version
/* Aalto XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.async;

import java.nio.ByteBuffer;

import javax.xml.stream.XMLStreamException;

import com.fasterxml.aalto.*;
import com.fasterxml.aalto.in.*;
import com.fasterxml.aalto.util.DataUtil;
//import com.fasterxml.aalto.util.XmlConsts;
import com.fasterxml.aalto.util.XmlCharTypes;

/**
 * This is the base class for asynchronous (non-blocking) XML
 * scanners. Due to basic complexity of async approach, character-based
 * doesn't make much sense, so only byte-based input is supported.
 */
public class AsyncByteBufferScanner
    extends AsyncByteScanner
    implements AsyncByteBufferFeeder
{
    /*
    /**********************************************************************
    /* Input buffer handling
    /**********************************************************************
     */

    /**
     * This buffer is actually provided by caller
     */
    protected ByteBuffer _inputBuffer;

    /**
     * In addition to current buffer pointer, and end pointer,
     * we will also need to know number of bytes originally
     * contained. This is needed to correctly update location
     * information when the block has been completed.
     */
    protected int _origBufferLen;

    /*
    /**********************************************************************
    /* Instance construction
    /**********************************************************************
     */

    public AsyncByteBufferScanner(ReaderConfig cfg)
    {
        super(cfg);
        // must start by checking if there's XML declaration...
        _state = STATE_PROLOG_INITIAL;
        _currToken = EVENT_INCOMPLETE;
    }

    @Override
    public String toString() {
        return "asyncScanner; curr="+_currToken+" next="+_nextEvent+", state = "+_state;
    }

    /*
    /**********************************************************************
    /* Implementation for low-level accessors
    /**********************************************************************
     */
    
    @Override
    protected final byte _currentByte() throws XMLStreamException {
        return _inputBuffer.get(_inputPtr);
    }

    @Override
    protected final byte _nextByte() throws XMLStreamException {
        return _inputBuffer.get(_inputPtr++);
    }

    @Override
    protected final byte _prevByte() throws XMLStreamException {
        return _inputBuffer.get(_inputPtr-1);
    }

    /*
    /**********************************************************************
    /* Parsing, comments
    /**********************************************************************
     */

    protected int parseCommentContents() throws XMLStreamException
    {
        // Left-overs from last input block?
        if (_pendingInput != 0) { // CR, multi-byte, or '-'?
            int result = handleCommentPending();
            // If there's not enough input, or if we completed, can leave
            if (result != 0) {
                return result;
            }
            // otherwise we should be good to continue
        }

        char[] outputBuffer = _textBuilder.getBufferWithoutReset();
        int outPtr = _textBuilder.getCurrentLength();

        final int[] TYPES = _charTypes.OTHER_CHARS;
        ByteBuffer inputBuffer = _inputBuffer;

        main_loop:
        while (true) {
            int c;
            // Then the tight ASCII non-funny-char loop:
            ascii_loop:
            while (true) {
                if (_inputPtr >= _inputEnd) {
                    break main_loop;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = _inputPtr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (_inputPtr < max) {
                    c = (int) inputBuffer.get(_inputPtr++) & 0xFF;
                    if (TYPES[c] != 0) {
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
            }

            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                {
                    if (_inputPtr >= _inputEnd) {
                        _pendingInput = PENDING_STATE_CR;
                        break main_loop;
                    }
                    if (inputBuffer.get(_inputPtr) == BYTE_LF) {
                        ++_inputPtr;
                    }
                    markLF();
                }
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                if (_inputPtr >= _inputEnd) {
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                if ((_inputEnd - _inputPtr) < 2) {
                    if (_inputEnd > _inputPtr) { // 2 bytes available
                        int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                        c |= (d << 8);
                    }
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                if ((_inputEnd - _inputPtr) < 3) {
                    if (_inputEnd > _inputPtr) { // at least 2 bytes?
                        int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                        c |= (d << 8);
                        if (_inputEnd > _inputPtr) { // 3 bytes?
                            d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                            c |= (d << 16);
                        }
                    }
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_HYPHEN: // '-->'?
                if (_inputPtr >= _inputEnd) {
                    _pendingInput = PENDING_STATE_COMMENT_HYPHEN1;
                    break main_loop;
                }
                if (_inputBuffer.get(_inputPtr) == BYTE_HYPHEN) { // ok, must be end then
                    ++_inputPtr;
                    if (_inputPtr >= _inputEnd) {
                        _pendingInput = PENDING_STATE_COMMENT_HYPHEN2;
                        break main_loop;
                    }
                    if (_inputBuffer.get(_inputPtr++) != BYTE_GT) {
                        reportDoubleHyphenInComments();
                    }
                    _textBuilder.setCurrentLength(outPtr);
                    _state = STATE_DEFAULT;
                    _nextEvent = EVENT_INCOMPLETE;
                    return COMMENT;
                }
                break;
            // default:
                // Other types are not important here...
            }

            // Ok, can output the char (we know there's room for one more)
            outputBuffer[outPtr++] = (char) c;
        }

        _textBuilder.setCurrentLength(outPtr);
        return EVENT_INCOMPLETE;
    }

    /**
     * @return EVENT_INCOMPLETE, if there's not enough input to
     *   handle pending char, COMMENT, if we handled complete
     *   "-->" end marker, or 0 to indicate something else
     *   was succesfully handled.
     */
    protected int handleCommentPending() throws XMLStreamException
    {
        if (_inputPtr >= _inputEnd) {
            return EVENT_INCOMPLETE;
        }
        if (_pendingInput == PENDING_STATE_COMMENT_HYPHEN1) {
            if (_inputBuffer.get(_inputPtr) != BYTE_HYPHEN) {
                // can't be the end marker, just append '-' and go
                _pendingInput = 0;
                _textBuilder.append("-");
                return 0;
            }
            ++_inputPtr;
            _pendingInput = PENDING_STATE_COMMENT_HYPHEN2;
            if (_inputPtr >= _inputEnd) { // no more input?
                return EVENT_INCOMPLETE;
            }
            // continue
        }
        if (_pendingInput == PENDING_STATE_COMMENT_HYPHEN2) {
            _pendingInput = 0;
            byte b = _inputBuffer.get(_inputPtr++);
            if (b != BYTE_GT) {
                reportDoubleHyphenInComments();
            } 
            _state = STATE_DEFAULT;
            _nextEvent = EVENT_INCOMPLETE;
            return COMMENT;
        }
        // Otherwise can use default code
        return handleAndAppendPending() ? 0 : EVENT_INCOMPLETE;
    }

    /*
    /**********************************************************************
    /* Parsing, PI
    /**********************************************************************
     */

    protected int parsePIData() throws XMLStreamException
    {
        // Left-overs from last input block?
        if (_pendingInput != 0) { // CR, multi-byte, '?'
            int result = handlePIPending();
            // If there's not enough input, or if we completed, can leave
            if (result != 0) {
                return result;
            }
            // otherwise we should be good to continue
        }
        
        char[] outputBuffer = _textBuilder.getBufferWithoutReset();
        int outPtr = _textBuilder.getCurrentLength();
        
        final int[] TYPES = _charTypes.OTHER_CHARS;
        ByteBuffer inputBuffer = _inputBuffer;
        
        main_loop:
        while (true) {
            int c;
            // Then the tight ASCII non-funny-char loop:
            ascii_loop:
            while (true) {
                if (_inputPtr >= _inputEnd) {
                    break main_loop;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = _inputPtr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (_inputPtr < max) {
                    c = (int) inputBuffer.get(_inputPtr++) & 0xFF;
                    if (TYPES[c] != 0) {
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
            }

            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                {
                    if (_inputPtr >= _inputEnd) {
                        _pendingInput = PENDING_STATE_CR;
                        break main_loop;
                    }
                    if (inputBuffer.get(_inputPtr) == BYTE_LF) {
                        ++_inputPtr;
                    }
                    markLF();
                }
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                if (_inputPtr >= _inputEnd) {
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                if ((_inputEnd - _inputPtr) < 2) {
                    if (_inputEnd > _inputPtr) { // 2 bytes available
                        int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                        c |= (d << 8);
                    }
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                if ((_inputEnd - _inputPtr) < 3) {
                    if (_inputEnd > _inputPtr) { // at least 2 bytes?
                        int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                        c |= (d << 8);
                        if (_inputEnd > _inputPtr) { // 3 bytes?
                            d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                            c |= (d << 16);
                        }
                    }
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_QMARK:

                if (_inputPtr >= _inputEnd) {
                    _pendingInput = PENDING_STATE_PI_QMARK;
                    break main_loop;
                }
                if (_inputBuffer.get(_inputPtr) == BYTE_GT) { // end
                    ++_inputPtr;
                    _textBuilder.setCurrentLength(outPtr);
                    _state = STATE_DEFAULT;
                    _nextEvent = EVENT_INCOMPLETE;
                    return PROCESSING_INSTRUCTION;
                }
                // Not end mark, just need to reprocess the second char
                break;
            // default:
                // Other types are not important here...
            }

            // Ok, can output the char (we know there's room for one more)
            outputBuffer[outPtr++] = (char) c;
        }
        _textBuilder.setCurrentLength(outPtr);
        return EVENT_INCOMPLETE;
    }

    /**
     * @return EVENT_INCOMPLETE, if there's not enough input to
     *   handle pending char, PROCESSING_INSTRUCTION, if we handled complete
     *   "?>" end marker, or 0 to indicate something else
     *   was succesfully handled.
     */
    protected int handlePIPending() throws XMLStreamException
    {
        // First, the special case, end marker:
        if (_pendingInput == PENDING_STATE_PI_QMARK) {
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            byte b = _inputBuffer.get(_inputPtr);
            _pendingInput = 0;
            if (b != BYTE_GT) {
                // can't be the end marker, just append '-' and go
                _textBuilder.append('?');
                return 0;
            }
            ++_inputPtr;
            _state = STATE_DEFAULT;
            _nextEvent = EVENT_INCOMPLETE;
            return PROCESSING_INSTRUCTION;
        }
        // Otherwise can use default code
        return handleAndAppendPending() ? 0 : EVENT_INCOMPLETE;
    }

    /*
    /**********************************************************************
    /* Parsing, internal DTD subset
    /**********************************************************************
     */

    @Override
    protected final boolean handleDTDInternalSubset(boolean init) throws XMLStreamException
    {
        char[] outputBuffer;
        int outPtr;

        if (init) { // first time around
            outputBuffer = _textBuilder.resetWithEmpty();
            outPtr = 0;
            _elemAttrQuote = 0;
            _inDtdDeclaration = false;
        } else {
            if (_pendingInput != 0) {
                if (!handleAndAppendPending()) {
                    return false;
                }
            }        
            outputBuffer = _textBuilder.getBufferWithoutReset();
            outPtr = _textBuilder.getCurrentLength();
        }

        final int[] TYPES = _charTypes.DTD_CHARS;
        ByteBuffer inputBuffer = _inputBuffer;
        
        main_loop:
        while (true) {
            int c;
            // Then the tight ASCII non-funny-char loop:
            ascii_loop:
            while (true) {
                if (_inputPtr >= _inputEnd) {
                    break main_loop;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = _inputPtr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (_inputPtr < max) {
                    c = (int) inputBuffer.get(_inputPtr++) & 0xFF;
                    if (TYPES[c] != 0) {
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
            }

            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                if (_inputPtr >= _inputEnd) {
                    _pendingInput = PENDING_STATE_CR;
                    break main_loop;
                }
                if (inputBuffer.get(_inputPtr) == BYTE_LF) {
                    ++_inputPtr;
                }
                markLF();
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                if (_inputPtr >= _inputEnd) {
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                if ((_inputEnd - _inputPtr) < 2) {
                    if (_inputEnd > _inputPtr) { // 2 bytes available
                        int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                        c |= (d << 8);
                    }
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                if ((_inputEnd - _inputPtr) < 3) {
                    if (_inputEnd > _inputPtr) { // at least 2 bytes?
                        int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                        c |= (d << 8);
                        if (_inputEnd > _inputPtr) { // 3 bytes?
                            d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                            c |= (d << 16);
                        }
                    }
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);

            case XmlCharTypes.CT_DTD_QUOTE: // apos or quot
                if (_elemAttrQuote == 0) {
                    _elemAttrQuote = (byte) c;
                } else {
                    if (_elemAttrQuote == c) {
                        _elemAttrQuote = 0;
                    }
                }
                break;

            case XmlCharTypes.CT_DTD_LT:
                if (!_inDtdDeclaration) {
                    _inDtdDeclaration = true;
                }
                break;
            case XmlCharTypes.CT_DTD_GT:
                if (_elemAttrQuote == 0) {
                    _inDtdDeclaration = false;
                }
                break;
            case XmlCharTypes.CT_DTD_RBRACKET:
                if (!_inDtdDeclaration && _elemAttrQuote == 0) {
                    _textBuilder.setCurrentLength(outPtr);
                    return true;
                }
                break;
            // default:
                // Other types are not important here...
            }
            // Ok, can output the char (we know there's room for one more)
            outputBuffer[outPtr++] = (char) c;
        }
        _textBuilder.setCurrentLength(outPtr);
        return false;
    }
    
    /*
    /**********************************************************************
    /* Parsing, CDATA
    /**********************************************************************
     */

    protected final int parseCDataContents() throws XMLStreamException
    {
        // Left-overs from last input block?
        if (_pendingInput != 0) { // CR, multi-byte, or ']'?
            int result = handleCDataPending();
            // If there's not enough input, or if we completed, can leave
            if (result != 0) {
                return result;
            }
            // otherwise we should be good to continue
        }
        char[] outputBuffer = _textBuilder.getBufferWithoutReset();
        int outPtr = _textBuilder.getCurrentLength();
    
        final int[] TYPES = _charTypes.OTHER_CHARS;
        ByteBuffer inputBuffer = _inputBuffer;
    
        main_loop:
        while (true) {
            int c;
            // Then the tight ASCII non-funny-char loop:
            ascii_loop:
            while (true) {
                if (_inputPtr >= _inputEnd) {
                    break main_loop;
                }
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                int max = _inputEnd;
                {
                    int max2 = _inputPtr + (outputBuffer.length - outPtr);
                    if (max2 < max) {
                        max = max2;
                    }
                }
                while (_inputPtr < max) {
                    c = (int) inputBuffer.get(_inputPtr++) & 0xFF;
                    if (TYPES[c] != 0) {
                        break ascii_loop;
                    }
                    outputBuffer[outPtr++] = (char) c;
                }
            }
    
            switch (TYPES[c]) {
            case XmlCharTypes.CT_INVALID:
                c = handleInvalidXmlChar(c);
            case XmlCharTypes.CT_WS_CR:
                {
                    if (_inputPtr >= _inputEnd) {
                        _pendingInput = PENDING_STATE_CR;
                        break main_loop;
                    }
                    if (inputBuffer.get(_inputPtr) == BYTE_LF) {
                        ++_inputPtr;
                    }
                    markLF();
                }
                c = INT_LF;
                break;
            case XmlCharTypes.CT_WS_LF:
                markLF();
                break;
            case XmlCharTypes.CT_MULTIBYTE_2:
                if (_inputPtr >= _inputEnd) {
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_2(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                if ((_inputEnd - _inputPtr) < 2) {
                    if (_inputEnd > _inputPtr) { // 2 bytes available
                        int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                        c |= (d << 8);
                    }
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_3(c);
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                if ((_inputEnd - _inputPtr) < 3) {
                    if (_inputEnd > _inputPtr) { // at least 2 bytes?
                        int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                        c |= (d << 8);
                        if (_inputEnd > _inputPtr) { // 3 bytes?
                            d = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                            c |= (d << 16);
                        }
                    }
                    _pendingInput = c;
                    break main_loop;
                }
                c = decodeUtf8_4(c);
                // Let's add first part right away:
                outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
                if (outPtr >= outputBuffer.length) {
                    outputBuffer = _textBuilder.finishCurrentSegment();
                    outPtr = 0;
                }
                c = 0xDC00 | (c & 0x3FF);
                // And let the other char output down below
                break;
            case XmlCharTypes.CT_MULTIBYTE_N:
                reportInvalidInitial(c);
            case XmlCharTypes.CT_RBRACKET: // ']]>'?
                if (_inputPtr >= _inputEnd) {
                    _pendingInput = PENDING_STATE_CDATA_BRACKET1;
                    break main_loop;
                }
                // Hmmh. This is more complex... so be it.
                if (_inputBuffer.get(_inputPtr) == BYTE_RBRACKET) { // end might be nigh...
                    ++_inputPtr;
                    while (true) {
                        if (_inputPtr >= _inputEnd) {
                            _pendingInput = PENDING_STATE_CDATA_BRACKET2;
                            break main_loop;
                        }
                        if (_inputBuffer.get(_inputPtr) == BYTE_GT) {
                            ++_inputPtr;
                            _textBuilder.setCurrentLength(outPtr);
                            _state = STATE_DEFAULT;
                            _nextEvent = EVENT_INCOMPLETE;
                            return CDATA;
                        }
                        if (_inputBuffer.get(_inputPtr) != BYTE_RBRACKET) { // neither '>' nor ']'; push "]]" back
                            outputBuffer[outPtr++] = ']';
                            if (outPtr >= outputBuffer.length) {
                                outputBuffer = _textBuilder.finishCurrentSegment();
                                outPtr = 0;
                            }
                            outputBuffer[outPtr++] = ']';
                            continue main_loop;
                        }
                        // Got third bracket; push one back, keep on checking
                        ++_inputPtr;
                        outputBuffer[outPtr++] = ']';
                        if (outPtr >= outputBuffer.length) {
                            outputBuffer = _textBuilder.finishCurrentSegment();
                            outPtr = 0;
                        }
                    }
                }
                break;
            // default:
                // Other types are not important here...
            }
    
            // Ok, can output the char (we know there's room for one more)
            outputBuffer[outPtr++] = (char) c;
        }
    
        _textBuilder.setCurrentLength(outPtr);
        return EVENT_INCOMPLETE;
    }

    /**
     * @return EVENT_INCOMPLETE, if there's not enough input to
     *   handle pending char, CDATA, if we handled complete
     *   "]]>" end marker, or 0 to indicate something else
     *   was succesfully handled.
     */
    protected final int handleCDataPending() throws XMLStreamException
    {
        if (_pendingInput == PENDING_STATE_CDATA_BRACKET1) {
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            if (_inputBuffer.get(_inputPtr) != BYTE_RBRACKET) {
                // can't be the end marker, just append ']' and go
                _textBuilder.append(']');
                return (_pendingInput = 0);
            }
            ++_inputPtr;
            _pendingInput = PENDING_STATE_CDATA_BRACKET2;
            if (_inputPtr >= _inputEnd) { // no more input?
                return EVENT_INCOMPLETE;
            }
            // continue
        }
        while (_pendingInput == PENDING_STATE_CDATA_BRACKET2) {
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            byte b = _inputBuffer.get(_inputPtr++);
            if (b == BYTE_GT) {
                _pendingInput = 0;
                _state = STATE_DEFAULT;
                _nextEvent = EVENT_INCOMPLETE;
                return CDATA;
            }
            if (b != BYTE_RBRACKET) {
                --_inputPtr;
                _textBuilder.append("]]");
                return (_pendingInput = 0);
            }
            _textBuilder.append(']');
        }
        // Otherwise can use default code
        return handleAndAppendPending() ? 0 : EVENT_INCOMPLETE;
    }
    
    /**
     * This method gets called, if the first character of a
     * CHARACTERS event could not be fully read (multi-byte,
     * split over buffer boundary). If so, there is some
     * pending data to be handled.
     */
    protected int startCharactersPending() throws XMLStreamException
    {
        // First, need to have at least one more byte:
        if (_inputPtr >= _inputEnd) {
            return EVENT_INCOMPLETE;
        }

        // K. So what was the type again?
        int c = _pendingInput;
        _pendingInput = 0;

        // Possible \r\n linefeed?
        if (c == PENDING_STATE_CR) {
            if (_inputBuffer.get(_inputPtr) == BYTE_LF) {
                ++_inputPtr;
            }
            markLF();
            _textBuilder.resetWithChar(CHAR_LF);
        } else {
            // Nah, a multi-byte UTF-8 char:
            
            // Let's just retest the first pending byte (in LSB):
            switch (_charTypes.TEXT_CHARS[c & 0xFF]) {
            case XmlCharTypes.CT_MULTIBYTE_2:
                // Easy: must have just one byte, did get another one:
                _textBuilder.resetWithChar((char) decodeUtf8_2(c));
                break;
            case XmlCharTypes.CT_MULTIBYTE_3:
                {
                    // Ok... so do we have one or two pending bytes?
                    int next = _inputBuffer.get(_inputPtr++) & 0xFF;
                    int c2 = (c >> 8);
                    if (c2 == 0) { // just one; need two more
                        if (_inputPtr >= _inputEnd) { // but got only one
                            _pendingInput = c | (next << 8);
                            return EVENT_INCOMPLETE;
                        }
                        int c3 = _inputBuffer.get(_inputPtr++) & 0xFF;
                        c = decodeUtf8_3(c, next, c3);
                    } else { // had two, got one, bueno:
                        c = decodeUtf8_3((c & 0xFF), c2, next);
                    }
                    _textBuilder.resetWithChar((char) c);
                }
                break;
            case XmlCharTypes.CT_MULTIBYTE_4:
                {
                    int next = (int) _inputBuffer.get(_inputPtr++) & 0xFF;
                    // Only had one?
                    if ((c >> 8) == 0) { // ok, so need 3 more
                        if (_inputPtr >= _inputEnd) { // just have 1
                            _pendingInput = c | (next << 8);
                            return EVENT_INCOMPLETE;
                        }
                        int c2 = _inputBuffer.get(_inputPtr++) & 0xFF;
                        if (_inputPtr >= _inputEnd) { // almost, got 2
                            _pendingInput = c | (next << 8) | (c2 << 16);
                            return EVENT_INCOMPLETE;
                        }
                        int c3 = _inputBuffer.get(_inputPtr++) & 0xFF;
                        c = decodeUtf8_4(c, next, c2, c3);
                    } else { // had two or three
                        int c2 = (c >> 8) & 0xFF;
                        int c3 = (c >> 16);
                        
                        if (c3 == 0) { // just two
                            if (_inputPtr >= _inputEnd) { // one short
                                _pendingInput = c | (next << 16);
                                return EVENT_INCOMPLETE;
                            }
                            c3 = _inputBuffer.get(_inputPtr++) & 0xFF;
                            c = decodeUtf8_4((c & 0xFF), c2, next, c3);
                        } else { // had three, got last
                            c = decodeUtf8_4((c & 0xFF), c2, c3, next);
                        }
                    } 
                }
                // Need a surrogate pair, have to call from here:
                _textBuilder.resetWithSurrogate(c);
                return (_currToken = CHARACTERS);
            default: // should never occur:
                throwInternal();
            }
        }

        // Great, we got it. Is that enough?
        if (_cfgCoalescing && !_cfgLazyParsing) {
            // In eager coalescing mode, must read it all
            return finishCharactersCoalescing();
        }
        _currToken = CHARACTERS;
        if (_cfgLazyParsing) {
            _tokenIncomplete = true;
        } else {
            finishCharacters();
        }
        return _currToken;
    }

    /**
     * TODO: Method not yet implemented
     */
    protected final int finishCharactersCoalescing() throws XMLStreamException
    {
        // First things first: any pending partial multi-bytes?
        if (_pendingInput != 0) {
            if (!handleAndAppendPending()) {
                return EVENT_INCOMPLETE;
            }
        }
        throw new UnsupportedOperationException();
        // !!! TBI
//        return 0;
    }

    /*
    /**********************************************************************
    /* Async input, methods to feed (push) content to parse
    /**********************************************************************
     */

    @Override
    public final boolean needMoreInput() {
        return (_inputPtr >=_inputEnd) && !_endOfInput;
    }

    @Override
    public void feedInput(ByteBuffer buffer) throws XMLStreamException
    {
        // Must not have remaining input
        if (_inputPtr < _inputEnd) {
            throw new XMLStreamException("Still have "+(_inputEnd - _inputPtr)+" unread bytes");
        }
        // and shouldn't have been marked as end-of-input
        if (_endOfInput) {
            throw new XMLStreamException("Already closed, can not feed more input");
        }
        // Time to update pointers first
        _pastBytesOrChars += _origBufferLen;
        _rowStartOffset -= _origBufferLen;

        int start = buffer.position();
        int end = buffer.limit();

        // And then update buffer settings
        _inputBuffer = buffer;
        _inputPtr = start;
        _inputEnd = end;
        _origBufferLen = end-start;
    }

    /*
    /**********************************************************************
    /* Implementation of parsing API
    /**********************************************************************
     */

    @Override
    public int nextFromTree() throws XMLStreamException
    {
        // Had a fully complete event? Need to reset state:
        if (_currToken != EVENT_INCOMPLETE) {
            /* First, need to handle some complications arising from
             * empty elements, and namespace binding/unbinding:
             */
            if (_currToken == START_ELEMENT) {
                if (_isEmptyTag) {
                    --_depth;
                    // Important: do NOT overwrite start location, same as with START_ELEMENT
                    return (_currToken = END_ELEMENT);
                }
            } else if (_currToken == END_ELEMENT) {
                _currElem = _currElem.getParent();
                // Any namespace declarations that need to be unbound?
                while (_lastNsDecl != null && _lastNsDecl.getLevel() >= _depth) {
                    _lastNsDecl = _lastNsDecl.unbind();
                }
            }

            // keep track of where event started
            setStartLocation();
            
            /* Only CHARACTERS can remain incomplete: this happens if
             * first character is decoded, but coalescing mode is NOT
             * set. Skip can not therefore block, nor will add pending
             * input. Can also occur when we have run out of input
             */
            if (_tokenIncomplete) {
                if (!skipCharacters()) { // couldn't complete skipping
                    return EVENT_INCOMPLETE;
                }
                _tokenIncomplete = false;
            }
            _currToken = _nextEvent = EVENT_INCOMPLETE;
            _state = STATE_DEFAULT;
        }
        
        // Don't yet know the type?
        if (_nextEvent == EVENT_INCOMPLETE) {
            if (_state == STATE_DEFAULT) {
                /* We can only have pending input for (incomplete)
                 * CHARACTERS event.
                 */
                if (_pendingInput != 0) { // CR, or multi-byte?
                    _nextEvent = CHARACTERS;
                    return startCharactersPending();
                }
                if (_inputPtr >= _inputEnd) { // nothing we can do?
                    return _currToken; // i.e. EVENT_INCOMPLETE
                }
                byte b = _inputBuffer.get(_inputPtr++);
                if (b == BYTE_LT) { // root element, comment, proc instr?
                    _state = STATE_TREE_SEEN_LT;
                } else if (b == BYTE_AMP) {
                    _state = STATE_TREE_SEEN_AMP;
                } else {
                    _nextEvent = CHARACTERS;
                    return startCharacters(b);
                }
            }

            if (_inputPtr >= _inputEnd) {
                return _currToken; // i.e. EVENT_INCOMPLETE
            }
            if (_state == STATE_TREE_SEEN_LT) {
                // Ok, so we've just seen the less-than char...
                byte b = _inputBuffer.get(_inputPtr++);
                if (b == BYTE_EXCL) { // comment or CDATA
                    _state = STATE_TREE_SEEN_EXCL;
                } else if (b == BYTE_QMARK) {
                    _nextEvent = PROCESSING_INSTRUCTION;
                    _state = STATE_DEFAULT;
                    return handlePI();
                } else if (b == BYTE_SLASH) {
                    return handleEndElementStart();
                } else {
                    // Probably start element -- need to retain first char tho
                    return handleStartElementStart(b);
                }
            } else if (_state == STATE_TREE_SEEN_AMP) {
                return handleEntityStartingToken();
            } else if (_state == STATE_TREE_NAMED_ENTITY_START) {
                return handleNamedEntityStartingToken();
            } else if (_state == STATE_TREE_NUMERIC_ENTITY_START) {
                return handleNumericEntityStartingToken();
            }
                
            if (_state == STATE_TREE_SEEN_EXCL) {
                if (_inputPtr >= _inputEnd) {
                    return _currToken; // i.e. EVENT_INCOMPLETE
                }
                byte b = _inputBuffer.get(_inputPtr++);
                // Comment or CDATA?
                if (b == BYTE_HYPHEN) { // Comment
                    _nextEvent = COMMENT;
                    _state = STATE_DEFAULT;
                } else if (b == BYTE_LBRACKET) { // CDATA
                    _nextEvent = CDATA;
                    _state = STATE_DEFAULT;
                } else {
                    reportTreeUnexpChar(decodeCharForError(b), " (expected either '-' for COMMENT or '[CDATA[' for CDATA section)");
                }
            } else {
                throwInternal();
            }
        }
        
        /* We know the type; event is usually partially processed
         * and needs to be completely read.
         */
        switch (_nextEvent) {
        case START_ELEMENT:
            return handleStartElement();
        case END_ELEMENT:
            return handleEndElement();
        case PROCESSING_INSTRUCTION:
            return handlePI();
        case COMMENT:
            return handleComment();
        case CDATA:
            return handleCData();
        case CHARACTERS:
            if (!_cfgLazyParsing) {
                // !!! TBI: how would non-lazy mode work?
                if (_cfgCoalescing) {
                    return finishCharactersCoalescing();
                }
            }
            if (_pendingInput != 0) { // multi-byte, or CR without LF
                return startCharactersPending();
            }
            // Otherwise, should not get here
            throwInternal();
//        case ENTITY_REFERENCE:
        }
        return throwInternal(); // never gets here
    }

    /*
    /**********************************************************************
    /* Second-level parsing; character content (in tree)
    /**********************************************************************
     */
    
    private int handleCData() throws XMLStreamException
    {
        if (_state == STATE_CDATA_CONTENT) {
            return parseCDataContents();
        }
        if (_inputPtr >= _inputEnd) {
            return EVENT_INCOMPLETE;
        }
        return handleCDataStartMarker(_inputBuffer.get(_inputPtr++));
    }
    
    private int handleCDataStartMarker(byte b) throws XMLStreamException
    {
        switch (_state) {
        case STATE_DEFAULT:
            if (b != BYTE_C) {
                reportTreeUnexpChar(decodeCharForError(b), " (expected 'C' for CDATA)");
            }
            _state = STATE_CDATA_C;
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            b = _inputBuffer.get(_inputPtr++);
            // fall through
        case STATE_CDATA_C:
            if (b != BYTE_D) {
                reportTreeUnexpChar(decodeCharForError(b), " (expected 'D' for CDATA)");
            }
            _state = STATE_CDATA_CD;
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            b = _inputBuffer.get(_inputPtr++);
            // fall through
        case STATE_CDATA_CD:
            if (b != BYTE_A) {
                reportTreeUnexpChar(decodeCharForError(b), " (expected 'A' for CDATA)");
            }
            _state = STATE_CDATA_CDA;
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            b = _inputBuffer.get(_inputPtr++);
            // fall through
        case STATE_CDATA_CDA:
            if (b != BYTE_T) {
                reportTreeUnexpChar(decodeCharForError(b), " (expected 'T' for CDATA)");
            }
            _state = STATE_CDATA_CDAT;
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            b = _inputBuffer.get(_inputPtr++);
            // fall through
        case STATE_CDATA_CDAT:
            if (b != BYTE_A) {
                reportTreeUnexpChar(decodeCharForError(b), " (expected 'A' for CDATA)");
            }
            _state = STATE_CDATA_CDATA;
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            b = _inputBuffer.get(_inputPtr++);
            // fall through
        case STATE_CDATA_CDATA:
            if (b != BYTE_LBRACKET) {
                reportTreeUnexpChar(decodeCharForError(b), " (expected '[' for CDATA)");
            }
            _textBuilder.resetWithEmpty();
            _state = STATE_CDATA_CONTENT;
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            return parseCDataContents();
        }
        return throwInternal();
    }
    
    /*
    /**********************************************************************
    /* Second-level parsing; other (PI, Comment)
    /**********************************************************************
     */

    @Override
    protected int handlePI() throws XMLStreamException
    {
        // Most common case first:
        if (_state == STATE_PI_IN_DATA) {
            return parsePIData();
        }

        main_loop:
        while (true) {
            if (_inputPtr >= _inputEnd) {
                return EVENT_INCOMPLETE;
            }
            switch (_state) {
            case STATE_DEFAULT:
                _tokenName = parseNewName(_inputBuffer.get(_inputPtr++));
                if (_tokenName == null) {
                    _state = STATE_PI_IN_TARGET;
                    return EVENT_INCOMPLETE;
                }
                _state = STATE_PI_AFTER_TARGET;
                checkPITargetName(_tokenName);
                if (_inputPtr >= _inputEnd) {
                    return EVENT_INCOMPLETE;
                }
                // fall through
            case STATE_PI_AFTER_TARGET:
                // Need ws or "?>"
                {
                    byte b = _inputBuffer.get(_inputPtr++);
                    if (b == BYTE_QMARK) {
                        // Quick check, can we see '>' as well? All done, if so
                        if (_inputPtr < _inputEnd && _inputBuffer.get(_inputPtr) == BYTE_GT) {
                            ++_inputPtr;
                            break main_loop; // means we are done
                        }
                        // If not (whatever reason), let's move to check state
                        _state = STATE_PI_AFTER_TARGET_QMARK;
                        break;
                    }
                    if (b == BYTE_SPACE || b == BYTE_CR
                               || b == BYTE_LF || b == BYTE_TAB) {
                        if (!asyncSkipSpace()) { // ran out of input?
                            _state = STATE_PI_AFTER_TARGET_WS;
                            return EVENT_INCOMPLETE;
                        }
                        _textBuilder.resetWithEmpty();
                        // Quick check, perhaps we'll see end marker?
                        if ((_inputPtr+1) < _inputEnd
                            && _inputBuffer.get(_inputPtr) == BYTE_QMARK
                            && _inputBuffer.get(_inputPtr+1) == BYTE_GT) {
                            _inputPtr += 2;
                            break main_loop; // means we are done
                        }
                        // If not, we'll move to 'data' portion of PI
                        _state = STATE_PI_IN_DATA;
                        return parsePIData();
                    }
                    // Otherwise, it's an error
                    reportMissingPISpace(decodeCharForError(b));
                }
                // fall through
            case STATE_PI_AFTER_TARGET_WS:
                if (!asyncSkipSpace()) { // ran out of input?
                    return EVENT_INCOMPLETE;
                }
                // Can just move to "data" portion right away
                _state = STATE_PI_IN_DATA;
                _textBuilder.resetWithEmpty();
                return parsePIData();
            case STATE_PI_AFTER_TARGET_QMARK:
                {
                    // Must get '>' following '?' we saw right after name
                    byte b = _inputBuffer.get(_inputPtr++);
                    // Otherwise, it's an error
                    if (b != BYTE_GT) {
                        reportMissingPISpace(decodeCharForError(b));
                    }
                }
                // but if it's ok, we are done
                break main_loop;
            case STATE_PI_IN_TARGET:
                _tokenName = parsePName();
                if (_tokenName == null) {
                    return EVENT_INCOMPLETE;
                }
                checkPITargetName(_tokenName);
                _state = STATE_PI_AFTER_TARGET;
                break;
                
            default:
                return throwInternal();
            }
        }
        
        _state = STATE_DEFAULT;
        _nextEvent = EVENT_INCOMPLETE;
        return PROCESSING_INSTRUCTION;
    }

    @Override
    protected final int handleComment() throws XMLStreamException
    {
        if (_state == STATE_COMMENT_CONTENT) {
            return parseCommentContents();
        }
        if (_inputPtr >= _inputEnd) {
            return EVENT_INCOMPLETE;
        }
        byte b = _inputBuffer.get(_inputPtr++);
        
        if (_state == STATE_DEFAULT) {
            if (b != BYTE_HYPHEN) {
                reportTreeUnexpChar(decodeCharForError(b), " (expected '-' for COMMENT)");
            }
            _state = STATE_COMMENT_CONTENT;
            _textBuilder.resetWithEmpty();
            return parseCommentContents();
        }
        if (_state == STATE_COMMENT_HYPHEN2) {
            // We are almost done, just need to get '>' at the end
            if (b != BYTE_GT) {
                reportDoubleHyphenInComments();
            }
            _state = STATE_DEFAULT;
            _nextEvent = EVENT_INCOMPLETE;
            return COMMENT;
        }
        return throwInternal();
    }
    
    /*
    /**********************************************************************
    /* Second-level parsing; helper methods
    /**********************************************************************
     */

    /**
     * Method to skip whatever space can be skipped.
     *

* NOTE: if available content ends with a CR, method will set * _pendingInput to PENDING_STATE_CR. * * @return True, if was able to skip through the space and find * a non-space byte; false if reached end-of-buffer */ @Override protected boolean asyncSkipSpace() throws XMLStreamException { while (_inputPtr < _inputEnd) { byte b = _inputBuffer.get(_inputPtr); if ((b & 0xFF) > INT_SPACE) { // hmmmh. Shouldn't this be handled someplace else? if (_pendingInput == PENDING_STATE_CR) { markLF(); _pendingInput = 0; } return true; } ++_inputPtr; if (b == BYTE_LF) { markLF(); } else if (b == BYTE_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (b != BYTE_SPACE && b != BYTE_TAB) { throwInvalidSpace(b); } } return false; } /** * Method called when a new token (within tree) starts with an * entity. * * @return Type of event to return */ protected int handleEntityStartingToken() throws XMLStreamException { _textBuilder.resetWithEmpty(); byte b = _inputBuffer.get(_inputPtr++); // we know one is available if (b == BYTE_HASH) { // numeric character entity _textBuilder.resetWithEmpty(); _state = STATE_TREE_NUMERIC_ENTITY_START; _pendingInput = PENDING_STATE_ENT_SEEN_HASH; if (_inputPtr >= _inputEnd) { // but no more content to parse yet return EVENT_INCOMPLETE; } return handleNumericEntityStartingToken(); } PName n = parseNewEntityName(b); // null if incomplete; non-null otherwise if (n == null) { // Not sure if it's a char entity or general one; so we don't yet know type _state = STATE_TREE_NAMED_ENTITY_START; return EVENT_INCOMPLETE; } int ch = decodeGeneralEntity(n); if (ch == 0) { // not a character entity _tokenName = n; return (_nextEvent = _currToken = ENTITY_REFERENCE); } // character entity; initialize buffer, _textBuilder.resetWithChar((char)ch); _nextEvent = 0; _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; } /** * Method called when we see an entity that is starting a new token, * and part of its name has been decoded (but not all) */ protected int handleNamedEntityStartingToken() throws XMLStreamException { PName n = parseEntityName(); // null if incomplete; non-null otherwise if (n == null) { return _nextEvent; // i.e. EVENT_INCOMPLETE } int ch = decodeGeneralEntity(n); if (ch == 0) { // not a character entity _tokenName = n; return (_currToken = ENTITY_REFERENCE); } // character entity; initialize buffer, _textBuilder.resetWithChar((char)ch); _nextEvent = 0; _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; } /** * Method called to handle cases where we find something other than * a character entity (or one of 4 pre-defined general entities that * act like character entities) */ protected int handleNumericEntityStartingToken() throws XMLStreamException { if (_pendingInput == PENDING_STATE_ENT_SEEN_HASH) { byte b = _inputBuffer.get(_inputPtr); // we know one is available _entityValue = 0; if (b == BYTE_x) { // 'x' marks hex _pendingInput = PENDING_STATE_ENT_IN_HEX_DIGIT; if (++_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } } else { // if not 'x', must be a digit _pendingInput = PENDING_STATE_ENT_IN_DEC_DIGIT; // let's just keep byte for calculation } } if (_pendingInput == PENDING_STATE_ENT_IN_HEX_DIGIT) { if (!decodeHexEntity()) { return EVENT_INCOMPLETE; } } else { if (!decodeDecEntity()) { return EVENT_INCOMPLETE; } } // and now we have the full value verifyAndAppendEntityCharacter(_entityValue); _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } _pendingInput = 0; return _currToken; } /** * @return True if entity was decoded (and value assigned to _entityValue; * false otherwise */ protected final boolean decodeHexEntity() throws XMLStreamException { int value = _entityValue; while (_inputPtr < _inputEnd) { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_SEMICOLON) { _entityValue = value; return true; } int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } value = (value << 4) + ch; if (value > MAX_UNICODE_CHAR) { // Overflow? _entityValue = value; reportEntityOverflow(); } } _entityValue = value; return false; } /** * @return True if entity was decoded (and value assigned to _entityValue; * false otherwise */ protected final boolean decodeDecEntity() throws XMLStreamException { int value = _entityValue; while (_inputPtr < _inputEnd) { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_SEMICOLON) { _entityValue = value; return true; } int ch = ((int) b) - INT_0; if (ch < 0 || ch > 9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } value = (value * 10) + ch; if (value > MAX_UNICODE_CHAR) { // Overflow? _entityValue = value; reportEntityOverflow(); } } _entityValue = value; return false; } /** * Method that verifies that given named entity is followed by * a semi-colon (meaning next byte must be available for reading); * and if so, whether it is one of pre-defined general entities. * * @return Character of the expanded pre-defined general entity * (if name matches one); zero if not. */ protected final int decodeGeneralEntity(PName entityName) throws XMLStreamException { // First things first: verify that we got semicolon afterwards byte b = _inputBuffer.get(_inputPtr++); if (b != BYTE_SEMICOLON) { throwUnexpectedChar(decodeCharForError(b), " expected ';' following entity name (\""+entityName.getPrefixedName()+"\")"); } String name = entityName.getPrefixedName(); if (name == "amp") { return INT_AMP; } if (name == "lt") { return INT_LT; } if (name == "apos") { return INT_APOS; } if (name == "quot") { return INT_QUOTE; } if (name == "gt") { return INT_GT; } return 0; } /** * Method called when '<' and (what appears to be) a name * start character have been seen. */ @Override protected int handleStartElementStart(byte b) throws XMLStreamException { PName elemName = parseNewName(b); _nextEvent = START_ELEMENT; if (elemName == null) { _state = STATE_SE_ELEM_NAME; return EVENT_INCOMPLETE; } initStartElement(elemName); return handleStartElement(); } @Override protected int handleStartElement() throws XMLStreamException { main_loop: while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } byte b; int c; switch (_state) { case STATE_SE_ELEM_NAME: { PName elemName = parsePName(); if (elemName == null) { return EVENT_INCOMPLETE; } initStartElement(elemName); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // Fall through to next state case STATE_SE_SPACE_OR_END: // obligatory space, or end if (_pendingInput != 0) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } // Ok, got a space, can move on } else { b = _inputBuffer.get(_inputPtr++); c = (int) b & 0xFF; if (c <= INT_SPACE) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } } else if (c == INT_GT) { // must be '/' or '>' return finishStartElement(false); } else if (c == INT_SLASH) { _state = STATE_SE_SEEN_SLASH; continue main_loop; } else { throwUnexpectedChar(decodeCharForError(b), " expected space, or '>' or \"/>\""); } } _state = STATE_SE_SPACE_OR_ATTRNAME; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // can fall through, again: case STATE_SE_SPACE_OR_ATTRNAME: case STATE_SE_SPACE_OR_EQ: case STATE_SE_SPACE_OR_ATTRVALUE: // Common to these states is that there may be leading space(s), // so let's see if any has to be skipped if (_pendingInput != 0) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } } b = _inputBuffer.get(_inputPtr++); c = (int) b & 0xFF; while (c <= INT_SPACE) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer.get(_inputPtr++); c = (int) b & 0xFF; } switch (_state) { case STATE_SE_SPACE_OR_ATTRNAME: if (b == BYTE_SLASH) { _state = STATE_SE_SEEN_SLASH; continue main_loop; } if (b == BYTE_GT) { return finishStartElement(false); } { PName n = parseNewName(b); if (n == null) { _state = STATE_SE_ATTR_NAME; return EVENT_INCOMPLETE; } _state = STATE_SE_SPACE_OR_EQ; _elemAttrName = n; } continue main_loop; case STATE_SE_SPACE_OR_EQ: if (b != BYTE_EQ) { throwUnexpectedChar(decodeCharForError(b), " expected '='"); } _state = STATE_SE_SPACE_OR_ATTRVALUE; continue main_loop; case STATE_SE_SPACE_OR_ATTRVALUE: if (b != BYTE_QUOT && b != BYTE_APOS) { throwUnexpectedChar(decodeCharForError(b), " Expected a quote"); } initAttribute(b); continue main_loop; default: throwInternal(); } case STATE_SE_ATTR_NAME: { PName n = parsePName(); if (n == null) { return EVENT_INCOMPLETE; } _elemAttrName = n; _state = STATE_SE_SPACE_OR_EQ; } break; case STATE_SE_ATTR_VALUE_NORMAL: if (!handleAttrValue()) { return EVENT_INCOMPLETE; } _state = STATE_SE_SPACE_OR_END; break; case STATE_SE_ATTR_VALUE_NSDECL: if (!handleNsDecl()) { return EVENT_INCOMPLETE; } _state = STATE_SE_SPACE_OR_END; break; case STATE_SE_SEEN_SLASH: { b = _inputBuffer.get(_inputPtr++); if (b != BYTE_GT) { throwUnexpectedChar(decodeCharForError(b), " expected '>'"); } return finishStartElement(true); } default: throwInternal(); } } } private void initStartElement(PName elemName) { String prefix = elemName.getPrefix(); if (prefix == null) { // element in default ns _elemAllNsBound = true; // which need not be bound } else { elemName = bindName(elemName, prefix); _elemAllNsBound = elemName.isBound(); } _tokenName = elemName; _currElem = new ElementScope(elemName, _currElem); _attrCount = 0; _currNsCount = 0; _elemAttrPtr = 0; _state = STATE_SE_SPACE_OR_END; } private void initAttribute(byte quoteChar) { _elemAttrQuote = quoteChar; PName attrName = _elemAttrName; String prefix = attrName.getPrefix(); boolean nsDecl; if (prefix == null) { // can be default ns decl: nsDecl = (attrName.getLocalName() == "xmlns"); } else { // May be a namespace decl though? if (prefix == "xmlns") { nsDecl = true; } else { attrName = bindName(attrName, prefix); if (_elemAllNsBound) { _elemAllNsBound = attrName.isBound(); } nsDecl = false; } } if (nsDecl) { _state = STATE_SE_ATTR_VALUE_NSDECL; // Ns decls use name buffer transiently _elemNsPtr = 0; ++_currNsCount; } else { _state = STATE_SE_ATTR_VALUE_NORMAL; // Regular attributes are appended, shouldn't reset ptr _attrCollector.startNewValue(attrName, _elemAttrPtr); } } /** * Method called to wrap up settings when the whole start * (or empty) element has been parsed. */ private int finishStartElement(boolean emptyTag) throws XMLStreamException { _isEmptyTag = emptyTag; // Note: this call also checks attribute uniqueness int act = _attrCollector.finishLastValue(_elemAttrPtr); if (act < 0) { // error, dup attr indicated by -1 act = _attrCollector.getCount(); // let's get correct count reportInputProblem(_attrCollector.getErrorMsg()); } _attrCount = act; ++_depth; /* Was there any prefix that wasn't bound prior to use? * That's legal, assuming declaration was found later on... * let's check */ if (!_elemAllNsBound) { if (!_tokenName.isBound()) { // element itself unbound reportUnboundPrefix(_tokenName, false); } for (int i = 0, len = _attrCount; i < len; ++i) { PName attrName = _attrCollector.getName(i); if (!attrName.isBound()) { reportUnboundPrefix(attrName, true); } } } return (_currToken = START_ELEMENT); } private int handleEndElementStart() throws XMLStreamException { --_depth; _tokenName = _currElem.getName(); /* Ok, perhaps we can do this quickly? This works, if we * are expected to have the full name (plus one more byte * to indicate name end) in the current buffer: */ int size = _tokenName.sizeInQuads(); if ((_inputEnd - _inputPtr) < ((size << 2) + 1)) { // may need to load more _nextEvent = END_ELEMENT; _state = STATE_DEFAULT; _quadCount = _currQuad = _currQuadBytes = 0; /* No, need to take it slow. Can not yet give up, though, * without reading remainder of the buffer */ return handleEndElement(); } ByteBuffer buf = _inputBuffer; // First all full chunks of 4 bytes (if any) --size; for (int qix = 0; qix < size; ++qix) { int ptr = _inputPtr; int q = (buf.get(ptr) << 24) | ((buf.get(ptr+1) & 0xFF) << 16) | ((buf.get(ptr+2) & 0xFF) << 8) | ((buf.get(ptr+3) & 0xFF)) ; _inputPtr += 4; // match? if (q != _tokenName.getQuad(qix)) { reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } /* After which we can deal with the last entry: it's bit * tricky as we don't actually fully know byte length... */ int lastQ = _tokenName.getQuad(size); int q = buf.get(_inputPtr++) & 0xFF; if (q != lastQ) { // need second byte? q = (q << 8) | (buf.get(_inputPtr++) & 0xFF); if (q != lastQ) { // need third byte? q = (q << 8) | (buf.get(_inputPtr++) & 0xFF); if (q != lastQ) { // need full 4 bytes? q = (q << 8) | (buf.get(_inputPtr++) & 0xFF); if (q != lastQ) { // still no match? failure! reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } } } // Trailing space? int i2 = _inputBuffer.get(_inputPtr++) & 0xFF; while (i2 <= INT_SPACE) { if (i2 == INT_LF) { markLF(); } else if (i2 == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; _nextEvent = END_ELEMENT; _state = STATE_EE_NEED_GT; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (i2 != INT_SPACE && i2 != INT_TAB) { throwInvalidSpace(i2); } if (_inputPtr >= _inputEnd) { _nextEvent = END_ELEMENT; _state = STATE_EE_NEED_GT; return EVENT_INCOMPLETE; } i2 = _inputBuffer.get(_inputPtr++) & 0xFF; } if (i2 != INT_GT) { throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'"); } return (_currToken = END_ELEMENT); } /** * This method is "slow" version of above, used when name of * the end element can split input buffer boundary */ private int handleEndElement() throws XMLStreamException { if (_state == STATE_DEFAULT) { // parsing name final PName elemName = _tokenName; final int quadSize = elemName.sizeInQuads() - 1; // need to ignore last for now for (; _quadCount < quadSize; ++_quadCount) { // first, full quads for (; _currQuadBytes < 4; ++_currQuadBytes) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } _currQuad = (_currQuad << 8) | (_inputBuffer.get(_inputPtr++) & 0xFF); } // match? if (_currQuad != elemName.getQuad(_quadCount)) { reportUnexpectedEndTag(elemName.getPrefixedName()); } _currQuad = _currQuadBytes = 0; } // So far so good! Now need to check the last quad: int lastQ = elemName.getLastQuad(); while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } int q = (_currQuad << 8); q |= (_inputBuffer.get(_inputPtr++) & 0xFF); _currQuad = q; if (q == lastQ) { // match break; } if (++_currQuadBytes > 3) { // no match, error reportUnexpectedEndTag(elemName.getPrefixedName()); break; // never gets here } } // Bueno. How about optional space, '>'? _state = STATE_EE_NEED_GT; } else if (_state != STATE_EE_NEED_GT) { throwInternal(); } if (_pendingInput != 0) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } // it's ignorable ws } // Trailing space? while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } int i2 = _inputBuffer.get(_inputPtr++) & 0xFF; if (i2 <= INT_SPACE) { if (i2 == INT_LF) { markLF(); } else if (i2 == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (i2 != INT_SPACE && i2 != INT_TAB) { throwInvalidSpace(i2); } continue; } if (i2 != INT_GT) { throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'"); } // Hah, done! return (_currToken = END_ELEMENT); } } /* /********************************************************************** /* Implementation of parsing API, character events /********************************************************************** */ @Override protected final int startCharacters(byte b) throws XMLStreamException { dummy_loop: do { // dummy loop, to allow break int c = (int) b & 0xFF; switch (_charTypes.TEXT_CHARS[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: /* Note: can not have pending input when this method * is called. No need to check that (could assert) */ if (_inputPtr >= _inputEnd) { // no more input available _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; return EVENT_INCOMPLETE; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; return EVENT_INCOMPLETE; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; return EVENT_INCOMPLETE; } c = decodeUtf8_4(c); // Need a surrogate pair, have to call from here: _textBuilder.resetWithSurrogate(c); break dummy_loop; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); break; case XmlCharTypes.CT_LT: // should never get here case XmlCharTypes.CT_AMP: // - "" - throwInternal(); break; case XmlCharTypes.CT_RBRACKET: // ']]>'? // !!! TBI: check for "]]>" default: break; } _textBuilder.resetWithChar((char) c); } while (false); // dummy loop, for break if (_cfgCoalescing && !_cfgLazyParsing) { // In eager coalescing mode, must read it all return finishCharactersCoalescing(); } _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; } /** * This method only gets called in non-coalescing mode; and if so, * needs to parse as many characters of the current text segment * from the current input block as possible. */ @Override protected final void finishCharacters() throws XMLStreamException { /* Now: there should not usually be any pending input (as it's * handled when CHARACTERS segment started, and this method * only gets called exactly once)... but we may want to * revisit this subject when (if) coalescing mode is to be * tackled. */ if (_pendingInput != 0) { // !!! TBI: needs to be changed for coalescing mode throwInternal(); } final int[] TYPES = _charTypes.TEXT_CHARS; final ByteBuffer inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); // Should have just one code point (one or two chars). Assert? int outPtr = _textBuilder.getCurrentLength(); main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer.get(ptr++) & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: c = handleEntityInCharacters(); if (c == 0) { // not a succesfully expanded char entity // _inputPtr set by entity expansion method --_inputPtr; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? /* 09-Mar-2007, tatus: This will not give 100% coverage, * for it may be split across input buffer boundary. * For now this will have to suffice though. */ { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b = BYTE_NULL; while (_inputPtr < _inputEnd) { b = inputBuffer.get(_inputPtr); if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (--count > 0) { outputBuffer[outPtr++] = ']'; // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } } // Can just output the first ']' along normal output break; // default: // Other types are not important here... } // We know there's room for one more: outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); } /** * Method called to handle entity encountered inside * CHARACTERS segment, when trying to complete a non-coalescing text segment. *

* NOTE: unlike with generic parsing of named entities, where trailing semicolon * needs to be left in place, here we should just process it right away. * * @return Expanded (character) entity, if positive number; 0 if incomplete. */ protected int handleEntityInCharacters() throws XMLStreamException { /* Thing that simplifies processing here is that handling * is pretty much optional: if there isn't enough data, we * just return 0 and are done with it. * * Also: we need at least 3 more characters for any character entity */ int ptr = _inputPtr; if ((ptr + 3) <= _inputEnd) { byte b = _inputBuffer.get(ptr++); if (b == BYTE_HASH) { // numeric character entity if (_inputBuffer.get(ptr) == BYTE_x) { return handleHexEntityInCharacters(ptr+1); } return handleDecEntityInCharacters(ptr); } // general entity; maybe one of pre-defined ones if (b == BYTE_a) { // amp or apos? b = _inputBuffer.get(ptr++); if (b == BYTE_m) { if ((ptr + 1) < _inputPtr && _inputBuffer.get(ptr) == BYTE_p && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_AMP; } } else if (b == BYTE_p) { if ((ptr + 2) < _inputPtr && _inputBuffer.get(ptr) == BYTE_o && _inputBuffer.get(ptr+1) == BYTE_s && _inputBuffer.get(ptr+2) == BYTE_SEMICOLON) { _inputPtr = ptr + 3; return INT_APOS; } } } else if (b == BYTE_g) { // gt? if (_inputBuffer.get(ptr) == BYTE_t && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_GT; } } else if (b == BYTE_l) { // lt? if (_inputBuffer.get(ptr) == BYTE_t && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_LT; } } else if (b == BYTE_q) { // quot? if ((ptr + 3) < _inputPtr && _inputBuffer.get(ptr)== BYTE_u && _inputBuffer.get(ptr+1) == BYTE_o && _inputBuffer.get(ptr+2) == BYTE_t && _inputBuffer.get(ptr+3) == BYTE_SEMICOLON) { _inputPtr = ptr + 4; return INT_APOS; } } } // couldn't handle: return 0; } protected int handleDecEntityInCharacters(int ptr) throws XMLStreamException { byte b = _inputBuffer.get(ptr++); final int end = _inputEnd; int value = 0; do { int ch = (int) b; if (ch > INT_9 || ch < INT_0) { throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } value = (value * 10) + (ch - INT_0); if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (ptr >= end) { return 0; } b = _inputBuffer.get(ptr++); } while (b != BYTE_SEMICOLON); _inputPtr = ptr; verifyXmlChar(value); return value; } protected int handleHexEntityInCharacters(int ptr) throws XMLStreamException { byte b = _inputBuffer.get(ptr++); final int end = _inputEnd; int value = 0; do { int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } value = (value << 4) + ch; if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (ptr >= end) { return 0; } b = _inputBuffer.get(ptr++); } while (b != BYTE_SEMICOLON); _inputPtr = ptr; verifyXmlChar(value); return value; } /** * Method called to handle split multi-byte character, by decoding * it and appending to the text buffer, if possible. * * @return True, if split character was completely handled; false * if not */ private final boolean handleAndAppendPending() throws XMLStreamException { // First, need to have at least one more byte: if (_inputPtr >= _inputEnd) { return false; } int c = _pendingInput; _pendingInput = 0; // Possible \r\n linefeed? if (c < 0) { // markers are all negative if (c == PENDING_STATE_CR) { if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); _textBuilder.append(CHAR_LF); return true; } throwInternal(); } // Nah, a multi-byte UTF-8 char: // Let's just re-test the first pending byte (in LSB): switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: _textBuilder.append((char) decodeUtf8_2(c)); break; case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer.get(_inputPtr++) & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return false; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: c = decodeUtf8_3((c & 0xFF), c2, next); } _textBuilder.append((char) c); } break; case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer.get(_inputPtr++) & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return false; } int c2 = _inputBuffer.get(_inputPtr++) & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return false; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return false; } c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last c = decodeUtf8_4((c & 0xFF), c2, c3, next); } } } // Need a surrogate pair, have to call from here: _textBuilder.appendSurrogate(c); break; default: // should never occur: throwInternal(); } return true; } /* /********************************************************************** /* Implementation of parsing API, skipping remainder CHARACTERS section /********************************************************************** */ /** * Method that will be called to skip all possible characters * from the input buffer, but without blocking. Partial * characters are not to be handled (not pending input * is to be added). * * @return True, if skipping ending with an unexpanded * entity; false if not */ @Override protected boolean skipCharacters() throws XMLStreamException { if (_pendingInput != 0) { if (!skipPending()) { return false; } } final int[] TYPES = _charTypes.TEXT_CHARS; final ByteBuffer inputBuffer = _inputBuffer; main_loop: while (true) { int c; ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { break main_loop; } while (ptr < max) { c = (int) inputBuffer.get(ptr++) & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } decodeUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; return true; case XmlCharTypes.CT_AMP: c = skipEntityInCharacters(); if (c == 0) { // not a successfully expanded char entity _pendingInput = PENDING_STATE_TEXT_AMP; // but we may have input to skip nonetheless.. if (_inputPtr < _inputEnd) { if (skipPending()) { return true; } } return false; } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? /* !!! 09-Mar-2007, tatu: This will not give 100% coverage, * for it may be split across input buffer boundary. * For now this will have to suffice though. */ { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b = BYTE_NULL; while (_inputPtr < _inputEnd) { b = inputBuffer.get(_inputPtr); if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } } break; // default: // Other types are not important here... } } // Ran out of input, no entity encountered return false; } private final boolean skipPending() throws XMLStreamException { // First, need to have at least one more byte: if (_inputPtr >= _inputEnd) { return false; } // Possible \r\n linefeed? if (_pendingInput < 0) { // markers are all negative while (true) { switch (_pendingInput) { case PENDING_STATE_CR: _pendingInput = 0; if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); return true; case PENDING_STATE_TEXT_AMP: { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_HASH) { _pendingInput = PENDING_STATE_TEXT_AMP_HASH; break; } PName n = parseNewEntityName(b); if (n == null) { _pendingInput = PENDING_STATE_TEXT_IN_ENTITY; return false; } int ch = decodeGeneralEntity(n); if (ch == 0) { _tokenName = n; _nextEvent = ENTITY_REFERENCE; } } _pendingInput = 0; return true; // no matter what, we are done case PENDING_STATE_TEXT_AMP_HASH: _entityValue = 0; if (_inputBuffer.get(_inputPtr) == BYTE_x) { ++_inputPtr; if (decodeHexEntity()) { _pendingInput = 0; return true; } _pendingInput = PENDING_STATE_TEXT_HEX_ENTITY; return false; } if (decodeDecEntity()) { _pendingInput = 0; return true; } _pendingInput = PENDING_STATE_TEXT_DEC_ENTITY; return false; case PENDING_STATE_TEXT_DEC_ENTITY: if (decodeDecEntity()) { _pendingInput = 0; return true; } return false; case PENDING_STATE_TEXT_HEX_ENTITY: if (decodeHexEntity()) { _pendingInput = 0; return true; } return false; case PENDING_STATE_TEXT_IN_ENTITY: { PName n = parseEntityName(); if (n == null) { return false; } int ch = decodeGeneralEntity(n); if (ch == 0) { _tokenName = n; _nextEvent = ENTITY_REFERENCE; } } _pendingInput = 0; return true; case PENDING_STATE_TEXT_BRACKET1: if (_inputBuffer.get(_inputPtr) != BYTE_RBRACKET) { _pendingInput = 0; return true; } ++_inputPtr; _pendingInput = PENDING_STATE_TEXT_BRACKET2; break; case PENDING_STATE_TEXT_BRACKET2: // may get sequence... { byte b = _inputBuffer.get(_inputPtr); if (b == BYTE_RBRACKET) { ++_inputPtr; break; } if (b == BYTE_GT) { // problem! ++_inputPtr; reportInputProblem("Encountered ']]>' in text segment"); } } // nope, something else, reprocess _pendingInput = 0; return true; default: throwInternal(); } if (_inputPtr >= _inputEnd) { return false; } } } // Nah, a multi-byte UTF-8 char: // Let's just re-test the first pending byte (in LSB): int c = _pendingInput; switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer.get(_inputPtr++) & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return false; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: decodeUtf8_3((c & 0xFF), c2, next); } } break; case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer.get(_inputPtr++) & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return false; } int c2 = _inputBuffer.get(_inputPtr++) & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return false; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return false; } c3 = _inputBuffer.get(_inputPtr++) & 0xFF; decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last decodeUtf8_4((c & 0xFF), c2, c3, next); } } } break; default: // should never occur: throwInternal(); } _pendingInput = 0; return true; } /** * Method called to handle entity encountered inside * CHARACTERS segment, when trying to complete a non-coalescing text segment. * * @return Expanded (character) entity, if positive number; 0 if incomplete. */ private int skipEntityInCharacters() throws XMLStreamException { /* Thing that simplifies processing here is that handling * is pretty much optional: if there isn't enough data, we * just return 0 and are done with it. * * Also: we need at least 3 more characters for any character entity */ int ptr = _inputPtr; if ((ptr + 3) <= _inputEnd) { byte b = _inputBuffer.get(ptr++); if (b == BYTE_HASH) { // numeric character entity if (_inputBuffer.get(ptr) == BYTE_x) { return handleHexEntityInCharacters(ptr+1); } return handleDecEntityInCharacters(ptr); } // general entity; maybe one of pre-defined ones if (b == BYTE_a) { // amp or apos? b = _inputBuffer.get(ptr++); if (b == BYTE_m) { if ((ptr + 1) < _inputPtr && _inputBuffer.get(ptr) == BYTE_p && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; // NOTE: do skip semicolon as well return INT_AMP; } } else if (b == BYTE_p) { if ((ptr + 2) < _inputPtr && _inputBuffer.get(ptr) == BYTE_o && _inputBuffer.get(ptr+1) == BYTE_s && _inputBuffer.get(ptr+2) == BYTE_SEMICOLON) { _inputPtr = ptr + 3; return INT_APOS; } } } else if (b == BYTE_g) { // gt? if (_inputBuffer.get(ptr) == BYTE_t && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_GT; } } else if (b == BYTE_l) { // lt? if (_inputBuffer.get(ptr) == BYTE_t && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_LT; } } else if (b == BYTE_q) { // quot? if ((ptr + 3) < _inputPtr && _inputBuffer.get(ptr) == BYTE_u && _inputBuffer.get(ptr+1) == BYTE_o && _inputBuffer.get(ptr+2) == BYTE_t && _inputBuffer.get(ptr+3) == BYTE_SEMICOLON) { _inputPtr = ptr + 4; return INT_APOS; } } } // couldn't handle: return 0; } /** * Coalescing mode is (and will) not be implemented for non-blocking * parsers, so this method should never get called. */ @Override protected boolean skipCoalescedText() throws XMLStreamException { throwInternal(); return false; } /* /********************************************************************** /* Implementation of parsing API, element/attr events /********************************************************************** */ /** * @return True, if the whole value was read; false if * only part (due to buffer ending) */ @Override protected boolean handleAttrValue() throws XMLStreamException { // First; any pending input? if (_pendingInput != 0) { if (!handleAttrValuePending()) { return false; } _pendingInput = 0; } char[] attrBuffer = _attrCollector.continueValue(); final int[] TYPES = _charTypes.ATTR_CHARS; final int quoteChar = (int) _elemAttrQuote; value_loop: while (true) { int c; ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { return false; } if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } int max = _inputEnd; { int max2 = _inputPtr + (attrBuffer.length - _elemAttrPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) _inputBuffer.get(_inputPtr++) & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } attrBuffer[_elemAttrPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return false; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = INT_SPACE; break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; return false; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; return false; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; return false; } c = decodeUtf8_4(c); // Let's add first part right away: attrBuffer[_elemAttrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: c = handleEntityInAttributeValue(); if (c <= 0) { // general entity; should never happen if (c < 0) { // end-of-input return false; } reportUnexpandedEntityInAttr(_elemAttrName, false); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; attrBuffer[_elemAttrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } // We know there's room for at least one char without checking attrBuffer[_elemAttrPtr++] = (char) c; } return true; // yeah, we're done! } /** * @return True if the partial information was succesfully handled; * false if not */ private final boolean handleAttrValuePending() throws XMLStreamException { if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return false; } char[] attrBuffer = _attrCollector.continueValue(); if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } // All LFs get converted to spaces, in attribute values attrBuffer[_elemAttrPtr++] = ' '; return true; } // otherwise must be related to entity handling within attribute value if (_inputPtr >= _inputEnd) { return false; } int ch; if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP) { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_HASH) { // numeric character entity _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH; if (_inputPtr >= _inputEnd) { return false; } if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else { PName entityName = parseNewEntityName(b); if (entityName == null) { _pendingInput = PENDING_STATE_ATTR_VALUE_ENTITY_NAME; return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH) { if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH_X) { ch = handleHexEntityInAttribute(true); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_ENTITY_NAME) { PName entityName = parseEntityName(); if (entityName == null) { return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_DEC_DIGIT) { ch = handleDecEntityInAttribute(false); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_HEX_DIGIT) { ch = handleHexEntityInAttribute(false); } else { // nope, split UTF-8 char // Nah, a multi-byte UTF-8 char. Alas, can't use shared method, as results // don't go in shared text buffer... ch = handleAttrValuePendingUTF8(); } if (ch == 0) { // wasn't resolved return false; } char[] attrBuffer = _attrCollector.continueValue(); // Ok; does it need a surrogate though? (over 16 bits) if ((ch >> 16) != 0) { ch -= 0x10000; if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } attrBuffer[_elemAttrPtr++] = (char) (0xD800 | (ch >> 10)); ch = 0xDC00 | (ch & 0x3FF); } if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } attrBuffer[_elemAttrPtr++] = (char) ch; return true; // done it! } private final int handleAttrValuePendingUTF8() throws XMLStreamException { // note: we know there must be at least one byte available at this point int c = _pendingInput; _pendingInput = 0; // Let's just re-test the first pending byte (in LSB): switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: return decodeUtf8_2(c); case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer.get(_inputPtr++) & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return 0; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: c = decodeUtf8_3((c & 0xFF), c2, next); } return c; } case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer.get(_inputPtr++) & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return 0; } int c2 = _inputBuffer.get(_inputPtr++) & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return 0; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return 0; } c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last c = decodeUtf8_4((c & 0xFF), c2, c3, next); } } return c; } default: // should never occur: throwInternal(); return 0; // never gets here } } private final int handleDecEntityInAttribute(boolean starting) throws XMLStreamException { byte b = _inputBuffer.get(_inputPtr++); // we know one is available if (starting) { int ch = (int) b; if (ch < INT_0 || ch > INT_9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } _pendingInput = PENDING_STATE_ATTR_VALUE_DEC_DIGIT; _entityValue = ch - INT_0; if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer.get(_inputPtr++); } while (b != BYTE_SEMICOLON) { int ch = ((int) b) - INT_0; if (ch < 0 || ch > 9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } int value = (_entityValue * 10) + ch; _entityValue = value; if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer.get(_inputPtr++); } verifyXmlChar(_entityValue); _pendingInput = 0; return _entityValue; } private final int handleHexEntityInAttribute(boolean starting) throws XMLStreamException { byte b = _inputBuffer.get(_inputPtr++); // we know one is available if (starting) { int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } _pendingInput = PENDING_STATE_ATTR_VALUE_HEX_DIGIT; _entityValue = ch; if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer.get(_inputPtr++); } while (b != BYTE_SEMICOLON) { int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } int value = (_entityValue << 4) + ch; _entityValue = value; if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer.get(_inputPtr++); } verifyXmlChar(_entityValue); _pendingInput = 0; return _entityValue; } /** * Method called to handle entity encountered inside attribute value. * * @return Value of expanded character entity, if processed (which must be * 1 or above); 0 for general entity, or -1 for "not enough input" */ protected int handleEntityInAttributeValue() throws XMLStreamException { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP; return -1; } byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_HASH) { // numeric character entity _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH; if (_inputPtr >= _inputEnd) { return -1; } int ch; if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return -1; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } if (ch == 0) { return -1; } return ch; } PName entityName = parseNewEntityName(b); if (entityName == null) { _pendingInput = PENDING_STATE_ATTR_VALUE_ENTITY_NAME; return -1; } int ch = decodeGeneralEntity(entityName); if (ch != 0) { return ch; } _tokenName = entityName; return 0; } @Override protected boolean handleNsDecl() throws XMLStreamException { final int[] TYPES = _charTypes.ATTR_CHARS; char[] attrBuffer = _nameBuffer; final int quoteChar = (int) _elemAttrQuote; // First; any pending input? if (_pendingInput != 0) { if (!handleNsValuePending()) { return false; } _pendingInput = 0; } value_loop: while (true) { int c; ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { return false; } if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } int max = _inputEnd; { int max2 = _inputPtr + (attrBuffer.length - _elemNsPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) _inputBuffer.get(_inputPtr++) & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } attrBuffer[_elemNsPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return false; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = INT_SPACE; break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; return false; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; return false; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; return false; } c = decodeUtf8_4(c); // Let's add first part right away: attrBuffer[_elemNsPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: c = handleEntityInAttributeValue(); if (c <= 0) { // general entity; should never happen if (c < 0) { // end-of-input return false; } reportUnexpandedEntityInAttr(_elemAttrName, true); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; attrBuffer[_elemNsPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } // We know there's room for at least one char without checking attrBuffer[_elemNsPtr++] = (char) c; } /* Simple optimization: for default ns removal (or, with * ns 1.1, any other as well), will use empty value... no * need to try to intern: */ int attrPtr = _elemNsPtr; if (attrPtr == 0) { bindNs(_elemAttrName, ""); } else { String uri = _config.canonicalizeURI(attrBuffer, attrPtr); bindNs(_elemAttrName, uri); } return true; } /** * @return True if the partial information was succesfully handled; * false if not */ private final boolean handleNsValuePending() throws XMLStreamException { if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return false; } char[] attrBuffer = _nameBuffer; if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } // All lfs get converted to spaces, in attribute values attrBuffer[_elemNsPtr++] = ' '; return true; } // otherwise must be related to entity handling within attribute value if (_inputPtr >= _inputEnd) { return false; } int ch; if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP) { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_HASH) { // numeric character entity _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH; if (_inputPtr >= _inputEnd) { return false; } if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else { PName entityName = parseNewEntityName(b); if (entityName == null) { _pendingInput = PENDING_STATE_ATTR_VALUE_ENTITY_NAME; return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH) { if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH_X) { ch = handleHexEntityInAttribute(true); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_ENTITY_NAME) { PName entityName = parseEntityName(); if (entityName == null) { return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_DEC_DIGIT) { ch = handleDecEntityInAttribute(false); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_HEX_DIGIT) { ch = handleHexEntityInAttribute(false); } else { // 05-Aug-2012, tatu: Apparently we can end up here too... ch = handleAttrValuePendingUTF8(); } if (ch == 0) { // wasn't resolved return false; } char[] attrBuffer = _nameBuffer; // Ok; does it need a surrogate though? (over 16 bits) if ((ch >> 16) != 0) { ch -= 0x10000; if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[_elemNsPtr++] = (char) (0xD800 | (ch >> 10)); ch = 0xDC00 | (ch & 0x3FF); } if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[_elemNsPtr++] = (char) ch; return true; // done it! } /* /********************************************************************** /* Common name/entity parsing /********************************************************************** */ @Override protected final PName parseNewName(byte b) throws XMLStreamException { int q = b & 0xFF; // Let's do just quick sanity check first; a thorough check will be // done later on if necessary, now we'll just do the very cheap // check to catch extra spaces etc. if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode throwUnexpectedChar(q, "; expected a name start character"); } _quadCount = 0; _currQuad = q; _currQuadBytes = 1; return parsePName(); } /** * This method can (for now?) be shared between all Ascii-based * encodings, since it only does coarse validity checking -- real * checks are done in different method. *

* Some notes about assumption implementation makes: *

    *
  • Well-formed xml content can not end with a name: as such, * end-of-input is an error and we can throw an exception *
  • *
*/ @Override protected final PName parsePName() throws XMLStreamException { int q = _currQuad; while (true) { int i; switch (_currQuadBytes) { case 0: if (_inputPtr >= _inputEnd) { return null; // all pointers have been set } q = _inputBuffer.get(_inputPtr++) & 0xFF; /* Since name char validity is checked later on, we only * need to be able to reliably see the end of the name... * and those are simple enough so that we can just * compare; lookup table won't speed things up (according * to profiler) */ if (q < 65) { // 'A' // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (q < 45 || q > 58 || q == 47) { // End of name return findPName(q, 0); } } // fall through case 1: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 1; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 1); } } q = (q << 8) | i; // fall through case 2: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 2; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 2); } } q = (q << 8) | i; // fall through case 3: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 3; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 3); } } q = (q << 8) | i; } // If we get this far, need to add full quad into result array and update state if (_quadCount == 0) { // first quad _quadBuffer[0] = q; _quadCount = 1; } else { if (_quadCount >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[_quadCount++] = q; } _currQuadBytes = 0; } } protected final PName parseNewEntityName(byte b) throws XMLStreamException { int q = b & 0xFF; if (q < INT_A) { throwUnexpectedChar(q, "; expected a name start character"); } _quadCount = 0; _currQuad = q; _currQuadBytes = 1; return parseEntityName(); } protected final PName parseEntityName() throws XMLStreamException { int q = _currQuad; while (true) { int i; switch (_currQuadBytes) { case 0: if (_inputPtr >= _inputEnd) { return null; // all pointers have been set } q = _inputBuffer.get(_inputPtr++) & 0xFF; /* Since name char validity is checked later on, we only * need to be able to reliably see the end of the name... * and those are simple enough so that we can just * compare; lookup table won't speed things up (according * to profiler) */ if (q < 65) { // 'A' // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (q < 45 || q > 58 || q == 47) { // apos, quot? if (_quadCount == 1) { q = _quadBuffer[0]; if (q == EntityNames.ENTITY_APOS_QUAD) { --_inputPtr; return EntityNames.ENTITY_APOS; } if (q == EntityNames.ENTITY_QUOT_QUAD) { --_inputPtr; return EntityNames.ENTITY_QUOT; } } // Nope, generic: return findPName(q, 0); } } // fall through case 1: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 1; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 1); } } q = (q << 8) | i; // fall through case 2: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 2; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { // lt or gt? if (_quadCount == 0) { if (q == EntityNames.ENTITY_GT_QUAD) { --_inputPtr; return EntityNames.ENTITY_GT; } if (q == EntityNames.ENTITY_LT_QUAD) { --_inputPtr; return EntityNames.ENTITY_LT; } } return findPName(q, 2); } } q = (q << 8) | i; // fall through case 3: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 3; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { // amp? if (_quadCount == 0) { if (q == EntityNames.ENTITY_AMP_QUAD) { --_inputPtr; return EntityNames.ENTITY_AMP; } } return findPName(q, 3); } } q = (q << 8) | i; } /* If we get this far, need to add full quad into * result array and update state */ if (_quadCount == 0) { // first quad _quadBuffer[0] = q; _quadCount = 1; } else { if (_quadCount >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[_quadCount++] = q; } _currQuadBytes = 0; } } /* /********************************************************************** /* Internal methods, LF handling /********************************************************************** */ /** * Method called when there is a pending \r (from past buffer), * and we need to see * * @return True if the linefeed was succesfully processed (had * enough input data to do that); or false if there is no * data available to check this */ @Override protected final boolean handlePartialCR() { // sanity check if (_pendingInput != PENDING_STATE_CR) { throwInternal(); } if (_inputPtr >= _inputEnd) { return false; } _pendingInput = 0; if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } ++_currRow; _rowStartOffset = _inputPtr; return true; } /* /********************************************************************** /* Multi-byte char decoding /********************************************************************** */ /** *

* Note: caller must guarantee enough data is available before * calling the method */ protected final int decodeUtf8_2(int c) throws XMLStreamException { int d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } return ((c & 0x1F) << 6) | (d & 0x3F); } protected final void skipUtf8_2(int c) throws XMLStreamException { int d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } } /** *

* Note: caller must guarantee enough data is available before * calling the method */ protected final int decodeUtf8_3(int c1) throws XMLStreamException { c1 &= 0x0F; int d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } int c = (c1 << 6) | (d & 0x3F); d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; } protected final int decodeUtf8_3(int c1, int c2, int c3) throws XMLStreamException { // Note: first char is assumed to have been checked if ((c2 & 0xC0) != 0x080) { reportInvalidOther(c2 & 0xFF, _inputPtr-1); } if ((c3 & 0xC0) != 0x080) { reportInvalidOther(c3 & 0xFF, _inputPtr); } int c = ((c1 & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; } protected final int decodeUtf8_4(int c) throws XMLStreamException { int d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = ((c & 0x07) << 6) | (d & 0x3F); d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } /* note: won't change it to negative here, since caller * already knows it'll need a surrogate */ return ((c << 6) | (d & 0x3F)) - 0x10000; } /** * @return Character value minus 0x10000; this so that caller * can readily expand it to actual surrogates */ protected final int decodeUtf8_4(int c1, int c2, int c3, int c4) throws XMLStreamException { /* Note: first char is assumed to have been checked, * (but not yet masked) */ if ((c2 & 0xC0) != 0x080) { reportInvalidOther(c2 & 0xFF, _inputPtr-2); } int c = ((c1 & 0x07) << 6) | (c2 & 0x3F); if ((c3 & 0xC0) != 0x080) { reportInvalidOther(c3 & 0xFF, _inputPtr-1); } c = (c << 6) | (c3 & 0x3F); if ((c4 & 0xC0) != 0x080) { reportInvalidOther(c4 & 0xFF, _inputPtr); } return ((c << 6) | (c4 & 0x3F)) - 0x10000; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy