com.fasterxml.aalto.in.ByteBasedScanner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
There is a newer version: 1.3.3
/* Aalto XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.io.IOException;

import javax.xml.stream.XMLStreamException;

import org.codehaus.stax2.XMLStreamLocation2;

import com.fasterxml.aalto.impl.LocationImpl;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;

/**
 * Intermediate base class used by different byte-backed scanners.
 * Specifically, used as a base by both blocking (stream) and
 * non-blocking (async) byte-based scanners (as opposed to Reader-backed,
 * character-based scanners)
 */
public abstract class ByteBasedScanner
    extends XmlScanner
{
    /*
    /**********************************************************************
    /* Byte constants
    /**********************************************************************
     */

    // White-space:

    final protected static byte BYTE_NULL = (byte) 0;
    final protected static byte BYTE_SPACE = (byte) ' ';
    final protected static byte BYTE_LF = (byte) '\n';
    final protected static byte BYTE_CR = (byte) '\r';
    final protected static byte BYTE_TAB = (byte) 9;

    final protected static byte BYTE_LT = (byte) '<';
    final protected static byte BYTE_GT = (byte) '>';
    final protected static byte BYTE_AMP = (byte) '&';
    final protected static byte BYTE_HASH = (byte) '#';
    final protected static byte BYTE_EXCL = (byte) '!';
    final protected static byte BYTE_HYPHEN = (byte) '-';
    final protected static byte BYTE_QMARK = (byte) '?';
    final protected static byte BYTE_SLASH = (byte) '/';
    final protected static byte BYTE_EQ = (byte) '=';
    final protected static byte BYTE_QUOT = (byte) '"';
    final protected static byte BYTE_APOS = (byte) '\'';
    final protected static byte BYTE_LBRACKET = (byte) '[';
    final protected static byte BYTE_RBRACKET = (byte) ']';
    final protected static byte BYTE_SEMICOLON = (byte) ';';

    final protected static byte BYTE_a = (byte) 'a';
    final protected static byte BYTE_g = (byte) 'g';
    final protected static byte BYTE_l = (byte) 'l';
    final protected static byte BYTE_m = (byte) 'm';
    final protected static byte BYTE_o = (byte) 'o';
    final protected static byte BYTE_p = (byte) 'p';
    final protected static byte BYTE_q = (byte) 'q';
    final protected static byte BYTE_s = (byte) 's';
    final protected static byte BYTE_t = (byte) 't';
    final protected static byte BYTE_u = (byte) 'u';
    final protected static byte BYTE_x = (byte) 'x';

    final protected static byte BYTE_A = (byte) 'A';
    final protected static byte BYTE_C = (byte) 'C';
    final protected static byte BYTE_D = (byte) 'D';
    final protected static byte BYTE_P = (byte) 'P';
    final protected static byte BYTE_S = (byte) 'S';
    final protected static byte BYTE_T = (byte) 'T';

    /*
    /**********************************************************************
    /* Input buffering
    /**********************************************************************
     */

    /**
     * Pointer to the next unread byte in the input buffer.
     */
    protected int _inputPtr;

    /**
     * Pointer to the first byte after the end of valid content.
     * This may point beyond of the physical buffer array.
     */
    protected int _inputEnd;

    /*
    /**********************************************************************
    /* Parsing state
    /**********************************************************************
     */

    /**
     * Storage location for a single character that can not be easily
     * pushed back (for example, multi-byte char; or char entity
     * expansion). Negative, if from entity expansion; positive if
     * a singular char.
     */
    protected int _tmpChar = INT_NULL;

    /*
    /**********************************************************************
    /* Life-cycle
    /**********************************************************************
     */

    protected ByteBasedScanner(ReaderConfig cfg)
    {
        super(cfg);
        _pastBytesOrChars = 0; // should it be passed by caller?
        _rowStartOffset = 0; // should probably be passed by caller...
    }

//    @Override protected abstract void _releaseBuffers();

    @Override
    protected abstract void _closeSource() throws IOException;

    /*
    /**********************************************************************
    /* Location handling
    /**********************************************************************
     */

    @Override
    public XMLStreamLocation2 getCurrentLocation()
    {
        return LocationImpl.fromZeroBased(_config.getPublicId(), _config.getSystemId(),
                _pastBytesOrChars + _inputPtr, _currRow, _inputPtr - _rowStartOffset);
    }

    @Override
    public int getCurrentColumnNr() {
        return _inputPtr - _rowStartOffset;
    }

    @Override
    public long getStartingByteOffset() {
        return _startRawOffset;
    }

    @Override
    public long getStartingCharOffset() {
        // N/A for this type
        return -1L;
    }

    @Override
    public long getEndingByteOffset() throws XMLStreamException {
        // Have to complete the token to know the ending location...
        if (_tokenIncomplete) {
            finishToken();
        }
        return _pastBytesOrChars + _inputPtr;
    }

    @Override
    public long getEndingCharOffset() throws XMLStreamException {
        // N/A for this type
        return -1L;
    }
    
    protected final void markLF(int offset) {
        _rowStartOffset = offset;
        ++_currRow;
    }

    protected final void markLF() {
        _rowStartOffset = _inputPtr;
        ++_currRow;
    }

    protected final void setStartLocation() {
        _startRawOffset = _pastBytesOrChars + _inputPtr;
        _startRow = _currRow;
        _startColumn = _inputPtr - _rowStartOffset;
    }
    
    /*
    /**********************************************************************
    /* Abstract methods for sub-classes to implement
    /**********************************************************************
     */

    /**
     * Method called by methods when encountering a byte that
     * can not be part of a valid character in the current context.
     * Should return the actual decoded character for error reporting
     * purposes.
     */
    protected abstract int decodeCharForError(byte b)
        throws XMLStreamException;

    /*
    /**********************************************************************
    /* And then shared functionality for sub-classes
    /**********************************************************************
     */

    /**
     * Conceptually, this method really does NOT belong here. However,
     * currently it is quite hard to refactor it, so it'll have to
     * stay here until better place is found
     */
    protected final PName addUTFPName(ByteBasedPNameTable symbols, XmlCharTypes charTypes,
            int hash, int[] quads, int qlen, int lastQuadBytes)
        throws XMLStreamException
    {
        // 4 bytes per quad, except last one maybe less
        int byteLen = (qlen << 2) - 4 + lastQuadBytes;

        // And last one is not correctly aligned (leading zero bytes instead
        // need to shift a bit, instead of trailing). Only need to shift it
        // for UTF-8 decoding; need revert for storage (since key will not
        // be aligned, to optimize lookup speed)
        int lastQuad;

        if (lastQuadBytes < 4) {
            lastQuad = quads[qlen-1];
            // 8/16/24 bit left shift
            quads[qlen-1] = (lastQuad << ((4 - lastQuadBytes) << 3));
        } else {
            lastQuad = 0;
        }

        // Let's handle first char separately (different validation):
        int ch = (quads[0] >>> 24);
        boolean ok;
        int ix = 1;
        char[] cbuf = _nameBuffer;
        int cix  = 0;
        final int[] TYPES = charTypes.NAME_CHARS;

        switch (TYPES[ch]) {
        case XmlCharTypes.CT_NAME_NONE:
        case XmlCharTypes.CT_NAME_COLON: // not ok as first
        case XmlCharTypes.CT_NAME_NONFIRST:
        case InputCharTypes.CT_INPUT_NAME_MB_N:
            ok = false;
            break;
        case XmlCharTypes.CT_NAME_ANY:
            ok = true;
            break;
        default: // multi-byte (UTF-8) chars:
            {
                int needed;
                
                if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
                    ch &= 0x1F;
                    needed = 1;
                } else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
                    ch &= 0x0F;
                    needed = 2;
                } else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all...
                    ch &= 0x07;
                    needed = 3;
                } else { // 5- and 6-byte chars not valid xml chars
                    reportInvalidInitial(ch);
                    needed = ch = 1; // never really gets this far
                }
                if ((ix + needed) > byteLen) {
                    reportEofInName(cbuf, 0);
                }
                ix += needed;
                
                int q = quads[0];
                // Always need at least one more right away:
                int ch2 = (q >> 16) & 0xFF;
                if ((ch2 & 0xC0) != 0x080) {
                    reportInvalidOther(ch2);
                }
                ch = (ch << 6) | (ch2 & 0x3F);
                
                /* And then may need more. Note: here we do not do all the
                 * checks that UTF-8 text decoder might do. Reason is that
                 * name validity checking methods handle most of such checks
                 */
                if (needed > 1) {
                    ch2 = (q >> 8) & 0xFF;
                    if ((ch2 & 0xC0) != 0x080) {
                        reportInvalidOther(ch2);
                    }
                    ch = (ch << 6) | (ch2 & 0x3F);
                    if (needed > 2) { // 4 bytes? (need surrogates on output)
                        ch2 = q & 0xFF;
                        if ((ch2 & 0xC0) != 0x080) {
                            reportInvalidOther(ch2 & 0xFF);
                        }
                        ch = (ch << 6) | (ch2 & 0x3F);
                    }
                }
                ok = XmlChars.is10NameStartChar(ch);
                if (needed > 2) { // outside of basic 16-bit range? need surrogates
                    /* so, let's first output first char (high surrogate),
                     * let second be output by later code
                     */
                    ch -= 0x10000; // to normalize it starting with 0x0
                    cbuf[cix++] = (char) (0xD800 + (ch >> 10));
                    ch = (0xDC00 | (ch & 0x03FF));
                }
            }
        }

        if (!ok) { // 0 to indicate it's first char, even with surrogates
            reportInvalidNameChar(ch, 0);
        }

        cbuf[cix++] = (char) ch; // the only char, or second (low) surrogate

        /* Whoa! Tons of code for just the start char. But now we get to
         * decode the name proper, at last!
         */
        int last_colon = -1;

        for (; ix < byteLen; ) {
            ch = quads[ix >> 2]; // current quad, need to shift+mask
            int byteIx = (ix & 3);
            ch = (ch >> ((3 - byteIx) << 3)) & 0xFF;
            ++ix;

            // Ascii?
            switch (TYPES[ch]) {
            case XmlCharTypes.CT_NAME_NONE:
            case XmlCharTypes.CT_MULTIBYTE_N:
                ok = false;
                break;
            case XmlCharTypes.CT_NAME_COLON: // not ok as first
                if (last_colon >= 0) {
                    reportMultipleColonsInName();
                }
                last_colon = cix;
                ok = true;
                break;
            case XmlCharTypes.CT_NAME_NONFIRST:
            case XmlCharTypes.CT_NAME_ANY:
                ok = true;
                break;
            default:
                {
                    int needed;
                    if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
                        ch &= 0x1F;
                        needed = 1;
                    } else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
                        ch &= 0x0F;
                        needed = 2;
                    } else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all...
                        ch &= 0x07;
                        needed = 3;
                    } else { // 5- and 6-byte chars not valid xml chars
                        reportInvalidInitial(ch);
                        needed = ch = 1; // never really gets this far
                    }
                    if ((ix + needed) > byteLen) {
                        reportEofInName(cbuf, cix);
                    }
                    
                    // Ok, always need at least one more:
                    int ch2 = quads[ix >> 2]; // current quad, need to shift+mask
                    byteIx = (ix & 3);
                    ch2 = (ch2 >> ((3 - byteIx) << 3));
                    ++ix;
                    
                    if ((ch2 & 0xC0) != 0x080) {
                        reportInvalidOther(ch2);
                    }
                    ch = (ch << 6) | (ch2 & 0x3F);
                    
                    // Once again, some of validation deferred to name char validator
                    if (needed > 1) {
                        ch2 = quads[ix >> 2];
                        byteIx = (ix & 3);
                        ch2 = (ch2 >> ((3 - byteIx) << 3));
                        ++ix;
                        
                        if ((ch2 & 0xC0) != 0x080) {
                            reportInvalidOther(ch2);
                        }
                        ch = (ch << 6) | (ch2 & 0x3F);
                        if (needed > 2) { // 4 bytes? (need surrogates on output)
                            ch2 = quads[ix >> 2];
                            byteIx = (ix & 3);
                            ch2 = (ch2 >> ((3 - byteIx) << 3));
                            ++ix;
                            if ((ch2 & 0xC0) != 0x080) {
                                reportInvalidOther(ch2 & 0xFF);
                            }
                            ch = (ch << 6) | (ch2 & 0x3F);
                        }
                    }
                    ok = XmlChars.is10NameChar(ch);
                    if (needed > 2) { // surrogate pair? once again, let's output one here, one later on
                        ch -= 0x10000; // to normalize it starting with 0x0
                        if (cix >= cbuf.length) {
                            _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
                        }
                        cbuf[cix++] = (char) (0xD800 + (ch >> 10));
                        ch = 0xDC00 | (ch & 0x03FF);
                    }
                }
            }
            if (!ok) {
                reportInvalidNameChar(ch, cix);
            }
            if (cix >= cbuf.length) {
                _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
            }
            cbuf[cix++] = (char) ch;
        }

        /* Ok. Now we have the character array, and can construct the
         * String (as well as check proper composition of semicolons
         * for ns-aware mode...)
         */
        String baseName = new String(cbuf, 0, cix);
        // And finally, unalign if necessary
        if (lastQuadBytes < 4) {
            quads[qlen-1] = lastQuad;
        }
        return symbols.addSymbol(hash, baseName, last_colon, quads, qlen);
    }

    /*
    /**********************************************************************
    /* Error reporting
    /**********************************************************************
     */
    
    protected void reportInvalidInitial(int mask) throws XMLStreamException {
        reportInputProblem("Invalid UTF-8 start byte 0x"+Integer.toHexString(mask));
    }

    protected void reportInvalidOther(int mask) throws XMLStreamException {
        reportInputProblem("Invalid UTF-8 middle byte 0x"+Integer.toHexString(mask));
    }
}