com.fasterxml.aalto.in.ByteSourceBootstrapper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
There is a newer version: 1.3.3
/* Woodstox Lite ("wool") XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.io.*;

import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;

import com.fasterxml.aalto.impl.IoStreamException;
import com.fasterxml.aalto.impl.LocationImpl;
import com.fasterxml.aalto.util.CharsetNames;

/**
 * Class that takes care of bootstrapping main document input from
 * a byte-oriented input source: usually either an InputStream,
 * or a block source like byte array.
 */
public final class ByteSourceBootstrapper
    extends InputBootstrapper
{
    private final static byte BYTE_NULL = (byte) 0;

    private final static byte BYTE_CR = (byte) '\r';

    private final static byte BYTE_LF = (byte) '\n';


    /*
    /**********************************************************************
    /* Configuration
    /**********************************************************************
     */

    /**
     * Underlying InputStream to use for reading content.
     */
    final InputStream _in;

    /*
    /**********************************************************************
    /* Input buffering
    /**********************************************************************
     */

    final byte[] _inputBuffer;

    private int _inputPtr;

    private int _inputLen;

    /*
    /**********************************************************************
    /* Data gathered
    /**********************************************************************
     */

    boolean mBigEndian = true;
    int mBytesPerChar = 0; // 0 means "dunno yet"

    boolean mHadBOM = false;
    boolean mByteSizeFound = false;

    /*
    /**********************************************************************
    /* Life-cycle
    /**********************************************************************
     */

    private ByteSourceBootstrapper(ReaderConfig cfg, InputStream in)
    {
        super(cfg);
        _in = in;
        _inputBuffer = cfg.allocFullBBuffer(4000);
        _inputLen = _inputPtr = 0;
    }

    private ByteSourceBootstrapper(ReaderConfig cfg, byte[] inputBuffer, int inputStart, int inputLen)
    {
        super(cfg);
        _in = null;
        _inputBuffer = inputBuffer;
        _inputPtr = inputStart;
        _inputLen = (inputStart + inputLen);
        // Need to offset this, to keep location correct
        _inputProcessed = -inputStart;
    }

    public static ByteSourceBootstrapper construct(ReaderConfig cfg, InputStream in)
        throws XMLStreamException
    {
        return new ByteSourceBootstrapper(cfg, in);
    }

    public static ByteSourceBootstrapper construct(ReaderConfig cfg, byte[] inputBuffer, int inputStart, int inputLen)
        throws XMLStreamException
    {
        return new ByteSourceBootstrapper(cfg, inputBuffer, inputStart, inputLen);
    }

    @Override
    public final XmlScanner bootstrap() throws XMLStreamException
    {
        try {
            return doBootstrap();
        } catch (IOException ioe) {
            throw new IoStreamException(ioe);
        } finally {
            _config.freeSmallCBuffer(mKeyword);
        }
    }

    public XmlScanner doBootstrap() throws IOException, XMLStreamException
    {
        String normEnc = null;

        determineStreamEncoding();
        if (hasXmlDeclaration()) { // yup, has xml decl:
            readXmlDeclaration();
            if (mFoundEncoding != null) {
                normEnc = verifyXmlEncoding(mFoundEncoding);
            }
        }

        // Now, have we figured out the encoding?

        if (normEnc == null) { // not via xml declaration
            if (mBytesPerChar == 2) { // UTF-16, BE/LE
                normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;
            } else if (mBytesPerChar == 4) { // UCS-4... ?
                /* 22-Mar-2005, TSa: JDK apparently has no way of dealing
                 *   with these encodings... not sure if and how it should
                 *   be dealt with, really. Name could be UCS-4xx... or
                 *   perhaps UTF-32xx
                 */
                normEnc = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;
            } else {
                // Ok, default has to be UTF-8, as per XML specs
                normEnc = CharsetNames.CS_UTF8;
            }
        }

        _config.setActualEncoding(normEnc);
        _config.setXmlDeclInfo(mDeclaredXmlVersion, mFoundEncoding, mStandalone);

        // Normalized, can thus use straight equality checks now
        // UTF-8 compatible (loosely speaking) ones can use same scanner
        if (normEnc == CharsetNames.CS_UTF8
            || normEnc == CharsetNames.CS_ISO_LATIN1
            || normEnc == CharsetNames.CS_US_ASCII) {
            return new Utf8Scanner(_config,
                                   _in, _inputBuffer, _inputPtr, _inputLen);
        } else if (normEnc.startsWith(CharsetNames.CS_UTF32)) {
            /* Since this is such a rare encoding, we'll just create
             * a Reader, and dispatch it to reader scanner?
             */
            // let's augment with actual endianness info
            if (normEnc == CharsetNames.CS_UTF32) {
                normEnc = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;
            }
            Reader r = new Utf32Reader(_config, _in, _inputBuffer, _inputPtr, _inputLen,
                                       mBigEndian);
            return new ReaderScanner(_config, r);
        }

        /* And finally, if all else fails, we'll also fall back to
         * using JDK-provided decoders and ReaderScanner:
         */
        InputStream in = _in;
        if (_inputPtr < _inputLen) {
            in = new MergedStream(_config, in, _inputBuffer, _inputPtr, _inputLen);
        }
        if (normEnc == CharsetNames.CS_UTF16) {
            normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;
        }
        try {
            Reader r = new InputStreamReader(in, normEnc);
            return new ReaderScanner(_config, r);
        } catch (UnsupportedEncodingException usex) {
            throw new IoStreamException("Unsupported encoding: "+usex.getMessage());
        }
    }

    /*
    /**********************************************************************
    // Internal methods, main xml decl processing
    /**********************************************************************
     */

    /**
     * Method called to figure out what the physical encoding of the
     * file appears to be (in case it can be determined from BOM, or
     * xml declaration, either of which may be present)
     */
    private void determineStreamEncoding()
        throws IOException
    {
        /* Ok; first just need 4 bytes for determining bytes-per-char from
         * BOM or first char(s) of likely xml declaration:
         */
        if (ensureLoaded(4)) {
            int origPtr = _inputPtr;

            bomblock:
            do { // BOM/auto-detection block
                int quartet = (_inputBuffer[_inputPtr] << 24)
                    | ((_inputBuffer[_inputPtr+1] & 0xFF) << 16)
                    | ((_inputBuffer[_inputPtr+2] & 0xFF) << 8)
                    | (_inputBuffer[_inputPtr+3] & 0xFF);

                /* Handling of (usually) optional BOM (required for
                 * multi-byte formats); first 32-bit charsets:
                 */
                switch (quartet) {
                case 0x0000FEFF:
                    mBigEndian = true;
                    _inputPtr += 4;
                    mBytesPerChar = 4;
                    break bomblock;
                case 0xFFFE0000: // UCS-4, LE?
                    mBigEndian = false;
                    _inputPtr += 4;
                    mBytesPerChar = 4;
                    break bomblock;
                case 0x0000FFFE: // UCS-4, in-order...
                    reportWeirdUCS4("2143");
                    break bomblock;
                case 0x0FEFF0000: // UCS-4, in-order...
                    reportWeirdUCS4("3412");
                    break bomblock;
                }

                // Ok, if not, how about 16-bit encoding BOMs?
                int msw = quartet >>> 16;
                if (msw == 0xFEFF) { // UTF-16, BE
                    _inputPtr += 2;
                    mBytesPerChar = 2;
                    mBigEndian = true;
                    break;
                }
                if (msw == 0xFFFE) { // UTF-16, LE
                    _inputPtr += 2;
                    mBytesPerChar = 2;
                    mBigEndian = false;
                    break;
                }

                // And if not, then UTF-8 BOM?
                if ((quartet >>> 8) == 0xEFBBBF) { // UTF-8
                    _inputPtr += 3;
                    mBytesPerChar = 1;
                    mBigEndian = true; // doesn't really matter
                    break;
                }

                /* And if that wasn't succesful, how about auto-detection
                 * for ' origPtr);

            /* Let's update location markers to ignore BOM when calculating
             * column positions (but not from raw byte offsets)
             */
            _inputRowStart = _inputPtr;
        }

        /* Hmmh. If we haven't figured it out, let's just assume
         * UTF-8 as per XML specs:
         */
        mByteSizeFound = (mBytesPerChar > 0);
        if (!mByteSizeFound) {
            mBytesPerChar = 1;
            mBigEndian = true; // doesn't matter
        }
    }

    protected boolean hasXmlDeclaration()
        throws IOException, XMLStreamException
    {
        // First the common case, 1-byte encoding (Ascii/ISO-Latin/UTF-8):
        if (mBytesPerChar == 1) {
            // Need 6 chars to determine for sure...
            if (ensureLoaded(6)) {
                if (_inputBuffer[_inputPtr] == '<'
                    && _inputBuffer[_inputPtr+1] == '?'
                    && _inputBuffer[_inputPtr+2] == 'x'
                    && _inputBuffer[_inputPtr+3] == 'm'
                    && _inputBuffer[_inputPtr+4] == 'l'
                    && ((_inputBuffer[_inputPtr+5] & 0xFF) <= CHAR_SPACE)) {

                    // Let's skip stuff so far:
                    _inputPtr += 6;
                    return true;
                }
            }
        } else { // ... and then for slower fixed-multibyte encodings:
            if (ensureLoaded (6 * mBytesPerChar)) { // 6 chars as well
                int start = _inputPtr; // if we have to 'unread' chars
                if (nextMultiByte() == '<'
                    && nextMultiByte() == '?'
                    && nextMultiByte() == 'x'
                    && nextMultiByte() == 'm'
                    && nextMultiByte() == 'l'
                    && nextMultiByte() <= CHAR_SPACE) {
                    return true;
                }
                _inputPtr = start; // push data back
            }
        }
        return false;
    }

    /**
     * @return Normalized encoding name
     */
    protected String verifyXmlEncoding(String enc)
        throws XMLStreamException
    {
        enc = CharsetNames.normalize(enc);

        // Let's actually verify we got matching information:
        if (enc == CharsetNames.CS_UTF8) {
            verifyEncoding(enc, 1);
        } else if (enc == CharsetNames.CS_ISO_LATIN1) {
            verifyEncoding(enc, 1);
        } else if (enc == CharsetNames.CS_US_ASCII) {
            verifyEncoding(enc, 1);
        } else if (enc == CharsetNames.CS_UTF16) {
            // BOM should be obligatory, to know the ordering?
            // For now, let's not enforce that though.
            //if (!mHadBOM) {
            //reportMissingBOM(enc);
            //}
            verifyEncoding(enc, 2);
        } else if (enc == CharsetNames.CS_UTF16LE) {
            verifyEncoding(enc, 2, false);
        } else if (enc == CharsetNames.CS_UTF16BE) {
            verifyEncoding(enc, 2, true);

        } else if (enc == CharsetNames.CS_UTF32) {
            // Do we require a BOM here? we can live without it...
            //if (!mHadBOM) {
            //    reportMissingBOM(enc);
            //}
            verifyEncoding(enc, 4);
        } else if (enc == CharsetNames.CS_UTF32LE) {
            verifyEncoding(enc, 4, false);
        } else if (enc == CharsetNames.CS_UTF32BE) {
            verifyEncoding(enc, 4, true);
        }
        return enc;
    }

    /*
    /**********************************************************************
    /* Internal methods, loading input data
    /**********************************************************************
     */
 
    protected boolean ensureLoaded(int minimum)
        throws IOException
    {
        /* Let's assume here buffer has enough room -- this will always
         * be true for the limited used this method gets
         */
        int gotten = (_inputLen - _inputPtr);
        while (gotten < minimum) {
            int count;

            if (_in == null) { // block source
                count = -1;
            } else {
                count = _in.read(_inputBuffer, _inputLen, _inputBuffer.length - _inputLen);
            }
            if (count < 1) {
                return false;
            }
            _inputLen += count;
            gotten += count;
        }
        return true;
    }

    protected void loadMore()
        throws IOException, XMLStreamException
    {
        _inputProcessed += _inputLen;
        _inputRowStart -= _inputLen;

        _inputPtr = 0;
        if (_in == null) { // block source
            _inputLen = -1;
        } else {
            _inputLen = _in.read(_inputBuffer, 0, _inputBuffer.length);
        }
        if (_inputLen < 1) {
            reportEof();
        }
    }

    /*
    /**********************************************************************
    /* Implementations of abstract parsing methods
    /**********************************************************************
     */

    @Override
    protected void pushback() {
        _inputPtr -= mBytesPerChar;
    }

    @Override
    protected int getNext()
        throws IOException, XMLStreamException
    {
        if (mBytesPerChar > 1) {
            return nextMultiByte();
        }
        byte b = (_inputPtr < _inputLen) ?
            _inputBuffer[_inputPtr++] : nextByte();
        return (b & 0xFF);
    }

    @Override
    protected int getNextAfterWs(boolean reqWs)
        throws IOException, XMLStreamException
    {
        int count;

        if (mBytesPerChar > 1) { // multi-byte
            count = skipMbWs();
        } else {
            count = skipSbWs();
        }

        if (reqWs && count == 0) {
            reportUnexpectedChar(getNext(), ERR_XMLDECL_EXP_SPACE);
        }

        // inlined getNext()
        if (mBytesPerChar > 1) {
            return nextMultiByte();
        }
        byte b = (_inputPtr < _inputLen) ?
            _inputBuffer[_inputPtr++] : nextByte();
        return (b & 0xFF);
    }

    /**
     * @return First character that does not match expected, if any;
     *    CHAR_NULL if match succeeded
     */
    @Override
    protected int checkKeyword(String exp)
        throws IOException, XMLStreamException
    {
        if (mBytesPerChar > 1) {
            return checkMbKeyword(exp);
        }
        return checkSbKeyword(exp);
    }

    @Override
    protected int readQuotedValue(char[] kw, int quoteChar)
        throws IOException, XMLStreamException
    {
        int i = 0;
        int len = kw.length;
        boolean mb = (mBytesPerChar > 1);

        while (i < len) {
            int c;

            if (mb) {
                c = nextMultiByte();
                if (c ==  CHAR_CR || c == CHAR_LF) {
                    skipMbLF(c);
                    c = CHAR_LF;
                }
            } else {
                byte b = (_inputPtr < _inputLen) ?
                    _inputBuffer[_inputPtr++] : nextByte();
                if (b == BYTE_NULL) {
                    reportNull();
                }
                if (b == BYTE_CR || b == BYTE_LF) {
                    skipSbLF(b);
                    b = BYTE_LF;
                }
                c = (b & 0xFF);
            }

            if (c == quoteChar) {
                return (i < len) ? i : -1;
            }

	    if (i < len) {
		kw[i++] = (char) c;
	    }
        }

        /* If we end up this far, we ran out of buffer space... let's let
         * caller figure that out, though
         */
        return -1;
    }

    @Override
    protected Location getLocation()
    {
        /* Ok; for fixed-size multi-byte encodings, need to divide numbers
         * to get character locations. For variable-length encodings the
         * good thing is that xml declaration only uses shortest codepoints,
         * ie. char count == byte count.
         */
        int total = _inputProcessed + _inputPtr;
        int col = _inputPtr - _inputRowStart;

        if (mBytesPerChar > 1) {
            total /= mBytesPerChar;
            col /= mBytesPerChar;
        }
        return LocationImpl.fromZeroBased
            (_config.getPublicId(), _config.getSystemId(),
             total, _inputRow, col);
    }

    /*
    /**********************************************************************
    /* Internal methods, single-byte access methods
    /**********************************************************************
     */

    protected byte nextByte()
        throws IOException, XMLStreamException
    {
        if (_inputPtr >= _inputLen) {
            loadMore();
        }
        return _inputBuffer[_inputPtr++];
    }

    protected int skipSbWs()
        throws IOException, XMLStreamException
    {
        int count = 0;

        while (true) {
            byte b = (_inputPtr < _inputLen) ?
                _inputBuffer[_inputPtr++] : nextByte();

            if ((b & 0xFF) > CHAR_SPACE) {
                --_inputPtr;
                break;
            }
            if (b == BYTE_CR || b == BYTE_LF) {
                skipSbLF(b);
            } else if (b == BYTE_NULL) {
                reportNull();
            }
            ++count;
        }
        return count;
    }

    protected void skipSbLF(byte lfByte)
        throws IOException, XMLStreamException
    {
        if (lfByte == BYTE_CR) {
            byte b = (_inputPtr < _inputLen) ?
                _inputBuffer[_inputPtr++] : nextByte();
            if (b != BYTE_LF) {
                --_inputPtr; // pushback if not 2-char/byte lf
            }
        }
        ++_inputRow;
        _inputRowStart = _inputPtr;
    }

    /**
     * @return First character that does not match expected, if any;
     *    CHAR_NULL if match succeeded
     */
    protected int checkSbKeyword(String expected)
        throws IOException, XMLStreamException
    {
        int len = expected.length();
        
        for (int ptr = 1; ptr < len; ++ptr) {
            byte b = (_inputPtr < _inputLen) ?
                _inputBuffer[_inputPtr++] : nextByte();
            
            if (b == BYTE_NULL) {
                reportNull();
            }
            if ((b & 0xFF) != expected.charAt(ptr)) {
                return (b & 0xFF);
            }
        }

        return CHAR_NULL;
    }

    /*
    /**********************************************************************
    /* Internal methods, multi-byte access/checks
    /**********************************************************************
     */

    protected int nextMultiByte()
        throws IOException, XMLStreamException
    {
        byte b = (_inputPtr < _inputLen) ?
            _inputBuffer[_inputPtr++] : nextByte();
        byte b2 = (_inputPtr < _inputLen) ?
            _inputBuffer[_inputPtr++] : nextByte();
        int c;

        if (mBytesPerChar == 2) {
            if (mBigEndian) {
                c = ((b & 0xFF) << 8) | (b2 & 0xFF);
            } else {
                c = (b & 0xFF) | ((b2 & 0xFF) << 8);
            }
        } else {
            // Has to be 4 bytes
            byte b3 = (_inputPtr < _inputLen) ?
                _inputBuffer[_inputPtr++] : nextByte();
            byte b4 = (_inputPtr < _inputLen) ?
                _inputBuffer[_inputPtr++] : nextByte();
            
            if (mBigEndian) {
                c = (b  << 24) | ((b2 & 0xFF) << 16)
                    | ((b3 & 0xFF) << 8) | (b4 & 0xFF);
            } else {
                c = (b4  << 24) | ((b3 & 0xFF) << 16)
                    | ((b2 & 0xFF) << 8) | (b & 0xFF);
            }
        }

        // Let's catch null chars early
        if (c == 0) {
            reportNull();
        }
        return c;
    }

    protected int skipMbWs()
        throws IOException, XMLStreamException
    {
        int count = 0;

        while (true) {
            int c = nextMultiByte();

            if (c > CHAR_SPACE) {
                _inputPtr -= mBytesPerChar;
                break;
            }
            if (c == CHAR_CR || c == CHAR_LF) {
                skipMbLF(c);
            } else if (c == CHAR_NULL) {
                reportNull();
            }
            ++count;
        }
        return count;
    }

    protected void skipMbLF(int lf)
        throws IOException, XMLStreamException
    {
        if (lf == CHAR_CR) {
            int c = nextMultiByte();
            if (c != CHAR_LF) {
                _inputPtr -= mBytesPerChar;
            }
        }
        ++_inputRow;
        _inputRowStart = _inputPtr;
    }

    /**
     * @return First character that does not match expected, if any;
     *    CHAR_NULL if match succeeded
     */
    protected int checkMbKeyword(String expected)
        throws IOException, XMLStreamException
    {
        int len = expected.length();
        
        for (int ptr = 1; ptr < len; ++ptr) {
            int c = nextMultiByte();
            if (c == BYTE_NULL) {
                reportNull();
            }
            if (c != expected.charAt(ptr)) {
              return c;
            }
        }

        return CHAR_NULL;
    }

    /*
    /**********************************************************************
    /* Other private methods:
    /**********************************************************************
     */

    private void verifyEncoding(String id, int bpc)
        throws XMLStreamException
    {
        if (mByteSizeFound) {
            /* Let's verify that if we matched an encoding, it's the same
             * as what was declared...
             */
            if (bpc != mBytesPerChar) {
                reportXmlProblem("Declared encoding '"+id+"' uses "+bpc
                                 +" bytes per character; but physical encoding appeared to use "+mBytesPerChar+"; cannot decode");
            }
        }
    }

    private void verifyEncoding(String id, int bpc, boolean bigEndian)
        throws XMLStreamException
    {
        if (mByteSizeFound) {
            verifyEncoding(id, bpc);

            if (bigEndian != mBigEndian) {
                String bigStr = bigEndian ? "big" : "little";
                reportXmlProblem
                    ("Declared encoding '"+id+"' has different endianness ("
                     +bigStr+" endian) than what physical ordering appeared to be; cannot decode");
            }
        }
    }

    private void reportWeirdUCS4(String type)
        throws IOException
    {
        throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");
    }

    private void reportEBCDIC()
        throws IOException
    {
        throw new CharConversionException("Unsupported encoding (EBCDIC)");
    }
}