src.java.com.ctc.wstx.io.StreamBootstrapper Maven / Gradle / Ivy

Go to download
package com.ctc.wstx.io;

import java.io.*;

import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;

import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.ParsingErrorMsgs;
import com.ctc.wstx.cfg.XmlConsts;
import com.ctc.wstx.exc.*;
import com.ctc.wstx.util.StringUtil;

/**
 * Input bootstrap class used with streams, when encoding is not known
 * (when encoding is specified by application, a reader is constructed,
 * and then reader-based bootstrapper is used).
 * 1) {
            total /= mBytesPerChar;
        }
        return total;
    }

    public int getInputColumn() {
        int col = mInputPtr - mInputRowStart;
        if (mBytesPerChar > 1) {
            col /= mBytesPerChar;
        }
        return col;
    }

    /*
    ////////////////////////////////////////
    // Internal methods, parsing
    ////////////////////////////////////////
    */

    /**
     * Method called to try to figure out physical encoding the underlying
     * input stream uses.
     */
    protected void resolveStreamEncoding()
        throws IOException, WstxException
    {
        // Let's first set defaults:
        mBytesPerChar = 0;
        mBigEndian = true;

        /* Ok; first just need 4 bytes for determining bytes-per-char from
         * BOM or first char(s) of likely xml declaration:
         */
        if (ensureLoaded(4)) {
            bomblock:
            do { // BOM/auto-detection block
                int quartet = (mByteBuffer[0] << 24)
                    | ((mByteBuffer[1] & 0xFF) << 16)
                    | ((mByteBuffer[2] & 0xFF) << 8)
                    | (mByteBuffer[3] & 0xFF);

                /* Handling of (usually) optional BOM (required for
                 * multi-byte formats); first 32-bit charsets:
                 */
                switch (quartet) {
                case 0x0000FEFF:
                    mBigEndian = true;
                    mInputPtr = mBytesPerChar = 4;
                    break bomblock;
                case 0xFFFE0000: // UCS-4, LE?
                    mInputPtr = mBytesPerChar = 4;
                    mBigEndian = false;
                    break bomblock;
                case 0x0000FFFE: // UCS-4, in-order...
                    reportWeirdUCS4("2143");
                    break bomblock;
                case 0x0FEFF0000: // UCS-4, in-order...
                    reportWeirdUCS4("3412");
                    break bomblock;
                }

                // Ok, if not, how about 16-bit encoding BOMs?
                int msw = quartet >>> 16;
                if (msw == 0xFEFF) { // UTF-16, BE
                    mInputPtr = mBytesPerChar = 2;
                    mBigEndian = true;
                    break;
                }
                if (msw == 0xFFFE) { // UTF-16, LE
                    mInputPtr = mBytesPerChar = 2;
                    mBigEndian = false;
                    break;
                }

                // And if not, then UTF-8 BOM?
                if ((quartet >>> 8) == 0xEFBBBF) { // UTF-8
                    mInputPtr = 3;
                    mBytesPerChar = 1;
                    mBigEndian = true; // doesn't really matter
                    break;
                }

                /* And if that wasn't succesful, how about auto-detection
                 * for ' 0);

            // Let's update location markers to ignore BOM.
            mInputProcessed = -mInputPtr;
            mInputRowStart = mInputPtr;
        }

        /* Hmmh. If we haven't figured it out, let's just assume
         * UTF-8 as per XML specs:
         */
        mByteSizeFound = (mBytesPerChar != 0);
        if (!mByteSizeFound) {
            mBytesPerChar = 1;
            mBigEndian = true; // doesn't matter
        }
    }

    /**
     * @return Normalized encoding name
     */
    protected String verifyXmlEncoding(String enc)
        throws WstxException
    {
        enc = CharsetNames.normalize(enc);

        // Let's actually verify we got matching information:
        if (enc == CharsetNames.CS_UTF8) {
            verifyEncoding(enc, 1);
        } else if (enc == CharsetNames.CS_ISO_LATIN1) {
            verifyEncoding(enc, 1);
        } else if (enc == CharsetNames.CS_US_ASCII) {
            verifyEncoding(enc, 1);
        } else if (enc == CharsetNames.CS_UTF16) {
            // BOM is obligatory, to know the ordering
            /* 22-Mar-2005, TSa: Actually, since we don't have a
             *   custom decoder, so the underlying JDK Reader may
             *   have dealt with it transparently... so we can not
             *   really throw an exception here.
             */
            //if (!mHadBOM) {
            //reportMissingBOM(enc);
            //}
            verifyEncoding(enc, 2);
        } else if (enc == CharsetNames.CS_UTF16LE) {
            verifyEncoding(enc, 2, false);
        } else if (enc == CharsetNames.CS_UTF16BE) {
            verifyEncoding(enc, 2, true);

        } else if (enc == CharsetNames.CS_UTF32) {
            // Do we require a BOM here? we can live without it...
            //if (!mHadBOM) {
            //    reportMissingBOM(enc);
            //}
            verifyEncoding(enc, 4);
        } else if (enc == CharsetNames.CS_UTF32LE) {
            verifyEncoding(enc, 4, false);
        } else if (enc == CharsetNames.CS_UTF32BE) {
            verifyEncoding(enc, 4, true);
        }
        return enc;
    }

    /*
    /////////////////////////////////////////////////////
    // Internal methods, loading input data
    /////////////////////////////////////////////////////
    */

    protected boolean ensureLoaded(int minimum)
        throws IOException
    {
        /* Let's assume here buffer has enough room -- this will always
         * be true for the limited used this method gets
         */
        int gotten = (mInputLen - mInputPtr);
        while (gotten < minimum) {
            int count = mIn.read(mByteBuffer, mInputLen,
                                 mByteBuffer.length - mInputLen);
            if (count < 1) {
                return false;
            }
            mInputLen += count;
            gotten += count;
        }
        return true;
    }

    protected void loadMore()
        throws IOException, WstxException
    {
        /* Need to make sure offsets are properly updated for error
         * reporting purposes, and do this now while previous amounts
         * are still known.
         */
        /* Note: at this point these are all in bytes, not chars (for multibyte
         * encodings)
         */
        mInputProcessed += mInputLen;
        mInputRowStart -= mInputLen;

        mInputPtr = 0;
        mInputLen = mIn.read(mByteBuffer, 0, mByteBuffer.length);
        if (mInputLen < 1) {
            throw new WstxEOFException(ParsingErrorMsgs.SUFFIX_IN_XML_DECL,
                                       getLocation());
        }
    }

    /*
    /////////////////////////////////////////////////////
    // Implementations of abstract parsing methods
    /////////////////////////////////////////////////////
    */

    protected void pushback() {
        if (mBytesPerChar < 0) {
            mInputPtr += mBytesPerChar;
        } else {
            mInputPtr -= mBytesPerChar;
        }
    }

    protected int getNext()
        throws IOException, WstxException
    {
        if (mBytesPerChar != 1) {
            if (mBytesPerChar == -1) { // need to translate
                return nextTranslated();
            }
            return nextMultiByte();
        }
        byte b = (mInputPtr < mInputLen) ?
            mByteBuffer[mInputPtr++] : nextByte();
        return (b & 0xFF);
    }


    protected int getNextAfterWs(boolean reqWs)
        throws IOException, WstxException
    {
        int count;

        if (mBytesPerChar == 1) { // single byte
            count = skipSbWs();
        } else {
            if (mBytesPerChar == -1) { // translated
                count = skipTranslatedWs();
            } else { // multi byte
                count = skipMbWs();
            }
        }

        if (reqWs && count == 0) {
            reportUnexpectedChar(getNext(), ERR_XMLDECL_EXP_SPACE);
        }

        // inlined getNext()
        if (mBytesPerChar != 1) {
            if (mBytesPerChar == -1) { // translated
                return nextTranslated();
            }
            return nextMultiByte();
        }
        byte b = (mInputPtr < mInputLen) ?
            mByteBuffer[mInputPtr++] : nextByte();
        return (b & 0xFF);
    }

    /**
     * @return First character that does not match expected, if any;
     *    CHAR_NULL if match succeeded
     */
    protected int checkKeyword(String exp)
        throws IOException, WstxException
    {
        if (mBytesPerChar != 1) {
            if (mBytesPerChar == -1) {
                return checkTranslatedKeyword(exp);
            }
            return checkMbKeyword(exp);
        }
        return checkSbKeyword(exp);
    }

    protected int readQuotedValue(char[] kw, int quoteChar)
        throws IOException, WstxException
    {
        int i = 0;
        int len = kw.length;
        boolean simple = (mBytesPerChar == 1);
        boolean mb = !simple && (mBytesPerChar > 1);

        while (i < len) {
            int c;

            if (simple) {
                byte b = (mInputPtr < mInputLen) ?
                    mByteBuffer[mInputPtr++] : nextByte();
                if (b == BYTE_NULL) {
                    reportNull();
                }
                if (b == BYTE_CR || b == BYTE_LF) {
                    skipSbLF(b);
                    b = BYTE_LF;
                }
                c = (b & 0xFF);
            } else {
                if (mb) {
                    c = nextMultiByte();
                    if (c ==  CHAR_CR || c == CHAR_LF) {
                        skipMbLF(c);
                        c = CHAR_LF;
                    }
                } else {
                    c = nextTranslated();
                    if (c ==  CHAR_CR || c == CHAR_LF) {
                        skipTranslatedLF(c);
                        c = CHAR_LF;
                    }
                }
            }

            if (c == quoteChar) {
                return (i < len) ? i : -1;
            }

            if (i < len) {
                kw[i++] = (char) c;
            }
        }
        
        /* If we end up this far, we ran out of buffer space... let's let
         * caller figure that out, though
         */
        return -1;
    }

    protected boolean hasXmlDecl()
        throws IOException, WstxException
    {
        /* Separate handling for common and fast case; 1/variable byte
         * encodings that have ASCII subset:
         */
        if (mBytesPerChar == 1) {
            /* However... there has to be at least 6 bytes available; and if
             * so, can check the 'signature' easily:
             */
            if (ensureLoaded(6)) {
                if (mByteBuffer[mInputPtr] == '<'
                    && mByteBuffer[mInputPtr+1] == '?'
                    && mByteBuffer[mInputPtr+2] == 'x'
                    && mByteBuffer[mInputPtr+3] == 'm'
                    && mByteBuffer[mInputPtr+4] == 'l'
                    && ((mByteBuffer[mInputPtr+5] & 0xFF) <= CHAR_SPACE)) {

                    // Let's skip stuff so far:
                    mInputPtr += 6;
                    return true;
                }
            }
        } else if (mBytesPerChar == -1) { // translated (EBCDIC)
            if (ensureLoaded(6)) {
                int start = mInputPtr; // if we have to 'unread' chars
                if (nextTranslated() == '<'
                    && nextTranslated() == '?'
                    && nextTranslated() == 'x'
                    && nextTranslated() == 'm'
                    && nextTranslated() == 'l'
                    && nextTranslated() <= CHAR_SPACE) {
                    return true;
                }
                mInputPtr = start; // push data back
            }
        } else {
            // ... and then for slower fixed-multibyte encodings:

            // Is there enough data for checks?
            if (ensureLoaded (6 * mBytesPerChar)) {
                int start = mInputPtr; // if we have to 'unread' chars
                if (nextMultiByte() == '<'
                    && nextMultiByte() == '?'
                    && nextMultiByte() == 'x'
                    && nextMultiByte() == 'm'
                    && nextMultiByte() == 'l'
                    && nextMultiByte() <= CHAR_SPACE) {
                    return true;
                }
                mInputPtr = start; // push data back
            }
        }

        return false;
    }

    protected Location getLocation()
    {
        /* Ok; for fixed-size multi-byte encodings, need to divide numbers
         * to get character locations. For variable-length encodings the
         * good thing is that xml declaration only uses shortest codepoints,
         * ie. char count == byte count.
         */
        int total = mInputProcessed + mInputPtr;
        int col = mInputPtr - mInputRowStart;

        if (mBytesPerChar > 1) {
            total /= mBytesPerChar;
            col /= mBytesPerChar;
        }

        return new WstxInputLocation(null, mPublicId, mSystemId,
                                     total - 1, // 0-based
                                     mInputRow, col);
    }

    /*
    /////////////////////////////////////////////////////
    // Internal methods, single-byte access methods
    /////////////////////////////////////////////////////
    */

    protected byte nextByte()
        throws IOException, WstxException
    {
        if (mInputPtr >= mInputLen) {
            loadMore();
        }
        return mByteBuffer[mInputPtr++];
    }

    protected int skipSbWs()
        throws IOException, WstxException
    {
        int count = 0;

        while (true) {
            byte b = (mInputPtr < mInputLen) ?
                mByteBuffer[mInputPtr++] : nextByte();

            if ((b & 0xFF) > CHAR_SPACE) {
                --mInputPtr;
                break;
            }
            if (b == BYTE_CR || b == BYTE_LF) {
                skipSbLF(b);
            } else if (b == BYTE_NULL) {
                reportNull();
            }
            ++count;
        }
        return count;
    }

    protected void skipSbLF(byte lfByte)
        throws IOException, WstxException
    {
        if (lfByte == BYTE_CR) {
            byte b = (mInputPtr < mInputLen) ?
                mByteBuffer[mInputPtr++] : nextByte();
            if (b != BYTE_LF) {
                --mInputPtr; // pushback if not 2-char/byte lf
            }
        }
        ++mInputRow;
        mInputRowStart = mInputPtr;
    }

    /**
     * @return First character that does not match expected, if any;
     *    CHAR_NULL if match succeeded
     */
    protected int checkSbKeyword(String expected)
        throws IOException, WstxException
    {
        int len = expected.length();
        
        for (int ptr = 1; ptr < len; ++ptr) {
            byte b = (mInputPtr < mInputLen) ?
                mByteBuffer[mInputPtr++] : nextByte();
            
            if (b == BYTE_NULL) {
                reportNull();
            }
            if ((b & 0xFF) != expected.charAt(ptr)) {
                return (b & 0xFF);
            }
        }

        return CHAR_NULL;
    }

    /*
    /////////////////////////////////////////////////////
    // Internal methods, multi-byte/translated access/checks
    /////////////////////////////////////////////////////
    */

    protected int nextMultiByte()
        throws IOException, WstxException
    {
        byte b = (mInputPtr < mInputLen) ?
            mByteBuffer[mInputPtr++] : nextByte();
        byte b2 = (mInputPtr < mInputLen) ?
            mByteBuffer[mInputPtr++] : nextByte();
        int c;

        if (mBytesPerChar == 2) {
            if (mBigEndian) {
                c = ((b & 0xFF) << 8) | (b2 & 0xFF);
            } else {
                c = (b & 0xFF) | ((b2 & 0xFF) << 8);
            }
        } else {
            // Has to be 4 bytes
            byte b3 = (mInputPtr < mInputLen) ?
                mByteBuffer[mInputPtr++] : nextByte();
            byte b4 = (mInputPtr < mInputLen) ?
                mByteBuffer[mInputPtr++] : nextByte();
            
            if (mBigEndian) {
                c = (b  << 24) | ((b2 & 0xFF) << 16)
                    | ((b3 & 0xFF) << 8) | (b4 & 0xFF);
            } else {
                c = (b4  << 24) | ((b3 & 0xFF) << 16)
                    | ((b2 & 0xFF) << 8) | (b & 0xFF);
            }
        }

        // Let's catch null chars early
        if (c == 0) {
            reportNull();
        }
        return c;
    }

    protected int nextTranslated()
        throws IOException, WstxException
    {
        byte b = (mInputPtr < mInputLen) ?
            mByteBuffer[mInputPtr++] : nextByte();
        int ch = mSingleByteTranslation[b & 0xFF];
        if (ch < 0) { // special char... won't care for now
            ch = -ch;
        }
        return ch;
    }

    protected int skipMbWs()
        throws IOException, WstxException
    {
        int count = 0;

        while (true) {
            int c = nextMultiByte();

            if (c > CHAR_SPACE) {
                mInputPtr -= mBytesPerChar;
                break;
            }
            if (c == CHAR_CR || c == CHAR_LF) {
                skipMbLF(c);
            } else if (c == CHAR_NULL) {
                reportNull();
            }
            ++count;
        }
        return count;
    }

    protected int skipTranslatedWs()
        throws IOException, WstxException
    {
        int count = 0;

        while (true) {
            int c = nextTranslated();

            // Hmmh. Are we to accept NEL (0x85)?
            if (c > CHAR_SPACE && c != CHAR_NEL) {
                --mInputPtr;
                break;
            }
            if (c == CHAR_CR || c == CHAR_LF) {
                skipTranslatedLF(c);
            } else if (c == CHAR_NULL) {
                reportNull();
            }
            ++count;
        }
        return count;
    }

    protected void skipMbLF(int lf)
        throws IOException, WstxException
    {
        if (lf == CHAR_CR) {
            int c = nextMultiByte();
            if (c != CHAR_LF) {
                mInputPtr -= mBytesPerChar;
            }
        }
        ++mInputRow;
        mInputRowStart = mInputPtr;
    }

    protected void skipTranslatedLF(int lf)
        throws IOException, WstxException
    {
        if (lf == CHAR_CR) {
            int c = nextTranslated();
            if (c != CHAR_LF) {
                mInputPtr -= 1;
            }
        }
        ++mInputRow;
        mInputRowStart = mInputPtr;
    }

    /**
     * @return First character that does not match expected, if any;
     *    CHAR_NULL if match succeeded
     */
    protected int checkMbKeyword(String expected)
        throws IOException, WstxException
    {
        int len = expected.length();
        
        for (int ptr = 1; ptr < len; ++ptr) {
            int c = nextMultiByte();
            if (c == BYTE_NULL) {
                reportNull();
            }
            if (c != expected.charAt(ptr)) {
              return c;
            }
        }

        return CHAR_NULL;
    }

    protected int checkTranslatedKeyword(String expected)
        throws IOException, WstxException
    {
        int len = expected.length();
        
        for (int ptr = 1; ptr < len; ++ptr) {
            int c = nextTranslated();
            if (c == BYTE_NULL) {
                reportNull();
            }
            if (c != expected.charAt(ptr)) {
              return c;
            }
        }

        return CHAR_NULL;
    }

    /*
    ////////////////////////////////////////
    // Other private methods:
    ////////////////////////////////////////
    */

    private void verifyEncoding(String id, int bpc)
        throws WstxException
    {
        if (mByteSizeFound) {
            /* Let's verify that if we matched an encoding, it's the same
             * as what was declared...
             */
            if (bpc != mBytesPerChar) {
                // [WSTX-138]: Needs to detect EBCDIC discrepancy
                if (mEBCDIC) {
                    reportXmlProblem("Declared encoding '"+id+"' incompatible with auto-detected physical encoding (EBCDIC variant), can not decode input since actual code page not known");
                }
                reportXmlProblem("Declared encoding '"+id+"' uses "+bpc
                                 +" bytes per character; but physical encoding appeared to use "+mBytesPerChar+"; cannot decode");
            }
        }
    }

    private void verifyEncoding(String id, int bpc, boolean bigEndian)
        throws WstxException
    {
        if (mByteSizeFound) {
            verifyEncoding(id, bpc);

            if (bigEndian != mBigEndian) {
                String bigStr = bigEndian ? "big" : "little";
                reportXmlProblem
                    ("Declared encoding '"+id+"' has different endianness ("
                     +bigStr+" endian) than what physical ordering appeared to be; cannot decode");
            }
        }
    }

    private void reportWeirdUCS4(String type)
        throws IOException
    {
        throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");
    }

    private void reportMissingBOM(String enc)
        throws WstxException
    {
        throw new WstxException("Missing BOM for encoding '"+enc+"'; can not be omitted",
                                getLocation());
    }
}