All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ctc.wstx.io.StreamBootstrapper Maven / Gradle / Ivy

Go to download

Woodstox is a high-performance XML processor that implements Stax (JSR-173) and SAX2 APIs

There is a newer version: 4.4.1
Show newest version
package com.ctc.wstx.io;

import java.io.*;

import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;

import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.ParsingErrorMsgs;
import com.ctc.wstx.cfg.XmlConsts;
import com.ctc.wstx.exc.*;

/**
 * Input bootstrap class used with streams, when encoding is not known
 * (when encoding is specified by application, a reader is constructed,
 * and then reader-based bootstrapper is used).
 *

after last valid byte in the buffer */ private StreamBootstrapper(String pubId, String sysId, byte[] data, int start, int end) { super(pubId, sysId); mIn = null; mRecycleBuffer = false; mByteBuffer = data; mInputPtr = start; mInputEnd = end; } /* //////////////////////////////////////// // Public API //////////////////////////////////////// */ /** * Factory method used when the underlying data provider is an * actual stream. */ public static StreamBootstrapper getInstance(String pubId, String sysId, InputStream in) { return new StreamBootstrapper(pubId, sysId, in); } /** * Factory method used when the underlying data provider is a pre-allocated * block source, and no stream is used. * Additionally the buffer passed is not owned by the bootstrapper * or Reader that is created, so it is not to be recycled. */ public static StreamBootstrapper getInstance(String pubId, String sysId, byte[] data, int start, int end) { return new StreamBootstrapper(pubId, sysId, data, start, end); } public Reader bootstrapInput(ReaderConfig cfg, boolean mainDoc, int xmlVersion) throws IOException, XMLStreamException { String normEnc = null; // First, let's get the buffers... int bufSize = cfg.getInputBufferLength(); if (bufSize < MIN_BUF_SIZE) { bufSize = MIN_BUF_SIZE; } if (mByteBuffer == null) { // non-null if we were passed a buffer mByteBuffer = cfg.allocFullBBuffer(bufSize); } resolveStreamEncoding(); if (hasXmlDecl()) { // note: readXmlDecl will set mXml11Handling too readXmlDecl(mainDoc, xmlVersion); if (mFoundEncoding != null) { normEnc = verifyXmlEncoding(mFoundEncoding); } } else { /* We'll actually then just inherit whatever main doc had... * (or in case there was no parent, just copy the 'unknown') */ mXml11Handling = (XmlConsts.XML_V_11 == xmlVersion); } // Now, have we figured out the encoding? if (normEnc == null) { // not via xml declaration /* 21-Sep-2007, TSa: As with any non-UTF-8 encoding, declaration * isn't optional any more. Besides, we need that information * anyway to know which variant it is. */ if (mEBCDIC) { if (mFoundEncoding == null || mFoundEncoding.length() == 0) { reportXmlProblem("Missing encoding declaration: underlying encoding looks like an EBCDIC variant, but no xml encoding declaration found"); } // Hmmh. What should be the canonical name? Let's just use found encoding? normEnc = mFoundEncoding; } else if (mBytesPerChar == 2) { // UTF-16, BE/LE normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE; } else if (mBytesPerChar == 4) { // UCS-4... ? /* 22-Mar-2005, TSa: JDK apparently has no way of dealing * with these encodings... not sure if and how it should * be dealt with, really. Name could be UCS-4xx... or * perhaps UTF-32xx */ normEnc = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE; } else { // Ok, default has to be UTF-8, as per XML specs normEnc = CharsetNames.CS_UTF8; } } mInputEncoding = normEnc; /* And then the reader. Let's figure out if we can use our own fast * implementations first: */ BaseReader r; // Normalized, can thus use straight equality checks now if (normEnc == CharsetNames.CS_UTF8) { r = new UTF8Reader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer); } else if (normEnc == CharsetNames.CS_ISO_LATIN1) { r = new ISOLatinReader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer); } else if (normEnc == CharsetNames.CS_US_ASCII) { r = new AsciiReader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer); } else if (normEnc.startsWith(CharsetNames.CS_UTF32)) { // let's augment with actual endianness info if (normEnc == CharsetNames.CS_UTF32) { mInputEncoding = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE; } r = new UTF32Reader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer, mBigEndian); } else { // Nah, JDK needs to try it // Ok; first, do we need to merge stuff back? InputStream in = mIn; if (mInputPtr < mInputEnd) { in = new MergedStream(cfg, in, mByteBuffer, mInputPtr, mInputEnd); } /* 20-Jan-2006, TSa: Ok; although it is possible to declare * stream as 'UTF-16', JDK may need help in figuring out * the right order, so let's be explicit: */ if (normEnc == CharsetNames.CS_UTF16) { mInputEncoding = normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE; } try { return new InputStreamReader(in, normEnc); } catch (UnsupportedEncodingException usex) { throw new WstxIOException("Unsupported encoding: "+usex.getMessage()); } } if (mXml11Handling) { r.setXmlCompliancy(XmlConsts.XML_V_11); } return r; } /** * Since this class only gets used when encoding is not explicitly * passed, need use the encoding that was auto-detected... */ public String getInputEncoding() { return mInputEncoding; } public int getInputTotal() { int total = mInputProcessed + mInputPtr; if (mBytesPerChar > 1) { total /= mBytesPerChar; } return total; } public int getInputColumn() { int col = mInputPtr - mInputRowStart; if (mBytesPerChar > 1) { col /= mBytesPerChar; } return col; } /* //////////////////////////////////////// // Internal methods, parsing //////////////////////////////////////// */ /** * Method called to try to figure out physical encoding the underlying * input stream uses. */ protected void resolveStreamEncoding() throws IOException, WstxException { // Let's first set defaults: mBytesPerChar = 0; mBigEndian = true; /* Ok; first just need 4 bytes for determining bytes-per-char from * BOM or first char(s) of likely xml declaration: */ if (ensureLoaded(4)) { bomblock: do { // BOM/auto-detection block int quartet = (mByteBuffer[0] << 24) | ((mByteBuffer[1] & 0xFF) << 16) | ((mByteBuffer[2] & 0xFF) << 8) | (mByteBuffer[3] & 0xFF); /* Handling of (usually) optional BOM (required for * multi-byte formats); first 32-bit charsets: */ switch (quartet) { case 0x0000FEFF: mBigEndian = true; mInputPtr = mBytesPerChar = 4; break bomblock; case 0xFFFE0000: // UCS-4, LE? mInputPtr = mBytesPerChar = 4; mBigEndian = false; break bomblock; case 0x0000FFFE: // UCS-4, in-order... reportWeirdUCS4("2143"); break bomblock; case 0x0FEFF0000: // UCS-4, in-order... reportWeirdUCS4("3412"); break bomblock; } // Ok, if not, how about 16-bit encoding BOMs? int msw = quartet >>> 16; if (msw == 0xFEFF) { // UTF-16, BE mInputPtr = mBytesPerChar = 2; mBigEndian = true; break; } if (msw == 0xFFFE) { // UTF-16, LE mInputPtr = mBytesPerChar = 2; mBigEndian = false; break; } // And if not, then UTF-8 BOM? if ((quartet >>> 8) == 0xEFBBBF) { // UTF-8 mInputPtr = 3; mBytesPerChar = 1; mBigEndian = true; // doesn't really matter break; } /* And if that wasn't succesful, how about auto-detection * for ' 0); // Let's update location markers to ignore BOM. mInputProcessed = -mInputPtr; mInputRowStart = mInputPtr; } /* Hmmh. If we haven't figured it out, let's just assume * UTF-8 as per XML specs: */ mByteSizeFound = (mBytesPerChar != 0); if (!mByteSizeFound) { mBytesPerChar = 1; mBigEndian = true; // doesn't matter } } /** * @return Normalized encoding name */ protected String verifyXmlEncoding(String enc) throws WstxException { enc = CharsetNames.normalize(enc); // Let's actually verify we got matching information: if (enc == CharsetNames.CS_UTF8) { verifyEncoding(enc, 1); } else if (enc == CharsetNames.CS_ISO_LATIN1) { verifyEncoding(enc, 1); } else if (enc == CharsetNames.CS_US_ASCII) { verifyEncoding(enc, 1); } else if (enc == CharsetNames.CS_UTF16) { // BOM is obligatory, to know the ordering /* 22-Mar-2005, TSa: Actually, since we don't have a * custom decoder, so the underlying JDK Reader may * have dealt with it transparently... so we can not * really throw an exception here. */ //if (!mHadBOM) { //reportMissingBOM(enc); //} verifyEncoding(enc, 2); } else if (enc == CharsetNames.CS_UTF16LE) { verifyEncoding(enc, 2, false); } else if (enc == CharsetNames.CS_UTF16BE) { verifyEncoding(enc, 2, true); } else if (enc == CharsetNames.CS_UTF32) { // Do we require a BOM here? we can live without it... //if (!mHadBOM) { // reportMissingBOM(enc); //} verifyEncoding(enc, 4); } else if (enc == CharsetNames.CS_UTF32LE) { verifyEncoding(enc, 4, false); } else if (enc == CharsetNames.CS_UTF32BE) { verifyEncoding(enc, 4, true); } return enc; } /* ///////////////////////////////////////////////////// // Internal methods, loading input data ///////////////////////////////////////////////////// */ protected boolean ensureLoaded(int minimum) throws IOException { /* Let's assume here buffer has enough room -- this will always * be true for the limited used this method gets */ int gotten = (mInputEnd - mInputPtr); while (gotten < minimum) { int count = (mIn == null) ? -1 : mIn.read(mByteBuffer, mInputEnd, mByteBuffer.length - mInputEnd); if (count < 1) { return false; } mInputEnd += count; gotten += count; } return true; } protected void loadMore() throws IOException, WstxException { /* Need to make sure offsets are properly updated for error * reporting purposes, and do this now while previous amounts * are still known. */ /* Note: at this point these are all in bytes, not chars (for multibyte * encodings) */ mInputProcessed += mInputEnd; mInputRowStart -= mInputEnd; mInputPtr = 0; mInputEnd = (mIn == null) ? -1 : mIn.read(mByteBuffer, 0, mByteBuffer.length); if (mInputEnd < 1) { throw new WstxEOFException(ParsingErrorMsgs.SUFFIX_IN_XML_DECL, getLocation()); } } /* ///////////////////////////////////////////////////// // Implementations of abstract parsing methods ///////////////////////////////////////////////////// */ protected void pushback() { if (mBytesPerChar < 0) { mInputPtr += mBytesPerChar; } else { mInputPtr -= mBytesPerChar; } } protected int getNext() throws IOException, WstxException { if (mBytesPerChar != 1) { if (mBytesPerChar == -1) { // need to translate return nextTranslated(); } return nextMultiByte(); } byte b = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); return (b & 0xFF); } protected int getNextAfterWs(boolean reqWs) throws IOException, WstxException { int count; if (mBytesPerChar == 1) { // single byte count = skipSbWs(); } else { if (mBytesPerChar == -1) { // translated count = skipTranslatedWs(); } else { // multi byte count = skipMbWs(); } } if (reqWs && count == 0) { reportUnexpectedChar(getNext(), ERR_XMLDECL_EXP_SPACE); } // inlined getNext() if (mBytesPerChar != 1) { if (mBytesPerChar == -1) { // translated return nextTranslated(); } return nextMultiByte(); } byte b = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); return (b & 0xFF); } /** * @return First character that does not match expected, if any; * CHAR_NULL if match succeeded */ protected int checkKeyword(String exp) throws IOException, WstxException { if (mBytesPerChar != 1) { if (mBytesPerChar == -1) { return checkTranslatedKeyword(exp); } return checkMbKeyword(exp); } return checkSbKeyword(exp); } protected int readQuotedValue(char[] kw, int quoteChar) throws IOException, WstxException { int i = 0; int len = kw.length; boolean simple = (mBytesPerChar == 1); boolean mb = !simple && (mBytesPerChar > 1); while (i < len) { int c; if (simple) { byte b = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); if (b == BYTE_NULL) { reportNull(); } if (b == BYTE_CR || b == BYTE_LF) { skipSbLF(b); b = BYTE_LF; } c = (b & 0xFF); } else { if (mb) { c = nextMultiByte(); if (c == CHAR_CR || c == CHAR_LF) { skipMbLF(c); c = CHAR_LF; } } else { c = nextTranslated(); if (c == CHAR_CR || c == CHAR_LF) { skipTranslatedLF(c); c = CHAR_LF; } } } if (c == quoteChar) { return (i < len) ? i : -1; } if (i < len) { kw[i++] = (char) c; } } /* If we end up this far, we ran out of buffer space... let's let * caller figure that out, though */ return -1; } protected boolean hasXmlDecl() throws IOException, WstxException { /* Separate handling for common and fast case; 1/variable byte * encodings that have ASCII subset: */ if (mBytesPerChar == 1) { /* However... there has to be at least 6 bytes available; and if * so, can check the 'signature' easily: */ if (ensureLoaded(6)) { if (mByteBuffer[mInputPtr] == '<' && mByteBuffer[mInputPtr+1] == '?' && mByteBuffer[mInputPtr+2] == 'x' && mByteBuffer[mInputPtr+3] == 'm' && mByteBuffer[mInputPtr+4] == 'l' && ((mByteBuffer[mInputPtr+5] & 0xFF) <= CHAR_SPACE)) { // Let's skip stuff so far: mInputPtr += 6; return true; } } } else if (mBytesPerChar == -1) { // translated (EBCDIC) if (ensureLoaded(6)) { int start = mInputPtr; // if we have to 'unread' chars if (nextTranslated() == '<' && nextTranslated() == '?' && nextTranslated() == 'x' && nextTranslated() == 'm' && nextTranslated() == 'l' && nextTranslated() <= CHAR_SPACE) { return true; } mInputPtr = start; // push data back } } else { // ... and then for slower fixed-multibyte encodings: // Is there enough data for checks? if (ensureLoaded (6 * mBytesPerChar)) { int start = mInputPtr; // if we have to 'unread' chars if (nextMultiByte() == '<' && nextMultiByte() == '?' && nextMultiByte() == 'x' && nextMultiByte() == 'm' && nextMultiByte() == 'l' && nextMultiByte() <= CHAR_SPACE) { return true; } mInputPtr = start; // push data back } } return false; } protected Location getLocation() { /* Ok; for fixed-size multi-byte encodings, need to divide numbers * to get character locations. For variable-length encodings the * good thing is that xml declaration only uses shortest codepoints, * ie. char count == byte count. */ int total = mInputProcessed + mInputPtr; int col = mInputPtr - mInputRowStart; if (mBytesPerChar > 1) { total /= mBytesPerChar; col /= mBytesPerChar; } return new WstxInputLocation(null, mPublicId, mSystemId, total - 1, // 0-based mInputRow, col); } /* ///////////////////////////////////////////////////// // Internal methods, single-byte access methods ///////////////////////////////////////////////////// */ protected byte nextByte() throws IOException, WstxException { if (mInputPtr >= mInputEnd) { loadMore(); } return mByteBuffer[mInputPtr++]; } protected int skipSbWs() throws IOException, WstxException { int count = 0; while (true) { byte b = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); if ((b & 0xFF) > CHAR_SPACE) { --mInputPtr; break; } if (b == BYTE_CR || b == BYTE_LF) { skipSbLF(b); } else if (b == BYTE_NULL) { reportNull(); } ++count; } return count; } protected void skipSbLF(byte lfByte) throws IOException, WstxException { if (lfByte == BYTE_CR) { byte b = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); if (b != BYTE_LF) { --mInputPtr; // pushback if not 2-char/byte lf } } ++mInputRow; mInputRowStart = mInputPtr; } /** * @return First character that does not match expected, if any; * CHAR_NULL if match succeeded */ protected int checkSbKeyword(String expected) throws IOException, WstxException { int len = expected.length(); for (int ptr = 1; ptr < len; ++ptr) { byte b = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); if (b == BYTE_NULL) { reportNull(); } if ((b & 0xFF) != expected.charAt(ptr)) { return (b & 0xFF); } } return CHAR_NULL; } /* ///////////////////////////////////////////////////// // Internal methods, multi-byte/translated access/checks ///////////////////////////////////////////////////// */ protected int nextMultiByte() throws IOException, WstxException { byte b = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); byte b2 = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); int c; if (mBytesPerChar == 2) { if (mBigEndian) { c = ((b & 0xFF) << 8) | (b2 & 0xFF); } else { c = (b & 0xFF) | ((b2 & 0xFF) << 8); } } else { // Has to be 4 bytes byte b3 = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); byte b4 = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); if (mBigEndian) { c = (b << 24) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 8) | (b4 & 0xFF); } else { c = (b4 << 24) | ((b3 & 0xFF) << 16) | ((b2 & 0xFF) << 8) | (b & 0xFF); } } // Let's catch null chars early if (c == 0) { reportNull(); } return c; } protected int nextTranslated() throws IOException, WstxException { byte b = (mInputPtr < mInputEnd) ? mByteBuffer[mInputPtr++] : nextByte(); int ch = mSingleByteTranslation[b & 0xFF]; if (ch < 0) { // special char... won't care for now ch = -ch; } return ch; } protected int skipMbWs() throws IOException, WstxException { int count = 0; while (true) { int c = nextMultiByte(); if (c > CHAR_SPACE) { mInputPtr -= mBytesPerChar; break; } if (c == CHAR_CR || c == CHAR_LF) { skipMbLF(c); } else if (c == CHAR_NULL) { reportNull(); } ++count; } return count; } protected int skipTranslatedWs() throws IOException, WstxException { int count = 0; while (true) { int c = nextTranslated(); // Hmmh. Are we to accept NEL (0x85)? if (c > CHAR_SPACE && c != CHAR_NEL) { --mInputPtr; break; } if (c == CHAR_CR || c == CHAR_LF) { skipTranslatedLF(c); } else if (c == CHAR_NULL) { reportNull(); } ++count; } return count; } protected void skipMbLF(int lf) throws IOException, WstxException { if (lf == CHAR_CR) { int c = nextMultiByte(); if (c != CHAR_LF) { mInputPtr -= mBytesPerChar; } } ++mInputRow; mInputRowStart = mInputPtr; } protected void skipTranslatedLF(int lf) throws IOException, WstxException { if (lf == CHAR_CR) { int c = nextTranslated(); if (c != CHAR_LF) { mInputPtr -= 1; } } ++mInputRow; mInputRowStart = mInputPtr; } /** * @return First character that does not match expected, if any; * CHAR_NULL if match succeeded */ protected int checkMbKeyword(String expected) throws IOException, WstxException { int len = expected.length(); for (int ptr = 1; ptr < len; ++ptr) { int c = nextMultiByte(); if (c == BYTE_NULL) { reportNull(); } if (c != expected.charAt(ptr)) { return c; } } return CHAR_NULL; } protected int checkTranslatedKeyword(String expected) throws IOException, WstxException { int len = expected.length(); for (int ptr = 1; ptr < len; ++ptr) { int c = nextTranslated(); if (c == BYTE_NULL) { reportNull(); } if (c != expected.charAt(ptr)) { return c; } } return CHAR_NULL; } /* //////////////////////////////////////// // Other private methods: //////////////////////////////////////// */ private void verifyEncoding(String id, int bpc) throws WstxException { if (mByteSizeFound) { /* Let's verify that if we matched an encoding, it's the same * as what was declared... */ if (bpc != mBytesPerChar) { // [WSTX-138]: Needs to detect EBCDIC discrepancy if (mEBCDIC) { reportXmlProblem("Declared encoding '"+id+"' incompatible with auto-detected physical encoding (EBCDIC variant), can not decode input since actual code page not known"); } reportXmlProblem("Declared encoding '"+id+"' uses "+bpc +" bytes per character; but physical encoding appeared to use "+mBytesPerChar+"; cannot decode"); } } } private void verifyEncoding(String id, int bpc, boolean bigEndian) throws WstxException { if (mByteSizeFound) { verifyEncoding(id, bpc); if (bigEndian != mBigEndian) { String bigStr = bigEndian ? "big" : "little"; reportXmlProblem ("Declared encoding '"+id+"' has different endianness (" +bigStr+" endian) than what physical ordering appeared to be; cannot decode"); } } } private void reportWeirdUCS4(String type) throws IOException { throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected"); } private void reportMissingBOM(String enc) throws WstxException { throw new WstxException("Missing BOM for encoding '"+enc+"'; can not be omitted", getLocation()); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy