All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.java.com.ctc.wstx.io.StreamBootstrapper Maven / Gradle / Ivy

package com.ctc.wstx.io;

import java.io.*;

import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;

import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.ParsingErrorMsgs;
import com.ctc.wstx.cfg.XmlConsts;
import com.ctc.wstx.exc.*;
import com.ctc.wstx.util.StringUtil;

/**
 * Input bootstrap class used with streams, when encoding is not known
 * (when encoding is specified by application, a reader is constructed,
 * and then reader-based bootstrapper is used).
 *

1) { total /= mBytesPerChar; } return total; } public int getInputColumn() { int col = mInputPtr - mInputRowStart; if (mBytesPerChar > 1) { col /= mBytesPerChar; } return col; } /* //////////////////////////////////////// // Internal methods, parsing //////////////////////////////////////// */ /** * Method called to try to figure out physical encoding the underlying * input stream uses. */ protected void resolveStreamEncoding() throws IOException, WstxException { // Let's first set defaults: mBytesPerChar = 0; mBigEndian = true; /* Ok; first just need 4 bytes for determining bytes-per-char from * BOM or first char(s) of likely xml declaration: */ if (ensureLoaded(4)) { bomblock: do { // BOM/auto-detection block int quartet = (mByteBuffer[0] << 24) | ((mByteBuffer[1] & 0xFF) << 16) | ((mByteBuffer[2] & 0xFF) << 8) | (mByteBuffer[3] & 0xFF); /* Handling of (usually) optional BOM (required for * multi-byte formats); first 32-bit charsets: */ switch (quartet) { case 0x0000FEFF: mBigEndian = true; mInputPtr = mBytesPerChar = 4; break bomblock; case 0xFFFE0000: // UCS-4, LE? mInputPtr = mBytesPerChar = 4; mBigEndian = false; break bomblock; case 0x0000FFFE: // UCS-4, in-order... reportWeirdUCS4("2143"); break bomblock; case 0x0FEFF0000: // UCS-4, in-order... reportWeirdUCS4("3412"); break bomblock; } // Ok, if not, how about 16-bit encoding BOMs? int msw = quartet >>> 16; if (msw == 0xFEFF) { // UTF-16, BE mInputPtr = mBytesPerChar = 2; mBigEndian = true; break; } if (msw == 0xFFFE) { // UTF-16, LE mInputPtr = mBytesPerChar = 2; mBigEndian = false; break; } // And if not, then UTF-8 BOM? if ((quartet >>> 8) == 0xEFBBBF) { // UTF-8 mInputPtr = 3; mBytesPerChar = 1; mBigEndian = true; // doesn't really matter break; } /* And if that wasn't succesful, how about auto-detection * for ' 0); // Let's update location markers to ignore BOM. mInputProcessed = -mInputPtr; mInputRowStart = mInputPtr; } /* Hmmh. If we haven't figured it out, let's just assume * UTF-8 as per XML specs: */ mByteSizeFound = (mBytesPerChar != 0); if (!mByteSizeFound) { mBytesPerChar = 1; mBigEndian = true; // doesn't matter } } /** * @return Normalized encoding name */ protected String verifyXmlEncoding(String enc) throws WstxException { enc = CharsetNames.normalize(enc); // Let's actually verify we got matching information: if (enc == CharsetNames.CS_UTF8) { verifyEncoding(enc, 1); } else if (enc == CharsetNames.CS_ISO_LATIN1) { verifyEncoding(enc, 1); } else if (enc == CharsetNames.CS_US_ASCII) { verifyEncoding(enc, 1); } else if (enc == CharsetNames.CS_UTF16) { // BOM is obligatory, to know the ordering /* 22-Mar-2005, TSa: Actually, since we don't have a * custom decoder, so the underlying JDK Reader may * have dealt with it transparently... so we can not * really throw an exception here. */ //if (!mHadBOM) { //reportMissingBOM(enc); //} verifyEncoding(enc, 2); } else if (enc == CharsetNames.CS_UTF16LE) { verifyEncoding(enc, 2, false); } else if (enc == CharsetNames.CS_UTF16BE) { verifyEncoding(enc, 2, true); } else if (enc == CharsetNames.CS_UTF32) { // Do we require a BOM here? we can live without it... //if (!mHadBOM) { // reportMissingBOM(enc); //} verifyEncoding(enc, 4); } else if (enc == CharsetNames.CS_UTF32LE) { verifyEncoding(enc, 4, false); } else if (enc == CharsetNames.CS_UTF32BE) { verifyEncoding(enc, 4, true); } return enc; } /* ///////////////////////////////////////////////////// // Internal methods, loading input data ///////////////////////////////////////////////////// */ protected boolean ensureLoaded(int minimum) throws IOException { /* Let's assume here buffer has enough room -- this will always * be true for the limited used this method gets */ int gotten = (mInputLen - mInputPtr); while (gotten < minimum) { int count = mIn.read(mByteBuffer, mInputLen, mByteBuffer.length - mInputLen); if (count < 1) { return false; } mInputLen += count; gotten += count; } return true; } protected void loadMore() throws IOException, WstxException { /* Need to make sure offsets are properly updated for error * reporting purposes, and do this now while previous amounts * are still known. */ /* Note: at this point these are all in bytes, not chars (for multibyte * encodings) */ mInputProcessed += mInputLen; mInputRowStart -= mInputLen; mInputPtr = 0; mInputLen = mIn.read(mByteBuffer, 0, mByteBuffer.length); if (mInputLen < 1) { throw new WstxEOFException(ParsingErrorMsgs.SUFFIX_IN_XML_DECL, getLocation()); } } /* ///////////////////////////////////////////////////// // Implementations of abstract parsing methods ///////////////////////////////////////////////////// */ protected void pushback() { if (mBytesPerChar < 0) { mInputPtr += mBytesPerChar; } else { mInputPtr -= mBytesPerChar; } } protected int getNext() throws IOException, WstxException { if (mBytesPerChar != 1) { if (mBytesPerChar == -1) { // need to translate return nextTranslated(); } return nextMultiByte(); } byte b = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); return (b & 0xFF); } protected int getNextAfterWs(boolean reqWs) throws IOException, WstxException { int count; if (mBytesPerChar == 1) { // single byte count = skipSbWs(); } else { if (mBytesPerChar == -1) { // translated count = skipTranslatedWs(); } else { // multi byte count = skipMbWs(); } } if (reqWs && count == 0) { reportUnexpectedChar(getNext(), ERR_XMLDECL_EXP_SPACE); } // inlined getNext() if (mBytesPerChar != 1) { if (mBytesPerChar == -1) { // translated return nextTranslated(); } return nextMultiByte(); } byte b = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); return (b & 0xFF); } /** * @return First character that does not match expected, if any; * CHAR_NULL if match succeeded */ protected int checkKeyword(String exp) throws IOException, WstxException { if (mBytesPerChar != 1) { if (mBytesPerChar == -1) { return checkTranslatedKeyword(exp); } return checkMbKeyword(exp); } return checkSbKeyword(exp); } protected int readQuotedValue(char[] kw, int quoteChar) throws IOException, WstxException { int i = 0; int len = kw.length; boolean simple = (mBytesPerChar == 1); boolean mb = !simple && (mBytesPerChar > 1); while (i < len) { int c; if (simple) { byte b = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); if (b == BYTE_NULL) { reportNull(); } if (b == BYTE_CR || b == BYTE_LF) { skipSbLF(b); b = BYTE_LF; } c = (b & 0xFF); } else { if (mb) { c = nextMultiByte(); if (c == CHAR_CR || c == CHAR_LF) { skipMbLF(c); c = CHAR_LF; } } else { c = nextTranslated(); if (c == CHAR_CR || c == CHAR_LF) { skipTranslatedLF(c); c = CHAR_LF; } } } if (c == quoteChar) { return (i < len) ? i : -1; } if (i < len) { kw[i++] = (char) c; } } /* If we end up this far, we ran out of buffer space... let's let * caller figure that out, though */ return -1; } protected boolean hasXmlDecl() throws IOException, WstxException { /* Separate handling for common and fast case; 1/variable byte * encodings that have ASCII subset: */ if (mBytesPerChar == 1) { /* However... there has to be at least 6 bytes available; and if * so, can check the 'signature' easily: */ if (ensureLoaded(6)) { if (mByteBuffer[mInputPtr] == '<' && mByteBuffer[mInputPtr+1] == '?' && mByteBuffer[mInputPtr+2] == 'x' && mByteBuffer[mInputPtr+3] == 'm' && mByteBuffer[mInputPtr+4] == 'l' && ((mByteBuffer[mInputPtr+5] & 0xFF) <= CHAR_SPACE)) { // Let's skip stuff so far: mInputPtr += 6; return true; } } } else if (mBytesPerChar == -1) { // translated (EBCDIC) if (ensureLoaded(6)) { int start = mInputPtr; // if we have to 'unread' chars if (nextTranslated() == '<' && nextTranslated() == '?' && nextTranslated() == 'x' && nextTranslated() == 'm' && nextTranslated() == 'l' && nextTranslated() <= CHAR_SPACE) { return true; } mInputPtr = start; // push data back } } else { // ... and then for slower fixed-multibyte encodings: // Is there enough data for checks? if (ensureLoaded (6 * mBytesPerChar)) { int start = mInputPtr; // if we have to 'unread' chars if (nextMultiByte() == '<' && nextMultiByte() == '?' && nextMultiByte() == 'x' && nextMultiByte() == 'm' && nextMultiByte() == 'l' && nextMultiByte() <= CHAR_SPACE) { return true; } mInputPtr = start; // push data back } } return false; } protected Location getLocation() { /* Ok; for fixed-size multi-byte encodings, need to divide numbers * to get character locations. For variable-length encodings the * good thing is that xml declaration only uses shortest codepoints, * ie. char count == byte count. */ int total = mInputProcessed + mInputPtr; int col = mInputPtr - mInputRowStart; if (mBytesPerChar > 1) { total /= mBytesPerChar; col /= mBytesPerChar; } return new WstxInputLocation(null, mPublicId, mSystemId, total - 1, // 0-based mInputRow, col); } /* ///////////////////////////////////////////////////// // Internal methods, single-byte access methods ///////////////////////////////////////////////////// */ protected byte nextByte() throws IOException, WstxException { if (mInputPtr >= mInputLen) { loadMore(); } return mByteBuffer[mInputPtr++]; } protected int skipSbWs() throws IOException, WstxException { int count = 0; while (true) { byte b = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); if ((b & 0xFF) > CHAR_SPACE) { --mInputPtr; break; } if (b == BYTE_CR || b == BYTE_LF) { skipSbLF(b); } else if (b == BYTE_NULL) { reportNull(); } ++count; } return count; } protected void skipSbLF(byte lfByte) throws IOException, WstxException { if (lfByte == BYTE_CR) { byte b = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); if (b != BYTE_LF) { --mInputPtr; // pushback if not 2-char/byte lf } } ++mInputRow; mInputRowStart = mInputPtr; } /** * @return First character that does not match expected, if any; * CHAR_NULL if match succeeded */ protected int checkSbKeyword(String expected) throws IOException, WstxException { int len = expected.length(); for (int ptr = 1; ptr < len; ++ptr) { byte b = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); if (b == BYTE_NULL) { reportNull(); } if ((b & 0xFF) != expected.charAt(ptr)) { return (b & 0xFF); } } return CHAR_NULL; } /* ///////////////////////////////////////////////////// // Internal methods, multi-byte/translated access/checks ///////////////////////////////////////////////////// */ protected int nextMultiByte() throws IOException, WstxException { byte b = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); byte b2 = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); int c; if (mBytesPerChar == 2) { if (mBigEndian) { c = ((b & 0xFF) << 8) | (b2 & 0xFF); } else { c = (b & 0xFF) | ((b2 & 0xFF) << 8); } } else { // Has to be 4 bytes byte b3 = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); byte b4 = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); if (mBigEndian) { c = (b << 24) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 8) | (b4 & 0xFF); } else { c = (b4 << 24) | ((b3 & 0xFF) << 16) | ((b2 & 0xFF) << 8) | (b & 0xFF); } } // Let's catch null chars early if (c == 0) { reportNull(); } return c; } protected int nextTranslated() throws IOException, WstxException { byte b = (mInputPtr < mInputLen) ? mByteBuffer[mInputPtr++] : nextByte(); int ch = mSingleByteTranslation[b & 0xFF]; if (ch < 0) { // special char... won't care for now ch = -ch; } return ch; } protected int skipMbWs() throws IOException, WstxException { int count = 0; while (true) { int c = nextMultiByte(); if (c > CHAR_SPACE) { mInputPtr -= mBytesPerChar; break; } if (c == CHAR_CR || c == CHAR_LF) { skipMbLF(c); } else if (c == CHAR_NULL) { reportNull(); } ++count; } return count; } protected int skipTranslatedWs() throws IOException, WstxException { int count = 0; while (true) { int c = nextTranslated(); // Hmmh. Are we to accept NEL (0x85)? if (c > CHAR_SPACE && c != CHAR_NEL) { --mInputPtr; break; } if (c == CHAR_CR || c == CHAR_LF) { skipTranslatedLF(c); } else if (c == CHAR_NULL) { reportNull(); } ++count; } return count; } protected void skipMbLF(int lf) throws IOException, WstxException { if (lf == CHAR_CR) { int c = nextMultiByte(); if (c != CHAR_LF) { mInputPtr -= mBytesPerChar; } } ++mInputRow; mInputRowStart = mInputPtr; } protected void skipTranslatedLF(int lf) throws IOException, WstxException { if (lf == CHAR_CR) { int c = nextTranslated(); if (c != CHAR_LF) { mInputPtr -= 1; } } ++mInputRow; mInputRowStart = mInputPtr; } /** * @return First character that does not match expected, if any; * CHAR_NULL if match succeeded */ protected int checkMbKeyword(String expected) throws IOException, WstxException { int len = expected.length(); for (int ptr = 1; ptr < len; ++ptr) { int c = nextMultiByte(); if (c == BYTE_NULL) { reportNull(); } if (c != expected.charAt(ptr)) { return c; } } return CHAR_NULL; } protected int checkTranslatedKeyword(String expected) throws IOException, WstxException { int len = expected.length(); for (int ptr = 1; ptr < len; ++ptr) { int c = nextTranslated(); if (c == BYTE_NULL) { reportNull(); } if (c != expected.charAt(ptr)) { return c; } } return CHAR_NULL; } /* //////////////////////////////////////// // Other private methods: //////////////////////////////////////// */ private void verifyEncoding(String id, int bpc) throws WstxException { if (mByteSizeFound) { /* Let's verify that if we matched an encoding, it's the same * as what was declared... */ if (bpc != mBytesPerChar) { // [WSTX-138]: Needs to detect EBCDIC discrepancy if (mEBCDIC) { reportXmlProblem("Declared encoding '"+id+"' incompatible with auto-detected physical encoding (EBCDIC variant), can not decode input since actual code page not known"); } reportXmlProblem("Declared encoding '"+id+"' uses "+bpc +" bytes per character; but physical encoding appeared to use "+mBytesPerChar+"; cannot decode"); } } } private void verifyEncoding(String id, int bpc, boolean bigEndian) throws WstxException { if (mByteSizeFound) { verifyEncoding(id, bpc); if (bigEndian != mBigEndian) { String bigStr = bigEndian ? "big" : "little"; reportXmlProblem ("Declared encoding '"+id+"' has different endianness (" +bigStr+" endian) than what physical ordering appeared to be; cannot decode"); } } } private void reportWeirdUCS4(String type) throws IOException { throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected"); } private void reportMissingBOM(String enc) throws WstxException { throw new WstxException("Missing BOM for encoding '"+enc+"'; can not be omitted", getLocation()); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy