com.fasterxml.aalto.in.ByteSourceBootstrapper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
/* Woodstox Lite ("wool") XML processor
*
* Copyright (c) 2006- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.in;
import java.io.*;
import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;
import com.fasterxml.aalto.impl.IoStreamException;
import com.fasterxml.aalto.impl.LocationImpl;
import com.fasterxml.aalto.util.CharsetNames;
/**
* Class that takes care of bootstrapping main document input from
* a byte-oriented input source: usually either an InputStream
,
* or a block source like byte array.
*/
public final class ByteSourceBootstrapper
extends InputBootstrapper
{
private final static byte BYTE_NULL = (byte) 0;
private final static byte BYTE_CR = (byte) '\r';
private final static byte BYTE_LF = (byte) '\n';
/*
/**********************************************************************
/* Configuration
/**********************************************************************
*/
/**
* Underlying InputStream to use for reading content.
*/
final InputStream _in;
/*
/**********************************************************************
/* Input buffering
/**********************************************************************
*/
final byte[] _inputBuffer;
private int _inputPtr;
private int _inputLen;
/*
/**********************************************************************
/* Data gathered
/**********************************************************************
*/
boolean mBigEndian = true;
int mBytesPerChar = 0; // 0 means "dunno yet"
boolean mHadBOM = false;
boolean mByteSizeFound = false;
/*
/**********************************************************************
/* Life-cycle
/**********************************************************************
*/
private ByteSourceBootstrapper(ReaderConfig cfg, InputStream in)
{
super(cfg);
_in = in;
_inputBuffer = cfg.allocFullBBuffer(4000);
_inputLen = _inputPtr = 0;
}
private ByteSourceBootstrapper(ReaderConfig cfg, byte[] inputBuffer, int inputStart, int inputLen)
{
super(cfg);
_in = null;
_inputBuffer = inputBuffer;
_inputPtr = inputStart;
_inputLen = (inputStart + inputLen);
// Need to offset this, to keep location correct
_inputProcessed = -inputStart;
}
public static ByteSourceBootstrapper construct(ReaderConfig cfg, InputStream in)
throws XMLStreamException
{
return new ByteSourceBootstrapper(cfg, in);
}
public static ByteSourceBootstrapper construct(ReaderConfig cfg, byte[] inputBuffer, int inputStart, int inputLen)
throws XMLStreamException
{
return new ByteSourceBootstrapper(cfg, inputBuffer, inputStart, inputLen);
}
@Override
public final XmlScanner bootstrap() throws XMLStreamException
{
try {
return doBootstrap();
} catch (IOException ioe) {
throw new IoStreamException(ioe);
} finally {
_config.freeSmallCBuffer(mKeyword);
}
}
public XmlScanner doBootstrap() throws IOException, XMLStreamException
{
String normEnc = null;
determineStreamEncoding();
if (hasXmlDeclaration()) { // yup, has xml decl:
readXmlDeclaration();
if (mFoundEncoding != null) {
normEnc = verifyXmlEncoding(mFoundEncoding);
}
}
// Now, have we figured out the encoding?
if (normEnc == null) { // not via xml declaration
if (mBytesPerChar == 2) { // UTF-16, BE/LE
normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;
} else if (mBytesPerChar == 4) { // UCS-4... ?
/* 22-Mar-2005, TSa: JDK apparently has no way of dealing
* with these encodings... not sure if and how it should
* be dealt with, really. Name could be UCS-4xx... or
* perhaps UTF-32xx
*/
normEnc = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;
} else {
// Ok, default has to be UTF-8, as per XML specs
normEnc = CharsetNames.CS_UTF8;
}
}
_config.setActualEncoding(normEnc);
_config.setXmlDeclInfo(mDeclaredXmlVersion, mFoundEncoding, mStandalone);
// Normalized, can thus use straight equality checks now
// UTF-8 compatible (loosely speaking) ones can use same scanner
if (normEnc == CharsetNames.CS_UTF8
|| normEnc == CharsetNames.CS_ISO_LATIN1
|| normEnc == CharsetNames.CS_US_ASCII) {
return new Utf8Scanner(_config,
_in, _inputBuffer, _inputPtr, _inputLen);
} else if (normEnc.startsWith(CharsetNames.CS_UTF32)) {
/* Since this is such a rare encoding, we'll just create
* a Reader, and dispatch it to reader scanner?
*/
// let's augment with actual endianness info
if (normEnc == CharsetNames.CS_UTF32) {
normEnc = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;
}
Reader r = new Utf32Reader(_config, _in, _inputBuffer, _inputPtr, _inputLen,
mBigEndian);
return new ReaderScanner(_config, r);
}
/* And finally, if all else fails, we'll also fall back to
* using JDK-provided decoders and ReaderScanner:
*/
InputStream in = _in;
if (_inputPtr < _inputLen) {
in = new MergedStream(_config, in, _inputBuffer, _inputPtr, _inputLen);
}
if (normEnc == CharsetNames.CS_UTF16) {
normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;
}
try {
Reader r = new InputStreamReader(in, normEnc);
return new ReaderScanner(_config, r);
} catch (UnsupportedEncodingException usex) {
throw new IoStreamException("Unsupported encoding: "+usex.getMessage());
}
}
/*
/**********************************************************************
// Internal methods, main xml decl processing
/**********************************************************************
*/
/**
* Method called to figure out what the physical encoding of the
* file appears to be (in case it can be determined from BOM, or
* xml declaration, either of which may be present)
*/
private void determineStreamEncoding()
throws IOException
{
/* Ok; first just need 4 bytes for determining bytes-per-char from
* BOM or first char(s) of likely xml declaration:
*/
if (ensureLoaded(4)) {
int origPtr = _inputPtr;
bomblock:
do { // BOM/auto-detection block
int quartet = (_inputBuffer[_inputPtr] << 24)
| ((_inputBuffer[_inputPtr+1] & 0xFF) << 16)
| ((_inputBuffer[_inputPtr+2] & 0xFF) << 8)
| (_inputBuffer[_inputPtr+3] & 0xFF);
/* Handling of (usually) optional BOM (required for
* multi-byte formats); first 32-bit charsets:
*/
switch (quartet) {
case 0x0000FEFF:
mBigEndian = true;
_inputPtr += 4;
mBytesPerChar = 4;
break bomblock;
case 0xFFFE0000: // UCS-4, LE?
mBigEndian = false;
_inputPtr += 4;
mBytesPerChar = 4;
break bomblock;
case 0x0000FFFE: // UCS-4, in-order...
reportWeirdUCS4("2143");
break bomblock;
case 0x0FEFF0000: // UCS-4, in-order...
reportWeirdUCS4("3412");
break bomblock;
}
// Ok, if not, how about 16-bit encoding BOMs?
int msw = quartet >>> 16;
if (msw == 0xFEFF) { // UTF-16, BE
_inputPtr += 2;
mBytesPerChar = 2;
mBigEndian = true;
break;
}
if (msw == 0xFFFE) { // UTF-16, LE
_inputPtr += 2;
mBytesPerChar = 2;
mBigEndian = false;
break;
}
// And if not, then UTF-8 BOM?
if ((quartet >>> 8) == 0xEFBBBF) { // UTF-8
_inputPtr += 3;
mBytesPerChar = 1;
mBigEndian = true; // doesn't really matter
break;
}
/* And if that wasn't succesful, how about auto-detection
* for ' origPtr);
/* Let's update location markers to ignore BOM when calculating
* column positions (but not from raw byte offsets)
*/
_inputRowStart = _inputPtr;
}
/* Hmmh. If we haven't figured it out, let's just assume
* UTF-8 as per XML specs:
*/
mByteSizeFound = (mBytesPerChar > 0);
if (!mByteSizeFound) {
mBytesPerChar = 1;
mBigEndian = true; // doesn't matter
}
}
protected boolean hasXmlDeclaration()
throws IOException, XMLStreamException
{
// First the common case, 1-byte encoding (Ascii/ISO-Latin/UTF-8):
if (mBytesPerChar == 1) {
// Need 6 chars to determine for sure...
if (ensureLoaded(6)) {
if (_inputBuffer[_inputPtr] == '<'
&& _inputBuffer[_inputPtr+1] == '?'
&& _inputBuffer[_inputPtr+2] == 'x'
&& _inputBuffer[_inputPtr+3] == 'm'
&& _inputBuffer[_inputPtr+4] == 'l'
&& ((_inputBuffer[_inputPtr+5] & 0xFF) <= CHAR_SPACE)) {
// Let's skip stuff so far:
_inputPtr += 6;
return true;
}
}
} else { // ... and then for slower fixed-multibyte encodings:
if (ensureLoaded (6 * mBytesPerChar)) { // 6 chars as well
int start = _inputPtr; // if we have to 'unread' chars
if (nextMultiByte() == '<'
&& nextMultiByte() == '?'
&& nextMultiByte() == 'x'
&& nextMultiByte() == 'm'
&& nextMultiByte() == 'l'
&& nextMultiByte() <= CHAR_SPACE) {
return true;
}
_inputPtr = start; // push data back
}
}
return false;
}
/**
* @return Normalized encoding name
*/
protected String verifyXmlEncoding(String enc)
throws XMLStreamException
{
enc = CharsetNames.normalize(enc);
// Let's actually verify we got matching information:
if (enc == CharsetNames.CS_UTF8) {
verifyEncoding(enc, 1);
} else if (enc == CharsetNames.CS_ISO_LATIN1) {
verifyEncoding(enc, 1);
} else if (enc == CharsetNames.CS_US_ASCII) {
verifyEncoding(enc, 1);
} else if (enc == CharsetNames.CS_UTF16) {
// BOM should be obligatory, to know the ordering?
// For now, let's not enforce that though.
//if (!mHadBOM) {
//reportMissingBOM(enc);
//}
verifyEncoding(enc, 2);
} else if (enc == CharsetNames.CS_UTF16LE) {
verifyEncoding(enc, 2, false);
} else if (enc == CharsetNames.CS_UTF16BE) {
verifyEncoding(enc, 2, true);
} else if (enc == CharsetNames.CS_UTF32) {
// Do we require a BOM here? we can live without it...
//if (!mHadBOM) {
// reportMissingBOM(enc);
//}
verifyEncoding(enc, 4);
} else if (enc == CharsetNames.CS_UTF32LE) {
verifyEncoding(enc, 4, false);
} else if (enc == CharsetNames.CS_UTF32BE) {
verifyEncoding(enc, 4, true);
}
return enc;
}
/*
/**********************************************************************
/* Internal methods, loading input data
/**********************************************************************
*/
protected boolean ensureLoaded(int minimum)
throws IOException
{
/* Let's assume here buffer has enough room -- this will always
* be true for the limited used this method gets
*/
int gotten = (_inputLen - _inputPtr);
while (gotten < minimum) {
int count;
if (_in == null) { // block source
count = -1;
} else {
count = _in.read(_inputBuffer, _inputLen, _inputBuffer.length - _inputLen);
}
if (count < 1) {
return false;
}
_inputLen += count;
gotten += count;
}
return true;
}
protected void loadMore()
throws IOException, XMLStreamException
{
_inputProcessed += _inputLen;
_inputRowStart -= _inputLen;
_inputPtr = 0;
if (_in == null) { // block source
_inputLen = -1;
} else {
_inputLen = _in.read(_inputBuffer, 0, _inputBuffer.length);
}
if (_inputLen < 1) {
reportEof();
}
}
/*
/**********************************************************************
/* Implementations of abstract parsing methods
/**********************************************************************
*/
@Override
protected void pushback() {
_inputPtr -= mBytesPerChar;
}
@Override
protected int getNext()
throws IOException, XMLStreamException
{
if (mBytesPerChar > 1) {
return nextMultiByte();
}
byte b = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
return (b & 0xFF);
}
@Override
protected int getNextAfterWs(boolean reqWs)
throws IOException, XMLStreamException
{
int count;
if (mBytesPerChar > 1) { // multi-byte
count = skipMbWs();
} else {
count = skipSbWs();
}
if (reqWs && count == 0) {
reportUnexpectedChar(getNext(), ERR_XMLDECL_EXP_SPACE);
}
// inlined getNext()
if (mBytesPerChar > 1) {
return nextMultiByte();
}
byte b = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
return (b & 0xFF);
}
/**
* @return First character that does not match expected, if any;
* CHAR_NULL if match succeeded
*/
@Override
protected int checkKeyword(String exp)
throws IOException, XMLStreamException
{
if (mBytesPerChar > 1) {
return checkMbKeyword(exp);
}
return checkSbKeyword(exp);
}
@Override
protected int readQuotedValue(char[] kw, int quoteChar)
throws IOException, XMLStreamException
{
int i = 0;
int len = kw.length;
boolean mb = (mBytesPerChar > 1);
while (i < len) {
int c;
if (mb) {
c = nextMultiByte();
if (c == CHAR_CR || c == CHAR_LF) {
skipMbLF(c);
c = CHAR_LF;
}
} else {
byte b = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
if (b == BYTE_NULL) {
reportNull();
}
if (b == BYTE_CR || b == BYTE_LF) {
skipSbLF(b);
b = BYTE_LF;
}
c = (b & 0xFF);
}
if (c == quoteChar) {
return (i < len) ? i : -1;
}
if (i < len) {
kw[i++] = (char) c;
}
}
/* If we end up this far, we ran out of buffer space... let's let
* caller figure that out, though
*/
return -1;
}
@Override
protected Location getLocation()
{
/* Ok; for fixed-size multi-byte encodings, need to divide numbers
* to get character locations. For variable-length encodings the
* good thing is that xml declaration only uses shortest codepoints,
* ie. char count == byte count.
*/
int total = _inputProcessed + _inputPtr;
int col = _inputPtr - _inputRowStart;
if (mBytesPerChar > 1) {
total /= mBytesPerChar;
col /= mBytesPerChar;
}
return LocationImpl.fromZeroBased
(_config.getPublicId(), _config.getSystemId(),
total, _inputRow, col);
}
/*
/**********************************************************************
/* Internal methods, single-byte access methods
/**********************************************************************
*/
protected byte nextByte()
throws IOException, XMLStreamException
{
if (_inputPtr >= _inputLen) {
loadMore();
}
return _inputBuffer[_inputPtr++];
}
protected int skipSbWs()
throws IOException, XMLStreamException
{
int count = 0;
while (true) {
byte b = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
if ((b & 0xFF) > CHAR_SPACE) {
--_inputPtr;
break;
}
if (b == BYTE_CR || b == BYTE_LF) {
skipSbLF(b);
} else if (b == BYTE_NULL) {
reportNull();
}
++count;
}
return count;
}
protected void skipSbLF(byte lfByte)
throws IOException, XMLStreamException
{
if (lfByte == BYTE_CR) {
byte b = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
if (b != BYTE_LF) {
--_inputPtr; // pushback if not 2-char/byte lf
}
}
++_inputRow;
_inputRowStart = _inputPtr;
}
/**
* @return First character that does not match expected, if any;
* CHAR_NULL if match succeeded
*/
protected int checkSbKeyword(String expected)
throws IOException, XMLStreamException
{
int len = expected.length();
for (int ptr = 1; ptr < len; ++ptr) {
byte b = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
if (b == BYTE_NULL) {
reportNull();
}
if ((b & 0xFF) != expected.charAt(ptr)) {
return (b & 0xFF);
}
}
return CHAR_NULL;
}
/*
/**********************************************************************
/* Internal methods, multi-byte access/checks
/**********************************************************************
*/
protected int nextMultiByte()
throws IOException, XMLStreamException
{
byte b = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
byte b2 = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
int c;
if (mBytesPerChar == 2) {
if (mBigEndian) {
c = ((b & 0xFF) << 8) | (b2 & 0xFF);
} else {
c = (b & 0xFF) | ((b2 & 0xFF) << 8);
}
} else {
// Has to be 4 bytes
byte b3 = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
byte b4 = (_inputPtr < _inputLen) ?
_inputBuffer[_inputPtr++] : nextByte();
if (mBigEndian) {
c = (b << 24) | ((b2 & 0xFF) << 16)
| ((b3 & 0xFF) << 8) | (b4 & 0xFF);
} else {
c = (b4 << 24) | ((b3 & 0xFF) << 16)
| ((b2 & 0xFF) << 8) | (b & 0xFF);
}
}
// Let's catch null chars early
if (c == 0) {
reportNull();
}
return c;
}
protected int skipMbWs()
throws IOException, XMLStreamException
{
int count = 0;
while (true) {
int c = nextMultiByte();
if (c > CHAR_SPACE) {
_inputPtr -= mBytesPerChar;
break;
}
if (c == CHAR_CR || c == CHAR_LF) {
skipMbLF(c);
} else if (c == CHAR_NULL) {
reportNull();
}
++count;
}
return count;
}
protected void skipMbLF(int lf)
throws IOException, XMLStreamException
{
if (lf == CHAR_CR) {
int c = nextMultiByte();
if (c != CHAR_LF) {
_inputPtr -= mBytesPerChar;
}
}
++_inputRow;
_inputRowStart = _inputPtr;
}
/**
* @return First character that does not match expected, if any;
* CHAR_NULL if match succeeded
*/
protected int checkMbKeyword(String expected)
throws IOException, XMLStreamException
{
int len = expected.length();
for (int ptr = 1; ptr < len; ++ptr) {
int c = nextMultiByte();
if (c == BYTE_NULL) {
reportNull();
}
if (c != expected.charAt(ptr)) {
return c;
}
}
return CHAR_NULL;
}
/*
/**********************************************************************
/* Other private methods:
/**********************************************************************
*/
private void verifyEncoding(String id, int bpc)
throws XMLStreamException
{
if (mByteSizeFound) {
/* Let's verify that if we matched an encoding, it's the same
* as what was declared...
*/
if (bpc != mBytesPerChar) {
reportXmlProblem("Declared encoding '"+id+"' uses "+bpc
+" bytes per character; but physical encoding appeared to use "+mBytesPerChar+"; cannot decode");
}
}
}
private void verifyEncoding(String id, int bpc, boolean bigEndian)
throws XMLStreamException
{
if (mByteSizeFound) {
verifyEncoding(id, bpc);
if (bigEndian != mBigEndian) {
String bigStr = bigEndian ? "big" : "little";
reportXmlProblem
("Declared encoding '"+id+"' has different endianness ("
+bigStr+" endian) than what physical ordering appeared to be; cannot decode");
}
}
}
private void reportWeirdUCS4(String type)
throws IOException
{
throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");
}
private void reportEBCDIC()
throws IOException
{
throw new CharConversionException("Unsupported encoding (EBCDIC)");
}
}