com.fasterxml.aalto.in.ByteBasedScanner Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
/* Aalto XML processor
*
* Copyright (c) 2006- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.in;
import java.io.IOException;
import javax.xml.stream.XMLStreamException;
import org.codehaus.stax2.XMLStreamLocation2;
import com.fasterxml.aalto.impl.LocationImpl;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;
/**
* Intermediate base class used by different byte-backed scanners.
* Specifically, used as a base by both blocking (stream) and
* non-blocking (async) byte-based scanners (as opposed to Reader-backed,
* character-based scanners)
*/
public abstract class ByteBasedScanner
extends XmlScanner
{
/*
/**********************************************************************
/* Byte constants
/**********************************************************************
*/
// White-space:
final protected static byte BYTE_NULL = (byte) 0;
final protected static byte BYTE_SPACE = (byte) ' ';
final protected static byte BYTE_LF = (byte) '\n';
final protected static byte BYTE_CR = (byte) '\r';
final protected static byte BYTE_TAB = (byte) 9;
final protected static byte BYTE_LT = (byte) '<';
final protected static byte BYTE_GT = (byte) '>';
final protected static byte BYTE_AMP = (byte) '&';
final protected static byte BYTE_HASH = (byte) '#';
final protected static byte BYTE_EXCL = (byte) '!';
final protected static byte BYTE_HYPHEN = (byte) '-';
final protected static byte BYTE_QMARK = (byte) '?';
final protected static byte BYTE_SLASH = (byte) '/';
final protected static byte BYTE_EQ = (byte) '=';
final protected static byte BYTE_QUOT = (byte) '"';
final protected static byte BYTE_APOS = (byte) '\'';
final protected static byte BYTE_LBRACKET = (byte) '[';
final protected static byte BYTE_RBRACKET = (byte) ']';
final protected static byte BYTE_SEMICOLON = (byte) ';';
final protected static byte BYTE_a = (byte) 'a';
final protected static byte BYTE_g = (byte) 'g';
final protected static byte BYTE_l = (byte) 'l';
final protected static byte BYTE_m = (byte) 'm';
final protected static byte BYTE_o = (byte) 'o';
final protected static byte BYTE_p = (byte) 'p';
final protected static byte BYTE_q = (byte) 'q';
final protected static byte BYTE_s = (byte) 's';
final protected static byte BYTE_t = (byte) 't';
final protected static byte BYTE_u = (byte) 'u';
final protected static byte BYTE_x = (byte) 'x';
final protected static byte BYTE_A = (byte) 'A';
final protected static byte BYTE_C = (byte) 'C';
final protected static byte BYTE_D = (byte) 'D';
final protected static byte BYTE_P = (byte) 'P';
final protected static byte BYTE_S = (byte) 'S';
final protected static byte BYTE_T = (byte) 'T';
/*
/**********************************************************************
/* Input buffering
/**********************************************************************
*/
/**
* Pointer to the next unread byte in the input buffer.
*/
protected int _inputPtr;
/**
* Pointer to the first byte after the end of valid content.
* This may point beyond of the physical buffer array.
*/
protected int _inputEnd;
/*
/**********************************************************************
/* Parsing state
/**********************************************************************
*/
/**
* Storage location for a single character that can not be easily
* pushed back (for example, multi-byte char; or char entity
* expansion). Negative, if from entity expansion; positive if
* a singular char.
*/
protected int _tmpChar = INT_NULL;
/*
/**********************************************************************
/* Life-cycle
/**********************************************************************
*/
protected ByteBasedScanner(ReaderConfig cfg)
{
super(cfg);
_pastBytesOrChars = 0; // should it be passed by caller?
_rowStartOffset = 0; // should probably be passed by caller...
}
// @Override protected abstract void _releaseBuffers();
@Override
protected abstract void _closeSource() throws IOException;
/*
/**********************************************************************
/* Location handling
/**********************************************************************
*/
@Override
public XMLStreamLocation2 getCurrentLocation()
{
return LocationImpl.fromZeroBased(_config.getPublicId(), _config.getSystemId(),
_pastBytesOrChars + _inputPtr, _currRow, _inputPtr - _rowStartOffset);
}
@Override
public int getCurrentColumnNr() {
return _inputPtr - _rowStartOffset;
}
@Override
public long getStartingByteOffset() {
return _startRawOffset;
}
@Override
public long getStartingCharOffset() {
// N/A for this type
return -1L;
}
@Override
public long getEndingByteOffset() throws XMLStreamException {
// Have to complete the token to know the ending location...
if (_tokenIncomplete) {
finishToken();
}
return _pastBytesOrChars + _inputPtr;
}
@Override
public long getEndingCharOffset() throws XMLStreamException {
// N/A for this type
return -1L;
}
protected final void markLF(int offset) {
_rowStartOffset = offset;
++_currRow;
}
protected final void markLF() {
_rowStartOffset = _inputPtr;
++_currRow;
}
protected final void setStartLocation() {
_startRawOffset = _pastBytesOrChars + _inputPtr;
_startRow = _currRow;
_startColumn = _inputPtr - _rowStartOffset;
}
/*
/**********************************************************************
/* Abstract methods for sub-classes to implement
/**********************************************************************
*/
/**
* Method called by methods when encountering a byte that
* can not be part of a valid character in the current context.
* Should return the actual decoded character for error reporting
* purposes.
*/
protected abstract int decodeCharForError(byte b)
throws XMLStreamException;
/*
/**********************************************************************
/* And then shared functionality for sub-classes
/**********************************************************************
*/
/**
* Conceptually, this method really does NOT belong here. However,
* currently it is quite hard to refactor it, so it'll have to
* stay here until better place is found
*/
protected final PName addUTFPName(ByteBasedPNameTable symbols, XmlCharTypes charTypes,
int hash, int[] quads, int qlen, int lastQuadBytes)
throws XMLStreamException
{
// 4 bytes per quad, except last one maybe less
int byteLen = (qlen << 2) - 4 + lastQuadBytes;
// And last one is not correctly aligned (leading zero bytes instead
// need to shift a bit, instead of trailing). Only need to shift it
// for UTF-8 decoding; need revert for storage (since key will not
// be aligned, to optimize lookup speed)
int lastQuad;
if (lastQuadBytes < 4) {
lastQuad = quads[qlen-1];
// 8/16/24 bit left shift
quads[qlen-1] = (lastQuad << ((4 - lastQuadBytes) << 3));
} else {
lastQuad = 0;
}
// Let's handle first char separately (different validation):
int ch = (quads[0] >>> 24);
boolean ok;
int ix = 1;
char[] cbuf = _nameBuffer;
int cix = 0;
final int[] TYPES = charTypes.NAME_CHARS;
switch (TYPES[ch]) {
case XmlCharTypes.CT_NAME_NONE:
case XmlCharTypes.CT_NAME_COLON: // not ok as first
case XmlCharTypes.CT_NAME_NONFIRST:
case InputCharTypes.CT_INPUT_NAME_MB_N:
ok = false;
break;
case XmlCharTypes.CT_NAME_ANY:
ok = true;
break;
default: // multi-byte (UTF-8) chars:
{
int needed;
if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
ch &= 0x1F;
needed = 1;
} else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
ch &= 0x0F;
needed = 2;
} else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all...
ch &= 0x07;
needed = 3;
} else { // 5- and 6-byte chars not valid xml chars
reportInvalidInitial(ch);
needed = ch = 1; // never really gets this far
}
if ((ix + needed) > byteLen) {
reportEofInName(cbuf, 0);
}
ix += needed;
int q = quads[0];
// Always need at least one more right away:
int ch2 = (q >> 16) & 0xFF;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
/* And then may need more. Note: here we do not do all the
* checks that UTF-8 text decoder might do. Reason is that
* name validity checking methods handle most of such checks
*/
if (needed > 1) {
ch2 = (q >> 8) & 0xFF;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates on output)
ch2 = q & 0xFF;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2 & 0xFF);
}
ch = (ch << 6) | (ch2 & 0x3F);
}
}
ok = XmlChars.is10NameStartChar(ch);
if (needed > 2) { // outside of basic 16-bit range? need surrogates
/* so, let's first output first char (high surrogate),
* let second be output by later code
*/
ch -= 0x10000; // to normalize it starting with 0x0
cbuf[cix++] = (char) (0xD800 + (ch >> 10));
ch = (0xDC00 | (ch & 0x03FF));
}
}
}
if (!ok) { // 0 to indicate it's first char, even with surrogates
reportInvalidNameChar(ch, 0);
}
cbuf[cix++] = (char) ch; // the only char, or second (low) surrogate
/* Whoa! Tons of code for just the start char. But now we get to
* decode the name proper, at last!
*/
int last_colon = -1;
for (; ix < byteLen; ) {
ch = quads[ix >> 2]; // current quad, need to shift+mask
int byteIx = (ix & 3);
ch = (ch >> ((3 - byteIx) << 3)) & 0xFF;
++ix;
// Ascii?
switch (TYPES[ch]) {
case XmlCharTypes.CT_NAME_NONE:
case XmlCharTypes.CT_MULTIBYTE_N:
ok = false;
break;
case XmlCharTypes.CT_NAME_COLON: // not ok as first
if (last_colon >= 0) {
reportMultipleColonsInName();
}
last_colon = cix;
ok = true;
break;
case XmlCharTypes.CT_NAME_NONFIRST:
case XmlCharTypes.CT_NAME_ANY:
ok = true;
break;
default:
{
int needed;
if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
ch &= 0x1F;
needed = 1;
} else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
ch &= 0x0F;
needed = 2;
} else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all...
ch &= 0x07;
needed = 3;
} else { // 5- and 6-byte chars not valid xml chars
reportInvalidInitial(ch);
needed = ch = 1; // never really gets this far
}
if ((ix + needed) > byteLen) {
reportEofInName(cbuf, cix);
}
// Ok, always need at least one more:
int ch2 = quads[ix >> 2]; // current quad, need to shift+mask
byteIx = (ix & 3);
ch2 = (ch2 >> ((3 - byteIx) << 3));
++ix;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
// Once again, some of validation deferred to name char validator
if (needed > 1) {
ch2 = quads[ix >> 2];
byteIx = (ix & 3);
ch2 = (ch2 >> ((3 - byteIx) << 3));
++ix;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2);
}
ch = (ch << 6) | (ch2 & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates on output)
ch2 = quads[ix >> 2];
byteIx = (ix & 3);
ch2 = (ch2 >> ((3 - byteIx) << 3));
++ix;
if ((ch2 & 0xC0) != 0x080) {
reportInvalidOther(ch2 & 0xFF);
}
ch = (ch << 6) | (ch2 & 0x3F);
}
}
ok = XmlChars.is10NameChar(ch);
if (needed > 2) { // surrogate pair? once again, let's output one here, one later on
ch -= 0x10000; // to normalize it starting with 0x0
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
cbuf[cix++] = (char) (0xD800 + (ch >> 10));
ch = 0xDC00 | (ch & 0x03FF);
}
}
}
if (!ok) {
reportInvalidNameChar(ch, cix);
}
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
cbuf[cix++] = (char) ch;
}
/* Ok. Now we have the character array, and can construct the
* String (as well as check proper composition of semicolons
* for ns-aware mode...)
*/
String baseName = new String(cbuf, 0, cix);
// And finally, unalign if necessary
if (lastQuadBytes < 4) {
quads[qlen-1] = lastQuad;
}
return symbols.addSymbol(hash, baseName, last_colon, quads, qlen);
}
/*
/**********************************************************************
/* Error reporting
/**********************************************************************
*/
protected void reportInvalidInitial(int mask) throws XMLStreamException {
reportInputProblem("Invalid UTF-8 start byte 0x"+Integer.toHexString(mask));
}
protected void reportInvalidOther(int mask) throws XMLStreamException {
reportInputProblem("Invalid UTF-8 middle byte 0x"+Integer.toHexString(mask));
}
}