com.ctc.wstx.io.WstxInputData Maven / Gradle / Ivy
/* Woodstox XML processor
*
* Copyright (c) 2004 Tatu Saloranta, [email protected]
*
* Licensed under the License specified in file LICENSE, included with
* the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.ctc.wstx.io;
import com.ctc.wstx.util.XmlChars;
/**
* Base class used by readers (specifically, by
* {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes)
* to encapsulate input buffer portion of the class. Philosophically
* this should probably be done via containment (composition), not
* sub-classing but for performance reason, this "core" class is generally
* extended from instead.
*
* Main reason for the input data portion to be factored out of main
* class is that this way it can also be passed to nested input handling
* Objects, which can then manipulate input buffers of the caller,
* efficiently.
*/
public class WstxInputData
{
// // // Some well-known chars:
/**
* Null-character is used as return value from some method(s), since
* it is not a legal character in an XML document.
*/
public final static char CHAR_NULL = '\u0000';
public final static char INT_NULL = 0;
public final static char CHAR_SPACE = (char) 0x0020;
public final static char INT_SPACE = 0x0020;
/**
* This constant defines the highest Unicode character allowed
* in XML content.
*/
public final static int MAX_UNICODE_CHAR = 0x10FFFF;
/*
////////////////////////////////////////////////////
// Character validity constants, structs
////////////////////////////////////////////////////
*/
/**
* We will only use validity array for first 256 characters, mostly
* because after those characters it's easier to do fairly simple
* block checks.
*/
private final static int VALID_CHAR_COUNT = 0x100;
// These are the same for both 1.0 and 1.1...
// private final static int FIRST_VALID_FOR_FIRST = 0x0041; // 'A'
// private final static int FIRST_VALID_FOR_REST = 0x002D; // '.'
private final static byte NAME_CHAR_INVALID_B = (byte) 0;
private final static byte NAME_CHAR_ALL_VALID_B = (byte) 1;
private final static byte NAME_CHAR_VALID_NONFIRST_B = (byte) -1;
private final static byte[] sCharValidity = new byte[VALID_CHAR_COUNT];
static {
/* First, since all valid-as-first chars are also valid-as-other chars,
* we'll initialize common chars:
*/
sCharValidity['_'] = NAME_CHAR_ALL_VALID_B;
for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
sCharValidity['A' + i] = NAME_CHAR_ALL_VALID_B;
sCharValidity['a' + i] = NAME_CHAR_ALL_VALID_B;
}
// not all are fully valid, but
for (int i = 0xC0; i < VALID_CHAR_COUNT; ++i) {
sCharValidity[i] = NAME_CHAR_ALL_VALID_B;
}
// ... now we can 'revert' ones not fully valid:
sCharValidity[0xD7] = NAME_CHAR_INVALID_B;
sCharValidity[0xF7] = NAME_CHAR_INVALID_B;
/* And then we can proceed with ones only valid-as-other.
*/
sCharValidity['-'] = NAME_CHAR_VALID_NONFIRST_B;
sCharValidity['.'] = NAME_CHAR_VALID_NONFIRST_B;
sCharValidity[0xB7] = NAME_CHAR_VALID_NONFIRST_B;
for (int i = '0'; i <= '9'; ++i) {
sCharValidity[i] = NAME_CHAR_VALID_NONFIRST_B;
}
}
/**
* Public identifiers only use 7-bit ascii range.
*/
private final static int VALID_PUBID_CHAR_COUNT = 0x80;
private final static byte[] sPubidValidity = new byte[VALID_PUBID_CHAR_COUNT];
// private final static byte PUBID_CHAR_INVALID_B = (byte) 0;
private final static byte PUBID_CHAR_VALID_B = (byte) 1;
static {
for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
sPubidValidity['A' + i] = PUBID_CHAR_VALID_B;
sPubidValidity['a' + i] = PUBID_CHAR_VALID_B;
}
for (int i = '0'; i <= '9'; ++i) {
sPubidValidity[i] = PUBID_CHAR_VALID_B;
}
// 3 main white space types are valid
sPubidValidity[0x0A] = PUBID_CHAR_VALID_B;
sPubidValidity[0x0D] = PUBID_CHAR_VALID_B;
sPubidValidity[0x20] = PUBID_CHAR_VALID_B;
// And many of punctuation/separator ascii chars too:
sPubidValidity['-'] = PUBID_CHAR_VALID_B;
sPubidValidity['\''] = PUBID_CHAR_VALID_B;
sPubidValidity['('] = PUBID_CHAR_VALID_B;
sPubidValidity[')'] = PUBID_CHAR_VALID_B;
sPubidValidity['+'] = PUBID_CHAR_VALID_B;
sPubidValidity[','] = PUBID_CHAR_VALID_B;
sPubidValidity['.'] = PUBID_CHAR_VALID_B;
sPubidValidity['/'] = PUBID_CHAR_VALID_B;
sPubidValidity[':'] = PUBID_CHAR_VALID_B;
sPubidValidity['='] = PUBID_CHAR_VALID_B;
sPubidValidity['?'] = PUBID_CHAR_VALID_B;
sPubidValidity[';'] = PUBID_CHAR_VALID_B;
sPubidValidity['!'] = PUBID_CHAR_VALID_B;
sPubidValidity['*'] = PUBID_CHAR_VALID_B;
sPubidValidity['#'] = PUBID_CHAR_VALID_B;
sPubidValidity['@'] = PUBID_CHAR_VALID_B;
sPubidValidity['$'] = PUBID_CHAR_VALID_B;
sPubidValidity['_'] = PUBID_CHAR_VALID_B;
sPubidValidity['%'] = PUBID_CHAR_VALID_B;
}
/*
////////////////////////////////////////////////////
// Configuration
////////////////////////////////////////////////////
*/
/**
* Flag that indicates whether XML content is to be treated as per
* XML 1.1 specification or not (if not, it'll use xml 1.0).
*/
protected boolean mXml11 = false;
/*
////////////////////////////////////////////////////
// Current input data
////////////////////////////////////////////////////
*/
/**
* Current buffer from which data is read; generally data is read into
* buffer from input source, but not always (especially when using nested
* input contexts when expanding parsed entity references etc).
*/
protected char[] mInputBuffer;
/**
* Pointer to next available character in buffer
*/
protected int mInputPtr = 0;
/**
* Index of character after last available one in the buffer.
*/
protected int mInputEnd = 0;
/*
////////////////////////////////////////////////////
// Current input location information
////////////////////////////////////////////////////
*/
/**
* Number of characters that were contained in previous blocks
* (blocks that were already processed prior to the current buffer).
*/
protected long mCurrInputProcessed = 0L;
/**
* Current row location of current point in input buffer, starting
* from 1
*/
protected int mCurrInputRow = 1;
/**
* Current index of the first character of the current row in input
* buffer. Needed to calculate column position, if necessary; benefit
* of not having column itself is that this only has to be updated
* once per line.
*/
protected int mCurrInputRowStart = 0;
/*
////////////////////////////////////////////////////
// Life-cycle
////////////////////////////////////////////////////
*/
protected WstxInputData() {
}
/**
* Note: Only public due to sub-classes needing to call this on
* base class instance from different package (confusing?)
*/
public void copyBufferStateFrom(WstxInputData src)
{
mInputBuffer = src.mInputBuffer;
mInputPtr = src.mInputPtr;
mInputEnd = src.mInputEnd;
mCurrInputProcessed = src.mCurrInputProcessed;
mCurrInputRow = src.mCurrInputRow;
mCurrInputRowStart = src.mCurrInputRowStart;
}
/*
////////////////////////////////////////////////////
// Public/package API, character classes
////////////////////////////////////////////////////
*/
/**
* Method that can be used to check whether specified character
* is a valid first character of an XML 1.0/1.1 name; except that
* colon (:) is not recognized as a start char here: caller has
* to verify it separately (since it generally affects namespace
* mapping of a qualified name).
*/
protected final boolean isNameStartChar(char c)
{
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c < 0x41) { // before 'A' just white space
return false;
}
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
*/
return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
}
/**
* Method that can be used to check whether specified character
* is a valid character of an XML 1.0/1.1 name as any other char than
* the first one; except that colon (:) is not recognized as valid here:
* caller has to verify it separately (since it generally affects namespace
* mapping of a qualified name).
*/
protected final boolean isNameChar(char c)
{
// First, let's handle 7-bit ascii range
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
return true;
}
// As are 0-9, '.' and '-'
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
}
return (c == 0x5F); // '_' is ok too
}
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
public final static boolean isNameStartChar(char c, boolean nsAware, boolean xml11)
{
/* First, let's handle 7-bit ascii range (identical between xml
* 1.0 and 1.1)
*/
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c < 0x41) { // before 'A' just white space (and colon)
if (c == ':' && !nsAware) {
return true;
}
return false;
}
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
}
/* Ok, otherwise need to use a big honking bit sets... which
* differ between 1.0 and 1.1
*/
return xml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
}
public final static boolean isNameChar(char c, boolean nsAware, boolean xml11)
{
// First, let's handle 7-bit ascii range
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
return true;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
return true;
}
// As are 0-9, '.' and '-'
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')
|| (c == ':' && !nsAware);
}
return (c == 0x5F); // '_' is ok too
}
return xml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
}
/**
* Method that can be called to check whether given String contains
* any characters that are not legal XML names.
*
* @return Index of the first illegal xml name characters, if any;
* -1 if the name is completely legal
*/
public final static int findIllegalNameChar(String name, boolean nsAware, boolean xml11)
{
int len = name.length();
if (len < 1) {
return -1;
}
char c = name.charAt(0);
// First char legal?
if (c <= 0x7A) { // 'z' or earlier
if (c < 0x61) { // 'a' - 'z' (0x61 - 0x7A) are ok
if (c < 0x41) { // before 'A' just white space (except colon)
if (c != ':' || nsAware) { // ':' == 0x3A
return 0;
}
} else if ((c > 0x5A) && (c != '_')) {
// 'A' - 'Z' and '_' are ok
return 0;
}
}
} else {
if (xml11) {
if (!XmlChars.is11NameStartChar(c)) {
return 0;
}
} else {
if (!XmlChars.is10NameStartChar(c)) {
return 0;
}
}
}
for (int i = 1; i < len; ++i) {
c = name.charAt(i);
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
continue;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
continue;
}
// As are 0-9, '.' and '-'
if ((c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')) {
continue;
}
// And finally, colon, in non-ns-aware mode
if (c == ':' && !nsAware) { // ':' == 0x3A
continue;
}
} else if (c == 0x5F) { // '_' is ok too
continue;
}
} else {
if (xml11) {
if (XmlChars.is11NameChar(c)) {
continue;
}
} else {
if (XmlChars.is10NameChar(c)) {
continue;
}
}
}
return i;
}
return -1;
}
public final static int findIllegalNmtokenChar(String nmtoken, boolean nsAware, boolean xml11)
{
int len = nmtoken.length();
// No special handling for the first char, just the loop
for (int i = 1; i < len; ++i) {
char c = nmtoken.charAt(i);
if (c <= 0x7A) { // 'z' or earlier
if (c >= 0x61) { // 'a' - 'z' are ok
continue;
}
if (c <= 0x5A) {
if (c >= 0x41) { // 'A' - 'Z' ok too
continue;
}
// As are 0-9, '.' and '-'
if ((c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')) {
continue;
}
// And finally, colon, in non-ns-aware mode
if (c == ':' && !nsAware) { // ':' == 0x3A
continue;
}
} else if (c == 0x5F) { // '_' is ok too
continue;
}
} else {
if (xml11) {
if (XmlChars.is11NameChar(c)) {
continue;
}
} else {
if (XmlChars.is10NameChar(c)) {
continue;
}
}
}
return i;
}
return -1;
}
public final static boolean isSpaceChar(char c)
{
return (c <= CHAR_SPACE);
}
public static String getCharDesc(char c)
{
int i = (int) c;
if (Character.isISOControl(c)) {
return "(CTRL-CHAR, code "+i+")";
}
if (i > 255) {
return "'"+c+"' (code "+i+" / 0x"+Integer.toHexString(i)+")";
}
return "'"+c+"' (code "+i+")";
}
}