All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ctc.wstx.io.WstxInputData Maven / Gradle / Ivy

There is a newer version: 0.3.0
Show newest version
/* Woodstox XML processor
 *
 * Copyright (c) 2004 Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in file LICENSE, included with
 * the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.ctc.wstx.io;

import com.ctc.wstx.util.XmlChars;

/**
 * Base class used by readers (specifically, by
 * {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes)
 * to encapsulate input buffer portion of the class. Philosophically
 * this should probably be done via containment (composition), not
 * sub-classing but for performance reason, this "core" class is generally
 * extended from instead.
 *

* Main reason for the input data portion to be factored out of main * class is that this way it can also be passed to nested input handling * Objects, which can then manipulate input buffers of the caller, * efficiently. */ public class WstxInputData { // // // Some well-known chars: /** * Null-character is used as return value from some method(s), since * it is not a legal character in an XML document. */ public final static char CHAR_NULL = '\u0000'; public final static char INT_NULL = 0; public final static char CHAR_SPACE = (char) 0x0020; public final static char INT_SPACE = 0x0020; /** * This constant defines the highest Unicode character allowed * in XML content. */ public final static int MAX_UNICODE_CHAR = 0x10FFFF; /* //////////////////////////////////////////////////// // Character validity constants, structs //////////////////////////////////////////////////// */ /** * We will only use validity array for first 256 characters, mostly * because after those characters it's easier to do fairly simple * block checks. */ private final static int VALID_CHAR_COUNT = 0x100; // These are the same for both 1.0 and 1.1... // private final static int FIRST_VALID_FOR_FIRST = 0x0041; // 'A' // private final static int FIRST_VALID_FOR_REST = 0x002D; // '.' private final static byte NAME_CHAR_INVALID_B = (byte) 0; private final static byte NAME_CHAR_ALL_VALID_B = (byte) 1; private final static byte NAME_CHAR_VALID_NONFIRST_B = (byte) -1; private final static byte[] sCharValidity = new byte[VALID_CHAR_COUNT]; static { /* First, since all valid-as-first chars are also valid-as-other chars, * we'll initialize common chars: */ sCharValidity['_'] = NAME_CHAR_ALL_VALID_B; for (int i = 0, last = ('z' - 'a'); i <= last; ++i) { sCharValidity['A' + i] = NAME_CHAR_ALL_VALID_B; sCharValidity['a' + i] = NAME_CHAR_ALL_VALID_B; } // not all are fully valid, but for (int i = 0xC0; i < VALID_CHAR_COUNT; ++i) { sCharValidity[i] = NAME_CHAR_ALL_VALID_B; } // ... now we can 'revert' ones not fully valid: sCharValidity[0xD7] = NAME_CHAR_INVALID_B; sCharValidity[0xF7] = NAME_CHAR_INVALID_B; /* And then we can proceed with ones only valid-as-other. */ sCharValidity['-'] = NAME_CHAR_VALID_NONFIRST_B; sCharValidity['.'] = NAME_CHAR_VALID_NONFIRST_B; sCharValidity[0xB7] = NAME_CHAR_VALID_NONFIRST_B; for (int i = '0'; i <= '9'; ++i) { sCharValidity[i] = NAME_CHAR_VALID_NONFIRST_B; } } /** * Public identifiers only use 7-bit ascii range. */ private final static int VALID_PUBID_CHAR_COUNT = 0x80; private final static byte[] sPubidValidity = new byte[VALID_PUBID_CHAR_COUNT]; // private final static byte PUBID_CHAR_INVALID_B = (byte) 0; private final static byte PUBID_CHAR_VALID_B = (byte) 1; static { for (int i = 0, last = ('z' - 'a'); i <= last; ++i) { sPubidValidity['A' + i] = PUBID_CHAR_VALID_B; sPubidValidity['a' + i] = PUBID_CHAR_VALID_B; } for (int i = '0'; i <= '9'; ++i) { sPubidValidity[i] = PUBID_CHAR_VALID_B; } // 3 main white space types are valid sPubidValidity[0x0A] = PUBID_CHAR_VALID_B; sPubidValidity[0x0D] = PUBID_CHAR_VALID_B; sPubidValidity[0x20] = PUBID_CHAR_VALID_B; // And many of punctuation/separator ascii chars too: sPubidValidity['-'] = PUBID_CHAR_VALID_B; sPubidValidity['\''] = PUBID_CHAR_VALID_B; sPubidValidity['('] = PUBID_CHAR_VALID_B; sPubidValidity[')'] = PUBID_CHAR_VALID_B; sPubidValidity['+'] = PUBID_CHAR_VALID_B; sPubidValidity[','] = PUBID_CHAR_VALID_B; sPubidValidity['.'] = PUBID_CHAR_VALID_B; sPubidValidity['/'] = PUBID_CHAR_VALID_B; sPubidValidity[':'] = PUBID_CHAR_VALID_B; sPubidValidity['='] = PUBID_CHAR_VALID_B; sPubidValidity['?'] = PUBID_CHAR_VALID_B; sPubidValidity[';'] = PUBID_CHAR_VALID_B; sPubidValidity['!'] = PUBID_CHAR_VALID_B; sPubidValidity['*'] = PUBID_CHAR_VALID_B; sPubidValidity['#'] = PUBID_CHAR_VALID_B; sPubidValidity['@'] = PUBID_CHAR_VALID_B; sPubidValidity['$'] = PUBID_CHAR_VALID_B; sPubidValidity['_'] = PUBID_CHAR_VALID_B; sPubidValidity['%'] = PUBID_CHAR_VALID_B; } /* //////////////////////////////////////////////////// // Configuration //////////////////////////////////////////////////// */ /** * Flag that indicates whether XML content is to be treated as per * XML 1.1 specification or not (if not, it'll use xml 1.0). */ protected boolean mXml11 = false; /* //////////////////////////////////////////////////// // Current input data //////////////////////////////////////////////////// */ /** * Current buffer from which data is read; generally data is read into * buffer from input source, but not always (especially when using nested * input contexts when expanding parsed entity references etc). */ protected char[] mInputBuffer; /** * Pointer to next available character in buffer */ protected int mInputPtr = 0; /** * Index of character after last available one in the buffer. */ protected int mInputEnd = 0; /* //////////////////////////////////////////////////// // Current input location information //////////////////////////////////////////////////// */ /** * Number of characters that were contained in previous blocks * (blocks that were already processed prior to the current buffer). */ protected long mCurrInputProcessed = 0L; /** * Current row location of current point in input buffer, starting * from 1 */ protected int mCurrInputRow = 1; /** * Current index of the first character of the current row in input * buffer. Needed to calculate column position, if necessary; benefit * of not having column itself is that this only has to be updated * once per line. */ protected int mCurrInputRowStart = 0; /* //////////////////////////////////////////////////// // Life-cycle //////////////////////////////////////////////////// */ protected WstxInputData() { } /** * Note: Only public due to sub-classes needing to call this on * base class instance from different package (confusing?) */ public void copyBufferStateFrom(WstxInputData src) { mInputBuffer = src.mInputBuffer; mInputPtr = src.mInputPtr; mInputEnd = src.mInputEnd; mCurrInputProcessed = src.mCurrInputProcessed; mCurrInputRow = src.mCurrInputRow; mCurrInputRowStart = src.mCurrInputRowStart; } /* //////////////////////////////////////////////////// // Public/package API, character classes //////////////////////////////////////////////////// */ /** * Method that can be used to check whether specified character * is a valid first character of an XML 1.0/1.1 name; except that * colon (:) is not recognized as a start char here: caller has * to verify it separately (since it generally affects namespace * mapping of a qualified name). */ protected final boolean isNameStartChar(char c) { /* First, let's handle 7-bit ascii range (identical between xml * 1.0 and 1.1) */ if (c <= 0x7A) { // 'z' or earlier if (c >= 0x61) { // 'a' - 'z' are ok return true; } if (c < 0x41) { // before 'A' just white space return false; } return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok } /* Ok, otherwise need to use a big honking bit sets... which * differ between 1.0 and 1.1 */ return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c); } /** * Method that can be used to check whether specified character * is a valid character of an XML 1.0/1.1 name as any other char than * the first one; except that colon (:) is not recognized as valid here: * caller has to verify it separately (since it generally affects namespace * mapping of a qualified name). */ protected final boolean isNameChar(char c) { // First, let's handle 7-bit ascii range if (c <= 0x7A) { // 'z' or earlier if (c >= 0x61) { // 'a' - 'z' are ok return true; } if (c <= 0x5A) { if (c >= 0x41) { // 'A' - 'Z' ok too return true; } // As are 0-9, '.' and '-' return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-'); } return (c == 0x5F); // '_' is ok too } return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c); } public final static boolean isNameStartChar(char c, boolean nsAware, boolean xml11) { /* First, let's handle 7-bit ascii range (identical between xml * 1.0 and 1.1) */ if (c <= 0x7A) { // 'z' or earlier if (c >= 0x61) { // 'a' - 'z' are ok return true; } if (c < 0x41) { // before 'A' just white space (and colon) if (c == ':' && !nsAware) { return true; } return false; } return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok } /* Ok, otherwise need to use a big honking bit sets... which * differ between 1.0 and 1.1 */ return xml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c); } public final static boolean isNameChar(char c, boolean nsAware, boolean xml11) { // First, let's handle 7-bit ascii range if (c <= 0x7A) { // 'z' or earlier if (c >= 0x61) { // 'a' - 'z' are ok return true; } if (c <= 0x5A) { if (c >= 0x41) { // 'A' - 'Z' ok too return true; } // As are 0-9, '.' and '-' return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-') || (c == ':' && !nsAware); } return (c == 0x5F); // '_' is ok too } return xml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c); } /** * Method that can be called to check whether given String contains * any characters that are not legal XML names. * * @return Index of the first illegal xml name characters, if any; * -1 if the name is completely legal */ public final static int findIllegalNameChar(String name, boolean nsAware, boolean xml11) { int len = name.length(); if (len < 1) { return -1; } char c = name.charAt(0); // First char legal? if (c <= 0x7A) { // 'z' or earlier if (c < 0x61) { // 'a' - 'z' (0x61 - 0x7A) are ok if (c < 0x41) { // before 'A' just white space (except colon) if (c != ':' || nsAware) { // ':' == 0x3A return 0; } } else if ((c > 0x5A) && (c != '_')) { // 'A' - 'Z' and '_' are ok return 0; } } } else { if (xml11) { if (!XmlChars.is11NameStartChar(c)) { return 0; } } else { if (!XmlChars.is10NameStartChar(c)) { return 0; } } } for (int i = 1; i < len; ++i) { c = name.charAt(i); if (c <= 0x7A) { // 'z' or earlier if (c >= 0x61) { // 'a' - 'z' are ok continue; } if (c <= 0x5A) { if (c >= 0x41) { // 'A' - 'Z' ok too continue; } // As are 0-9, '.' and '-' if ((c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')) { continue; } // And finally, colon, in non-ns-aware mode if (c == ':' && !nsAware) { // ':' == 0x3A continue; } } else if (c == 0x5F) { // '_' is ok too continue; } } else { if (xml11) { if (XmlChars.is11NameChar(c)) { continue; } } else { if (XmlChars.is10NameChar(c)) { continue; } } } return i; } return -1; } public final static int findIllegalNmtokenChar(String nmtoken, boolean nsAware, boolean xml11) { int len = nmtoken.length(); // No special handling for the first char, just the loop for (int i = 1; i < len; ++i) { char c = nmtoken.charAt(i); if (c <= 0x7A) { // 'z' or earlier if (c >= 0x61) { // 'a' - 'z' are ok continue; } if (c <= 0x5A) { if (c >= 0x41) { // 'A' - 'Z' ok too continue; } // As are 0-9, '.' and '-' if ((c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-')) { continue; } // And finally, colon, in non-ns-aware mode if (c == ':' && !nsAware) { // ':' == 0x3A continue; } } else if (c == 0x5F) { // '_' is ok too continue; } } else { if (xml11) { if (XmlChars.is11NameChar(c)) { continue; } } else { if (XmlChars.is10NameChar(c)) { continue; } } } return i; } return -1; } public final static boolean isSpaceChar(char c) { return (c <= CHAR_SPACE); } public static String getCharDesc(char c) { int i = (int) c; if (Character.isISOControl(c)) { return "(CTRL-CHAR, code "+i+")"; } if (i > 255) { return "'"+c+"' (code "+i+" / 0x"+Integer.toHexString(i)+")"; } return "'"+c+"' (code "+i+")"; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy