All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.abubusoft.kripton.xml.XMLParser Maven / Gradle / Ivy

There is a newer version: 8.2.0-rc.4
Show newest version
/* Copyright (c) 2002,2003, Stefan Haustein, Oberhausen, Rhld., Germany
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The  above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE. */

// Contributors: Paul Hackenberger (unterminated entity handling in relaxed mode)

package com.abubusoft.kripton.xml;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.Map;

import com.abubusoft.kripton.common.Base64Utils;
import com.abubusoft.kripton.common.StringUtils;
import com.abubusoft.kripton.exception.KriptonRuntimeException;
import com.abubusoft.kripton.persistence.xml.internal.StringPool;

/**
 * An XML pull parser with limited support for parsing internal DTDs.
 */
public class XMLParser implements XmlPullParser, Closeable {

	private static final String PROPERTY_XMLDECL_VERSION = "http://xmlpull.org/v1/doc/properties.html#xmldecl-version";
	private static final String PROPERTY_XMLDECL_STANDALONE = "http://xmlpull.org/v1/doc/properties.html#xmldecl-standalone";
	private static final String PROPERTY_LOCATION = "http://xmlpull.org/v1/doc/properties.html#location";
	private static final String FEATURE_RELAXED = "http://xmlpull.org/v1/doc/features.html#relaxed";

	private static final Map DEFAULT_ENTITIES = new HashMap();
	static {
		DEFAULT_ENTITIES.put("lt", "<");
		DEFAULT_ENTITIES.put("gt", ">");
		DEFAULT_ENTITIES.put("amp", "&");
		DEFAULT_ENTITIES.put("apos", "'");
		DEFAULT_ENTITIES.put("quot", "\"");
	}

	private static final int ELEMENTDECL = 11;
	private static final int ENTITYDECL = 12;
	private static final int ATTLISTDECL = 13;
	private static final int NOTATIONDECL = 14;
	private static final int PARAMETER_ENTITY_REF = 15;
	private static final char[] START_COMMENT = { '<', '!', '-', '-' };
	private static final char[] END_COMMENT = { '-', '-', '>' };
	private static final char[] COMMENT_DOUBLE_DASH = { '-', '-' };
	private static final char[] START_CDATA = { '<', '!', '[', 'C', 'D', 'A', 'T', 'A', '[' };
	private static final char[] END_CDATA = { ']', ']', '>' };
	private static final char[] START_PROCESSING_INSTRUCTION = { '<', '?' };
	private static final char[] END_PROCESSING_INSTRUCTION = { '?', '>' };
	private static final char[] START_DOCTYPE = { '<', '!', 'D', 'O', 'C', 'T', 'Y', 'P', 'E' };
	private static final char[] SYSTEM = { 'S', 'Y', 'S', 'T', 'E', 'M' };
	private static final char[] PUBLIC = { 'P', 'U', 'B', 'L', 'I', 'C' };
	private static final char[] START_ELEMENT = { '<', '!', 'E', 'L', 'E', 'M', 'E', 'N', 'T' };
	private static final char[] START_ATTLIST = { '<', '!', 'A', 'T', 'T', 'L', 'I', 'S', 'T' };
	private static final char[] START_ENTITY = { '<', '!', 'E', 'N', 'T', 'I', 'T', 'Y' };
	private static final char[] START_NOTATION = { '<', '!', 'N', 'O', 'T', 'A', 'T', 'I', 'O', 'N' };
	private static final char[] EMPTY = new char[] { 'E', 'M', 'P', 'T', 'Y' };
	private static final char[] ANY = new char[] { 'A', 'N', 'Y' };
	private static final char[] NDATA = new char[] { 'N', 'D', 'A', 'T', 'A' };
	private static final char[] NOTATION = new char[] { 'N', 'O', 'T', 'A', 'T', 'I', 'O', 'N' };
	private static final char[] REQUIRED = new char[] { 'R', 'E', 'Q', 'U', 'I', 'R', 'E', 'D' };
	private static final char[] IMPLIED = new char[] { 'I', 'M', 'P', 'L', 'I', 'E', 'D' };
	private static final char[] FIXED = new char[] { 'F', 'I', 'X', 'E', 'D' };

	static final private String UNEXPECTED_EOF = "Unexpected EOF";
	static final private String ILLEGAL_TYPE = "Wrong event type";
	static final private int XML_DECLARATION = 998;

	// general
	private String location;

	private String version;
	private Boolean standalone;
	private String rootElementName;
	private String systemId;
	private String publicId;

	/**
	 * True if the {@code } contents are handled. The DTD defines
	 * entity values and default attribute values. These values are parsed at
	 * inclusion time and may contain both tags and entity references.
	 *
	 * 

* If this is false, the user must {@link #defineEntityReplacementText * define entity values manually}. Such entity values are literal strings * and will not be parsed. There is no API to define default attributes * manually. */ private boolean processDocDecl; private boolean processNsp; private boolean relaxed; private boolean keepNamespaceAttributes; /** * If non-null, the contents of the read buffer must be copied into this * string builder before the read buffer is overwritten. This is used to * capture the raw DTD text while parsing the DTD. */ private StringBuilder bufferCapture; /** * Entities defined in or for this document. This map is created lazily. */ private Map documentEntities; /** * Default attributes in this document. The outer map's key is the element * name; the inner map's key is the attribute name. Both keys should be * without namespace adjustments. This map is created lazily. */ private Map> defaultAttributes; private int depth; private String[] elementStack = new String[16]; private String[] nspStack = new String[8]; private int[] nspCounts = new int[4]; // source private Reader reader; private String encoding; private ContentSource nextContentSource; private char[] buffer = new char[8192]; private int position = 0; private int limit = 0; /* * Track the number of newlines and columns preceding the current buffer. To * compute the line and column of a position in the buffer, compute the line * and column in the buffer and add the preceding values. */ private int bufferStartLine; private int bufferStartColumn; // the current token private int type; private boolean isWhitespace; private String namespace; private String prefix; private String name; private String text; private boolean degenerated; private int attributeCount; /* * The current element's attributes arranged in groups of 4: i + 0 = * attribute namespace URI i + 1 = attribute namespace prefix i + 2 = * attribute qualified name (may contain ":", as in "html:h1") i + 3 = * attribute value */ private String[] attributes = new String[16]; private String error; private boolean unresolved; public final StringPool stringPool = new StringPool(); /** * Retains namespace attributes like {@code xmlns="http://foo"} or * {@code xmlns:foo="http:foo"} in pulled elements. Most applications will * only be interested in the effective namespaces of their elements, so * these attributes aren't useful. But for structure preserving wrappers * like DOM, it is necessary to keep the namespace data around. */ public void keepNamespaceAttributes() { this.keepNamespaceAttributes = true; } private boolean adjustNsp() { boolean any = false; for (int i = 0; i < attributeCount << 2; i += 4) { String attrName = attributes[i + 2]; int cut = attrName.indexOf(':'); String prefix; if (cut != -1) { prefix = attrName.substring(0, cut); attrName = attrName.substring(cut + 1); } else if (attrName.equals("xmlns")) { prefix = attrName; attrName = null; } else { continue; } if (!prefix.equals("xmlns")) { any = true; } else { int j = (nspCounts[depth]++) << 1; nspStack = ensureCapacity(nspStack, j + 2); nspStack[j] = attrName; nspStack[j + 1] = attributes[i + 3]; if (attrName != null && attributes[i + 3].isEmpty()) { checkRelaxed("illegal empty namespace"); } if (keepNamespaceAttributes) { // explicitly set the namespace for unprefixed attributes // such as xmlns="http://foo" attributes[i] = "http://www.w3.org/2000/xmlns/"; any = true; } else { System.arraycopy(attributes, i + 4, attributes, i, ((--attributeCount) << 2) - i); i -= 4; } } } if (any) { for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) { String attrName = attributes[i + 2]; int cut = attrName.indexOf(':'); if (cut == 0 && !relaxed) { throw new RuntimeException("illegal attribute name: " + attrName + " at " + this); } else if (cut != -1) { String attrPrefix = attrName.substring(0, cut); attrName = attrName.substring(cut + 1); String attrNs = getNamespace(attrPrefix); if (attrNs == null && !relaxed) { throw new RuntimeException("Undefined Prefix: " + attrPrefix + " in " + this); } attributes[i] = attrNs; attributes[i + 1] = attrPrefix; attributes[i + 2] = attrName; } } } int cut = name.indexOf(':'); if (cut == 0) { checkRelaxed("illegal tag name: " + name); } if (cut != -1) { prefix = name.substring(0, cut); name = name.substring(cut + 1); } this.namespace = getNamespace(prefix); if (this.namespace == null) { if (prefix != null) { checkRelaxed("undefined prefix: " + prefix); } this.namespace = NO_NAMESPACE; } return any; } private String[] ensureCapacity(String[] arr, int required) { if (arr.length >= required) { return arr; } String[] bigger = new String[required + 16]; System.arraycopy(arr, 0, bigger, 0, arr.length); return bigger; } private void checkRelaxed(String errorMessage) { if (!relaxed) { throw new KriptonRuntimeException(errorMessage, this, null); } if (error == null) { error = "Error: " + errorMessage; } } @Override public int next() throws KriptonRuntimeException, IOException { return next(false); } @Override public int nextToken() throws KriptonRuntimeException, IOException { return next(true); } private int next(boolean justOneToken) throws IOException, KriptonRuntimeException { if (reader == null) { throw new KriptonRuntimeException("setInput() must be called first.", this, null); } if (type == END_TAG) { depth--; } // degenerated needs to be handled before error because of possible // processor expectations(!) if (degenerated) { degenerated = false; type = END_TAG; return type; } if (error != null) { if (justOneToken) { text = error; type = COMMENT; error = null; return type; } else { error = null; } } type = peekType(false); if (type == XML_DECLARATION) { readXmlDeclaration(); type = peekType(false); } text = null; isWhitespace = true; prefix = null; name = null; namespace = null; attributeCount = -1; boolean throwOnResolveFailure = !justOneToken; while (true) { switch (type) { /* * Return immediately after encountering a start tag, end tag, or * the end of the document. */ case START_TAG: parseStartTag(false, throwOnResolveFailure); return type; case END_TAG: readEndTag(); return type; case END_DOCUMENT: return type; /* * Return after any text token when we're looking for a single * token. Otherwise concatenate all text between tags. */ case ENTITY_REF: if (justOneToken) { StringBuilder entityTextBuilder = new StringBuilder(); readEntity(entityTextBuilder, true, throwOnResolveFailure, ValueContext.TEXT); text = entityTextBuilder.toString(); break; } // fall-through case TEXT: text = readValue('<', !justOneToken, throwOnResolveFailure, ValueContext.TEXT); if (depth == 0 && isWhitespace) { type = IGNORABLE_WHITESPACE; } break; case CDSECT: read(START_CDATA); text = readUntil(END_CDATA, true); break; /* * Comments, processing instructions and declarations are returned * when we're looking for a single token. Otherwise they're skipped. */ case COMMENT: String commentText = readComment(justOneToken); if (justOneToken) { text = commentText; } break; case PROCESSING_INSTRUCTION: read(START_PROCESSING_INSTRUCTION); String processingInstruction = readUntil(END_PROCESSING_INSTRUCTION, justOneToken); if (justOneToken) { text = processingInstruction; } break; case DOCDECL: readDoctype(justOneToken); break; default: throw new KriptonRuntimeException("Unexpected token", this, null); } if (depth == 0 && (type == ENTITY_REF || type == TEXT || type == CDSECT)) { throw new KriptonRuntimeException("Unexpected token", this, null); } if (justOneToken) { return type; } if (type == IGNORABLE_WHITESPACE) { text = null; } /* * We've read all that we can of a non-empty text block. Always * report this as text, even if it was a CDATA block or entity * reference. */ int peek = peekType(false); if (text != null && !text.isEmpty() && peek < TEXT) { type = TEXT; return type; } type = peek; } } /** * Reads text until the specified delimiter is encountered. Consumes the * text and the delimiter. * * @param returnText * true to return the read text excluding the delimiter; false to * return null. */ private String readUntil(char[] delimiter, boolean returnText) throws IOException, KriptonRuntimeException { int start = position; StringBuilder result = null; if (returnText && text != null) { result = new StringBuilder(); result.append(text); } search: while (true) { if (position + delimiter.length >= limit) { if (start < position && returnText) { if (result == null) { result = new StringBuilder(); } result.append(buffer, start, position - start); } if (!fillBuffer(delimiter.length)) { checkRelaxed(UNEXPECTED_EOF); type = COMMENT; return null; } start = position; } // TODO: replace with Arrays.equals(buffer, position, delimiter, 0, // delimiter.length) // when the VM has better method inlining for (int i = 0; i < delimiter.length; i++) { if (buffer[position + i] != delimiter[i]) { position++; continue search; } } break; } int end = position; position += delimiter.length; if (!returnText) { return null; } else if (result == null) { return stringPool.get(buffer, start, end - start); } else { result.append(buffer, start, end - start); return result.toString(); } } /** * Returns true if an XML declaration was read. */ private void readXmlDeclaration() throws IOException, KriptonRuntimeException { if (bufferStartLine != 0 || bufferStartColumn != 0 || position != 0) { checkRelaxed("processing instructions must not start with xml"); } read(START_PROCESSING_INSTRUCTION); parseStartTag(true, true); if (attributeCount < 1 || !"version".equals(attributes[2])) { checkRelaxed("version expected"); } version = attributes[3]; int pos = 1; if (pos < attributeCount && "encoding".equals(attributes[2 + 4])) { encoding = attributes[3 + 4]; pos++; } if (pos < attributeCount && "standalone".equals(attributes[4 * pos + 2])) { String st = attributes[3 + 4 * pos]; if ("yes".equals(st)) { standalone = Boolean.TRUE; } else if ("no".equals(st)) { standalone = Boolean.FALSE; } else { checkRelaxed("illegal standalone value: " + st); } pos++; } if (pos != attributeCount) { checkRelaxed("unexpected attributes in XML declaration"); } isWhitespace = true; text = null; } private String readComment(boolean returnText) throws IOException, KriptonRuntimeException { read(START_COMMENT); if (relaxed) { return readUntil(END_COMMENT, returnText); } String commentText = readUntil(COMMENT_DOUBLE_DASH, returnText); if (peekCharacter() != '>') { throw new KriptonRuntimeException("Comments may not contain --", this, null); } position++; return commentText; } /** * Read the document's DTD. Although this parser is non-validating, the DTD * must be parsed to capture entity values and default attribute values. */ private void readDoctype(boolean saveDtdText) throws IOException, KriptonRuntimeException { read(START_DOCTYPE); int startPosition = -1; if (saveDtdText) { bufferCapture = new StringBuilder(); startPosition = position; } try { skip(); rootElementName = readName(); readExternalId(true, true); skip(); if (peekCharacter() == '[') { readInternalSubset(); } skip(); } finally { if (saveDtdText) { bufferCapture.append(buffer, 0, position); bufferCapture.delete(0, startPosition); text = bufferCapture.toString(); bufferCapture = null; } } read('>'); } /** * Reads an external ID of one of these two forms: SYSTEM "quoted system * name" PUBLIC "quoted public id" "quoted system name" * * If the system name is not required, this also supports lone public IDs of * this form: PUBLIC "quoted public id" * * Returns true if any ID was read. */ private boolean readExternalId(boolean requireSystemName, boolean assignFields) throws IOException, KriptonRuntimeException { skip(); int c = peekCharacter(); if (c == 'S') { read(SYSTEM); } else if (c == 'P') { read(PUBLIC); skip(); if (assignFields) { publicId = readQuotedId(true); } else { readQuotedId(false); } } else { return false; } skip(); if (!requireSystemName) { int delimiter = peekCharacter(); if (delimiter != '"' && delimiter != '\'') { return true; // no system name! } } if (assignFields) { systemId = readQuotedId(true); } else { readQuotedId(false); } return true; } private static final char[] SINGLE_QUOTE = new char[] { '\'' }; private static final char[] DOUBLE_QUOTE = new char[] { '"' }; /** * Reads a quoted string, performing no entity escaping of the contents. */ private String readQuotedId(boolean returnText) throws IOException, KriptonRuntimeException { int quote = peekCharacter(); char[] delimiter; if (quote == '"') { delimiter = DOUBLE_QUOTE; } else if (quote == '\'') { delimiter = SINGLE_QUOTE; } else { throw new KriptonRuntimeException("Expected a quoted string", this, null); } position++; return readUntil(delimiter, returnText); } private void readInternalSubset() throws IOException, KriptonRuntimeException { read('['); while (true) { skip(); if (peekCharacter() == ']') { position++; return; } int declarationType = peekType(true); switch (declarationType) { case ELEMENTDECL: readElementDeclaration(); break; case ATTLISTDECL: readAttributeListDeclaration(); break; case ENTITYDECL: readEntityDeclaration(); break; case NOTATIONDECL: readNotationDeclaration(); break; case PROCESSING_INSTRUCTION: read(START_PROCESSING_INSTRUCTION); readUntil(END_PROCESSING_INSTRUCTION, false); break; case COMMENT: readComment(false); break; case PARAMETER_ENTITY_REF: throw new KriptonRuntimeException("Parameter entity references are not supported", this, null); default: throw new KriptonRuntimeException("Unexpected token", this, null); } } } /** * Read an element declaration. This contains a name and a content spec. * */ private void readElementDeclaration() throws IOException, KriptonRuntimeException { read(START_ELEMENT); skip(); readName(); readContentSpec(); skip(); read('>'); } /** * Read an element content spec. This is a regular expression-like pattern * of names or other content specs. The following operators are supported: * sequence: (a,b,c) choice: (a|b|c) optional: a? one or more: a+ any * number: a* * * The special name '#PCDATA' is permitted but only if it is the first * element of the first group: (#PCDATA|a|b) * * The top-level element must be either a choice, a sequence, or one of the * special names EMPTY and ANY. */ private void readContentSpec() throws IOException, KriptonRuntimeException { // this implementation is very lenient; it scans for balanced parens // only skip(); int c = peekCharacter(); if (c == '(') { int depth = 0; do { if (c == '(') { depth++; } else if (c == ')') { depth--; } position++; c = peekCharacter(); } while (depth > 0); if (c == '*' || c == '?' || c == '+') { position++; } } else if (c == EMPTY[0]) { read(EMPTY); } else if (c == ANY[0]) { read(ANY); } else { throw new KriptonRuntimeException("Expected element content spec", this, null); } } /** * Reads an attribute list declaration such as the following: * * Each attribute has a name, type and default. * * Types are one of the built-in types (CDATA, ID, IDREF, IDREFS, ENTITY, * ENTITIES, NMTOKEN, or NMTOKENS), an enumerated type "(list|of|options)" * or NOTATION followed by an enumerated type. * * The default is either #REQUIRED, #IMPLIED, #FIXED, a quoted value, or * #FIXED with a quoted value. */ private void readAttributeListDeclaration() throws IOException, KriptonRuntimeException { read(START_ATTLIST); skip(); String elementName = readName(); while (true) { skip(); int c = peekCharacter(); if (c == '>') { position++; return; } // attribute name String attributeName = readName(); // attribute type skip(); if (position + 1 >= limit && !fillBuffer(2)) { throw new KriptonRuntimeException("Malformed attribute list", this, null); } if (buffer[position] == NOTATION[0] && buffer[position + 1] == NOTATION[1]) { read(NOTATION); skip(); } c = peekCharacter(); if (c == '(') { position++; while (true) { skip(); readName(); skip(); c = peekCharacter(); if (c == ')') { position++; break; } else if (c == '|') { position++; } else { throw new KriptonRuntimeException("Malformed attribute type", this, null); } } } else { readName(); } // default value skip(); c = peekCharacter(); if (c == '#') { position++; c = peekCharacter(); if (c == 'R') { read(REQUIRED); } else if (c == 'I') { read(IMPLIED); } else if (c == 'F') { read(FIXED); } else { throw new KriptonRuntimeException("Malformed attribute type", this, null); } skip(); c = peekCharacter(); } if (c == '"' || c == '\'') { position++; // TODO: does this do escaping correctly? String value = readValue((char) c, true, true, ValueContext.ATTRIBUTE); position++; defineAttributeDefault(elementName, attributeName, value); } } } private void defineAttributeDefault(String elementName, String attributeName, String value) { if (defaultAttributes == null) { defaultAttributes = new HashMap>(); } Map elementAttributes = defaultAttributes.get(elementName); if (elementAttributes == null) { elementAttributes = new HashMap(); defaultAttributes.put(elementName, elementAttributes); } elementAttributes.put(attributeName, value); } /** * Read an entity declaration. The value of internal entities are inline: * * * The values of external entities must be retrieved by URL or path: * * * Entities may be general or parameterized. Parameterized entities are * marked by a percent sign. Such entities may only be used in the DTD: * */ private void readEntityDeclaration() throws IOException, KriptonRuntimeException { read(START_ENTITY); boolean generalEntity = true; skip(); if (peekCharacter() == '%') { generalEntity = false; position++; skip(); } String name = readName(); skip(); int quote = peekCharacter(); String entityValue; if (quote == '"' || quote == '\'') { position++; entityValue = readValue((char) quote, true, false, ValueContext.ENTITY_DECLARATION); position++; } else if (readExternalId(true, false)) { /* * Map external entities to the empty string. This is dishonest, but * it's consistent with Android's Expat pull parser. */ entityValue = ""; skip(); if (peekCharacter() == NDATA[0]) { read(NDATA); skip(); readName(); } } else { throw new KriptonRuntimeException("Expected entity value or external ID", this, null); } if (generalEntity && processDocDecl) { if (documentEntities == null) { documentEntities = new HashMap(); } documentEntities.put(name, entityValue.toCharArray()); } skip(); read('>'); } private void readNotationDeclaration() throws IOException, KriptonRuntimeException { read(START_NOTATION); skip(); readName(); if (!readExternalId(false, false)) { throw new KriptonRuntimeException("Expected external ID or public ID for notation", this, null); } skip(); read('>'); } private void readEndTag() throws IOException, KriptonRuntimeException { read('<'); read('/'); name = readName(); // TODO: pass the expected name in as a hint? skip(); read('>'); int sp = (depth - 1) * 4; if (depth == 0) { checkRelaxed("read end tag " + name + " with no tags open"); type = COMMENT; return; } if (name.equals(elementStack[sp + 3])) { namespace = elementStack[sp]; prefix = elementStack[sp + 1]; name = elementStack[sp + 2]; } else if (!relaxed) { throw new KriptonRuntimeException("expected: /" + elementStack[sp + 3] + " read: " + name, this, null); } } @Override public boolean hasNext() { return getEventType() != END_DOCUMENT; } /** * Returns the type of the next token. */ private int peekType(boolean inDeclaration) throws IOException, KriptonRuntimeException { if (position >= limit && !fillBuffer(1)) { return END_DOCUMENT; } switch (buffer[position]) { case '&': return ENTITY_REF; // & case '<': if (position + 3 >= limit && !fillBuffer(4)) { throw new KriptonRuntimeException("Dangling <", this, null); } switch (buffer[position + 1]) { case '/': return END_TAG; // = limit && !fillBuffer(1)) { checkRelaxed(UNEXPECTED_EOF); return; } int c = buffer[position]; if (xmldecl) { if (c == '?') { position++; read('>'); return; } } else { if (c == '/') { degenerated = true; position++; skip(); read('>'); break; } else if (c == '>') { position++; break; } } String attrName = readName(); int i = (attributeCount++) * 4; attributes = ensureCapacity(attributes, i + 4); attributes[i] = ""; attributes[i + 1] = null; attributes[i + 2] = attrName; skip(); if (position >= limit && !fillBuffer(1)) { checkRelaxed(UNEXPECTED_EOF); return; } if (buffer[position] == '=') { position++; skip(); if (position >= limit && !fillBuffer(1)) { checkRelaxed(UNEXPECTED_EOF); return; } char delimiter = buffer[position]; if (delimiter == '\'' || delimiter == '"') { position++; } else if (relaxed) { delimiter = ' '; } else { throw new KriptonRuntimeException("attr value delimiter missing!", this, null); } attributes[i + 3] = readValue(delimiter, true, throwOnResolveFailure, ValueContext.ATTRIBUTE); if (delimiter != ' ') { position++; // end quote } } else if (relaxed) { attributes[i + 3] = attrName; } else { checkRelaxed("Attr.value missing f. " + attrName); attributes[i + 3] = attrName; } } int sp = depth++ * 4; elementStack = ensureCapacity(elementStack, sp + 4); elementStack[sp + 3] = name; if (depth >= nspCounts.length) { int[] bigger = new int[depth + 4]; System.arraycopy(nspCounts, 0, bigger, 0, nspCounts.length); nspCounts = bigger; } nspCounts[depth] = nspCounts[depth - 1]; if (processNsp) { adjustNsp(); } else { namespace = ""; } // For consistency with Expat, add default attributes after fixing // namespaces. if (defaultAttributes != null) { Map elementDefaultAttributes = defaultAttributes.get(name); if (elementDefaultAttributes != null) { for (Map.Entry entry : elementDefaultAttributes.entrySet()) { if (getAttributeValue(null, entry.getKey()) != null) { continue; // an explicit value overrides the default } int i = (attributeCount++) * 4; attributes = ensureCapacity(attributes, i + 4); attributes[i] = ""; attributes[i + 1] = null; attributes[i + 2] = entry.getKey(); attributes[i + 3] = entry.getValue(); } } } elementStack[sp] = namespace; elementStack[sp + 1] = prefix; elementStack[sp + 2] = name; } /** * Reads an entity reference from the buffer, resolves it, and writes the * resolved entity to {@code out}. If the entity cannot be read or resolved, * {@code out} will contain the partial entity reference. */ private void readEntity(StringBuilder out, boolean isEntityToken, boolean throwOnResolveFailure, ValueContext valueContext) throws IOException, KriptonRuntimeException { int start = out.length(); if (buffer[position++] != '&') { throw new AssertionError(); } out.append('&'); while (true) { int c = peekCharacter(); if (c == ';') { out.append(';'); position++; break; } else if (c >= 128 || (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-' || c == '#') { position++; out.append((char) c); } else if (relaxed) { // intentionally leave the partial reference in 'out' return; } else { throw new KriptonRuntimeException("unterminated entity ref", this, null); } } String code = out.substring(start + 1, out.length() - 1); if (isEntityToken) { name = code; } if (code.startsWith("#")) { try { int c = code.startsWith("#x") ? Integer.parseInt(code.substring(2), 16) : Integer.parseInt(code.substring(1)); out.delete(start, out.length()); out.appendCodePoint(c); unresolved = false; return; } catch (NumberFormatException notANumber) { throw new KriptonRuntimeException("Invalid character reference: &" + code); } catch (IllegalArgumentException invalidCodePoint) { throw new KriptonRuntimeException("Invalid character reference: &" + code); } } if (valueContext == ValueContext.ENTITY_DECLARATION) { // keep the unresolved &code; in the text to resolve later return; } String defaultEntity = DEFAULT_ENTITIES.get(code); if (defaultEntity != null) { out.delete(start, out.length()); unresolved = false; out.append(defaultEntity); return; } char[] resolved; if (documentEntities != null && (resolved = documentEntities.get(code)) != null) { out.delete(start, out.length()); unresolved = false; if (processDocDecl) { pushContentSource(resolved); // parse the entity as XML } else { out.append(resolved); // include the entity value as text } return; } /* * The parser skipped an external DTD, and now we've encountered an * unknown entity that could have been declared there. Map it to the * empty string. This is dishonest, but it's consistent with Android's * old ExpatPullParser. */ if (systemId != null) { out.delete(start, out.length()); return; } // keep the unresolved entity "&code;" in the text for relaxed clients unresolved = true; if (throwOnResolveFailure) { checkRelaxed("unresolved: &" + code + ";"); } } /** * Where a value is found impacts how that value is interpreted. For * example, in attributes, "\n" must be replaced with a space character. In * text, "]]>" is forbidden. In entity declarations, named references are * not resolved. */ enum ValueContext { ATTRIBUTE, TEXT, ENTITY_DECLARATION } /** * Returns the current text or attribute value. This also has the side * effect of setting isWhitespace to false if a non-whitespace character is * encountered. * * @param delimiter * {@code <} for text, {@code "} and {@code '} for quoted * attributes, or a space for unquoted attributes. */ private String readValue(char delimiter, boolean resolveEntities, boolean throwOnResolveFailure, ValueContext valueContext) throws IOException, KriptonRuntimeException { /* * This method returns all of the characters from the current position * through to an appropriate delimiter. * * If we're lucky (which we usually are), we'll return a single slice of * the buffer. This fast path avoids allocating a string builder. * * There are 6 unlucky characters we could encounter: - "&": entities * must be resolved. - "%": parameter entities are unsupported in entity * values. - "<": this isn't permitted in attributes unless relaxed. - * "]": this requires a lookahead to defend against the forbidden CDATA * section delimiter "]]>". - "\r": If a "\r" is followed by a "\n", we * discard the "\r". If it isn't followed by "\n", we replace "\r" with * either a "\n" in text nodes or a space in attribute values. - "\n": * In attribute values, "\n" must be replaced with a space. * * We could also get unlucky by needing to refill the buffer midway * through the text. */ int start = position; StringBuilder result = null; // if a text section was already started, prefix the start if (valueContext == ValueContext.TEXT && text != null) { result = new StringBuilder(); result.append(text); } while (true) { /* * Make sure we have at least a single character to read from the * buffer. This mutates the buffer, so save the partial result to * the slow path string builder first. */ if (position >= limit) { if (start < position) { if (result == null) { result = new StringBuilder(); } result.append(buffer, start, position - start); } if (!fillBuffer(1)) { return result != null ? result.toString() : ""; } start = position; } char c = buffer[position]; if (c == delimiter || (delimiter == ' ' && (c <= ' ' || c == '>')) || c == '&' && !resolveEntities) { break; } if (c != '\r' && (c != '\n' || valueContext != ValueContext.ATTRIBUTE) && c != '&' && c != '<' && (c != ']' || valueContext != ValueContext.TEXT) && (c != '%' || valueContext != ValueContext.ENTITY_DECLARATION)) { isWhitespace &= (c <= ' '); position++; continue; } /* * We've encountered an unlucky character! Convert from fast path to * slow path if we haven't done so already. */ if (result == null) { result = new StringBuilder(); } result.append(buffer, start, position - start); if (c == '\r') { if ((position + 1 < limit || fillBuffer(2)) && buffer[position + 1] == '\n') { position++; } c = (valueContext == ValueContext.ATTRIBUTE) ? ' ' : '\n'; } else if (c == '\n') { c = ' '; } else if (c == '&') { isWhitespace = false; // TODO: what if the entity resolves to // whitespace? readEntity(result, false, throwOnResolveFailure, valueContext); start = position; continue; } else if (c == '<') { if (valueContext == ValueContext.ATTRIBUTE) { checkRelaxed("Illegal: \"<\" inside attribute value"); } isWhitespace = false; } else if (c == ']') { if ((position + 2 < limit || fillBuffer(3)) && buffer[position + 1] == ']' && buffer[position + 2] == '>') { checkRelaxed("Illegal: \"]]>\" outside CDATA section"); } isWhitespace = false; } else { // if (c == '%') throw new KriptonRuntimeException("This parser doesn't support parameter entities", this, null); } position++; result.append(c); start = position; } if (result == null) { return stringPool.get(buffer, start, position - start); } else { result.append(buffer, start, position - start); return result.toString(); } } private void read(char expected) throws IOException, KriptonRuntimeException { int c = peekCharacter(); if (c != expected) { checkRelaxed("expected: '" + expected + "' actual: '" + ((char) c) + "'"); } position++; } private void read(char[] chars) throws IOException, KriptonRuntimeException { if (position + chars.length >= limit && !fillBuffer(chars.length)) { checkRelaxed("expected: '" + new String(chars) + "' but was EOF"); return; } // TODO: replace with Arrays.equals(buffer, position, delimiter, 0, // delimiter.length) // when the VM has better method inlining for (int i = 0; i < chars.length; i++) { if (buffer[position + i] != chars[i]) { checkRelaxed("expected: \"" + new String(chars) + "\" but was \"" + new String(buffer, position, chars.length) + "...\""); } } position += chars.length; } private int peekCharacter() throws IOException, KriptonRuntimeException { if (position < limit || fillBuffer(1)) { return buffer[position]; } return -1; } /** * Returns true once {@code limit - position >= minimum}. If the data is * exhausted before that many characters are available, this returns false. */ private boolean fillBuffer(int minimum) throws IOException, KriptonRuntimeException { // If we've exhausted the current content source, remove it while (nextContentSource != null) { if (position < limit) { throw new KriptonRuntimeException("Unbalanced entity!", this, null); } popContentSource(); if (limit - position >= minimum) { return true; } } // Before clobbering the old characters, update where buffer starts for (int i = 0; i < position; i++) { if (buffer[i] == '\n') { bufferStartLine++; bufferStartColumn = 0; } else { bufferStartColumn++; } } if (bufferCapture != null) { bufferCapture.append(buffer, 0, position); } if (limit != position) { limit -= position; System.arraycopy(buffer, position, buffer, 0, limit); } else { limit = 0; } position = 0; int total; while ((total = reader.read(buffer, limit, buffer.length - limit)) != -1) { limit += total; if (limit >= minimum) { return true; } } return false; } /** * Returns an element or attribute name. This is always non-empty for * non-relaxed parsers. */ private String readName() throws IOException, KriptonRuntimeException { if (position >= limit && !fillBuffer(1)) { checkRelaxed("name expected"); return ""; } int start = position; StringBuilder result = null; // read the first character char c = buffer[position]; if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':' || c >= '\u00c0' // TODO: // check // the // XML // spec || relaxed) { position++; } else { checkRelaxed("name expected"); return ""; } while (true) { /* * Make sure we have at least a single character to read from the * buffer. This mutates the buffer, so save the partial result to * the slow path string builder first. */ if (position >= limit) { if (result == null) { result = new StringBuilder(); } result.append(buffer, start, position - start); if (!fillBuffer(1)) { return result.toString(); } start = position; } // read another character c = buffer[position]; if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c == ':' || c == '.' || c >= '\u00b7') { // TODO: // check // the // XML // spec position++; continue; } // we encountered a non-name character. done! if (result == null) { return stringPool.get(buffer, start, position - start); } else { result.append(buffer, start, position - start); return result.toString(); } } } private void skip() throws IOException, KriptonRuntimeException { while (position < limit || fillBuffer(1)) { int c = buffer[position]; if (c > ' ') { break; } position++; } } // public part starts here... @Override public void setInput(Reader reader) { this.reader = reader; type = START_DOCUMENT; name = null; namespace = null; degenerated = false; attributeCount = -1; encoding = null; version = null; standalone = null; if (reader == null) { return; } position = 0; limit = 0; bufferStartLine = 0; bufferStartColumn = 0; depth = 0; documentEntities = null; } @Override public void setInput(InputStream is, String charset) { position = 0; limit = 0; boolean detectCharset = (charset == null); if (is == null) { throw new IllegalArgumentException(); } try { if (detectCharset) { // read the four bytes looking for an indication of the encoding // in use int firstFourBytes = 0; while (limit < 4) { int i = is.read(); if (i == -1) { break; } firstFourBytes = (firstFourBytes << 8) | i; buffer[limit++] = (char) i; } if (limit == 4) { switch (firstFourBytes) { case 0x00000FEFF: // UTF-32BE BOM charset = "UTF-32BE"; limit = 0; break; case 0x0FFFE0000: // UTF-32LE BOM charset = "UTF-32LE"; limit = 0; break; case 0x0000003c: // '<' in UTF-32BE charset = "UTF-32BE"; buffer[0] = '<'; limit = 1; break; case 0x03c000000: // '<' in UTF-32LE charset = "UTF-32LE"; buffer[0] = '<'; limit = 1; break; case 0x0003c003f: // "') { String s = new String(buffer, 0, limit); int i0 = s.indexOf("encoding"); if (i0 != -1) { while (s.charAt(i0) != '"' && s.charAt(i0) != '\'') { i0++; } char deli = s.charAt(i0++); int i1 = s.indexOf(deli, i0); charset = s.substring(i0, i1); } break; } } break; default: // handle a byte order mark followed by something other // than * is still at character 0. */ if (!detectCharset && peekCharacter() == 0xfeff) { limit--; System.arraycopy(buffer, 1, buffer, 0, limit); } } catch (Exception e) { throw new KriptonRuntimeException("Invalid stream or encoding: " + e, this, e); } } @Override public void close() throws IOException { if (reader != null) { reader.close(); } } @Override public boolean getFeature(String feature) { if (XmlPullParser.FEATURE_PROCESS_NAMESPACES.equals(feature)) { return processNsp; } else if (FEATURE_RELAXED.equals(feature)) { return relaxed; } else if (FEATURE_PROCESS_DOCDECL.equals(feature)) { return processDocDecl; } else { return false; } } @Override public String getInputEncoding() { return encoding; } @Override public void defineEntityReplacementText(String entity, String value) { if (processDocDecl) { throw new IllegalStateException("Entity replacement text may not be defined with DOCTYPE processing enabled."); } if (reader == null) { throw new IllegalStateException("Entity replacement text must be defined after setInput()"); } if (documentEntities == null) { documentEntities = new HashMap(); } documentEntities.put(entity, value.toCharArray()); } @Override public Object getProperty(String property) { if (property.equals(PROPERTY_XMLDECL_VERSION)) { return version; } else if (property.equals(PROPERTY_XMLDECL_STANDALONE)) { return standalone; } else if (property.equals(PROPERTY_LOCATION)) { return location != null ? location : reader.toString(); } else { return null; } } /** * Returns the root element's name if it was declared in the DTD. This * equals the first tag's name for valid documents. */ public String getRootElementName() { return rootElementName; } /** * Returns the document's system ID if it was declared. This is typically a * string like {@code http://www.w3.org/TR/html4/strict.dtd}. */ public String getSystemId() { return systemId; } /** * Returns the document's public ID if it was declared. This is typically a * string like {@code -//W3C//DTD HTML 4.01//EN}. */ public String getPublicId() { return publicId; } @Override public int getNamespaceCount(int depth) { if (depth > this.depth) { throw new IndexOutOfBoundsException(); } return nspCounts[depth]; } @Override public String getNamespacePrefix(int pos) { return nspStack[pos * 2]; } @Override public String getNamespaceUri(int pos) { return nspStack[(pos * 2) + 1]; } @Override public String getNamespace(String prefix) { if ("xml".equals(prefix)) { return "http://www.w3.org/XML/1998/namespace"; } if ("xmlns".equals(prefix)) { return "http://www.w3.org/2000/xmlns/"; } for (int i = (getNamespaceCount(depth) << 1) - 2; i >= 0; i -= 2) { if (prefix == null) { if (nspStack[i] == null) { return nspStack[i + 1]; } } else if (prefix.equals(nspStack[i])) { return nspStack[i + 1]; } } return null; } @Override public int getDepth() { return depth; } @Override public String getPositionDescription() { StringBuilder buf = new StringBuilder(type < TYPES.length ? TYPES[type] : "unknown"); buf.append(' '); if (type == START_TAG || type == END_TAG) { if (degenerated) { buf.append("(empty) "); } buf.append('<'); if (type == END_TAG) { buf.append('/'); } if (prefix != null) { buf.append("{" + namespace + "}" + prefix + ":"); } buf.append(name); int cnt = attributeCount * 4; for (int i = 0; i < cnt; i += 4) { buf.append(' '); if (attributes[i + 1] != null) { buf.append("{" + attributes[i] + "}" + attributes[i + 1] + ":"); } buf.append(attributes[i + 2] + "='" + attributes[i + 3] + "'"); } buf.append('>'); } else if (type == IGNORABLE_WHITESPACE) { ; } else if (type != TEXT) { buf.append(getText()); } else if (isWhitespace) { buf.append("(whitespace)"); } else { String text = getText(); if (text.length() > 16) { text = text.substring(0, 16) + "..."; } buf.append(text); } buf.append("@" + getLineNumber() + ":" + getColumnNumber()); if (location != null) { buf.append(" in "); buf.append(location); } else if (reader != null) { buf.append(" in "); buf.append(reader.toString()); } return buf.toString(); } @Override public int getLineNumber() { int result = bufferStartLine; for (int i = 0; i < position; i++) { if (buffer[i] == '\n') { result++; } } return result + 1; // the first line is '1' } @Override public int getColumnNumber() { int result = bufferStartColumn; for (int i = 0; i < position; i++) { if (buffer[i] == '\n') { result = 0; } else { result++; } } return result + 1; // the first column is '1' } @Override public boolean isWhitespace() { if (type != TEXT && type != IGNORABLE_WHITESPACE && type != CDSECT) { throw new KriptonRuntimeException(ILLEGAL_TYPE, this, null); } return isWhitespace; } @Override public String getText() { if (type < TEXT || (type == ENTITY_REF && unresolved)) { return null; } else if (text == null) { return ""; } else { return text; } } @Override public char[] getTextCharacters(int[] poslen) { String text = getText(); if (text == null) { poslen[0] = -1; poslen[1] = -1; return null; } char[] result = text.toCharArray(); poslen[0] = 0; poslen[1] = result.length; return result; } @Override public String getNamespace() { return namespace; } @Override public String getName() { return name; } @Override public String getPrefix() { return prefix; } @Override public boolean isEmptyElementTag() { if (type != START_TAG) { throw new KriptonRuntimeException(ILLEGAL_TYPE, this, null); } return degenerated; } @Override public int getAttributeCount() { return attributeCount; } @Override public String getAttributeType(int index) { return "CDATA"; } @Override public boolean isAttributeDefault(int index) { return false; } @Override public String getAttributeNamespace(int index) { if (index >= attributeCount) { throw new IndexOutOfBoundsException(); } return attributes[index * 4]; } @Override public String getAttributeName(int index) { if (index >= attributeCount) { throw new IndexOutOfBoundsException(); } return attributes[(index * 4) + 2]; } @Override public String getAttributePrefix(int index) { if (index >= attributeCount) { throw new IndexOutOfBoundsException(); } return attributes[(index * 4) + 1]; } @Override public String getAttributeValue(int index) { if (index >= attributeCount) { throw new IndexOutOfBoundsException(); } return attributes[(index * 4) + 3]; } @Override public String getAttributeValue(String namespace, String name) { for (int i = (attributeCount * 4) - 4; i >= 0; i -= 4) { if (attributes[i + 2].equals(name) && (namespace == null || attributes[i].equals(namespace))) { return attributes[i + 3]; } } return null; } @Override public int getAttributeIndex(String namespace, String name) { for (int i = (attributeCount * 4) - 4; i >= 0; i -= 4) { if (attributes[i + 2].equals(name) && (namespace == null || attributes[i].equals(namespace))) { return i / 4; } } return -1; } @Override public int getEventType() { return type; } // utility methods to make XML parsing easier ... @Override public int nextTag() throws KriptonRuntimeException, IOException { next(); if (type == TEXT && isWhitespace) { next(); } if (type != END_TAG && type != START_TAG) { throw new KriptonRuntimeException("unexpected type", this, null); } return type; } @Override public void require(int type, String namespace, String name) throws KriptonRuntimeException, IOException { if (type != this.type || (namespace != null && !namespace.equals(getNamespace())) || (name != null && !name.equals(getName()))) { throw new KriptonRuntimeException("expected: " + TYPES[type] + " {" + namespace + "}" + name, this, null); } } @Override public String nextText() throws IOException { if (type != START_TAG) { throw new KriptonRuntimeException("precondition: START_TAG", this, null); } next(); String result; if (type == TEXT) { result = getText(); next(); } else { result = ""; } if (type != END_TAG) { throw new KriptonRuntimeException("END_TAG expected", this, null); } return result; } @Override public void setFeature(String feature, boolean value) { if (XmlPullParser.FEATURE_PROCESS_NAMESPACES.equals(feature)) { processNsp = value; } else if (XmlPullParser.FEATURE_PROCESS_DOCDECL.equals(feature)) { processDocDecl = value; } else if (FEATURE_RELAXED.equals(feature)) { relaxed = value; } else { throw new KriptonRuntimeException("unsupported feature: " + feature, this, null); } } @Override public void setProperty(String property, Object value) { if (property.equals(PROPERTY_LOCATION)) { location = String.valueOf(value); } else { throw new KriptonRuntimeException("unsupported property: " + property); } } /** * A chain of buffers containing XML content. Each content source contains * the parser's primary read buffer or the characters of entities actively * being parsed. * *

* For example, note the buffers needed to parse this document: * *

	 *    {@code
	 *   
	 *       
	 *   ]>
	 *   abc &bar; mno
	 * }
	 * 
* *

* Things get interesting when the bar entity is encountered. At that point * two buffers are active: *

    *
  1. The value for the bar entity, containing {@code "def &baz; jkl"} *
  2. The parser's primary read buffer, containing {@code " mno"} *
*

* The parser will return the characters {@code "def "} from the bar * entity's buffer, and then it will encounter the baz entity. To handle * that, three buffers will be active: *

    *
  1. The value for the baz entity, containing {@code "ghi"} *
  2. The remaining value for the bar entity, containing {@code " jkl"} *
  3. The parser's primary read buffer, containing {@code " mno"} *
*

* The parser will then return the characters {@code ghi jkl mno} in that * sequence by reading each buffer in sequence. */ static class ContentSource { private final ContentSource next; private final char[] buffer; private final int position; private final int limit; ContentSource(ContentSource next, char[] buffer, int position, int limit) { this.next = next; this.buffer = buffer; this.position = position; this.limit = limit; } } /** * Prepends the characters of {@code newBuffer} to be read before the * current buffer. */ private void pushContentSource(char[] newBuffer) { nextContentSource = new ContentSource(nextContentSource, buffer, position, limit); buffer = newBuffer; position = 0; limit = newBuffer.length; } /** * Replaces the current exhausted buffer with the next buffer in the chain. */ private void popContentSource() { buffer = nextContentSource.buffer; position = nextContentSource.position; limit = nextContentSource.limit; nextContentSource = nextContentSource.next; } @Override public boolean hasText() { return StringUtils.hasText(getText()); } @Override public String getElementText() throws IOException { return nextText(); } @Override public boolean getElementAsBoolean() throws IOException { return Boolean.parseBoolean(nextText()); } @Override public int getElementAsInt() throws Exception { return Integer.parseInt(nextText()); } @Override public long getElementAsLong() throws Exception { return Long.parseLong(nextText()); } @Override public float getElementAsFloat() throws Exception { return Float.parseFloat(nextText()); } @Override public double getElementAsDouble() throws Exception { return Double.parseDouble(nextText()); } @Override public BigInteger getElementAsInteger() throws Exception { return new BigInteger(nextText()); } @Override public BigDecimal getElementAsDecimal() throws Exception { return new BigDecimal(nextText()); } @Override public boolean isEmptyElement() { return isEmptyElementTag(); } @Override public byte[] getElementAsBinary() throws IOException { return Base64Utils.decode(nextText()); } @Override public BigDecimal getAttributeAsDecimal(int index) { return new BigDecimal(getAttributeValue(index).trim()); } @Override public BigInteger getAttributeAsInteger(int index) { return new BigInteger(getAttributeValue(index).trim()); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy