All Downloads are FREE. Search and download functionalities are using the official Maven repository.

to.etc.syntaxer.XmlParser Maven / Gradle / Ivy

The newest version!
package to.etc.syntaxer;


import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Stack;


/**
 * Parse XML documents and return parse events through call-backs.
 * 

You need to define a class implementing the XmlHandler * interface: an object belonging to this class will receive the * callbacks for the events. (As an alternative to implementing * the full XmlHandler interface, you can simply extend the * HandlerBase convenience class.) *

Usage (assuming that MyHandler is your implementation * of the XmlHandler interface): *

 * XmlHandler handler = new MyHandler();
 * XmlParser parser = new XmlParser();
 * parser.setHandler(handler);
 * try {
 *   parser.parse("http://www.host.com/doc.xml", null);
 * } catch (Exception e) {
 *   [do something interesting]
 * }
 * 
*

Alternatively, you can use the standard SAX interfaces * with the SAXDriver class as your entry point. * @author Copyright (c) 1997, 1998 by Microstar Software Ltd. * @author Written by David Megginson <[email protected]> * @version 1.1 * @see XmlHandler * @see HandlerBase * @see SAXDriver */ public class XmlParser { // // Use special cheats that speed up the code (currently about 50%), // but may cause problems with future maintenance and add to the // class file size (about 500 bytes). // private final static boolean USE_CHEATS = true; ////////////////////////////////////////////////////////////////////// // Constructors. //////////////////////////////////////////////////////////////////////// /** * Construct a new parser with no associated handler. * @see #setHandler * @see #parse */ public XmlParser() {} /** * Set the handler that will receive parsing events. * @param handler The handler to receive callback events. * @see #parse * @see XmlHandler */ public void setHandler(XmlHandler handler) { this.m_handler = handler; } /** * Parse an XML document from a URI. *

You may parse a document more than once, but only one thread * may call this method for an object at one time. * @param systemId The URI of the document. * @param publicId The public identifier of the document, or null. * @param encoding The suggested encoding, or null if unknown. * @exception java.lang.Exception Any exception thrown by your * own handlers, or any derivation of java.io.IOException * thrown by the parser itself. */ public void parse(String systemId, String publicId, String encoding) throws java.lang.Exception { doParse(systemId, publicId, null, null, encoding); } /** * Parse an XML document from a byte stream. *

The URI that you supply will become the base URI for * resolving relative links, but Ælfred will actually read * the document from the supplied input stream. *

You may parse a document more than once, but only one thread * may call this method for an object at one time. * @param systemId The base URI of the document, or null if not * known. * @param publicId The public identifier of the document, or null * if not known. * @param stream A byte input stream. * @param encoding The suggested encoding, or null if unknown. * @exception java.lang.Exception Any exception thrown by your * own handlers, or any derivation of java.io.IOException * thrown by the parser itself. */ public void parse(String systemId, String publicId, InputStream stream, String encoding) throws java.lang.Exception { doParse(systemId, publicId, null, stream, encoding); } /** * Parse an XML document from a character stream. *

The URI that you supply will become the base URI for * resolving relative links, but Ælfred will actually read * the document from the supplied input stream. *

You may parse a document more than once, but only one thread * may call this method for an object at one time. * @param systemId The base URI of the document, or null if not * known. * @param publicId The public identifier of the document, or null * if not known. * @param reader A character stream. * @exception java.lang.Exception Any exception thrown by your * own handlers, or any derivation of java.io.IOException * thrown by the parser itself. */ public void parse(String systemId, String publicId, Reader reader) throws java.lang.Exception { doParse(systemId, publicId, reader, null, null); } private synchronized void doParse(String systemId, String publicId, Reader reader, InputStream stream, String encoding) throws java.lang.Exception { m_basePublicId = publicId; m_baseURI = systemId; m_baseReader = reader; m_baseInputStream = stream; initializeVariables(); // Set the default entities here. setInternalEntity(intern("amp"), "&"); setInternalEntity(intern("lt"), "<"); setInternalEntity(intern("gt"), ">"); setInternalEntity(intern("apos"), "'"); setInternalEntity(intern("quot"), """); if(m_handler != null) { m_handler.startDocument(); } pushURL("[document]", m_basePublicId, m_baseURI, m_baseReader, m_baseInputStream, encoding); parseDocument(); if(m_handler != null) { m_handler.endDocument(); } cleanupVariables(); } //////////////////////////////////////////////////////////////////////// // Constants. //////////////////////////////////////////////////////////////////////// // // Constants for element content type. // /** * Constant: an element has not been declared. * @see #getElementContentType */ public final static int CONTENT_UNDECLARED = 0; /** * Constant: the element has a content model of ANY. * @see #getElementContentType */ public final static int CONTENT_ANY = 1; /** * Constant: the element has declared content of EMPTY. * @see #getElementContentType */ public final static int CONTENT_EMPTY = 2; /** * Constant: the element has mixed content. * @see #getElementContentType */ public final static int CONTENT_MIXED = 3; /** * Constant: the element has element content. * @see #getElementContentType */ public final static int CONTENT_ELEMENTS = 4; // // Constants for the entity type. // /** * Constant: the entity has not been declared. * @see #getEntityType */ public final static int ENTITY_UNDECLARED = 0; /** * Constant: the entity is internal. * @see #getEntityType */ public final static int ENTITY_INTERNAL = 1; /** * Constant: the entity is external, non-XML data. * @see #getEntityType */ public final static int ENTITY_NDATA = 2; /** * Constant: the entity is external XML data. * @see #getEntityType */ public final static int ENTITY_TEXT = 3; // // Constants for attribute type. // /** * Constant: the attribute has not been declared for this element type. * @see #getAttributeType */ public final static int ATTRIBUTE_UNDECLARED = 0; /** * Constant: the attribute value is a string value. * @see #getAttributeType */ public final static int ATTRIBUTE_CDATA = 1; /** * Constant: the attribute value is a unique identifier. * @see #getAttributeType */ public final static int ATTRIBUTE_ID = 2; /** * Constant: the attribute value is a reference to a unique identifier. * @see #getAttributeType */ public final static int ATTRIBUTE_IDREF = 3; /** * Constant: the attribute value is a list of ID references. * @see #getAttributeType */ public final static int ATTRIBUTE_IDREFS = 4; /** * Constant: the attribute value is the name of an entity. * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITY = 5; /** * Constant: the attribute value is a list of entity names. * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITIES = 6; /** * Constant: the attribute value is a name token. * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKEN = 7; /** * Constant: the attribute value is a list of name tokens. * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKENS = 8; /** * Constant: the attribute value is a token from an enumeration. * @see #getAttributeType */ public final static int ATTRIBUTE_ENUMERATED = 9; /** * Constant: the attribute is the name of a notation. * @see #getAttributeType */ public final static int ATTRIBUTE_NOTATION = 10; // // When the class is loaded, populate the hash table of // attribute types. // /** * Hash table of attribute types. */ private static Hashtable attributeTypeHash; static { attributeTypeHash = new Hashtable(); attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA)); attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID)); attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF)); attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS)); attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY)); attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES)); attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN)); attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS)); attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION)); } // // Constants for supported encodings. // private final static int ENCODING_UTF_8 = 1; private final static int ENCODING_ISO_8859_1 = 2; private final static int ENCODING_UCS_2_12 = 3; private final static int ENCODING_UCS_2_21 = 4; private final static int ENCODING_UCS_4_1234 = 5; private final static int ENCODING_UCS_4_4321 = 6; private final static int ENCODING_UCS_4_2143 = 7; private final static int ENCODING_UCS_4_3412 = 8; // // Constants for attribute default value. // /** * Constant: the attribute is not declared. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0; /** * Constant: the attribute has a literal default value specified. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1; /** * Constant: the attribute was declared #IMPLIED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2; /** * Constant: the attribute was declared #REQUIRED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3; /** * Constant: the attribute was declared #FIXED. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_FIXED = 4; // // Constants for input. // private final static int INPUT_NONE = 0; private final static int INPUT_INTERNAL = 1; private final static int INPUT_EXTERNAL = 2; private final static int INPUT_STREAM = 3; private final static int INPUT_BUFFER = 4; private final static int INPUT_READER = 5; // // Flags for reading literals. // private final static int LIT_CHAR_REF = 1; private final static int LIT_ENTITY_REF = 2; private final static int LIT_PE_REF = 4; private final static int LIT_NORMALIZE = 8; // // Flags for parsing context. // private final static int CONTEXT_NONE = 0; private final static int CONTEXT_DTD = 1; private final static int CONTEXT_ENTITYVALUE = 2; private final static int CONTEXT_ATTRIBUTEVALUE = 3; ////////////////////////////////////////////////////////////////////// // Error reporting. ////////////////////////////////////////////////////////////////////// /** * Report an error. * @param message The error message. * @param textFound The text that caused the error (or null). * @see XmlHandler#error * @see #m_line */ void error(String message, String textFound, String textExpected) throws java.lang.Exception { errorCount++; if(textFound != null) { message = message + " (found \"" + textFound + "\")"; } if(textExpected != null) { message = message + " (expected \"" + textExpected + "\")"; } if(m_handler != null) { String uri = null; if(m_externalEntity != null) { uri = m_externalEntity.getURL().toString(); } m_handler.error(message, uri, m_line, m_column); } } /** * Report a serious error. * @param message The error message. * @param textFound The text that caused the error (or null). */ void error(String message, char textFound, String textExpected) throws java.lang.Exception { error(message, new Character(textFound).toString(), textExpected); } ////////////////////////////////////////////////////////////////////// // Major syntactic productions. ////////////////////////////////////////////////////////////////////// /** * Parse an XML document. *

	 * [1] document ::= prolog element Misc*
	 * 
*

This is the top-level parsing function for a single XML * document. As a minimum, a well-formed document must have * a document element, and a valid document must have a prolog * as well. */ void parseDocument() throws java.lang.Exception { char c; parseProlog(); require('<'); parseElement(); try { parseMisc(); //skip all white, PIs, and comments c = readCh(); //if this doesn't throw an exception... error("unexpected characters after document end", c, null); } catch(EOFException e) { return; } } /** * Skip a comment. *

	 * [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
	 * 
*

(The <!-- has already been read.) */ void parseComment() throws java.lang.Exception { skipUntil("-->"); } /** * Parse a processing instruction and do a call-back. *

	 * [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>'
	 * 
*

(The <? has already been read.) *

An XML processing instruction must begin with * a Name, which is the instruction's target. */ void parsePI() throws java.lang.Exception { String name; name = readNmtoken(true); if(!tryRead("?>")) { requireWhitespace(); parseUntil("?>"); } if(m_handler != null) { m_handler.processingInstruction(name, dataBufferToString()); } } /** * Parse a CDATA marked section. *

	 * [20] CDSect ::= CDStart CData CDEnd
	 * [21] CDStart ::= '<![CDATA['
	 * [22] CData ::= (Char* - (Char* ']]>' Char*))
	 * [23] CDEnd ::= ']]>'
	 * 
*

(The '<![CDATA[' has already been read.) *

Note that this just appends characters to the dataBuffer, * without actually generating an event. */ void parseCDSect() throws java.lang.Exception { parseUntil("]]>"); } /** * Parse the prolog of an XML document. *

	 * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
	 * 
*

There are a couple of tricks here. First, it is necessary to * declare the XML default attributes after the DTD (if present) * has been read. Second, it is not possible to expand general * references in attribute value literals until after the entire * DTD (if present) has been parsed. *

We do not look for the XML declaration here, because it is * handled by pushURL(). * @see pushURL */ void parseProlog() throws java.lang.Exception { parseMisc(); if(tryRead(" * [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'") * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'" * | S 'standalone' Eq '"' ("yes" | "no") '"' * [78] EncodingDecl ::= S 'encoding' Eq QEncoding *

*

([80] to [82] are also significant.) *

(The <?xml and whitespace have already been read.) *

TODO: validate value of standalone. * @see #parseTextDecl * @see #checkEncoding */ void parseXMLDecl(boolean ignoreEncoding) throws java.lang.Exception { String version; String encodingName = null; // Read the version. require("version"); parseEq(); version = readLiteral(0); if(!version.equals("1.0")) { error("unsupported XML version", version, "1.0"); } // Try reading an encoding declaration. skipWhitespace(); if(tryRead("encoding")) { parseEq(); encodingName = readLiteral(0); checkEncoding(encodingName, ignoreEncoding); } // Try reading a standalone declaration skipWhitespace(); if(tryRead("standalone")) { parseEq(); readLiteral(0); } skipWhitespace(); require("?>"); } /** * Parse the Encoding PI. *

	 * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
	 * [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>'
	 * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
	 * [81] Encoding ::= LatinName
	 * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
	 * 
*

(The <?xml' and whitespace have already been read.) * @see #parseXMLDecl * @see #checkEncoding */ void parseTextDecl(boolean ignoreEncoding) throws java.lang.Exception { String encodingName = null; // Read an optional version. if(tryRead("version")) { String version; parseEq(); version = readLiteral(0); if(!version.equals("1.0")) { error("unsupported XML version", version, "1.0"); } requireWhitespace(); } // Read the encoding. require("encoding"); parseEq(); encodingName = readLiteral(0); checkEncoding(encodingName, ignoreEncoding); skipWhitespace(); require("?>"); } /** * Check that the encoding specified makes sense. *

Compare what the author has specified in the XML declaration * or encoding PI with what we have detected. *

This is also important for distinguishing among the various * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect * those). * @param encodingName The name of the encoding specified by the user. * @see #parseXMLDecl * @see #parseTextDecl */ void checkEncoding(String encodingName, boolean ignoreEncoding) throws java.lang.Exception { encodingName = encodingName.toUpperCase(); if(ignoreEncoding) { return; } switch(m_encoding) { // 8-bit encodings case ENCODING_UTF_8: if(encodingName.equals("ISO-8859-1")) { m_encoding = ENCODING_ISO_8859_1; } else if(!encodingName.equals("UTF-8")) { error("unsupported 8-bit encoding", encodingName, "UTF-8 or ISO-8859-1"); } break; // 16-bit encodings case ENCODING_UCS_2_12: case ENCODING_UCS_2_21: if(!encodingName.equals("ISO-10646-UCS-2") && !encodingName.equals("UTF-16")) { error("unsupported 16-bit encoding", encodingName, "ISO-10646-UCS-2"); } break; // 32-bit encodings case ENCODING_UCS_4_1234: case ENCODING_UCS_4_4321: case ENCODING_UCS_4_2143: case ENCODING_UCS_4_3412: if(!encodingName.equals("ISO-10646-UCS-4")) { error("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); } } } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. *

	 * [27] Misc ::= Comment | PI | S
	 * 
*/ void parseMisc() throws java.lang.Exception { while(true) { skipWhitespace(); if(tryRead(" * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? * ('[' %markupdecl* ']' S?)? '>' * *

(The <!DOCTYPE has already been read.) */ void parseDoctypedecl() throws java.lang.Exception { String doctypeName, ids[]; // Read the document type name. requireWhitespace(); doctypeName = readNmtoken(true); // Read the ExternalIDs. skipWhitespace(); ids = readExternalIds(false); // Look for a declaration subset. skipWhitespace(); if(tryRead('[')) { // loop until the subset ends while(true) { m_context = CONTEXT_DTD; skipWhitespace(); m_context = CONTEXT_NONE; if(tryRead(']')) { break; // end of subset } else { m_context = CONTEXT_DTD; parseMarkupdecl(); m_context = CONTEXT_NONE; } } } // Read the external subset, if any if(ids[1] != null) { pushURL("[external subset]", ids[0], ids[1], null, null, null); // Loop until we end up back at '>' while(true) { m_context = CONTEXT_DTD; skipWhitespace(); m_context = CONTEXT_NONE; if(tryRead('>')) { break; } else { m_context = CONTEXT_DTD; parseMarkupdecl(); m_context = CONTEXT_NONE; } } } else { // No external subset. skipWhitespace(); require('>'); } if(m_handler != null) { m_handler.doctypeDecl(doctypeName, ids[0], ids[1]); } // Expand general entities in // default values of attributes. // (Do this after the doctypeDecl // event!). // expandAttributeDefaultValues(); } /** * Parse a markup declaration in the internal or external DTD subset. *

	 * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
	 *                       %NotationDecl | %PI | %S | %Comment |
	 *                       InternalPERef )
	 * [30] InternalPERef ::= PEReference
	 * [31] extSubset ::= (%markupdecl | %conditionalSect)*
	 * 
*/ void parseMarkupdecl() throws java.lang.Exception { if(tryRead(" * [33] STag ::= '<' Name (S Attribute)* S? '>' [WFC: unique Att spec] * [38] element ::= EmptyElement | STag content ETag * [39] EmptyElement ::= '<' Name (S Attribute)* S? '/>' * [WFC: unique Att spec] * *

(The '<' has already been read.) *

NOTE: this method actually chains onto parseContent(), if necessary, * and parseContent() will take care of calling parseETag(). */ void parseElement() throws java.lang.Exception { String gi; char c; int oldElementContent = m_currentElementContent; String oldElement = m_currentElement; // This is the (global) counter for the // array of specified attributes. m_tagAttributePos = 0; // Read the element type name. gi = readNmtoken(true); // Determine the current content type. m_currentElement = gi; m_currentElementContent = getElementContentType(gi); if(m_currentElementContent == CONTENT_UNDECLARED) { m_currentElementContent = CONTENT_ANY; } // Read the attributes, if any. // After this loop, we should be just // in front of the closing delimiter. skipWhitespace(); c = readCh(); while(c != '/' && c != '>') { unread(c); parseAttribute(gi); skipWhitespace(); c = readCh(); } unread(c); // Supply any defaulted attributes. Enumeration atts = declaredAttributes(gi); if(atts != null) { String aname; loop : while(atts.hasMoreElements()) { aname = (String) atts.nextElement(); // See if it was specified. for(int i = 0; i < m_tagAttributePos; i++) { if(m_tagAttributes[i] == aname) { continue loop; } } // I guess not... if(m_handler != null) { m_handler.attribute(aname, getAttributeExpandedValue(gi, aname), false); } } } // Figure out if this is a start tag // or an empty element, and dispatch an // event accordingly. c = readCh(); switch(c) { case '>': if(m_handler != null) { m_handler.startElement(gi); } parseContent(); break; case '/': require('>'); if(m_handler != null) { m_handler.startElement(gi); m_handler.endElement(gi); } break; } // Restore the previous state. m_currentElement = oldElement; m_currentElementContent = oldElementContent; } /** * Parse an attribute assignment. *

	 * [34] Attribute ::= Name Eq AttValue
	 * 
* @param name The name of the attribute's element. * @see XmlHandler#attribute */ void parseAttribute(String name) throws java.lang.Exception { String aname; int type; String value; // Read the attribute name. aname = readNmtoken(true).intern(); type = getAttributeDefaultValueType(name, aname); // Parse '=' parseEq(); // Read the value, normalizing whitespace // if it is not CDATA. if(type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) { value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF); } else { value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE); } // Inform the handler about the // attribute. if(m_handler != null) { m_handler.attribute(aname, value, true); } m_dataBufferPos = 0; // Note that the attribute has been // specified. if(m_tagAttributePos == m_tagAttributes.length) { String newAttrib[] = new String[m_tagAttributes.length * 2]; System.arraycopy(m_tagAttributes, 0, newAttrib, 0, m_tagAttributePos); m_tagAttributes = newAttrib; } m_tagAttributes[m_tagAttributePos++] = aname; } /** * Parse an equals sign surrounded by optional whitespace. * [35] Eq ::= S? '=' S? */ void parseEq() throws java.lang.Exception { skipWhitespace(); require('='); skipWhitespace(); } /** * Parse an end tag. * [36] ETag ::= '' * *NOTE: parseContent() chains to here. */ void parseETag() throws java.lang.Exception { String name; name = readNmtoken(true); if(name != m_currentElement) { error("mismatched end tag", name, m_currentElement); } skipWhitespace(); require('>'); if(m_handler != null) { m_handler.endElement(name); } } /** * Parse the content of an element. * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)* * [68] Reference ::= EntityRef | CharRef */ void parseContent() throws java.lang.Exception { char c; while(true) { switch(m_currentElementContent) { case CONTENT_ANY: case CONTENT_MIXED: parsePCData(); break; case CONTENT_ELEMENTS: parseWhitespace(); break; } // Handle delimiters c = readCh(); switch(c) { case '&': // Found "&" c = readCh(); if(c == '#') { parseCharRef(); } else { unread(c); parseEntityRef(true); } break; case '<': // Found "<" c = readCh(); switch(c) { case '!': // Found "' * [VC: Unique Element Declaration] * *NOTE: the ''); } /** * Content specification. * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements */ void parseContentspec(String name) throws java.lang.Exception { if(tryRead("EMPTY")) { setElement(name, CONTENT_EMPTY, null, null); return; } else if(tryRead("ANY")) { setElement(name, CONTENT_ANY, null, null); return; } else { require('('); dataBufferAppend('('); skipWhitespace(); if(tryRead("#PCDATA")) { dataBufferAppend("#PCDATA"); parseMixed(); setElement(name, CONTENT_MIXED, dataBufferToString(), null); } else { parseElements(); setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null); } } } /** * Parse an element-content model. * [42] elements ::= (choice | seq) ('?' | '*' | '+')? * [44] cps ::= S? %cp S? * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')' * [46] ctokplus ::= cps ('|' cps)+ * [47] ctoks ::= cps ('|' cps)* * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')' * [49] stoks ::= cps (',' cps)* * *NOTE: the opening '(' and S have already been read. * *TODO: go over parameter entity boundaries more carefully. */ void parseElements() throws java.lang.Exception { char c; char sep; // Parse the first content particle skipWhitespace(); parseCp(); // Check for end or for a separator. skipWhitespace(); c = readCh(); switch(c) { case ')': dataBufferAppend(')'); c = readCh(); switch(c) { case '*': case '+': case '?': dataBufferAppend(c); break; default: unread(c); } return; case ',': // Register the separator. case '|': sep = c; dataBufferAppend(c); break; default: error("bad separator in content model", c, null); return; } // Parse the rest of the content model. while(true) { skipWhitespace(); parseCp(); skipWhitespace(); c = readCh(); if(c == ')') { dataBufferAppend(')'); break; } else if(c != sep) { error("bad separator in content model", c, null); return; } else { dataBufferAppend(c); } } // Check for the occurrence indicator. c = readCh(); switch(c) { case '?': case '*': case '+': dataBufferAppend(c); return; default: unread(c); return; } } /** * Parse a content particle. * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+') * *NOTE: I actually use a slightly different production here: * cp ::= (elements | (Name ('?' | '*' | '+')?)) */ void parseCp() throws java.lang.Exception { char c; if(tryRead('(')) { dataBufferAppend('('); parseElements(); } else { dataBufferAppend(readNmtoken(true)); c = readCh(); switch(c) { case '?': case '*': case '+': dataBufferAppend(c); break; default: unread(c); break; } } } /** * Parse mixed content. * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*' * | '(' S? %('#PCDATA') S? ')' * [51] Mtoks ::= %Name (S? '|' S? %Name)* * *NOTE: the S and '#PCDATA' have already been read. */ void parseMixed() throws java.lang.Exception { // Check for PCDATA alone. skipWhitespace(); if(tryRead(')')) { dataBufferAppend(")*"); tryRead('*'); return; } // Parse mixed content. skipWhitespace(); while(!tryRead(")*")) { require('|'); dataBufferAppend('|'); skipWhitespace(); dataBufferAppend(readNmtoken(true)); skipWhitespace(); } dataBufferAppend(")*"); } /** * Parse an attribute list declaration. * [52] AttlistDecl ::= '' * *NOTE: the '')) { parseAttDef(elementName); skipWhitespace(); } } /** * Parse a single attribute definition. * [53] AttDef ::= S %Name S %AttType S %Default */ void parseAttDef(String elementName) throws java.lang.Exception { String name; int type; String enumeration = null; // Read the attribute name. name = readNmtoken(true); // Read the attribute type. requireWhitespace(); type = readAttType(); // Get the string of enumerated values // if necessary. if(type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) { enumeration = dataBufferToString(); } // Read the default value. requireWhitespace(); parseDefault(elementName, name, type, enumeration); } /** * Parse the attribute type. * [54] AttType ::= StringType | TokenizedType | EnumeratedType * [55] StringType ::= 'CDATA' * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | * 'NMTOKEN' | 'NMTOKENS' * [57] EnumeratedType ::= NotationType | Enumeration * *TODO: validate the type!! */ int readAttType() throws java.lang.Exception { String typeString; Integer type; if(tryRead('(')) { parseEnumeration(); return ATTRIBUTE_ENUMERATED; } else { typeString = readNmtoken(true); if(typeString.equals("NOTATION")) { parseNotationType(); } type = (Integer) attributeTypeHash.get(typeString); if(type == null) { error("illegal attribute type", typeString, null); return ATTRIBUTE_UNDECLARED; } else { return type.intValue(); } } } /** * Parse an enumeration. * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')' * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)* * *NOTE: the '(' has already been read. */ void parseEnumeration() throws java.lang.Exception { dataBufferAppend('('); // Read the first token. skipWhitespace(); dataBufferAppend(readNmtoken(true)); // Read the remaining tokens. skipWhitespace(); while(!tryRead(')')) { require('|'); dataBufferAppend('|'); skipWhitespace(); dataBufferAppend(readNmtoken(true)); skipWhitespace(); } dataBufferAppend(')'); } /** * Parse a notation type for an attribute. * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)* * S? ')' * [59] Ntoks ::= %Name (S? '|' S? %Name) * *NOTE: the 'NOTATION' has already been read */ void parseNotationType() throws java.lang.Exception { requireWhitespace(); require('('); parseEnumeration(); } /** * Parse the default value for an attribute. * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue */ void parseDefault(String elementName, String name, int type, String enumeration) throws java.lang.Exception { int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; String value = null; if(tryRead('#')) { if(tryRead("FIXED")) { valueType = ATTRIBUTE_DEFAULT_FIXED; requireWhitespace(); m_context = CONTEXT_ATTRIBUTEVALUE; value = readLiteral(LIT_CHAR_REF); m_context = CONTEXT_DTD; } else if(tryRead("REQUIRED")) { valueType = ATTRIBUTE_DEFAULT_REQUIRED; } else if(tryRead("IMPLIED")) { valueType = ATTRIBUTE_DEFAULT_IMPLIED; } else { error("illegal keyword for attribute default value", null, null); } } else { m_context = CONTEXT_ATTRIBUTEVALUE; value = readLiteral(LIT_CHAR_REF); m_context = CONTEXT_DTD; } setAttribute(elementName, name, type, enumeration, value, valueType); } /** * Parse a conditional section. * [63] conditionalSect ::= includeSect || ignoreSect * [64] includeSect ::= '' * [65] ignoreSect ::= '' * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>')) * | ('') * | (Char - (']' | [<'"])) * | ('")) { parseMarkupdecl(); skipWhitespace(); } } else if(tryRead("IGNORE")) { skipWhitespace(); require('['); char c; for(int nest = 1; nest > 0;) { c = readCh(); switch(c) { case '<': if(tryRead("![")) { nest++; } case ']': if(tryRead("]>")) { nest--; } } } } else { error("conditional section must begin with INCLUDE or IGNORE", null, null); } } /** * Read a character reference. * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' * *NOTE: the '&#' has already been read. */ void parseCharRef() throws java.lang.Exception { int value = 0; char c; if(tryRead('x')) { loop1 : while(true) { c = readCh(); switch(c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'a': case 'A': case 'b': case 'B': case 'c': case 'C': case 'd': case 'D': case 'e': case 'E': case 'f': case 'F': value *= 16; value += Integer.parseInt(new Character(c).toString(), 16); break; case ';': break loop1; default: error("illegal character in character reference", c, null); break loop1; } } } else { loop2 : while(true) { c = readCh(); switch(c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': value *= 10; value += Integer.parseInt(new Character(c).toString(), 10); break; case ';': break loop2; default: error("illegal character in character reference", c, null); break loop2; } } } // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: if(value <= 0x0000ffff) { // no surrogates needed dataBufferAppend((char) value); } else if(value <= 0x000fffff) { // > 16 bits, surrogate needed dataBufferAppend((char) (0xd8 | ((value & 0x000ffc00) >> 10))); dataBufferAppend((char) (0xdc | (value & 0x0003ff))); } else { // too big for surrogate error("character reference " + value + " is too large for UTF-16", new Integer(value).toString(), null); } } /** * Parse a reference. * [69] EntityRef ::= '&' Name ';' * *NOTE: the '&' has already been read. * @param externalAllowed External entities are allowed here. */ void parseEntityRef(boolean externalAllowed) throws java.lang.Exception { String name; name = readNmtoken(true); require(';'); switch(getEntityType(name)) { case ENTITY_UNDECLARED: error("reference to undeclared entity", name, null); break; case ENTITY_INTERNAL: pushString(name, getEntityValue(name)); break; case ENTITY_TEXT: if(externalAllowed) { pushURL(name, getEntityPublicId(name), getEntitySystemId(name), null, null, null); } else { error("reference to external entity in attribute value.", name, null); } break; case ENTITY_NDATA: if(externalAllowed) { error("data entity reference in content", name, null); } else { error("reference to external entity in attribute value.", name, null); } break; } } /** * Parse a parameter entity reference. * [70] PEReference ::= '%' Name ';' * *NOTE: the '%' has already been read. */ void parsePEReference(boolean isEntityValue) throws java.lang.Exception { String name; name = "%" + readNmtoken(true); require(';'); switch(getEntityType(name)) { case ENTITY_UNDECLARED: error("reference to undeclared parameter entity", name, null); break; case ENTITY_INTERNAL: if(isEntityValue) { pushString(name, getEntityValue(name)); } else { pushString(name, " " + getEntityValue(name) + ' '); } break; case ENTITY_TEXT: if(isEntityValue) { pushString(null, " "); } pushURL(name, getEntityPublicId(name), getEntitySystemId(name), null, null, null); if(isEntityValue) { pushString(null, " "); } break; } } /** * Parse an entity declaration. * [71] EntityDecl ::= '' * | '' * [72] EntityDef ::= EntityValue | ExternalDef * [73] ExternalDef ::= ExternalID %NDataDecl? * [74] ExternalID ::= 'SYSTEM' S SystemLiteral * | 'PUBLIC' S PubidLiteral S SystemLiteral * [75] NDataDecl ::= S %'NDATA' S %Name * *NOTE: the ''); } /** * Parse a notation declaration. * [81] NotationDecl ::= '' * *NOTE: the ''); } /** * Parse PCDATA. *
	 * [16] PCData ::= [^<&]*
	 * 
*

The trick here is that the data stays in the dataBuffer without * necessarily being converted to a string right away. */ void parsePCData() throws java.lang.Exception { char c; // Start with a little cheat -- in most // cases, the entire sequence of // character data will already be in // the readBuffer; if not, fall through to // the normal approach. if(USE_CHEATS) { int lineAugment = 0; int columnAugment = 0; for(int i = m_rbpos; i < m_rblen; i++) { switch(m_readBuffer[i]) { case '\n': lineAugment++; columnAugment = 0; break; case '&': case '<': int start = m_rbpos; columnAugment++; m_rbpos = i; if(lineAugment > 0) { m_line += lineAugment; m_column = columnAugment; } else { m_column += columnAugment; } dataBufferAppend(m_readBuffer, start, i - start); return; default: columnAugment++; } } } // OK, the cheat didn't work; start over // and do it by the book. while(true) { c = readCh(); switch(c) { case '<': case '&': unread(c); return; default: dataBufferAppend(c); break; } } } ////////////////////////////////////////////////////////////////////// // High-level reading and scanning methods. ////////////////////////////////////////////////////////////////////// /** * Require whitespace characters. * [1] S ::= (#x20 | #x9 | #xd | #xa)+ */ void requireWhitespace() throws java.lang.Exception { char c = readCh(); if(isWhitespace(c)) { skipWhitespace(); } else { error("whitespace expected", c, null); } } /** * Parse whitespace characters, and leave them in the data buffer. */ void parseWhitespace() throws java.lang.Exception { char c = readCh(); while(isWhitespace(c)) { dataBufferAppend(c); c = readCh(); } unread(c); } /** * Skip whitespace characters. * [1] S ::= (#x20 | #x9 | #xd | #xa)+ */ void skipWhitespace() throws java.lang.Exception { // Start with a little cheat. Most of // the time, the white space will fall // within the current read buffer; if // not, then fall through. if(USE_CHEATS) { int lineAugment = 0; int columnAugment = 0; loop : for(int i = m_rbpos; i < m_rblen; i++) { switch(m_readBuffer[i]) { case ' ': case '\t': case '\r': columnAugment++; break; case '\n': lineAugment++; columnAugment = 0; break; case '%': if(m_context == CONTEXT_DTD || m_context == CONTEXT_ENTITYVALUE) { break loop; } // else fall through... default: m_rbpos = i; if(lineAugment > 0) { m_line += lineAugment; m_column = columnAugment; } else { m_column += columnAugment; } return; } } } // OK, do it by the book. char c = readCh(); while(isWhitespace(c)) { c = readCh(); } unread(c); } /** * Read a name or name token. * [5] Name ::= (Letter | '_' | ':') (NameChar)* * [7] Nmtoken ::= (NameChar)+ * *NOTE: [6] is implemented implicitly where required. */ String readNmtoken(boolean isName) throws java.lang.Exception { char c; if(USE_CHEATS) { loop : for(int i = m_rbpos; i < m_rblen; i++) { switch(m_readBuffer[i]) { case '%': if(m_context == CONTEXT_DTD || m_context == CONTEXT_ENTITYVALUE) { break loop; } // else fall through... case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\r': case '\n': case ';': case '/': case '#': int start = m_rbpos; if(i == start) { error("name expected", m_readBuffer[i], null); } m_rbpos = i; return intern(m_readBuffer, start, i - start); } } } m_nameBufferPos = 0; // Read the first character. while(true) { c = readCh(); switch(c) { case '%': case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\n': case '\r': case ';': case '/': unread(c); if(m_nameBufferPos == 0) { error("name expected", null, null); } String s = intern(m_nameBuffer, 0, m_nameBufferPos); m_nameBufferPos = 0; return s; default: m_nameBuffer = (char[]) extendArray(m_nameBuffer, m_nameBuffer.length, m_nameBufferPos); m_nameBuffer[m_nameBufferPos++] = c; } } } /** * Read a literal. * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' * | "'" ([^<&'] | Reference)* "'" * [11] SystemLiteral ::= '"' URLchar* '"' | "'" (URLchar - "'")* "'" * [13] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" * [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' * | "'" ([^%&'] | PEReference | Reference)* "'" */ String readLiteral(int flags) throws java.lang.Exception { char delim, c; int startLine = m_line; // Find the delimiter. delim = readCh(); if(delim != '"' && delim != '\'' && delim != (char) 0) { error("expected '\"' or \"'\"", delim, null); return null; } // Read the literal. try { c = readCh(); loop : while(c != delim) { switch(c) { // Literals never have line ends case '\n': case '\r': c = ' '; break; // References may be allowed case '&': if((flags & LIT_CHAR_REF) > 0) { c = readCh(); if(c == '#') { parseCharRef(); c = readCh(); continue loop; // check the next character } else if((flags & LIT_ENTITY_REF) > 0) { unread(c); parseEntityRef(false); c = readCh(); continue loop; } else { dataBufferAppend('&'); } } break; default: break; } dataBufferAppend(c); c = readCh(); } } catch(EOFException e) { error("end of input while looking for delimiter (started on line " + startLine + ')', null, new Character(delim).toString()); } // Normalise whitespace if necessary. if((flags & LIT_NORMALIZE) > 0) { dataBufferNormalize(); } // Return the value. return dataBufferToString(); } /** * Try reading external identifiers. *

The system identifier is not required for notations. * @param inNotation Are we in a notation? * @return A two-member String array containing the identifiers. */ String[] readExternalIds(boolean inNotation) throws java.lang.Exception { String ids[] = new String[2]; if(tryRead("PUBLIC")) { requireWhitespace(); ids[0] = readLiteral(LIT_NORMALIZE); // public id if(inNotation) { skipWhitespace(); if(tryRead('"') || tryRead('\'')) { ids[1] = readLiteral(0); } } else { requireWhitespace(); ids[1] = readLiteral(0); // system id } } else if(tryRead("SYSTEM")) { requireWhitespace(); ids[1] = readLiteral(0); // system id } return ids; } /** * Test if a character is whitespace. *

	 * [1] S ::= (#x20 | #x9 | #xd | #xa)+
	 * 
* @param c The character to test. * @return true if the character is whitespace. */ final boolean isWhitespace(char c) { switch(c) { case 0x20: case 0x09: case 0x0d: case 0x0a: return true; default: return false; } } ////////////////////////////////////////////////////////////////////// // Utility routines. ////////////////////////////////////////////////////////////////////// /** * Add a character to the data buffer. */ void dataBufferAppend(char c) { // Expand buffer if necessary. m_dataBuffer = (char[]) extendArray(m_dataBuffer, m_dataBuffer.length, m_dataBufferPos); m_dataBuffer[m_dataBufferPos++] = c; } /** * Add a string to the data buffer. */ void dataBufferAppend(String s) { dataBufferAppend(s.toCharArray(), 0, s.length()); } /** * Append (part of) a character array to the data buffer. */ void dataBufferAppend(char ch[], int start, int length) { m_dataBuffer = (char[]) extendArray(m_dataBuffer, m_dataBuffer.length, m_dataBufferPos + length); System.arraycopy(ch, start, m_dataBuffer, m_dataBufferPos, length); m_dataBufferPos += length; } /** * Normalise whitespace in the data buffer. */ void dataBufferNormalize() { int i = 0; int j = 0; int end = m_dataBufferPos; // Skip whitespace at the start. while(j < end && isWhitespace(m_dataBuffer[j])) { j++; } // Skip whitespace at the end. while(end > j && isWhitespace(m_dataBuffer[end - 1])) { end--; } // Start copying to the left. while(j < end) { char c = m_dataBuffer[j++]; // Normalise all other whitespace to // a single space. if(isWhitespace(c)) { while(j < end && isWhitespace(m_dataBuffer[j++])) {} m_dataBuffer[i++] = ' '; m_dataBuffer[i++] = m_dataBuffer[j - 1]; } else { m_dataBuffer[i++] = c; } } // The new length is <= the old one. m_dataBufferPos = i; } /** * Convert the data buffer to a string. * @param internFlag true if the contents should be interned. * @see #intern(char[],int,int) */ String dataBufferToString() { String s = new String(m_dataBuffer, 0, m_dataBufferPos); m_dataBufferPos = 0; return s; } /** * Flush the contents of the data buffer to the handler, if * appropriate, and reset the buffer for new input. */ void dataBufferFlush() throws java.lang.Exception { if(m_dataBufferPos > 0) { switch(m_currentElementContent) { case CONTENT_UNDECLARED: case CONTENT_EMPTY: // do nothing break; case CONTENT_MIXED: case CONTENT_ANY: if(m_handler != null) { m_handler.charData(m_dataBuffer, 0, m_dataBufferPos); } break; case CONTENT_ELEMENTS: if(m_handler != null) { m_handler.ignorableWhitespace(m_dataBuffer, 0, m_dataBufferPos); } break; } m_dataBufferPos = 0; } } /** * Require a string to appear, or throw an exception. */ void require(String delim) throws java.lang.Exception { char ch[] = delim.toCharArray(); for(int i = 0; i < ch.length; i++) { require(ch[i]); } } /** * Require a character to appear, or throw an exception. */ void require(char delim) throws java.lang.Exception { char c = readCh(); if(c != delim) { error("expected character", c, new Character(delim).toString()); } } /** * Return an internalised version of a string. *

Ælfred uses this method to create an internalised version * of all names and attribute values, so that it can test equality * with == instead of String.equals(). *

If you want to be able to test for equality in the same way, * you can use this method to internalise your own strings first: *

	 * String PARA = handler.intern("PARA");
	 * 
*

Note that this will not return the same results as String.intern(). * @param s The string to internalise. * @return An internalised version of the string. * @see #intern(char[],int,int) * @see java.lang.String#intern */ public String intern(String s) { char ch[] = s.toCharArray(); return intern(ch, 0, ch.length); } /** * Create an internalised string from a character array. *

This is much more efficient than constructing a non-internalised * string first, and then internalising it. *

Note that this will not return the same results as String.intern(). * @param ch an array of characters for building the string. * @param start the starting position in the array. * @param length the number of characters to place in the string. * @return an internalised string. * @see #intern(String) * @see java.lang.String#intern */ public String intern(char ch[], int start, int length) { int index; int hash = 0; // Generate a hash code. for(int i = start; i < start + length; i++) { hash = ((hash << 1) & 0xffffff) + ch[i]; } hash = hash % SYMBOL_TABLE_LENGTH; // Get the bucket. Object bucket[] = (Object[]) m_symbolTable[hash]; if(bucket == null) { m_symbolTable[hash] = bucket = new Object[8]; } // Search for a matching tuple, and // return the string if we find one. for(index = 0; index < bucket.length; index += 2) { char chFound[] = (char[]) bucket[index]; // Stop when we hit a null index. if(chFound == null) { break; } // If they're the same length, // check for a match. // If the loop finishes, 'index' will // contain the current bucket // position. if(chFound.length == length) { for(int i = 0; i < chFound.length; i++) { // Stop if there are no more tuples. if(ch[start + i] != chFound[i]) { break; } else if(i == length - 1) { // That's it, we have a match! return (String) bucket[index + 1]; } } } } // Not found -- we'll have to add it. // Do we have to grow the bucket? bucket = (Object[]) extendArray(bucket, bucket.length, index); // OK, add it to the end of the // bucket. String s = new String(ch, start, length); bucket[index] = s.toCharArray(); bucket[index + 1] = s; m_symbolTable[hash] = bucket; return s; } /** * Ensure the capacity of an array, allocating a new one if * necessary. */ Object extendArray(Object array, int currentSize, int requiredSize) { if(requiredSize < currentSize) { return array; } else { Object newArray = null; int newSize = currentSize * 2; if(newSize <= requiredSize) { newSize = requiredSize + 1; } if(array instanceof char[]) { newArray = new char[newSize]; } else if(array instanceof Object[]) { newArray = new Object[newSize]; } System.arraycopy(array, 0, newArray, 0, currentSize); return newArray; } } ////////////////////////////////////////////////////////////////////// // XML query routines. ////////////////////////////////////////////////////////////////////// // // Elements // /** * Get the declared elements for an XML document. *

The results will be valid only after the DTD (if any) has been * parsed. * @return An enumeration of all element types declared for this * document (as Strings). * @see #getElementContentType * @see #getElementContentModel */ public Enumeration declaredElements() { return m_elementInfo.keys(); } /** * Look up the content type of an element. * @param name The element type name. * @return An integer constant representing the content type. * @see #getElementContentModel * @see #CONTENT_UNDECLARED * @see #CONTENT_ANY * @see #CONTENT_EMPTY * @see #CONTENT_MIXED * @see #CONTENT_ELEMENTS */ public int getElementContentType(String name) { Object element[] = (Object[]) m_elementInfo.get(name); if(element == null) { return CONTENT_UNDECLARED; } else { return ((Integer) element[0]).intValue(); } } /** * Look up the content model of an element. *

The result will always be null unless the content type is * CONTENT_ELEMENTS or CONTENT_MIXED. * @param name The element type name. * @return The normalised content model, as a string. * @see #getElementContentType */ public String getElementContentModel(String name) { Object element[] = (Object[]) m_elementInfo.get(name); if(element == null) { return null; } else { return (String) element[1]; } } /** * Register an element. * Array format: * element type * attribute hash table */ void setElement(String name, int contentType, String contentModel, Hashtable attributes) throws java.lang.Exception { Object element[]; // Try looking up the element element = (Object[]) m_elementInfo.get(name); // Make a new one if necessary. if(element == null) { element = new Object[3]; element[0] = new Integer(CONTENT_UNDECLARED); element[1] = null; element[2] = null; } else if(contentType != CONTENT_UNDECLARED && ((Integer) element[0]).intValue() != CONTENT_UNDECLARED) { error("multiple declarations for element type", name, null); return; } // Insert the content type, if any. if(contentType != CONTENT_UNDECLARED) { element[0] = new Integer(contentType); } // Insert the content model, if any. if(contentModel != null) { element[1] = contentModel; } // Insert the attributes, if any. if(attributes != null) { element[2] = attributes; } // Save the element info. m_elementInfo.put(name, element); } /** * Look up the attribute hash table for an element. * The hash table is the second item in the element array. */ Hashtable getElementAttributes(String name) { Object element[] = (Object[]) m_elementInfo.get(name); if(element == null) { return null; } else { return (Hashtable) element[2]; } } // // Attributes // /** * Get the declared attributes for an element type. * @param elname The name of the element type. * @return An Enumeration of all the attributes declared for * a specific element type. The results will be valid only * after the DTD (if any) has been parsed. * @see #getAttributeType * @see #getAttributeEnumeration * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue * @see #getAttributeExpandedValue */ public Enumeration declaredAttributes(String elname) { Hashtable attlist = getElementAttributes(elname); if(attlist == null) { return null; } else { return attlist.keys(); } } /** * Retrieve the declared type of an attribute. * @param name The name of the associated element. * @param aname The name of the attribute. * @return An integer constant representing the attribute type. * @see #ATTRIBUTE_UNDECLARED * @see #ATTRIBUTE_CDATA * @see #ATTRIBUTE_ID * @see #ATTRIBUTE_IDREF * @see #ATTRIBUTE_IDREFS * @see #ATTRIBUTE_ENTITY * @see #ATTRIBUTE_ENTITIES * @see #ATTRIBUTE_NMTOKEN * @see #ATTRIBUTE_NMTOKENS * @see #ATTRIBUTE_ENUMERATED * @see #ATTRIBUTE_NOTATION */ public int getAttributeType(String name, String aname) { Object attribute[] = getAttribute(name, aname); if(attribute == null) { return ATTRIBUTE_UNDECLARED; } else { return ((Integer) attribute[0]).intValue(); } } /** * Retrieve the allowed values for an enumerated attribute type. * @param name The name of the associated element. * @param aname The name of the attribute. * @return A string containing the token list. * @see #ATTRIBUTE_ENUMERATED * @see #ATTRIBUTE_NOTATION */ public String getAttributeEnumeration(String name, String aname) { Object attribute[] = getAttribute(name, aname); if(attribute == null) { return null; } else { return (String) attribute[3]; } } /** * Retrieve the default value of a declared attribute. * @param name The name of the associated element. * @param aname The name of the attribute. * @return The default value, or null if the attribute was * #IMPLIED or simply undeclared and unspecified. * @see #getAttributeExpandedValue */ public String getAttributeDefaultValue(String name, String aname) { Object attribute[] = getAttribute(name, aname); if(attribute == null) { return null; } else { return (String) attribute[1]; } } /** * Retrieve the expanded value of a declared attribute. *

All general entities will be expanded. * @param name The name of the associated element. * @param aname The name of the attribute. * @return The expanded default value, or null if the attribute was * #IMPLIED or simply undeclared * @see #getAttributeDefaultValue */ public String getAttributeExpandedValue(String name, String aname) { Object attribute[] = getAttribute(name, aname); if(attribute == null) { return null; } else if(attribute[4] == null && attribute[1] != null) { try { pushString(null, (char) 0 + (String) attribute[1] + (char) 0); attribute[4] = readLiteral(LIT_NORMALIZE | LIT_CHAR_REF | LIT_ENTITY_REF); } catch(Exception e) {} } return (String) attribute[4]; } /** * Retrieve the default value type of a declared attribute. * @see #ATTRIBUTE_DEFAULT_SPECIFIED * @see #ATTRIBUTE_DEFAULT_IMPLIED * @see #ATTRIBUTE_DEFAULT_REQUIRED * @see #ATTRIBUTE_DEFAULT_FIXED */ public int getAttributeDefaultValueType(String name, String aname) { Object attribute[] = getAttribute(name, aname); if(attribute == null) { return ATTRIBUTE_DEFAULT_UNDECLARED; } else { return ((Integer) attribute[2]).intValue(); } } /** * Register an attribute declaration for later retrieval. * Format: * - String type * - String default value * - int value type * *TODO: do something with attribute types. */ void setAttribute(String elName, String name, int type, String enumeration, String value, int valueType) throws java.lang.Exception { Hashtable attlist; Object attribute[]; // Create a new hashtable if necessary. attlist = getElementAttributes(elName); if(attlist == null) { attlist = new Hashtable(); } // Check that the attribute doesn't // already exist! if(attlist.get(name) != null) { return; } else { attribute = new Object[5]; attribute[0] = new Integer(type); attribute[1] = value; attribute[2] = new Integer(valueType); attribute[3] = enumeration; attribute[4] = null; attlist.put(name.intern(), attribute); // Use CONTENT_UNDECLARED to avoid overwriting // existing element declaration. setElement(elName, CONTENT_UNDECLARED, null, attlist); } } /** * Retrieve the three-member array representing an * attribute declaration. */ Object[] getAttribute(String elName, String name) { Hashtable attlist; Object attribute[]; attlist = getElementAttributes(elName); if(attlist == null) { return null; } attribute = (Object[]) attlist.get(name); return attribute; } // // Entities // /** * Get declared entities. * @return An Enumeration of all the entities declared for * this XML document. The results will be valid only * after the DTD (if any) has been parsed. * @see #getEntityType * @see #getEntityPublicId * @see #getEntitySystemId * @see #getEntityValue * @see #getEntityNotationName */ public Enumeration declaredEntities() { return m_entityInfo.keys(); } /** * Find the type of an entity. * @returns An integer constant representing the entity type. * @see #ENTITY_UNDECLARED * @see #ENTITY_INTERNAL * @see #ENTITY_NDATA * @see #ENTITY_TEXT */ public int getEntityType(String ename) { Object entity[] = (Object[]) m_entityInfo.get(ename); if(entity == null) { return ENTITY_UNDECLARED; } else { return ((Integer) entity[0]).intValue(); } } /** * Return an external entity's public identifier, if any. * @param ename The name of the external entity. * @return The entity's system identifier, or null if the * entity was not declared, if it is not an * external entity, or if no public identifier was * provided. * @see #getEntityType */ public String getEntityPublicId(String ename) { Object entity[] = (Object[]) m_entityInfo.get(ename); if(entity == null) { return null; } else { return (String) entity[1]; } } /** * Return an external entity's system identifier. * @param ename The name of the external entity. * @return The entity's system identifier, or null if the * entity was not declared, or if it is not an * external entity. * @see #getEntityType */ public String getEntitySystemId(String ename) { Object entity[] = (Object[]) m_entityInfo.get(ename); if(entity == null) { return null; } else { return (String) entity[2]; } } /** * Return the value of an internal entity. * @param ename The name of the internal entity. * @return The entity's value, or null if the entity was * not declared, or if it is not an internal entity. * @see #getEntityType */ public String getEntityValue(String ename) { Object entity[] = (Object[]) m_entityInfo.get(ename); if(entity == null) { return null; } else { return (String) entity[3]; } } /** * Get the notation name associated with an NDATA entity. * @param ename The NDATA entity name. * @return The associated notation name, or null if the * entity was not declared, or if it is not an * NDATA entity. * @see #getEntityType */ public String getEntityNotationName(String eName) { Object entity[] = (Object[]) m_entityInfo.get(eName); if(entity == null) { return null; } else { return (String) entity[4]; } } /** * Register an entity declaration for later retrieval. */ void setInternalEntity(String eName, String value) { setEntity(eName, ENTITY_INTERNAL, null, null, value, null); } /** * Register an external data entity. */ void setExternalDataEntity(String eName, String pubid, String sysid, String nName) { setEntity(eName, ENTITY_NDATA, pubid, sysid, null, nName); } /** * Register an external text entity. */ void setExternalTextEntity(String eName, String pubid, String sysid) { setEntity(eName, ENTITY_TEXT, pubid, sysid, null, null); } /** * Register an entity declaration for later retrieval. */ void setEntity(String eName, int eClass, String pubid, String sysid, String value, String nName) { Object entity[]; if(m_entityInfo.get(eName) == null) { entity = new Object[5]; entity[0] = new Integer(eClass); entity[1] = pubid; entity[2] = sysid; entity[3] = value; entity[4] = nName; m_entityInfo.put(eName, entity); } } // // Notations. // /** * Get declared notations. * @return An Enumeration of all the notations declared for * this XML document. The results will be valid only * after the DTD (if any) has been parsed. * @see #getNotationPublicId * @see #getNotationSystemId */ public Enumeration declaredNotations() { return m_notationInfo.keys(); } /** * Look up the public identifier for a notation. * You will normally use this method to look up a notation * that was provided as an attribute value or for an NDATA entity. * @param nname The name of the notation. * @return A string containing the public identifier, or null * if none was provided or if no such notation was * declared. * @see #getNotationSystemId */ public String getNotationPublicId(String nname) { Object notation[] = (Object[]) m_notationInfo.get(nname); if(notation == null) { return null; } else { return (String) notation[0]; } } /** * Look up the system identifier for a notation. * You will normally use this method to look up a notation * that was provided as an attribute value or for an NDATA entity. * @param nname The name of the notation. * @return A string containing the system identifier, or null * if no such notation was declared. * @see #getNotationPublicId */ public String getNotationSystemId(String nname) { Object notation[] = (Object[]) m_notationInfo.get(nname); if(notation == null) { return null; } else { return (String) notation[1]; } } /** * Register a notation declaration for later retrieval. * Format: * - public id * - system id */ void setNotation(String nname, String pubid, String sysid) throws java.lang.Exception { Object notation[]; if(m_notationInfo.get(nname) == null) { notation = new Object[2]; notation[0] = pubid; notation[1] = sysid; m_notationInfo.put(nname, notation); } else { error("multiple declarations of notation", nname, null); } } // // Location. // /** * Return the current line number. */ public int getLineNumber() { return m_line; } /** * Return the current column number. */ public int getColumnNumber() { return m_column; } ////////////////////////////////////////////////////////////////////// // High-level I/O. ////////////////////////////////////////////////////////////////////// /** * Read a single character from the readBuffer. *

The readDataChunk() method maintains the buffer. *

If we hit the end of an entity, try to pop the stack and * keep going. *

(This approach doesn't really enforce XML's rules about * entity boundaries, but this is not currently a validating * parser). *

This routine also attempts to keep track of the current * position in external entities, but it's not entirely accurate. * @return The next available input character. * @see #unread(char) * @see #unread(String) * @see #readDataChunk * @see #m_readBuffer * @see #m_line * @return The next character from the current input source. */ char readCh() throws java.lang.Exception { char c; // As long as there's nothing in the // read buffer, try reading more data // (for an external entity) or popping // the entity stack (for either). while(m_rbpos >= m_rblen) { switch(m_sourceType) { case INPUT_READER: case INPUT_EXTERNAL: case INPUT_STREAM: readDataChunk(); while(m_rblen < 1) { popInput(); if(m_rblen < 1) { readDataChunk(); } } break; default: popInput(); break; } } c = m_readBuffer[m_rbpos++]; // This is a particularly nasty bit // of code, that checks for a parameter // entity reference but peeks ahead to // catch the '%' in parameter entity // declarations. if(c == '%' && (m_context == CONTEXT_DTD || m_context == CONTEXT_ENTITYVALUE)) { char c2 = readCh(); unread(c2); if(!isWhitespace(c2)) { parsePEReference(m_context == CONTEXT_ENTITYVALUE); return readCh(); } } if(c == '\n') { m_line++; m_column = 0; } else { m_column++; } return c; } /** * Push a single character back onto the current input stream. *

This method usually pushes the character back onto * the readBuffer, while the unread(String) method treats the * string as a new internal entity. *

I don't think that this would ever be called with * readBufferPos = 0, because the methods always reads a character * before unreading it, but just in case, I've added a boundary * condition. * @param c The character to push back. * @see #readCh * @see #unread(String) * @see #unread(char[]) * @see #m_readBuffer */ void unread(char c) throws java.lang.Exception { // Normal condition. if(c == '\n') { m_line--; m_column = -1; } if(m_rbpos > 0) { m_readBuffer[--m_rbpos] = c; } else { pushString(null, new Character(c).toString()); } } /** * Push a char array back onto the current input stream. *

NOTE: you must never push back characters that you * haven't actually read: use pushString() instead. * @see #readCh * @see #unread(char) * @see #unread(String) * @see #m_readBuffer * @see #pushString */ void unread(char ch[], int length) throws java.lang.Exception { for(int i = 0; i < length; i++) { if(ch[i] == '\n') { m_line--; m_column = -1; } } if(length < m_rbpos) { m_rbpos -= length; } else { pushCharArray(null, ch, 0, length); m_sourceType = INPUT_BUFFER; } } /** * Push a new external input source. *

The source will be either an external text entity, or the DTD * external subset. *

TO DO: Right now, this method always attempts to autodetect * the encoding; in the future, it should allow the caller to * request an encoding explicitly, and it should also look at the * headers with an HTTP connection. * @param url The java.net.URL object for the entity. * @see XmlHandler#resolveEntity * @see #pushString * @see #m_sourceType * @see #pushInput * @see #detectEncoding * @see #m_sourceType * @see #m_readBuffer */ void pushURL(String ename, String publicId, String systemId, Reader reader, InputStream stream, String encoding) throws java.lang.Exception { URL url; boolean ignoreEncoding = false; // Push the existing status. pushInput(ename); // Create a new read buffer. // (Note the four-character margin) m_readBuffer = new char[READ_BUFFER_MAX + 4]; m_rbpos = 0; m_rblen = 0; m_rboverflow = -1; m_is = null; m_line = 1; m_currentByteCount = 0; // Flush any remaining data. dataBufferFlush(); // Make the URL absolute. if(systemId != null && m_externalEntity != null) { systemId = new URL(m_externalEntity.getURL(), systemId).toString(); } else if(m_baseURI != null) { try { systemId = new URL(new URL(m_baseURI), systemId).toString(); } catch(Exception e) {} } // See if the application wants to // redirect the system ID and/or // supply its own character stream. if(systemId != null && m_handler != null) { Object input = m_handler.resolveEntity(publicId, systemId); if(input != null) { if(input instanceof String) { systemId = (String) input; } else if(input instanceof InputStream) { stream = (InputStream) input; } else if(input instanceof Reader) { reader = (Reader) input; } } } // Start the entity. if(m_handler != null) { if(systemId != null) { m_handler.startExternalEntity(systemId); } else { m_handler.startExternalEntity("[external stream]"); } } // Figure out what we're reading from. if(reader != null) { // There's an explicit character stream. m_sourceType = INPUT_READER; this.m_reader = reader; tryEncodingDecl(true); return; } else if(stream != null) { m_sourceType = INPUT_STREAM; m_is = stream; } else { // We have to open our own stream // to the URL. // Set the new status m_sourceType = INPUT_EXTERNAL; url = new URL(systemId); m_externalEntity = url.openConnection(); m_externalEntity.connect(); m_is = m_externalEntity.getInputStream(); } // If we get to here, there must be // an InputStream available. if(!m_is.markSupported()) { m_is = new BufferedInputStream(m_is); } // Attempt to detect the encoding. if(encoding == null && m_externalEntity != null) { encoding = m_externalEntity.getContentEncoding(); } if(encoding != null) { checkEncoding(encoding, false); ignoreEncoding = true; } else { detectEncoding(); ignoreEncoding = false; } // Read an XML or text declaration. tryEncodingDecl(ignoreEncoding); } /** * Check for an encoding declaration. */ void tryEncodingDecl(boolean ignoreEncoding) throws java.lang.Exception { // Read the XML/Encoding declaration. if(tryRead(" 0) { parseTextDecl(ignoreEncoding); } else { parseXMLDecl(ignoreEncoding); } } else { unread("xml".toCharArray(), 3); parsePI(); } } } /** * Attempt to detect the encoding of an entity. *

The trick here (as suggested in the XML standard) is that * any entity not in UTF-8, or in UCS-2 with a byte-order mark, * must begin with an XML declaration or an encoding * declaration; we simply have to look for "<?XML" in various * encodings. *

This method has no way to distinguish among 8-bit encodings. * Instead, it assumes UTF-8, then (possibly) revises its assumption * later in checkEncoding(). Any ASCII-derived 8-bit encoding * should work, but most will be rejected later by checkEncoding(). *

I don't currently detect EBCDIC, since I'm concerned that it * could also be a valid UTF-8 sequence; I'll have to do more checking * later. * @see #tryEncoding(byte[], byte, byte, byte, byte) * @see #tryEncoding(byte[], byte, byte) * @see #checkEncoding * @see #read8bitEncodingDeclaration */ void detectEncoding() throws java.lang.Exception { byte signature[] = new byte[4]; // Read the first four bytes for // autodetection. m_is.mark(4); m_is.read(signature); m_is.reset(); // Look for a known signature. if(tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x3c)) { // UCS-4 must begin with "Utility routine for detectEncoding(). *

Always looks for some part of "Looks for a UCS-2 byte-order mark. *

Utility routine for detectEncoding(). * @param sig The first four bytes read. * @param b1 The first byte of the signature * @param b2 The second byte of the signature * @see #detectEncoding */ boolean tryEncoding(byte sig[], byte b1, byte b2) { return ((sig[0] == b1) && (sig[1] == b2)); } /** * This method pushes a string back onto input. *

It is useful either as the expansion of an internal entity, * or for backtracking during the parse. *

Call pushCharArray() to do the actual work. * @param s The string to push back onto input. * @see #pushCharArray */ void pushString(String ename, String s) throws java.lang.Exception { char ch[] = s.toCharArray(); pushCharArray(ename, ch, 0, ch.length); } /** * Push a new internal input source. *

This method is useful for expanding an internal entity, * or for unreading a string of characters. It creates a new * readBuffer containing the characters in the array, instead * of characters converted from an input byte stream. *

I've added a couple of optimisations: don't push zero- * length strings, and just push back a single character * for 1-character strings; this should save some time and memory. * @param ch The char array to push. * @see #pushString * @see #pushURL * @see #m_readBuffer * @see #m_sourceType * @see #pushInput */ void pushCharArray(String ename, char ch[], int start, int length) throws java.lang.Exception { // Push the existing status pushInput(ename); m_sourceType = INPUT_INTERNAL; m_readBuffer = ch; m_rbpos = start; m_rblen = length; m_rboverflow = -1; } /** * Save the current input source onto the stack. *

This method saves all of the global variables associated with * the current input source, so that they can be restored when a new * input source has finished. It also tests for entity recursion. *

The method saves the following global variables onto a stack * using a fixed-length array: *

    *
  1. sourceType *
  2. externalEntity *
  3. readBuffer *
  4. readBufferPos *
  5. readBufferLength *
  6. line *
  7. encoding *
* @param ename The name of the entity (if any) causing the new input. * @see #popInput * @see #m_sourceType * @see #m_externalEntity * @see #m_readBuffer * @see #m_rbpos * @see #m_rblen * @see #m_line * @see #m_encoding */ void pushInput(String ename) throws java.lang.Exception { Object input[] = new Object[12]; // Check for entity recursion. if(ename != null) { Enumeration entities = m_entityStack.elements(); while(entities.hasMoreElements()) { String e = (String) entities.nextElement(); if(e == ename) { error("recursive reference to entity", ename, null); } } } m_entityStack.push(ename); // Don't bother if there is no input. if(m_sourceType == INPUT_NONE) { return; } // Set up a snapshot of the current // input source. input[0] = new Integer(m_sourceType); input[1] = m_externalEntity; input[2] = m_readBuffer; input[3] = new Integer(m_rbpos); input[4] = new Integer(m_rblen); input[5] = new Integer(m_line); input[6] = new Integer(m_encoding); input[7] = new Integer(m_rboverflow); input[8] = m_is; input[9] = new Integer(m_currentByteCount); input[10] = new Integer(m_column); input[11] = m_reader; // Push it onto the stack. m_inputStack.push(input); } /** * Restore a previous input source. *

This method restores all of the global variables associated with * the current input source. * @exception java.io.EOFException * If there are no more entries on the input stack. * @see #pushInput * @see #m_sourceType * @see #m_externalEntity * @see #m_readBuffer * @see #m_rbpos * @see #m_rblen * @see #m_line * @see #m_encoding */ void popInput() throws java.lang.Exception { Object input[]; switch(m_sourceType) { case INPUT_EXTERNAL: dataBufferFlush(); if(m_handler != null && m_externalEntity != null) { m_handler.endExternalEntity(m_externalEntity.getURL().toString()); } break; case INPUT_STREAM: dataBufferFlush(); if(m_baseURI != null) { if(m_handler != null) { m_handler.endExternalEntity(m_baseURI); } } break; case INPUT_READER: dataBufferFlush(); if(m_baseURI != null) { if(m_handler != null) { m_handler.endExternalEntity(m_baseURI); } } break; } // Throw an EOFException if there // is nothing else to pop. if(m_inputStack.isEmpty()) { throw new EOFException(); } else { input = (Object[]) m_inputStack.pop(); m_entityStack.pop(); } m_sourceType = ((Integer) input[0]).intValue(); m_externalEntity = (URLConnection) input[1]; m_readBuffer = (char[]) input[2]; m_rbpos = ((Integer) input[3]).intValue(); m_rblen = ((Integer) input[4]).intValue(); m_line = ((Integer) input[5]).intValue(); m_encoding = ((Integer) input[6]).intValue(); m_rboverflow = ((Integer) input[7]).intValue(); m_is = (InputStream) input[8]; m_currentByteCount = ((Integer) input[9]).intValue(); m_column = ((Integer) input[10]).intValue(); m_reader = (Reader) input[11]; } /** * Return true if we can read the expected character. *

Note that the character will be removed from the input stream * on success, but will be put back on failure. Do not attempt to * read the character again if the method succeeds. * @param delim The character that should appear next. For a * insensitive match, you must supply this in upper-case. * @return true if the character was successfully read, or false if * it was not. * @see #tryRead(String) */ boolean tryRead(char delim) throws java.lang.Exception { char c; // Read the character c = readCh(); // Test for a match, and push the character // back if the match fails. if(c == delim) { return true; } else { unread(c); return false; } } /** * Return true if we can read the expected string. *

This is simply a convenience method. *

Note that the string will be removed from the input stream * on success, but will be put back on failure. Do not attempt to * read the string again if the method succeeds. *

This method will push back a character rather than an * array whenever possible (probably the majority of cases). *

NOTE: This method currently has a hard-coded limit * of 100 characters for the delimiter. * @param delim The string that should appear next. * @return true if the string was successfully read, or false if * it was not. * @see #tryRead(char) */ boolean tryRead(String delim) throws java.lang.Exception { char ch[] = delim.toCharArray(); char c; // Compare the input, character- // by character. for(int i = 0; i < ch.length; i++) { c = readCh(); if(c != ch[i]) { unread(c); if(i != 0) { unread(ch, i); } return false; } } return true; } /** * Return true if we can read some whitespace. *

This is simply a convenience method. *

This method will push back a character rather than an * array whenever possible (probably the majority of cases). * @return true if whitespace was found. */ boolean tryWhitespace() throws java.lang.Exception { char c; c = readCh(); if(isWhitespace(c)) { skipWhitespace(); return true; } else { unread(c); return false; } } /** * Read all data until we find the specified string. *

This is especially useful for scanning marked sections. *

This is a a little inefficient right now, since it calls tryRead() * for every character. * @param delim The string delimiter * @see #tryRead(String, boolean) * @see #readCh */ void parseUntil(String delim) throws java.lang.Exception { char c; int startLine = m_line; try { while(!tryRead(delim)) { c = readCh(); dataBufferAppend(c); } } catch(EOFException e) { error("end of input while looking for delimiter (started on line " + startLine + ')', null, delim); } } /** * Skip all data until we find the specified string. *

This is especially useful for scanning comments. *

This is a a little inefficient right now, since it calls tryRead() * for every character. * @param delim The string delimiter * @see #tryRead(String, boolean) * @see #readCh */ void skipUntil(String delim) throws java.lang.Exception { while(!tryRead(delim)) { readCh(); } } /** * Read just the encoding declaration (or XML declaration) at the * start of an external entity. * When this method is called, we know that the declaration is * present (or appears to be). We also know that the entity is * in some sort of ASCII-derived 8-bit encoding. * The idea of this is to let us read what the 8-bit encoding is * before we've committed to converting any more of the file; the * XML or encoding declaration must be in 7-bit ASCII, so we're * safe as long as we don't go past it. */ void read8bitEncodingDeclaration() throws java.lang.Exception { int ch; m_rbpos = m_rblen = 0; while(true) { ch = m_is.read(); m_readBuffer[m_rblen++] = (char) ch; switch(ch) { case '>': return; case -1: error("end of file before end of XML or encoding declaration.", null, "?>"); return; } if(m_readBuffer.length == m_rblen) { error("unfinished XML or encoding declaration", null, null); } } } ////////////////////////////////////////////////////////////////////// // Low-level I/O. ////////////////////////////////////////////////////////////////////// /** * Read a chunk of data from an external input source. *

This is simply a front-end that fills the rawReadBuffer * with bytes, then calls the appropriate encoding handler. * @see #m_encoding * @see #m_rawReadBuffer * @see #m_readBuffer * @see #filterCR * @see #copyUtf8ReadBuffer * @see #copyIso8859_1ReadBuffer * @see #copyUcs_2ReadBuffer * @see #copyUcs_4ReadBuffer */ void readDataChunk() throws java.lang.Exception { int count; // See if we have any overflow. if(m_rboverflow > -1) { m_readBuffer[0] = (char) m_rboverflow; m_rboverflow = -1; m_rbpos = 1; m_sawCR = true; } else { m_rbpos = 0; m_sawCR = false; } // Special situation -- we're taking // input from a character stream. if(m_sourceType == INPUT_READER) { count = m_reader.read(m_readBuffer, m_rbpos, READ_BUFFER_MAX - 1); if(count < 0) { m_rblen = -1; } else { m_rblen = m_rbpos + count; filterCR(); m_sawCR = false; } return; } // Read as many bytes as possible // into the read buffer. count = m_is.read(m_rawReadBuffer, 0, READ_BUFFER_MAX); // Dispatch to an encoding-specific // reader method to populate the // readBuffer. switch(m_encoding) { case ENCODING_UTF_8: copyUtf8ReadBuffer(count); break; case ENCODING_ISO_8859_1: copyIso8859_1ReadBuffer(count); break; case ENCODING_UCS_2_12: copyUcs2ReadBuffer(count, 8, 0); break; case ENCODING_UCS_2_21: copyUcs2ReadBuffer(count, 0, 8); break; case ENCODING_UCS_4_1234: copyUcs4ReadBuffer(count, 24, 16, 8, 0); break; case ENCODING_UCS_4_4321: copyUcs4ReadBuffer(count, 0, 8, 16, 24); break; case ENCODING_UCS_4_2143: copyUcs4ReadBuffer(count, 16, 24, 0, 8); break; case ENCODING_UCS_4_3412: copyUcs4ReadBuffer(count, 8, 0, 24, 16); break; } // Filter out all carriage returns // if we've seen any. if(m_sawCR) { filterCR(); m_sawCR = false; } // Reset the position. m_rbpos = 0; m_currentByteCount += count; } /** * Filter carriage returns in the read buffer. *

CRLF becomes LF; CR becomes LF. * @see #readDataChunk * @see #m_readBuffer * @see #m_rboverflow */ void filterCR() { int i, j; m_rboverflow = -1; loop : for(i = 0, j = 0; j < m_rblen; i++, j++) { switch(m_readBuffer[j]) { case '\r': if(j == m_rblen - 1) { m_rboverflow = '\r'; m_rblen--; break loop; } else if(m_readBuffer[j + 1] == '\n') { j++; } m_readBuffer[i] = '\n'; break; case '\n': default: m_readBuffer[i] = m_readBuffer[j]; break; } } m_rblen = i; } /** * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. *

When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. *

The tricky part of this is dealing with UTF-8 multi-byte * sequences, but it doesn't seem to slow things down too much. * @param count The number of bytes to convert. * @see #readDataChunk * @see #m_rawReadBuffer * @see #m_readBuffer * @see #getNextUtf8Byte */ void copyUtf8ReadBuffer(int count) throws java.lang.Exception { int i = 0; int j = m_rbpos; int b1; // boolean isSurrogate = false; while(i < count) { b1 = m_rawReadBuffer[i++]; // isSurrogate = false; // Determine whether we are dealing // with a one-, two-, three-, or four- // byte sequence. if((b1 & 0x80) == 0) { // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx m_readBuffer[j++] = (char) b1; } else if((b1 & 0xe0) == 0xc0) { // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx m_readBuffer[j++] = (char) (((b1 & 0x1f) << 6) | getNextUtf8Byte(i++, count)); } else if((b1 & 0xf0) == 0xe0) { // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx m_readBuffer[j++] = (char) (((b1 & 0x0f) << 12) | (getNextUtf8Byte(i++, count) << 6) | getNextUtf8Byte(i++, count)); } else if((b1 & 0xf8) == 0xf0) { // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx // (uuuuu = wwww + 1) // isSurrogate = true; int b2 = getNextUtf8Byte(i++, count); int b3 = getNextUtf8Byte(i++, count); int b4 = getNextUtf8Byte(i++, count); m_readBuffer[j++] = (char) (0xd800 | ((((b1 & 0x07) << 2) | ((b2 & 0x30) >> 4) - 1) << 6) | ((b2 & 0x0f) << 2) | ((b3 & 0x30) >> 4)); m_readBuffer[j++] = (char) (0xdc | ((b3 & 0x0f) << 6) | b4); // TODO: test that surrogate value is legal. } else { // Otherwise, the 8th bit may not be set in UTF-8 encodingError("bad start for UTF-8 multi-byte sequence", b1, i); } if(m_readBuffer[j - 1] == '\r') { m_sawCR = true; } } // How many characters have we read? m_rblen = j; } /** * Return the next byte value in a UTF-8 sequence. * If it is not possible to get a byte from the current * entity, throw an exception. * @param pos The current position in the rawReadBuffer. * @param count The number of bytes in the rawReadBuffer * @return The significant six bits of a non-initial byte in * a UTF-8 sequence. * @exception EOFException If the sequence is incomplete. */ int getNextUtf8Byte(int pos, int count) throws java.lang.Exception { int val; // Take a character from the buffer // or from the actual input stream. if(pos < count) { val = m_rawReadBuffer[pos]; } else { val = m_is.read(); if(val == -1) { encodingError("unfinished multi-byte UTF-8 sequence at EOF", -1, pos); } } // Check for the correct bits at the // start. if((val & 0xc0) != 0x80) { encodingError("bad continuation of multi-byte UTF-8 sequence", val, pos + 1); } // Return the significant bits. return (val & 0x3f); } /** * Convert a buffer of ISO-8859-1-encoded bytes into UTF-16 characters. *

When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. *

This is a direct conversion, with no tricks. * @param count The number of bytes to convert. * @see #readDataChunk * @see #m_rawReadBuffer * @see #m_readBuffer */ void copyIso8859_1ReadBuffer(int count) { int i, j; for(i = 0, j = m_rbpos; i < count; i++, j++) { m_readBuffer[j] = (char) (m_rawReadBuffer[i] & 0xff); if(m_readBuffer[j] == '\r') { m_sawCR = true; } } m_rblen = j; } /** * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters. *

When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. * @param count The number of bytes to convert. * @param shift1 The number of bits to shift byte 1. * @param shift2 The number of bits to shift byte 2 * @see #readDataChunk * @see #m_rawReadBuffer * @see #m_readBuffer */ void copyUcs2ReadBuffer(int count, int shift1, int shift2) throws java.lang.Exception { int j = m_rbpos; if(count > 0 && (count % 2) != 0) { encodingError("odd number of bytes in UCS-2 encoding", -1, count); } for(int i = 0; i < count; i += 2) { m_readBuffer[j++] = (char) (((m_rawReadBuffer[i] & 0xff) << shift1) | ((m_rawReadBuffer[i + 1] & 0xff) << shift2)); if(m_readBuffer[j - 1] == '\r') { m_sawCR = true; } } m_rblen = j; } /** * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. *

When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. *

Java has 16-bit chars, but this routine will attempt to use * surrogates to encoding values between 0x00010000 and 0x000fffff. * @param count The number of bytes to convert. * @param shift1 The number of bits to shift byte 1. * @param shift2 The number of bits to shift byte 2 * @param shift3 The number of bits to shift byte 2 * @param shift4 The number of bits to shift byte 2 * @see #readDataChunk * @see #m_rawReadBuffer * @see #m_readBuffer */ void copyUcs4ReadBuffer(int count, int shift1, int shift2, int shift3, int shift4) throws java.lang.Exception { int j = m_rbpos; int value; if(count > 0 && (count % 4) != 0) { encodingError("number of bytes in UCS-4 encoding not divisible by 4", -1, count); } for(int i = 0; i < count; i += 4) { value = (((m_rawReadBuffer[i] & 0xff) << shift1) | ((m_rawReadBuffer[i + 1] & 0xff) << shift2) | ((m_rawReadBuffer[i + 2] & 0xff) << shift3) | ((m_rawReadBuffer[i + 3] & 0xff) << shift4)); if(value < 0x0000ffff) { m_readBuffer[j++] = (char) value; if(value == '\r') { m_sawCR = true; } } else if(value < 0x000fffff) { m_readBuffer[j++] = (char) (0xd8 | ((value & 0x000ffc00) >> 10)); m_readBuffer[j++] = (char) (0xdc | (value & 0x0003ff)); } else { encodingError("value cannot be represented in UTF-16", value, i); } } m_rblen = j; } /** * Report a character encoding error. */ void encodingError(String message, int value, int offset) throws java.lang.Exception { String uri; if(value >= 0) { message = message + " (byte value: 0x" + Integer.toHexString(value) + ')'; } if(m_externalEntity != null) { uri = m_externalEntity.getURL().toString(); } else { uri = m_baseURI; } m_handler.error(message, uri, -1, offset + m_currentByteCount); } ////////////////////////////////////////////////////////////////////// // Local Variables. ////////////////////////////////////////////////////////////////////// /** * Re-initialize the variables for each parse. */ void initializeVariables() { // No errors; first line errorCount = 0; m_line = 1; m_column = 0; // Set up the buffers for data and names m_dataBufferPos = 0; m_dataBuffer = new char[DATA_BUFFER_INITIAL]; m_nameBufferPos = 0; m_nameBuffer = new char[NAME_BUFFER_INITIAL]; // Set up the DTD hash tables m_elementInfo = new Hashtable(); m_entityInfo = new Hashtable(); m_notationInfo = new Hashtable(); // Set up the variables for the current // element context. m_currentElement = null; m_currentElementContent = CONTENT_UNDECLARED; // Set up the input variables m_sourceType = INPUT_NONE; m_inputStack = new Stack(); m_entityStack = new Stack(); m_externalEntity = null; m_tagAttributePos = 0; m_tagAttributes = new String[100]; m_rawReadBuffer = new byte[READ_BUFFER_MAX]; m_rboverflow = -1; m_context = CONTEXT_NONE; m_symbolTable = new Object[SYMBOL_TABLE_LENGTH]; } /** * Clean up after the parse to allow some garbage collection. * Leave around anything that might be useful for queries. */ void cleanupVariables() { errorCount = -1; m_line = -1; m_column = -1; m_dataBuffer = null; m_nameBuffer = null; m_currentElement = null; m_currentElementContent = CONTENT_UNDECLARED; m_sourceType = INPUT_NONE; m_inputStack = null; m_externalEntity = null; m_entityStack = null; } // // The current XML handler interface. // XmlHandler m_handler; // // I/O information. // private Reader m_reader; // current reader private InputStream m_is; // current input stream private int m_line; // current line number private int m_column; // current column number private int m_sourceType; // type of input source private Stack m_inputStack; // stack of input soruces private URLConnection m_externalEntity; // current external entity private int m_encoding; // current character encoding. private int m_currentByteCount; // how many bytes read from current source. // // Maintain a count of errors. // private int errorCount; // // Buffers for decoded but unparsed character input. // private final static int READ_BUFFER_MAX = 16384; private char m_readBuffer[]; private int m_rbpos; private int m_rblen; private int m_rboverflow; // overflow character from last data chunk. // // Buffer for undecoded raw byte input. // private byte m_rawReadBuffer[]; // // Buffer for parsed character data. // private static int DATA_BUFFER_INITIAL = 4096; private char m_dataBuffer[]; private int m_dataBufferPos; // // Buffer for parsed names. // private static int NAME_BUFFER_INITIAL = 1024; private char m_nameBuffer[]; private int m_nameBufferPos; // // Hashtables for DTD information on elements, entities, and notations. // private Hashtable m_elementInfo; private Hashtable m_entityInfo; private Hashtable m_notationInfo; // // Element type currently in force. // private String m_currentElement; private int m_currentElementContent; // // Base external identifiers for resolution. // private String m_basePublicId; private String m_baseURI; private Reader m_baseReader; private InputStream m_baseInputStream; // // Stack of entity names, to help detect recursion. // private Stack m_entityStack; // // Are we in a context where PEs are allowed? // private int m_context; // // Symbol table, for internalising names. // private Object m_symbolTable[]; private final static int SYMBOL_TABLE_LENGTH = 1087; // // Hash table of attributes found in current start tag. // private String m_tagAttributes[]; private int m_tagAttributePos; // // Utility flag: have we noticed a CR while reading the last // data chunk? If so, we will have to go back and normalise // CR/LF. // private boolean m_sawCR; }



© 2015 - 2024 Weber Informatics LLC | Privacy Policy