com.caucho.xml.XmlParser Maven / Gradle / Ivy
Show all versions of resin Show documentation
/*
* Copyright (c) 1998-2018 Caucho Technology -- all rights reserved
*
* This file is part of Resin(R) Open Source
*
* Each copy or derived work must preserve the copyright notice and this
* notice unmodified.
*
* Resin Open Source is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Resin Open Source is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
* of NON-INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with Resin Open Source; if not, write to the
* Free SoftwareFoundation, Inc.
* 59 Temple Place, Suite 330
* Boston, MA 02111-1307 USA
*
* @author Scott Ferguson
*/
package com.caucho.xml;
import com.caucho.util.CharBuffer;
import com.caucho.vfs.Path;
import com.caucho.vfs.ReadStream;
import com.caucho.vfs.ReaderWriterStream;
import com.caucho.vfs.Vfs;
import com.caucho.vfs.WriteStream;
import com.caucho.xml.readers.MacroReader;
import com.caucho.xml.readers.Utf16Reader;
import com.caucho.xml.readers.Utf8Reader;
import com.caucho.xml.readers.XmlReader;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.logging.Level;
/**
* A configurable XML parser. Loose versions of XML and HTML are supported
* by changing the Policy object.
*
* Normally, applications will use Xml, LooseXml, Html, or LooseHtml.
*/
public class XmlParser extends AbstractParser {
// Xerces uses the following
public static final String XMLNS = "http://www.w3.org/2000/xmlns/";
public static final String XML = "http://www.w3.org/XML/1998/namespace";
static final QName DOC_NAME = new QName(null, "#document", null);
static final QName TEXT_NAME = new QName(null, "#text", null);
static final QName JSP_NAME = new QName(null, "#jsp", null);
static final QName WHITESPACE_NAME = new QName(null, "#whitespace", null);
static final QName JSP_ATTRIBUTE_NAME = new QName("xtp", "jsp-attribute", null);
QAttributes _attributes;
QAttributes _nullAttributes;
boolean _inDtd;
CharBuffer _text;
CharBuffer _eltName;
CharBuffer _cb;
CharBuffer _buf = new CharBuffer();
String _textFilename;
int _textLine;
char []_textBuffer = new char[1024];
int _textLength;
int _textCapacity = _textBuffer.length;
boolean _isIgnorableWhitespace;
boolean _isJspText;
CharBuffer _name = new CharBuffer();
CharBuffer _nameBuffer = new CharBuffer();
MacroReader _macro = new MacroReader();
int _macroIndex = 0;
int _macroLength = 0;
char []_macroBuffer;
QName []_elementNames = new QName[64];
NamespaceMap []_namespaces = new NamespaceMap[64];
int []_elementLines = new int[64];
int _elementTop;
NamespaceMap _namespaceMap;
ArrayList _attrNames = new ArrayList();
ArrayList _attrValues = new ArrayList();
ReadStream _is;
XmlReader _reader;
String _extPublicId;
String _extSystemId;
QName _activeNode;
QName _topNamespaceNode;
boolean _isTagStart;
boolean _stopOnIncludeEnd;
boolean _hasTopElement;
boolean _hasDoctype;
boolean _isHtml;
Locator _locator = new LocatorImpl(this);
public XmlParser()
{
clear();
}
/**
* Creates a new parser with a given parsing policy and dtd.
*
* @param policy the parsing policy, handling optional tags.
* @param dtd the parser's dtd.
*/
XmlParser(Policy policy, QDocumentType dtd)
{
super(policy, dtd);
clear();
}
/**
* Initialize the parser.
*/
void init()
{
super.init();
_attributes = new QAttributes();
_nullAttributes = new QAttributes();
_eltName = new CharBuffer();
_text = new CharBuffer();
_isHtml = _policy instanceof HtmlPolicy;
// jsp/193b
// _namespaceMap = null;
_textLength = 0;
_isIgnorableWhitespace = true;
_elementTop = 0;
_elementLines[0] = 1;
_line = 1;
_dtd = null;
_inDtd = false;
_isTagStart = false;
_stopOnIncludeEnd = false;
_extPublicId = null;
_extSystemId = null;
// _filename = null;
_publicId = null;
_systemId = null;
_hasTopElement = false;
_hasDoctype = false;
_macroIndex = 0;
_macroLength = 0;
_reader = null;
// _owner = null;
_policy.init();
}
/**
* Parse the document from a read stream.
*
* @param is read stream to parse from.
*
* @return The parsed document.
*/
@Override
Document parseInt(ReadStream is)
throws IOException, SAXException
{
_is = is;
if (_filename == null && _systemId != null)
_filename = _systemId;
else if (_filename == null)
_filename = _is.getUserPath();
if (_systemId == null) {
_systemId = _is.getPath().getURL();
if ("null:".equals(_systemId) || "string:".equals(_systemId))
_systemId = "stream";
}
/* xsl/0401
if (_isNamespaceAware)
_namespaceMap = new NamespaceMap(null, "", "");
*/
_policy.setNamespaceAware(_isNamespaceAware);
if (_filename == null)
_filename = _systemId;
if (_filename == null)
_filename = "stream";
if (_dtd != null)
_dtd.setSystemId(_systemId);
if (_builder != null) {
if (! "string:".equals(_systemId) && ! "stream".equals(_systemId))
_builder.setSystemId(_systemId);
_builder.setFilename(_is.getPath().getURL());
}
if (_contentHandler == null)
_contentHandler = new org.xml.sax.helpers.DefaultHandler();
_contentHandler.setDocumentLocator(_locator);
if (_owner == null)
_owner = new QDocument();
if (_defaultEncoding != null)
_owner.setAttribute("encoding", _defaultEncoding);
_activeNode = DOC_NAME;
_policy.setStream(is);
_policy.setNamespace(_namespaceMap);
_contentHandler.startDocument();
int ch = parseXMLDeclaration(null);
ch = skipWhitespace(ch);
parseNode(ch, false);
/*
if (dbg.canWrite()) {
printDebugNode(dbg, doc, 0);
dbg.flush();
}
*/
if (_strictXml && ! _hasTopElement)
throw error(L.l("XML file has no top-element. All well-formed XML files have a single top-level element."));
if (_contentHandler != null)
_contentHandler.endDocument();
QDocument owner = _owner;
_owner = null;
// ioc/23l0
Path path = is.getPath();
is.close();
owner.addDepend(path);
return owner;
}
/**
* The main dispatch loop.
*
* @param node the current node
* @param ch the next character
* @param special true for the short form, <foo/bar/>
*/
private void parseNode(int ch, boolean special)
throws IOException, SAXException
{
//boolean isTop = node instanceof QDocument;
_text.clear();
loop:
while (true) {
if (_textLength == 0) {
_textFilename = getFilename();
_textLine = getLine();
}
switch (ch) {
case -1:
if (_textLength != 0)
appendText();
if (! _stopOnIncludeEnd && _reader.getNext() != null) {
popInclude();
if (_reader != null)
parseNode(_reader.read(), special);
return;
}
closeTag("");
return;
case ' ': case '\t': case '\n': case '\r':
if (! _normalizeWhitespace)
addText((char) ch);
else if (_textLength == 0) {
if (! _isTagStart)
addText(' ');
}
else if (_textBuffer[_textLength - 1] != ' ') {
addText(' ');
}
ch = _reader.read();
break;
case 0xffff:
// marker for end of text for serialization
return;
default:
addText((char) ch);
ch = _reader.read();
break;
case '/':
if (! special) {
addText((char) ch);
ch = _reader.read();
continue;
}
ch = _reader.read();
if (ch == '>' || ch == -1) {
appendText();
popNode();
return;
}
addText('/');
break;
case '&':
ch = parseEntityReference();
break;
case '<':
boolean endTag = false;
ch = _reader.read();
if (ch == '/' && ! special) {
if (_normalizeWhitespace &&
_textLength > 0 && _textBuffer[_textLength - 1] == ' ') {
_textLength--;
}
appendText();
ch = _reader.parseName(_name, _reader.read());
if (ch != '>') {
// XXX: Hack for Java PetStore
while (XmlChar.isWhitespace(ch))
ch = _reader.read();
if (ch != '>')
throw error(L.l("`{0}>' expected `>' at {1}. Closing tags must close immediately after the tag name.", _name, badChar(ch)));
}
closeTag(_policy.getName(_name).getName());
ch = _reader.read();
}
// element: ...
else if (XmlChar.isNameStart(ch)) {
appendText();
parseElement(ch);
ch = _reader.read();
}
//
if ((ch = _reader.read()) == '[') {
parseCdata();
ch = _reader.read();
}
//
else if (ch == '-') {
parseComment();
ch = _reader.read();
}
else if (XmlChar.isNameStart(ch)) {
appendText();
ch = _reader.parseName(_name, ch);
String declName = _name.toString();
if (declName.equals("DOCTYPE")) {
parseDoctype(ch);
if (_contentHandler instanceof DOMBuilder)
((DOMBuilder) _contentHandler).dtd(_dtd);
ch = _reader.read();
} else if (_forgiving && declName.equalsIgnoreCase("doctype")) {
parseDoctype(ch);
if (_contentHandler instanceof DOMBuilder)
((DOMBuilder) _contentHandler).dtd(_dtd);
ch = _reader.read();
} else
throw error(L.l("expected `
else if (ch == '?') {
ch = parsePI();
}
else if (_strictXml) {
throw error(L.l("expected tag name after `<' at {0}. Open tag names must immediately follow the open brace like `'", badChar(ch)));
}
// implicit
else if (_isJsp && ch == '%') {
ch = _reader.read();
appendText();
_isJspText = ch != '=';
addText("<%");
while (ch >= 0) {
if (ch == '%') {
ch = _reader.read();
if (ch == '>') {
addText("%>");
ch = _reader.read();
break;
}
else
addText('%');
}
else {
addText((char) ch);
ch = _reader.read();
}
}
appendText();
_isJspText = false;
}
else {
addText('<');
}
}
}
}
/**
* Parses the <!DOCTYPE> declaration.
*/
private void parseDoctype(int ch)
throws IOException, SAXException
{
if (_activeNode != DOC_NAME)
throw error(L.l(" declaration."));
_inDtd = true;
ch = skipWhitespace(ch);
ch = _reader.parseName(_nameBuffer, ch);
String name = _nameBuffer.toString();
ch = skipWhitespace(ch);
if (_dtd == null)
_dtd = new QDocumentType(name);
_dtd.setName(name);
if (XmlChar.isNameStart(ch)) {
ch = parseExternalID(ch);
ch = skipWhitespace(ch);
_dtd._publicId = _extPublicId;
_dtd._systemId = _extSystemId;
}
if (_dtd._systemId != null && ! _dtd._systemId.equals("")) {
InputStream is = null;
unread(ch);
XmlReader oldReader = _reader;
boolean hasInclude = false;
try {
pushInclude(_extPublicId, _extSystemId);
hasInclude = true;
} catch (RemoteURLException e) {
log.finest(e.toString());
} catch (Exception e) {
if (log.isLoggable(Level.FINEST))
log.log(Level.FINER, e.toString(), e);
else
log.finer(e.toString());
}
if (hasInclude) {
_stopOnIncludeEnd = true;
try {
ch = parseDoctypeDecl(_dtd);
} catch (XmlParseException e) {
if (_extSystemId != null &&
_extSystemId.startsWith("http")) {
log.log(Level.FINE, e.toString(), e);
}
else
throw e;
}
_stopOnIncludeEnd = false;
while (_reader != null && _reader != oldReader)
popInclude();
}
if (_reader != null)
ch = skipWhitespace(read());
}
if (ch == '[')
ch = parseDoctypeDecl(_dtd);
ch = skipWhitespace(ch);
_inDtd = false;
if (ch != '>')
throw error(L.l("expected `>' in
* dtd-item ::= <!ELEMENT ... |
* <!ATTLIST ... |
* <!NOTATION ... |
* <!ENTITY ... |
* <!-- comment |
* <? pi |
* %pe-ref;
*
*
* @return the next character.
*/
private int parseDoctypeDecl(QDocumentType doctype)
throws IOException, SAXException
{
_hasDoctype = true;
int ch = 0;
for (ch = skipWhitespace(read());
ch >= 0 && ch != ']';
ch = skipWhitespace(read())) {
if (ch == '<') {
if ((ch = read()) == '!') {
if (XmlChar.isNameStart(ch = read())) {
ch = _reader.parseName(_text, ch);
String name = _text.toString();
if (name.equals("ELEMENT"))
parseElementDecl(doctype);
else if (name.equals("ATTLIST"))
parseAttlistDecl(doctype);
else if (name.equals("NOTATION"))
parseNotationDecl(doctype);
else if (name.equals("ENTITY"))
parseEntityDecl(doctype);
else
throw error("unknown declaration `" + name + "'");
}
else if (ch == '-')
parseComment();
else if (ch == '[') {
ch = _reader.parseName(_text, read());
String name = _text.toString();
if (name.equals("IGNORE")) {
parseIgnore();
}
else if (name.equals("INCLUDE")) {
parseIgnore();
}
else
throw error("unknown declaration `" + name + "'");
}
}
else if (ch == '?') {
parsePI();
}
else
throw error(L.l("expected markup at {0}", badChar(ch)));
}
else if (ch == '%') {
ch = _reader.parseName(_buf, read());
if (ch != ';')
throw error(L.l("`%{0};' expects `;' at {1}. Parameter entities have a `%name;' syntax.", _buf, badChar(ch)));
addPEReference(_text, _buf.toString());
}
else {
throw error(L.l("expected '<' at {0}", badChar(ch)));
}
_text.clear();
}
_text.clear();
return read();
}
/**
* Parses an element.
*
* @param ch the current character
*/
private void parseElement(int ch)
throws IOException, SAXException
{
ch = _reader.parseName(_eltName, ch);
NamespaceMap oldNamespace = _namespaceMap;
if (ch != '>' && ch != '/')
ch = parseAttributes(ch, true);
else
_attributes.clear();
QName qname = _policy.getName(_eltName);
if (_isValidating && _dtd != null) {
QElementDef elementDef = _dtd.getElement(qname.getName());
if (elementDef != null)
elementDef.fillDefaults(_attributes);
}
if (ch == '/') {
// empty tag: * <options> * <option name="foo" <%= test.isSelected("foo") %>/> * </options> ** * @param element the parent element * * @return the next character to read. */ private int parseJspAttribute(boolean isElement) throws IOException, XmlParseException { int ch = _reader.read(); if (ch != '%') throw error(L.l("unexpected char `{0}' in element", "%")); ch = _reader.read(); if (ch != '=') throw error(L.l("unexpected char `{0}' in element", "=")); _text.clear(); ch = _reader.read(); while (ch >= 0) { if (ch == '%') { ch = _reader.read(); if (ch == '>') { ch = _reader.read(); break; } _text.append((char) ch); } else { _text.append((char) ch); ch = _reader.read(); } } String value = _text.toString(); if (isElement) _attributes.add(JSP_ATTRIBUTE_NAME, value); return ch; } /** * Handle processing at a close tag. For strict XML, this will normally * just change the current node to its parent, but HTML has a more * complicated policy. */ private void closeTag(String endTagName) throws IOException, SAXException { while (_activeNode != null && _activeNode != DOC_NAME) { switch (_policy.elementCloseAction(this, _activeNode, endTagName)) { case Policy.POP: //if (dbg.canWrite()) // dbg.println("" + activeNode.getNodeName() + ">"); popNode(); return; case Policy.POP_AND_LOOP: //if (dbg.canWrite()) // dbg.println("" + activeNode.getNodeName() + ">"); popNode(); break; case Policy.IGNORE: return; default: throw new RuntimeException(); } } if (! _extraForgiving && endTagName != null && ! endTagName.equals("")) throw error(L.l("Unexpected end tag `{0}>' at top-level. All open tags have already been closed.", endTagName)); } /** * Handles processing of the resin:include tag. */ private void handleResinInclude() throws IOException, SAXException { String filename = _attributes.getValue("path"); if (filename == null || filename.equals("")) filename = _attributes.getValue("href"); if (filename.equals("")) throw error(L.l("
* er ::= d+; * ::= &name; **/ private int parseEntityReference() throws IOException, SAXException { int ch; ch = _reader.read(); // character reference if (ch == '#') { addText((char) parseCharacterReference()); return _reader.read(); } // entity reference else if (XmlChar.isNameStart(ch)) { ch = _reader.parseName(_buf, ch); if (ch != ';' && _strictXml) throw error(L.l("`&{0};' expected `;' at {0}. Entity references have a `&name;' syntax.", _buf, badChar(ch))); else if (ch != ';') { addText('&'); addText(_buf.toString()); return ch; } addEntityReference(_buf.toString()); ch = _reader.read(); return ch; } else if (_strictXml) { throw error(L.l("expected name at {0}", badChar(ch))); } else { addText('&'); return ch; } } private int parseCharacterReference() throws IOException, SAXException { int ch = _reader.read(); int radix = 10; if (ch == 'x') { radix = 16; ch = _reader.read(); } int value = 0; for (; ch != ';'; ch = _reader.read()) { if (ch >= '0' && ch <= '9') value = radix * value + ch - '0'; else if (radix == 16 && ch >= 'a' && ch <= 'f') value = radix * value + ch - 'a' + 10; else if (radix == 16 && ch >= 'A' && ch <= 'F') value = radix * value + ch - 'A' + 10; else throw error(L.l("malformed entity ref at {0}", badChar(ch))); } if (value > 0xffff) throw error(L.l("malformed entity ref at {0}", "" + value)); // xml/0072 if (_strictCharacters && ! isChar(value)) throw error(L.l("illegal character ref at {0}", badChar(value))); return value; } /** * Looks up a named entity reference, filling the text. */ private void addEntityReference(String name) throws IOException, SAXException { boolean expand = ! _entitiesAsText || _hasDoctype || ! _switchToXml; // XXX: not quite the right logic. There should be a soft expandEntities if (! expand) { addText("&" + name + ";"); return; } int ch = _entities.getEntity(name); if (ch >= 0 && ch <= 0xffff) { addText((char) ch); return; } QEntity entity = _dtd == null ? null : _dtd.getEntity(name); if (! _expandEntities) { addText("&" + name + ";"); return; } if (entity == null && (_dtd == null || _dtd.getName() == null || ! _dtd.isExternal())) { if (_strictXml) throw error(L.l("`&{0};' is an unknown entity. XML predefines only `<', `&', `>', `'' and `"'. All other entities must be defined in an <!ENTITY> definition in the DTD.", name)); else { if (expand && _contentHandler instanceof DOMBuilder) { appendText(); ((DOMBuilder) _contentHandler).entityReference(name); } else addText("&" + name + ";"); } } else if (entity != null) { if (expand && entity._isSpecial && entity._value != null) addText(entity._value); else if (entity.getSystemId() != null) { if (pushSystemEntity(entity)) { } /* XXX:?? else if (strictXml) { throw error(L.l("can't open external entity at `&{0};'", name)); } */ else if (_contentHandler instanceof DOMBuilder) { appendText(); ((DOMBuilder) _contentHandler).entityReference(name); } else addText("&" + name + ";"); } else if (expand && entity._value != null) setMacro(entity._value); else addText("&" + name + ";"); } else { if (expand && _contentHandler instanceof DOMBuilder) { appendText(); ((DOMBuilder) _contentHandler).entityReference(name); } else // XXX: error? addText("&" + name + ";"); } } private boolean pushSystemEntity(QEntity entity) throws IOException, SAXException { String publicId = entity.getPublicId(); String systemId = entity.getSystemId(); String value = null; InputSource source = null; ReadStream is = null; if (_entityResolver != null) source = _entityResolver.resolveEntity(publicId, systemId); if (source != null && source.getByteStream() != null) is = Vfs.openRead(source.getByteStream()); else if (source != null && source.getCharacterStream() != null) is = Vfs.openRead(source.getCharacterStream()); else if (source != null && source.getSystemId() != null && _searchPath.lookup(source.getSystemId()).isFile()) { _owner.addDepend(_searchPath.lookup(source.getSystemId())); is = _searchPath.lookup(source.getSystemId()).openRead(); } else if (systemId != null && ! systemId.equals("")) { String path = systemId; if (path.startsWith("file:")) path = path.substring(5); if (_searchPath != null && _searchPath.lookup(path).isFile()) { _owner.addDepend(_searchPath.lookup(path)); is = _searchPath.lookup(path).openRead(); } } if (is == null) return false; _filename = systemId; _systemId = systemId; Path oldSearchPath = _searchPath; Path path = is.getPath(); if (path != null) { _owner.addDepend(path); if (_searchPath != null) { _searchPath = path.getParent(); _reader.setSearchPath(oldSearchPath); } } _is = is; _line = 1; XmlReader oldReader = _reader; _reader = null; int ch = parseXMLDeclaration(oldReader); unread(ch); return true; } /** * Parses an attribute value. * *
* value ::= '[^']*' * ::= "[^"]*" * ::= [^ />]* ** * @param value the CharBuffer which will contain the value. * @param ch the next character from the input stream. * @param isGeneral true if general entities are allowed. * * @return the following character from the input stream */ private int parseValue(CharBuffer value, int ch, boolean isGeneral) throws IOException, SAXException { int end = ch; value.clear(); if (end == '\'' || end == '"') ch = _reader.read(); else if (_strictAttributes) { value.append((char) end); for (ch = _reader.read(); ch >= 0 && XmlChar.isNameChar(ch); ch = _reader.read()) value.append((char) ch); throw error(L.l("XML attribute value must be quoted at `{0}'. XML attribute syntax is either attr=\"value\" or attr='value'.", value)); } else end = 0; while (ch != -1 && (end != 0 && ch != end || end == 0 && isAttributeChar(ch))) { if (end == 0 && ch == '/') { ch = _reader.read(); if (! isWhitespace(ch) && ch != '>') { value.append('/'); value.append((char) ch); } else { unread(ch); return '/'; } } else if (ch == '&' && ! _entitiesAsText) { if ((ch = _reader.read()) == '#') value.append((char) parseCharacterReference()); else if (! isGeneral) { value.append('&'); value.append((char) ch); } else if (XmlChar.isNameStart(ch)) { ch = _reader.parseName(_buf, ch); String name = _buf.toString(); if (ch != ';' && _strictXml) throw error(L.l("expected `{0}' at {1}", ";", badChar(ch))); else if (ch != ';') { value.append('&'); value.append(name); continue; } else { int lookup = _entities.getEntity(name); if (lookup >= 0 && lookup <= 0xffff) { ch = _reader.read(); value.append((char) lookup); continue; } QEntity entity = _dtd == null ? null : _dtd.getEntity(name); if (entity != null && entity._value != null) setMacroAttr(entity._value); else if (_strictXml) throw error(L.l("expected local reference at `&{0};'", name)); else { value.append('&'); value.append(name); value.append(';'); } } } } else if (ch == '%' && ! isGeneral) { ch = _reader.read(); if (! XmlChar.isNameStart(ch)) { value.append('%'); continue; } else { ch = _reader.parseName(_buf, ch); if (ch != ';') throw error(L.l("expected `{0}' at {1}", ";", badChar(ch))); else addPEReference(value, _buf.toString()); } } else if (ch == '<' && _isJsp) { value.append('<'); ch = _reader.read(); if (ch != '%') continue; value.append('%'); ch = _reader.read(); while (ch >= 0) { if (ch == '%') { ch = _reader.read(); if (ch == '>') { value.append("%>"); break; } else value.append('%'); } else { value.append((char) ch); ch = _reader.read(); } } } else if (isGeneral) { if (ch == '\r') { ch = _reader.read(); if (ch != '\n') { value.append('\n'); continue; } } value.append((char) ch); } else if (ch == '\r') { value.append(' '); if ((ch = _reader.read()) != '\n') continue; } else if (ch == '\n') value.append(' '); else value.append((char) ch); ch = _reader.read(); } if (end != 0) ch = _reader.read(); return ch; } private boolean isAttributeChar(int ch) { switch (ch) { case ' ': case '\t': case '\n': case '\r': return false; case '<': case '>': case '\'':case '"': case '=': return false; default: return true; } } private void parsePcdata(QNode node) throws IOException, SAXException { int ch; String tail = "" + node.getNodeName() + ">"; _text.clear(); ch = _reader.read(); if (ch == '\n') ch = _reader.read(); for (; ch != -1; ch = _reader.read()) { addText((char) ch); if (_text.endsWith(tail)) { _text.setLength(_text.length() - tail.length()); if (_text.length() > 1 && _text.charAt(_text.length() - 1) == '\n') _text.setLength(_text.length() - 1); appendText(); return; } } throw error("bad pcdata"); } private int parseXMLDeclaration(XmlReader oldReader) throws IOException, SAXException { int startOffset = _is.getOffset(); boolean isEBCDIC = false; int ch = _is.read(); XmlReader reader = null; // utf-16 starts with \xfe \xff if (ch == 0xfe) { ch = _is.read(); if (ch == 0xff) { _owner.setAttribute("encoding", "UTF-16"); _is.setEncoding("utf-16"); reader = new Utf16Reader(this, _is); ch = reader.read(); } } // utf-16 rev starts with \xff \xfe else if (ch == 0xff) { ch = _is.read(); if (ch == 0xfe) { _owner.setAttribute("encoding", "UTF-16"); _is.setEncoding("utf-16"); reader = new Utf16Reader(this, _is); ((Utf16Reader) reader).setReverse(true); ch = reader.read(); } } // utf-16 can also start with \x00 < else if (ch == 0x00) { ch = _is.read(); _owner.setAttribute("encoding", "UTF-16"); _is.setEncoding("utf-16"); reader = new Utf16Reader(this, _is); } // utf-8 BOM is \xef \xbb \xbf else if (ch == 0xef) { ch = _is.read(); if (ch == 0xbb) { ch = _is.read(); if (ch == 0xbf) { ch = _is.read(); _owner.setAttribute("encoding", "UTF-8"); _is.setEncoding("utf-8"); reader = new Utf8Reader(this, _is); } } } else if (ch == 0x4c) { // ebcdic // xml/00l1 _is.unread(); // _is.setEncoding("cp037"); _is.setEncoding("cp500"); isEBCDIC = true; reader = new XmlReader(this, _is); ch = reader.read(); } else { int ch2 = _is.read(); if (ch2 == 0x00) { _owner.setAttribute("encoding", "UTF-16LE"); _is.setEncoding("utf-16le"); reader = new Utf16Reader(this, _is); ((Utf16Reader) reader).setReverse(true); } else if (ch2 > 0) _is.unread(); } if (reader != null && reader != oldReader) { } else if (_policy instanceof HtmlPolicy || _is.getSource() instanceof ReaderWriterStream) { reader = new XmlReader(this, _is); } else { reader = new Utf8Reader(this, _is); } if (ch == '\n') reader.setLine(2); reader.setSystemId(_systemId); if (_systemId == null) reader.setSystemId(_filename); reader.setFilename(_filename); reader.setPublicId(_publicId); reader.setNext(oldReader); _reader = reader; /* XXX: this might be too strict. */ /* if (! strictXml) { for (; XmlChar.isWhitespace(ch); ch = reader.read()) { } } */ if (ch != '<') return ch; if (parseXMLDecl(_reader) && isEBCDIC) { // EBCDIC requires a re-read _is.setOffset(startOffset); ch = _reader.read(); if (ch != '<') throw new IllegalStateException(); parseXMLDecl(_reader); } return _reader.read(); } private boolean parseXMLDecl(XmlReader reader) throws IOException, SAXException { int ch = reader.read(); if (ch != '?') { unread((char) ch); unread('<'); return false; } ch = _reader.read(); if (! XmlChar.isNameStart(ch)) throw error(L.l("expected name after '' at {0}. Processing instructions expect a name like ", badChar(ch))); ch = _reader.parseName(_text, ch); String piName = _text.toString(); if (! piName.equals("xml")) { ch = parsePITail(piName, ch); unread(ch); return false; } if (_switchToXml && _activeNode == DOC_NAME && ! _inDtd) { _policy = new XmlPolicy(); } ch = parseAttributes(ch, false); if (ch != '?') throw error(L.l("expected `?' at {0}. Processing instructions end with `?>' like ", badChar(ch))); if ((ch = _reader.read()) != '>') throw error(L.l("expected `>' at {0}. Processing instructions end with `?>' like ", ">", badChar(ch))); for (int i = 0; i < _attributes.getLength(); i++) { QName name = _attributes.getName(i); String value = _attributes.getValue(i); if (_owner != null) _owner.setAttribute(name.getName(), value); if (name.getName().equals("encoding")) { // xml/00hb // && ! _inDtd) { String encoding = value; if (! _isStaticEncoding && ! encoding.equalsIgnoreCase("UTF-8") && ! encoding.equalsIgnoreCase("UTF-16") && ! (_is.getSource() instanceof ReaderWriterStream)) { _is.setEncoding(encoding); XmlReader oldReader = _reader; _reader = new XmlReader(this, _is); // _reader.setNext(oldReader); _reader.setLine(oldReader.getLine()); _reader.setSystemId(_filename); _reader.setPublicId(null); } } } return true; } private int parsePI() throws IOException, SAXException { int ch; appendText(); ch = _reader.read(); if (! XmlChar.isNameStart(ch)) throw error(L.l("expected name after '' at {0}. Processing instructions expect a name like ", badChar(ch))); ch = _reader.parseName(_text, ch); String piName = _text.toString(); if (! piName.equals("xml")) return parsePITail(piName, ch); else if (_switchToXml && _activeNode == DOC_NAME && ! _inDtd) { _policy = new XmlPolicy(); return parsePITail(piName, ch); } else { throw error(L.l(" occurs after content. The prolog must be at the document start.")); } } private int parsePITail(String piName, int ch) throws IOException, SAXException { ch = skipWhitespace(ch); _text.clear(); while (ch != -1) { if (ch == '?') { if ((ch = _reader.read()) == '>') break; else _text.append('?'); } else { _text.append((char) ch); ch = _reader.read(); } } if (_inDtd) { QProcessingInstruction pi; pi = new QProcessingInstruction(piName, _text.toString()); pi._owner = _dtd._owner; _dtd.appendChild(pi); } else _contentHandler.processingInstruction(piName, _text.toString()); return _reader.read(); } /** * Parses a comment. The "<!--" has already been read. */ private void parseComment() throws IOException, SAXException { if (! _skipComments) appendText(); int ch = _reader.read(); if (ch != '-') throw error(L.l("expected comment at {0}", badChar(ch))); ch = _reader.read(); if (! _skipComments) _buf.clear(); comment: while (ch != -1) { if (ch == '-') { ch = _reader.read(); while (ch == '-') { if ((ch = _reader.read()) == '>') break comment; else if (_strictComments) throw error(L.l("XML forbids `--' in comments")); else if (ch == '-') { if (! _skipComments) _buf.append('-'); } else { if (! _skipComments) _buf.append("--"); break; } } _buf.append('-'); } else if (! XmlChar.isChar(ch)) { throw error(L.l("bad character {0}", hex(ch))); } else { _buf.append((char) ch); ch = _reader.read(); } } if (_inDtd) { QComment comment = new QComment(_buf.toString()); comment._owner = _dtd._owner; _dtd.appendChild(comment); } else if (_skipComments) { } else if (_contentHandler instanceof XMLWriter && ! _skipComments) { ((XMLWriter) _contentHandler).comment(_buf.toString()); _isIgnorableWhitespace = true; } else if (_lexicalHandler != null) { _lexicalHandler.comment(_buf.getBuffer(), 0, _buf.getLength()); _isIgnorableWhitespace = true; } } /** * Parses the contents of a cdata section. * *
* cdata ::= <![CDATA[ ... ]]> **/ private void parseCdata() throws IOException, SAXException { int ch; if (_forgiving) { if ((ch = _reader.read()) != 'C') { appendText("') break cdata; else if (ch == ']') addText(']'); else { addText(']'); break; } } addText(']'); } else if (_strictCharacters && ! isChar(ch)) { throw error(L.l("expected character in cdata at {0}", badChar(ch))); } else { addText((char) ch); ch = _reader.read(); } } if (_lexicalHandler != null) { appendText(); _lexicalHandler.endCDATA(); } else if (! _isCoalescing) appendText(); } /** * Ignores content to the ']]>' */ private void parseIgnore() throws IOException, SAXException { int ch = read(); while (ch >= 0) { if (ch != ']') { ch = read(); } else if ((ch = read()) != ']') { } else if ((ch = read()) == '>') return; } } private int parseContentSpec(QElementDef def, int ch) throws IOException, SAXException { ch = expandPE(ch); if (XmlChar.isNameStart(ch)) { ch = _reader.parseName(_text, ch); String name = _text.toString(); if (name.equals("EMPTY")) { def._content = "EMPTY"; return ch; } else if (name.equals("ANY")) { def._content = "ANY"; return ch; } else throw error(L.l("expected EMPTY or ANY at `{0}'", name)); } else if (ch != '(') { throw error(L.l("expected grammar definition starting with '(' at {0}. definitions have the syntax ", badChar(ch))); } else { QContentParticle cp = new QContentParticle(); def._content = cp; return parseContentParticle(cp, true); } } /** * Parses a content-particle, i.e. a grammer particle in the DTD * regexp. */ private int parseContentParticle(QContentParticle cp, boolean isTop) throws IOException, SAXException { boolean hasCdata = false; cp._separator = 0; cp._repeat = 0; int ch; ch = expandPE(_reader.read()); for (; ch != -1; ch = expandPE(ch)) { if (ch == '(') { QContentParticle child = new QContentParticle(); cp.addChild(child); ch = parseContentParticle(child, false); } else if (XmlChar.isNameStart(ch)) { ch = _reader.parseName(_text, ch); cp.addChild(_text.toString()); } else if (ch == '#') { ch = _reader.parseName(_text, _reader.read()); String name = _text.toString(); if (_strictXml && cp._children.size() != 0) throw error(L.l("`#{0}' must occur first", name)); if (_strictXml && ! isTop) throw error(L.l("`#{0}' may only occur at top level", name)); if (name.equals("PCDATA")) cp.addChild("#PCDATA"); else throw error(L.l("illegal content particle at `#{0}'", name)); hasCdata = true; } else throw error(L.l("expected content particle at {0}", badChar(ch))); ch = expandPE(ch); if (ch == '?' || ch == '*' || ch == '+') { Object child = cp.getChild(cp.getChildSize() - 1); if (child instanceof QContentParticle) { QContentParticle cpChild = (QContentParticle) child; cpChild._repeat = ch; } else { QContentParticle cpChild = new QContentParticle(); cpChild.addChild(child); cpChild._repeat = ch; cp.setChild(cp.getChildSize() - 1, cpChild); } ch = expandPE(_reader.read()); } if (ch == ')') break; else if (cp._separator == 0) { if (ch == '|') cp._separator = ch; else if (hasCdata && _strictXml) throw error(L.l("#PCDATA must be separated by `|' at {0}", badChar(ch))); else if (ch == ',') cp._separator = ch; else if (! _strictXml && ch =='&') cp._separator = ch; else throw error(L.l("expected separator at {0}", badChar(ch))); ch = _reader.read(); } else if (ch != cp._separator) throw error(L.l("expected `{0}' at {1}", "" + (char) cp._separator, badChar(ch))); else ch = _reader.read(); } ch = expandPE(_reader.read()); if (_strictXml && hasCdata && (ch == '+' || ch == '?')) throw error(L.l("pcdata clause can not have {0}", badChar(ch))); else if (ch == '*' || ch == '+' || ch == '?') { cp._repeat = ch; return _reader.read(); } else return ch; } private int expandPE(int ch) throws IOException, SAXException { ch = skipWhitespace(ch); while (ch == '%') { parsePEReference(); ch = skipWhitespace(_reader.read()); } return ch; } /** * Parses a PE reference %foo; and inserts the macro text to the input * stream. */ private void parsePEReference() throws IOException, SAXException { int ch = _reader.parseName(_buf, _reader.read()); if (ch != ';') throw error(L.l("`%{0};' expects `;' at {1}. Parameter entities have a `%name;' syntax.", _buf, badChar(ch))); addPEReference(_text, _buf.toString()); } /** * Expands the macro value of a PE reference. */ private void addPEReference(CharBuffer value, String name) throws IOException, SAXException { QEntity entity = _dtd.getParameterEntity(name); if (entity == null && ! _dtd.isExternal()) throw error(L.l("`%{0};' is an unknown parameter entity. Parameter entities must be defined in an declaration before use.", name)); else if (entity != null && entity._value != null) { setMacro(entity._value); } else if (entity != null && entity.getSystemId() != null) { pushInclude(entity.getPublicId(), entity.getSystemId()); } else { value.append("%"); value.append(name); value.append(";"); } } /** * */ private void parseElementDecl(QDocumentType doctype) throws IOException, SAXException { int ch = skipWhitespace(_reader.read()); ch = _reader.parseName(_text, ch); String name = _text.toString(); ch = skipWhitespace(ch); QElementDef def = _dtd.addElement(name); def.setLocation(getSystemId(), getFilename(), getLine(), getColumn()); boolean needsStartTag = true; boolean needsEndTag = true; if (_optionalTags && (ch == 'O' || ch == '-')) { needsStartTag = ch == '-'; ch = skipWhitespace(ch); if (ch == '0') needsEndTag = false; else if (ch == '-') needsEndTag = true; else throw error(L.l("unknown short tag")); } ch = parseContentSpec(def, ch); ch = skipWhitespace(ch); if (ch != '>') throw error(L.l("`' at {0}", badChar(ch))); } private static String toAttrDefault(CharBuffer text) { for (int i = 0; i < text.length(); i++) { int ch = text.charAt(i); if (ch == '"') { text.delete(i, i + 1); text.insert(i, """); i--; } else if (ch == '\'') { text.delete(i, i + 1); text.insert(i, "'"); i--; } } return text.toString(); } /** * */ private void parseAttlistDecl(QDocumentType doctype) throws IOException, SAXException { int ch = skipWhitespace(_reader.read()); ch = _reader.parseName(_text, ch); String name = _text.toString(); ch = skipWhitespace(ch); QElementDef def = _dtd.addElement(name); while (XmlChar.isNameStart((ch = expandPE(ch)))) { ch = _reader.parseName(_text, ch); String attrName = _text.toString(); String attrType = null; ArrayList