org.apache.xerces.impl.XML11DocumentScannerImpl Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.xerces.impl;
import java.io.IOException;
import org.apache.xerces.impl.msg.XMLMessageFormatter;
import org.apache.xerces.util.XML11Char;
import org.apache.xerces.util.XMLChar;
import org.apache.xerces.util.XMLStringBuffer;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
/**
* This class is responsible for scanning XML document structure
* and content. The scanner acts as the source for the document
* information which is communicated to the document handler.
*
* This component requires the following features and properties from the
* component manager that uses it:
*
* - http://xml.org/sax/features/namespaces
* - http://xml.org/sax/features/validation
* - http://apache.org/xml/features/nonvalidating/load-external-dtd
* - http://apache.org/xml/features/scanner/notify-char-refs
* - http://apache.org/xml/features/scanner/notify-builtin-refs
* - http://apache.org/xml/properties/internal/symbol-table
* - http://apache.org/xml/properties/internal/error-reporter
* - http://apache.org/xml/properties/internal/entity-manager
* - http://apache.org/xml/properties/internal/dtd-scanner
*
*
* @xerces.internal
*
* @author Glenn Marcy, IBM
* @author Andy Clark, IBM
* @author Arnaud Le Hors, IBM
* @author Eric Ye, IBM
*
* @version $Id: XML11DocumentScannerImpl.java 572055 2007-09-02 17:55:43Z mrglavas $
*/
public class XML11DocumentScannerImpl
extends XMLDocumentScannerImpl {
/** String. */
private final XMLString fString = new XMLString();
/** String buffer. */
private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
private final XMLStringBuffer fStringBuffer2 = new XMLStringBuffer();
private final XMLStringBuffer fStringBuffer3 = new XMLStringBuffer();
//
// Constructors
//
/** Default constructor. */
public XML11DocumentScannerImpl() {super();} // ()
//
// overridden methods
//
// XMLDocumentFragmentImpl methods
/**
* Scans element content.
*
* @return Returns the next character on the stream.
*/
protected int scanContent() throws IOException, XNIException {
XMLString content = fString;
int c = fEntityScanner.scanContent(content);
if (c == '\r' || c == 0x85 || c == 0x2028) {
// happens when there is the character reference
// but scanContent doesn't do entity expansions...
// is this *really* necessary??? - NG
fEntityScanner.scanChar();
fStringBuffer.clear();
fStringBuffer.append(fString);
fStringBuffer.append((char)c);
content = fStringBuffer;
c = -1;
}
if (fDocumentHandler != null && content.length > 0) {
fDocumentHandler.characters(content, null);
}
if (c == ']' && fString.length == 0) {
fStringBuffer.clear();
fStringBuffer.append((char)fEntityScanner.scanChar());
// remember where we are in case we get an endEntity before we
// could flush the buffer out - this happens when we're parsing an
// entity which ends with a ]
fInScanContent = true;
//
// We work on a single character basis to handle cases such as:
// ']]]>' which we might otherwise miss.
//
if (fEntityScanner.skipChar(']')) {
fStringBuffer.append(']');
while (fEntityScanner.skipChar(']')) {
fStringBuffer.append(']');
}
if (fEntityScanner.skipChar('>')) {
reportFatalError("CDEndInContent", null);
}
}
if (fDocumentHandler != null && fStringBuffer.length != 0) {
fDocumentHandler.characters(fStringBuffer, null);
}
fInScanContent = false;
c = -1;
}
return c;
} // scanContent():int
/**
* Scans an attribute value and normalizes whitespace converting all
* whitespace characters to space characters.
*
* [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
*
* @param value The XMLString to fill in with the value.
* @param nonNormalizedValue The XMLString to fill in with the
* non-normalized value.
* @param atName The name of the attribute being parsed (for error msgs).
* @param checkEntities true if undeclared entities should be reported as VC violation,
* false if undeclared entities should be reported as WFC violation.
* @param eleName The name of element to which this attribute belongs.
*
* @return true if the non-normalized and normalized value are the same
*
* Note: This method uses fStringBuffer2, anything in it
* at the time of calling is lost.
**/
protected boolean scanAttributeValue(XMLString value,
XMLString nonNormalizedValue,
String atName,
boolean checkEntities,String eleName)
throws IOException, XNIException
{
// quote
int quote = fEntityScanner.peekChar();
if (quote != '\'' && quote != '"') {
reportFatalError("OpenQuoteExpected", new Object[]{eleName,atName});
}
fEntityScanner.scanChar();
int entityDepth = fEntityDepth;
int c = fEntityScanner.scanLiteral(quote, value);
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** scanLiteral -> \""
+ value.toString() + "\"");
}
int fromIndex = 0;
if (c == quote && (fromIndex = isUnchangedByNormalization(value)) == -1) {
/** Both the non-normalized and normalized attribute values are equal. **/
nonNormalizedValue.setValues(value);
int cquote = fEntityScanner.scanChar();
if (cquote != quote) {
reportFatalError("CloseQuoteExpected", new Object[]{eleName,atName});
}
return true;
}
fStringBuffer2.clear();
fStringBuffer2.append(value);
normalizeWhitespace(value, fromIndex);
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** normalizeWhitespace -> \""
+ value.toString() + "\"");
}
if (c != quote) {
fScanningAttribute = true;
fStringBuffer.clear();
do {
fStringBuffer.append(value);
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** value2: \""
+ fStringBuffer.toString() + "\"");
}
if (c == '&') {
fEntityScanner.skipChar('&');
if (entityDepth == fEntityDepth) {
fStringBuffer2.append('&');
}
if (fEntityScanner.skipChar('#')) {
if (entityDepth == fEntityDepth) {
fStringBuffer2.append('#');
}
int ch = scanCharReferenceValue(fStringBuffer, fStringBuffer2);
if (ch != -1) {
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** value3: \""
+ fStringBuffer.toString()
+ "\"");
}
}
}
else {
String entityName = fEntityScanner.scanName();
if (entityName == null) {
reportFatalError("NameRequiredInReference", null);
}
else if (entityDepth == fEntityDepth) {
fStringBuffer2.append(entityName);
}
if (!fEntityScanner.skipChar(';')) {
reportFatalError("SemicolonRequiredInReference",
new Object []{entityName});
}
else if (entityDepth == fEntityDepth) {
fStringBuffer2.append(';');
}
if (entityName == fAmpSymbol) {
fStringBuffer.append('&');
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** value5: \""
+ fStringBuffer.toString()
+ "\"");
}
}
else if (entityName == fAposSymbol) {
fStringBuffer.append('\'');
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** value7: \""
+ fStringBuffer.toString()
+ "\"");
}
}
else if (entityName == fLtSymbol) {
fStringBuffer.append('<');
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** value9: \""
+ fStringBuffer.toString()
+ "\"");
}
}
else if (entityName == fGtSymbol) {
fStringBuffer.append('>');
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** valueB: \""
+ fStringBuffer.toString()
+ "\"");
}
}
else if (entityName == fQuotSymbol) {
fStringBuffer.append('"');
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** valueD: \""
+ fStringBuffer.toString()
+ "\"");
}
}
else {
if (fEntityManager.isExternalEntity(entityName)) {
reportFatalError("ReferenceToExternalEntity",
new Object[] { entityName });
}
else {
if (!fEntityManager.isDeclaredEntity(entityName)) {
//WFC & VC: Entity Declared
if (checkEntities) {
if (fValidation) {
fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
"EntityNotDeclared",
new Object[]{entityName},
XMLErrorReporter.SEVERITY_ERROR);
}
}
else {
reportFatalError("EntityNotDeclared",
new Object[]{entityName});
}
}
fEntityManager.startEntity(entityName, true);
}
}
}
}
else if (c == '<') {
reportFatalError("LessthanInAttValue",
new Object[] { eleName, atName });
fEntityScanner.scanChar();
if (entityDepth == fEntityDepth) {
fStringBuffer2.append((char)c);
}
}
else if (c == '%' || c == ']') {
fEntityScanner.scanChar();
fStringBuffer.append((char)c);
if (entityDepth == fEntityDepth) {
fStringBuffer2.append((char)c);
}
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** valueF: \""
+ fStringBuffer.toString() + "\"");
}
}
// note that none of these characters should ever get through
// XML11EntityScanner. Not sure why
// this check was originally necessary. - NG
else if (c == '\n' || c == '\r' || c == 0x85 || c == 0x2028) {
fEntityScanner.scanChar();
fStringBuffer.append(' ');
if (entityDepth == fEntityDepth) {
fStringBuffer2.append('\n');
}
}
else if (c != -1 && XMLChar.isHighSurrogate(c)) {
fStringBuffer3.clear();
if (scanSurrogates(fStringBuffer3)) {
fStringBuffer.append(fStringBuffer3);
if (entityDepth == fEntityDepth) {
fStringBuffer2.append(fStringBuffer3);
}
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** valueI: \""
+ fStringBuffer.toString()
+ "\"");
}
}
}
else if (c != -1 && isInvalidLiteral(c)) {
reportFatalError("InvalidCharInAttValue",
new Object[] {eleName, atName, Integer.toString(c, 16)});
fEntityScanner.scanChar();
if (entityDepth == fEntityDepth) {
fStringBuffer2.append((char)c);
}
}
c = fEntityScanner.scanLiteral(quote, value);
if (entityDepth == fEntityDepth) {
fStringBuffer2.append(value);
}
normalizeWhitespace(value);
} while (c != quote || entityDepth != fEntityDepth);
fStringBuffer.append(value);
if (DEBUG_ATTR_NORMALIZATION) {
System.out.println("** valueN: \""
+ fStringBuffer.toString() + "\"");
}
value.setValues(fStringBuffer);
fScanningAttribute = false;
}
nonNormalizedValue.setValues(fStringBuffer2);
// quote
int cquote = fEntityScanner.scanChar();
if (cquote != quote) {
reportFatalError("CloseQuoteExpected", new Object[]{eleName,atName});
}
return nonNormalizedValue.equals(value.ch, value.offset, value.length);
} // scanAttributeValue()
//
// XMLScanner methods
//
// NOTE: this is a carbon copy of the code in XML11DTDScannerImpl;
// we need to override these methods in both places.
// this needs to be refactored!!! - NG
/**
* Scans public ID literal.
*
* [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
* [13] PubidChar::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
*
* The returned string is normalized according to the following rule,
* from http://www.w3.org/TR/REC-xml#dt-pubid:
*
* Before a match is attempted, all strings of white space in the public
* identifier must be normalized to single space characters (#x20), and
* leading and trailing white space must be removed.
*
* @param literal The string to fill in with the public ID literal.
* @return True on success.
*
* Note: This method uses fStringBuffer, anything in it at
* the time of calling is lost.
*/
protected boolean scanPubidLiteral(XMLString literal)
throws IOException, XNIException
{
int quote = fEntityScanner.scanChar();
if (quote != '\'' && quote != '"') {
reportFatalError("QuoteRequiredInPublicID", null);
return false;
}
fStringBuffer.clear();
// skip leading whitespace
boolean skipSpace = true;
boolean dataok = true;
while (true) {
int c = fEntityScanner.scanChar();
// REVISIT: none of these except \n and 0x20 should make it past the entity scanner
if (c == ' ' || c == '\n' || c == '\r' || c == 0x85 || c == 0x2028) {
if (!skipSpace) {
// take the first whitespace as a space and skip the others
fStringBuffer.append(' ');
skipSpace = true;
}
}
else if (c == quote) {
if (skipSpace) {
// if we finished on a space let's trim it
fStringBuffer.length--;
}
literal.setValues(fStringBuffer);
break;
}
else if (XMLChar.isPubid(c)) {
fStringBuffer.append((char)c);
skipSpace = false;
}
else if (c == -1) {
reportFatalError("PublicIDUnterminated", null);
return false;
}
else {
dataok = false;
reportFatalError("InvalidCharInPublicID",
new Object[]{Integer.toHexString(c)});
}
}
return dataok;
}
/**
* Normalize whitespace in an XMLString converting all whitespace
* characters to space characters.
*/
protected void normalizeWhitespace(XMLString value) {
int end = value.offset + value.length;
for (int i = value.offset; i < end; ++i) {
int c = value.ch[i];
if (XMLChar.isSpace(c)) {
value.ch[i] = ' ';
}
}
}
/**
* Normalize whitespace in an XMLString converting all whitespace
* characters to space characters.
*/
protected void normalizeWhitespace(XMLString value, int fromIndex) {
int end = value.offset + value.length;
for (int i = value.offset + fromIndex; i < end; ++i) {
int c = value.ch[i];
if (XMLChar.isSpace(c)) {
value.ch[i] = ' ';
}
}
}
/**
* Checks whether this string would be unchanged by normalization.
*
* @return -1 if the value would be unchanged by normalization,
* otherwise the index of the first whitespace character which
* would be transformed.
*/
protected int isUnchangedByNormalization(XMLString value) {
int end = value.offset + value.length;
for (int i = value.offset; i < end; ++i) {
int c = value.ch[i];
if (XMLChar.isSpace(c)) {
return i - value.offset;
}
}
return -1;
}
// returns true if the given character is not
// valid with respect to the version of
// XML understood by this scanner.
protected boolean isInvalid(int value) {
return (XML11Char.isXML11Invalid(value));
} // isInvalid(int): boolean
// returns true if the given character is not
// valid or may not be used outside a character reference
// with respect to the version of XML understood by this scanner.
protected boolean isInvalidLiteral(int value) {
return (!XML11Char.isXML11ValidLiteral(value));
} // isInvalidLiteral(int): boolean
// returns true if the given character is
// a valid nameChar with respect to the version of
// XML understood by this scanner.
protected boolean isValidNameChar(int value) {
return (XML11Char.isXML11Name(value));
} // isValidNameChar(int): boolean
// returns true if the given character is
// a valid nameStartChar with respect to the version of
// XML understood by this scanner.
protected boolean isValidNameStartChar(int value) {
return (XML11Char.isXML11NameStart(value));
} // isValidNameStartChar(int): boolean
// returns true if the given character is
// a valid NCName character with respect to the version of
// XML understood by this scanner.
protected boolean isValidNCName(int value) {
return (XML11Char.isXML11NCName(value));
} // isValidNCName(int): boolean
// returns true if the given character is
// a valid high surrogate for a nameStartChar
// with respect to the version of XML understood
// by this scanner.
protected boolean isValidNameStartHighSurrogate(int value) {
return XML11Char.isXML11NameHighSurrogate(value);
} // isValidNameStartHighSurrogate(int): boolean
protected boolean versionSupported(String version) {
return (version.equals("1.1") || version.equals("1.0"));
} // versionSupported(String): boolean
// returns the error message key for unsupported
// versions of XML with respect to the version of
// XML understood by this scanner.
protected String getVersionNotSupportedKey () {
return "VersionNotSupported11";
} // getVersionNotSupportedKey: String
} // class XML11DocumentScannerImpl