nu.validator.checker.XmlPiChecker Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of validator Show documentation
Show all versions of validator Show documentation
An HTML-checking library (used by https://html5.validator.nu and the HTML5 facet of the W3C Validator)
/*
* Copyright (c) 2010 Mozilla Foundation
* Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
* Foundation, and Opera Software ASA.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.checker;
import org.xml.sax.Attributes;
import org.xml.sax.SAXParseException;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.AttributesImpl;
import org.relaxng.datatype.DatatypeException;
import nu.validator.datatype.Html5DatatypeLibrary;
import nu.validator.datatype.Html5DatatypeException;
import nu.validator.datatype.Charset;
import nu.validator.datatype.IriRef;
import nu.validator.datatype.MediaQuery;
import nu.validator.datatype.MimeType;
public class XmlPiChecker extends Checker implements LexicalHandler {
private static final char[][] NAMES = { "amp;".toCharArray(),
"lt;".toCharArray(), "gt;".toCharArray(), "quot;".toCharArray(),
"apos;".toCharArray(), };
private static final char[][] VALUES = { { '\u0026' }, { '\u003c' },
{ '\u003e' }, { '\u0022' }, { '\'' }, };
private static final int DATA_AND_RCDATA_MASK = ~1;
private static final int BEFORE_ATTRIBUTE_NAME = 0;
private static final int ATTRIBUTE_NAME = 1;
private static final int AFTER_ATTRIBUTE_NAME = 2;
private static final int BEFORE_ATTRIBUTE_VALUE = 3;
private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 4;
private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 5;
private static final int ATTRIBUTE_VALUE_UNQUOTED = 6;
private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 7;
private static final int CONSUME_CHARACTER_REFERENCE = 8;
private static final int CONSUME_NCR = 9;
private static final int CHARACTER_REFERENCE_LOOP = 10;
private static final int HEX_NCR_LOOP = 11;
private static final int DECIMAL_NRC_LOOP = 12;
private static final int HANDLE_NCR_VALUE = 13;
private static final int BUFFER_GROW_BY = 1024;
private static final char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
private char[] strBuf = new char[64];
private int strBufLen;
private char[] longStrBuf = new char[1024];
private int longStrBufLen;
private final char[] bmpChar = new char[1];
private final char[] astralChar = new char[2];
private int entCol;
private int lo;
private int hi;
private int candidate;
private int strBufMark;
private int prevValue;
private int value;
private boolean seenDigits;
private char additional;
private boolean alreadyWarnedAboutPrivateUseCharacters;
private AttributesImpl attributes;
private String attributeName;
private boolean inDoctype;
private boolean alreadyHasElement;
private String piTarget = null;
private boolean hasXsltPi;
private enum PseudoAttrName {
HREF, TYPE, TITLE, MEDIA, CHARSET, ALTERNATE, INVALID;
private static PseudoAttrName toCaps(String str) {
try {
if (!str.toLowerCase().equals(str)) {
return INVALID;
}
return valueOf(newAsciiUpperCaseStringFromString(str));
} catch (Exception ex) {
return INVALID;
}
}
}
public XmlPiChecker() {
super();
inDoctype = false;
hasXsltPi = false;
alreadyHasElement = false;
}
public void startDTD(String name, String publicId, String systemId)
throws SAXException {
inDoctype = true;
}
public void endDTD() throws SAXException {
inDoctype = false;
}
public void startEntity(String name) throws SAXException {
}
public void endEntity(String name) throws SAXException {
}
public void startCDATA() throws SAXException {
}
public void endCDATA() throws SAXException {
}
public void comment(char[] ch, int start, int len) throws SAXException {
}
@Override public void startDocument() throws SAXException {
inDoctype = false;
hasXsltPi = false;
alreadyHasElement = false;
}
@Override public void startElement(String uri, String localName,
String qName, Attributes atts) throws SAXException {
alreadyHasElement = true;
}
@Override public void processingInstruction(String target, String data)
throws SAXException {
piTarget = target;
if ("xml-stylesheet".equals(piTarget)) {
checkXmlStylesheetPiData(data);
}
}
private void errBadPseudoAttrDatatype(DatatypeException e,
Class> datatypeClass, String attrName, String attrValue)
throws SAXException, ClassNotFoundException {
if (getErrorHandler() != null) {
Html5DatatypeException ex5 = (Html5DatatypeException) e;
boolean warning = ex5.isWarning() ? true : false;
DatatypeMismatchException bpe = new DatatypeMismatchException(
"Bad value \u201c" + attrValue + "\u201d for \u201c"
+ piTarget + "\u201d pseudo-attribute \u201c"
+ attrName + "\u201d. "
+ e.getMessage(),
getDocumentLocator(), datatypeClass, warning);
getErrorHandler().error(bpe);
}
}
private void errAttributeWithNoValue() throws SAXException {
err("Found \u201c" + piTarget + "\u201d pseudo-attribute \u201c"
+ attributeName
+ "\u201d without a value. All pseudo-attributes in \u201c"
+ piTarget + "\u201d instructions must have values.");
}
private void errAttributeValueContainsLt() throws SAXException {
err("Found \u201c"
+ piTarget
+ "\u201d pseudo-attribute \u201c"
+ attributeName
+ "\u201d with the character \u201c<\u201d in its value. All pseudo-attribute values in \u201c"
+ piTarget
+ "\u201d instructions must not contain the character \u201c<\u201d.");
}
private void errUpperCaseXinHexNcr() throws SAXException {
err("In XML documents, a hexadecimal character reference must begin with "
+ "\u201c\u201d (lowercase \u201cx\u201d), not \u201c\u201d (uppercase \u201cX\u201d).");
}
private void checkXmlStylesheetPiData(String data) throws SAXException {
boolean hasHref = false;
boolean hasTitle = false;
boolean hasMedia = false;
boolean hasCharset = false;
boolean hasAlternate = false;
boolean hasNonEmptyTitle = false;
boolean alternateIsYes = false;
boolean badDatatype = false;
if (inDoctype) {
warn("An \u201cxml-stylesheet\u201d instruction should not be used within a \u201cDOCTYPE\u201d declaration.");
}
if (alreadyHasElement) {
err("Any \u201cxml-stylesheet\u201d instruction in a document must occur before any elements in the document. "
+ "Suppressing any further errors for this \u201cxml-stylesheet\u201d instruction.");
return;
}
if (!"".equals(data)) {
Html5DatatypeLibrary dl = new Html5DatatypeLibrary();
AttributesImpl patts = getPseudoAttributesFromPiData(data);
String attrName;
String attrValue;
for (int i = 0; i < patts.getLength(); i++) {
attrName = patts.getQName(i);
attrValue = patts.getValue(i);
switch (PseudoAttrName.toCaps(attrName)) {
case HREF:
hasHref = true;
if (attrValue == null) {
break;
}
try {
IriRef ir = (IriRef) dl.createDatatype("iri-ref");
ir.checkValid(attrValue);
} catch (DatatypeException e) {
try {
errBadPseudoAttrDatatype(e, IriRef.class,
"href", attrValue);
} catch (ClassNotFoundException ce) {
}
}
break;
case TYPE:
if (attrValue == null) {
break;
}
try {
MimeType mt = (MimeType) dl.createDatatype("mime-type");
mt.checkValid(attrValue);
attrValue = newAsciiLowerCaseStringFromString(attrValue);
} catch (DatatypeException e) {
badDatatype = true;
try {
errBadPseudoAttrDatatype(e, MimeType.class,
"type", attrValue);
} catch (ClassNotFoundException ce) {
}
}
if (!badDatatype) {
if (attrValue.matches("application/xml(;.*)?")
|| attrValue.matches("text/xml(;.*)?")
|| attrValue.matches("application/xslt+xml(;.*)?")
|| attrValue.matches("text/xsl(;.*)?")
|| attrValue.matches("text/xslt(;.*)?")) {
if (!attrValue.matches("text/xsl(;.*)?")) {
warn("For indicating XSLT, \u201ctext/xsl\u201d is the only MIME type for the "
+ "\u201cxml-stylesheet\u201d pseudo-attribute \u201ctype\u201d that is supported across browsers.");
}
if (hasXsltPi) {
warn("Browsers do not support multiple \u201cxml-stylesheet\u201d instructions with a "
+ "\u201ctype\u201d value that indicates XSLT.");
}
hasXsltPi = true;
} else if (!attrValue.matches("^text/css(;.*)?$")) {
warn("\u201ctext/css\u201d and \u201ctext/xsl\u201d are the only MIME types for the "
+ "\u201cxml-stylesheet\u201d pseudo-attribute \u201ctype\u201d that are supported across browsers.");
}
}
break;
case TITLE:
hasTitle = true;
if (attrValue == null) {
break;
}
if (!"".equals(attrValue)) {
hasNonEmptyTitle = true;
}
break;
case MEDIA:
hasMedia = true;
if (attrValue == null) {
break;
}
try {
MediaQuery mq = (MediaQuery) dl.createDatatype("media-query");
mq.checkValid(attrValue);
} catch (DatatypeException e) {
try {
errBadPseudoAttrDatatype(e, MediaQuery.class,
"media", attrValue);
} catch (ClassNotFoundException ce) {
}
}
break;
case CHARSET:
hasCharset = true;
if (attrValue == null) {
break;
}
try {
Charset c = (Charset) dl.createDatatype("charset");
c.checkValid(attrValue);
} catch (DatatypeException e) {
try {
errBadPseudoAttrDatatype(e, Charset.class,
"charset", attrValue);
} catch (ClassNotFoundException ce) {
}
}
break;
case ALTERNATE:
hasAlternate = true;
if (attrValue == null) {
break;
}
if ("yes".equals(attrValue)) {
alternateIsYes = true;
} else if (!"no".equals(attrValue)) {
err("The value of the \u201cxml-stylesheet\u201d pseudo-attribute \u201calternate\u201d "
+ "must be either \u201cyes\u201d or \u201cno\u201d.");
}
break;
default:
err("Pseudo-attribute \u201c"
+ attrName
+ "\u201D not allowed in \u201cxml-stylesheet\u201d instruction.");
break;
}
}
if (alternateIsYes && !hasNonEmptyTitle) {
err("An \u201cxml-stylesheet\u201d instruction with an \u201calternate\u201d pseudo-attribute "
+ "whose value is \u201cyes\u201d must also have a \u201ctitle\u201d pseudo-attribute with a non-empty value.");
}
}
if (!hasHref) {
err("\u201cxml-stylesheet\u201d instruction lacks \u201chref\u201d pseudo-attribute. "
+ "The \u201chref\u201d pseudo-attribute is required in all \u201cxml-stylesheet\u201d instructions.");
}
if (hasXsltPi && (hasTitle || hasMedia || hasCharset || hasAlternate)) {
warn("When processing \u201cxml-stylesheet\u201d instructions, browsers ignore the pseudo-attributes "
+ "\u201ctitle\u201d, \u201cmedia\u201d, \u201ccharset\u201d, and \u201calternate\u201d.");
} else if (hasCharset) {
warn("Some browsers ignore the value of the \u201cxml-stylesheet\u201d pseudo-attribute \u201ccharset\u201d.");
}
}
/**
* Collect a set of attribues and values from the data part of a PI.
*
*
* The bulk of this method and associated methods that follow it here are
* copied from the nu.validator.htmlparser.impl.Tokenizer class, with
* appropriate modifications.
*
*
* @see nu.validator.htmlparser.impl.Tokenizer
* @see nu.validator.htmlparser.impl.ErrorReportingTokenizer
*
*/
private AttributesImpl getPseudoAttributesFromPiData(String buf)
throws SAXException {
int state = BEFORE_ATTRIBUTE_NAME;
int returnState = BEFORE_ATTRIBUTE_NAME;
char c = '\u0000';
int pos = -1;
int endPos = buf.length();
boolean reconsume = false;
attributes = null;
attributeName = null;
stateloop: for (;;) {
switch (state) {
case BEFORE_ATTRIBUTE_NAME:
beforeattributenameloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = buf.charAt(pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '\n':
case ' ':
case '\t':
continue;
case '/':
case '>':
case '\"':
case '\'':
case '<':
case '=':
/*
* U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
* (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
* SIGN (=) Parse error.
*/
errBadCharBeforeAttributeNameOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
default:
/*
* Anything else Start a new attribute in the
* current tag token.
*/
/*
* Set that attribute's name to the current
* input character,
*/
clearStrBufAndAppendCurrentC(c);
/*
* and its value to the empty string.
*/
// Will do later.
/*
* Switch to the attribute name state.
*/
state = ATTRIBUTE_NAME;
break beforeattributenameloop;
// continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case ATTRIBUTE_NAME:
attributenameloop: for (;;) {
if (++pos == endPos) {
attributeNameComplete();
addAttributeWithoutValue();
break stateloop;
}
c = buf.charAt(pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\n':
case ' ':
case '\t':
attributeNameComplete();
state = AFTER_ATTRIBUTE_NAME;
continue stateloop;
case '=':
/*
* U+003D EQUALS SIGN (=) Switch to the before
* attribute value state.
*/
attributeNameComplete();
state = BEFORE_ATTRIBUTE_VALUE;
break attributenameloop;
// continue stateloop;
case '\"':
case '\'':
case '<':
/*
* U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
* (') U+003C LESS-THAN SIGN (<) Parse error.
*/
errQuoteOrLtInAttributeNameOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
default:
/*
* Anything else Append the current input
* character to the current attribute's name.
*/
appendStrBuf(c);
/*
* Stay in the attribute name state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case BEFORE_ATTRIBUTE_VALUE:
beforeattributevalueloop: for (;;) {
if (++pos == endPos) {
addAttributeWithoutValue();
break stateloop;
}
c = buf.charAt(pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\n':
case ' ':
case '\t':
continue;
case '"':
/*
* U+0022 QUOTATION MARK (") Switch to the
* attribute value (double-quoted) state.
*/
clearLongStrBufForNextState();
state = ATTRIBUTE_VALUE_DOUBLE_QUOTED;
break beforeattributevalueloop;
// continue stateloop;
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the attribute
* value (unquoted) state and reconsume this
* input character.
*/
clearLongStrBuf();
state = ATTRIBUTE_VALUE_UNQUOTED;
reconsume = true;
continue stateloop;
case '\'':
/*
* U+0027 APOSTROPHE (') Switch to the attribute
* value (single-quoted) state.
*/
clearLongStrBufForNextState();
state = ATTRIBUTE_VALUE_SINGLE_QUOTED;
continue stateloop;
case '<':
case '=':
case '`':
/*
* U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
* (=) U+0060 GRAVE ACCENT (`)
*/
errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
default:
/*
* Anything else Append the current input
* character to the current attribute's value.
*/
clearLongStrBufAndAppendCurrentC(c);
/*
* Switch to the attribute value (unquoted)
* state.
*/
state = ATTRIBUTE_VALUE_UNQUOTED;
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
attributevaluedoublequotedloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
addAttributeWithoutValue();
break stateloop;
}
c = buf.charAt(pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '"':
/*
* U+0022 QUOTATION MARK (") Switch to the after
* attribute value (quoted) state.
*/
addAttributeWithValue();
state = AFTER_ATTRIBUTE_VALUE_QUOTED;
break attributevaluedoublequotedloop;
// continue stateloop;
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in attribute value state, with the
* additional allowed character being U+0022
* QUOTATION MARK (").
*/
clearStrBufAndAppendCurrentC(c);
returnState = state;
state = CONSUME_CHARACTER_REFERENCE;
continue stateloop;
case '\n':
appendLongStrBufLineFeed();
continue;
default:
/*
* Anything else Append the current input
* character to the current attribute's value.
*/
appendLongStrBuf(c);
/*
* Stay in the attribute value (double-quoted)
* state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case AFTER_ATTRIBUTE_VALUE_QUOTED:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = buf.charAt(pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\n':
case ' ':
case '\t':
state = BEFORE_ATTRIBUTE_NAME;
continue stateloop;
default:
/*
* Anything else Parse error.
*/
errNoSpaceBetweenAttributes();
/*
* Reconsume the character in the before
* attribute name state.
*/
state = BEFORE_ATTRIBUTE_NAME;
reconsume = true;
continue stateloop;
}
}
// FALLTHRU DON'T REORDER
case ATTRIBUTE_VALUE_UNQUOTED:
errUnquotedAttributeValOrNull();
for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
addAttributeWithValue();
break stateloop;
}
c = buf.charAt(pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '\n':
case ' ':
case '\t':
addAttributeWithValue();
state = BEFORE_ATTRIBUTE_NAME;
continue stateloop;
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in attribute value state, with the
* additional allowed character being U+003E
* GREATER-THAN SIGN (>)
*/
clearStrBufAndAppendCurrentC(c);
returnState = state;
state = CONSUME_CHARACTER_REFERENCE;
continue stateloop;
case '>':
/*
* U+003E GREATER-THAN SIGN (>) Emit the current
* tag token.
*/
// addAttributeWithValue();
// state = emitCurrentTagToken(false, pos);
// if (shouldSuspend) {
// break stateloop;
// }
/*
* Switch to the data state.
*/
continue stateloop;
case '<':
case '\"':
case '\'':
case '=':
case '`':
/*
* U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
* (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
* SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
*/
// errUnquotedAttributeValOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
// fall through
default:
/*
* Anything else Append the current input
* character to the current attribute's value.
*/
appendLongStrBuf(c);
/*
* Stay in the attribute value (unquoted) state.
*/
continue;
}
}
// XXX reorder point
case AFTER_ATTRIBUTE_NAME:
for (;;) {
if (++pos == endPos) {
addAttributeWithoutValue();
break stateloop;
}
c = buf.charAt(pos);
/*
* Consume the next input character:
*/
switch (c) {
case '\n':
case ' ':
case '\t':
continue;
case '=':
/*
* U+003D EQUALS SIGN (=) Switch to the before
* attribute value state.
*/
state = BEFORE_ATTRIBUTE_VALUE;
continue stateloop;
case '\"':
case '\'':
case '<':
errQuoteOrLtInAttributeNameOrNull(c);
/*
* Treat it as per the "anything else" entry
* below.
*/
default:
addAttributeWithoutValue();
/*
* Anything else Start a new attribute in the
* current tag token.
*/
/*
* Set that attribute's name to the current
* input character,
*/
clearStrBufAndAppendCurrentC(c);
/*
* and its value to the empty string.
*/
// Will do later.
/*
* Switch to the attribute name state.
*/
state = ATTRIBUTE_NAME;
continue stateloop;
}
}
// XXX reorder point
case ATTRIBUTE_VALUE_SINGLE_QUOTED:
attributevaluesinglequotedloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
addAttributeWithoutValue();
break stateloop;
}
c = buf.charAt(pos);
}
/*
* Consume the next input character:
*/
switch (c) {
case '\'':
/*
* U+0027 APOSTROPHE (') Switch to the after
* attribute value (quoted) state.
*/
addAttributeWithValue();
state = AFTER_ATTRIBUTE_VALUE_QUOTED;
continue stateloop;
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in attribute value state, with the
* + additional allowed character being U+0027
* APOSTROPHE (').
*/
clearStrBufAndAppendCurrentC(c);
returnState = state;
state = CONSUME_CHARACTER_REFERENCE;
break attributevaluesinglequotedloop;
// continue stateloop;
case '\n':
appendLongStrBufLineFeed();
continue;
default:
/*
* Anything else Append the current input
* character to the current attribute's value.
*/
appendLongStrBuf(c);
/*
* Stay in the attribute value (double-quoted)
* state.
*/
continue;
}
}
// FALLTHRU DON'T REORDER
case CONSUME_CHARACTER_REFERENCE:
if (++pos == endPos) {
break stateloop;
}
c = buf.charAt(pos);
/*
* Unlike the definition is the spec, this state does not
* return a value and never requires the caller to
* backtrack. This state takes care of emitting characters
* or appending to the current attribute value. It also
* takes care of that in the case when consuming the
* character reference fails.
*/
/*
* This section defines how to consume a character
* reference. This definition is used when parsing character
* references in text and in attributes.
*
* The behavior depends on the identity of the next
* character (the one immediately after the U+0026 AMPERSAND
* character):
*/
switch (c) {
case '#':
/*
* U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
* SIGN.
*/
appendStrBuf('#');
state = CONSUME_NCR;
continue stateloop;
default:
if (c == additional) {
emitOrAppendStrBuf(returnState);
state = returnState;
reconsume = true;
continue stateloop;
}
entCol = -1;
lo = 0;
hi = (NAMES.length - 1);
candidate = -1;
strBufMark = 0;
state = CHARACTER_REFERENCE_LOOP;
reconsume = true;
// FALL THROUGH continue stateloop;
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case CHARACTER_REFERENCE_LOOP:
outer: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = buf.charAt(pos);
}
entCol++;
/*
* Consume the maximum number of characters possible,
* with the consumed characters matching one of the
* identifiers in the first column of the named
* character references table (in a case-sensitive
* manner).
*/
hiloop: for (;;) {
if (hi == -1) {
break hiloop;
}
if (entCol == NAMES[hi].length) {
break hiloop;
}
if (entCol > NAMES[hi].length) {
break outer;
} else if (c < NAMES[hi][entCol]) {
hi--;
} else {
break hiloop;
}
}
loloop: for (;;) {
if (hi < lo) {
break outer;
}
if (entCol == NAMES[lo].length) {
candidate = lo;
strBufMark = strBufLen;
lo++;
} else if (entCol > NAMES[lo].length) {
break outer;
} else if (c > NAMES[lo][entCol]) {
lo++;
} else {
break loloop;
}
}
if (hi < lo) {
break outer;
}
appendStrBuf(c);
continue;
}
if (candidate == -1) {
/*
* If no match can be made, then this is a parse error.
*/
errNoNamedCharacterMatch();
emitOrAppendStrBuf(returnState);
state = returnState;
reconsume = true;
continue stateloop;
} else {
char[] candidateArr = NAMES[candidate];
if (candidateArr[candidateArr.length - 1] != ';') {
/*
* If the last character matched is not a U+003B
* SEMICOLON (;), there is a parse error.
*/
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
/*
* If the entity is being consumed as part of an
* attribute, and the last character matched is
* not a U+003B SEMICOLON (;),
*/
char ch;
if (strBufMark == strBufLen) {
ch = c;
} else {
// if (strBufOffset != -1) {
// ch = buf[strBufOffset + strBufMark];
// } else {
ch = strBuf[strBufMark];
// }
}
if ((ch >= '0' && ch <= '9')
|| (ch >= 'A' && ch <= 'Z')
|| (ch >= 'a' && ch <= 'z')) {
/*
* and the next character is in the range
* U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
* U+0041 LATIN CAPITAL LETTER A to U+005A
* LATIN CAPITAL LETTER Z, or U+0061 LATIN
* SMALL LETTER A to U+007A LATIN SMALL
* LETTER Z, then, for historical reasons,
* all the characters that were matched
* after the U+0026 AMPERSAND (&) must be
* unconsumed, and nothing is returned.
*/
errNoNamedCharacterMatch();
appendStrBufToLongStrBuf();
state = returnState;
reconsume = true;
continue stateloop;
}
}
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
errUnescapedAmpersandInterpretedAsCharacterReference();
}
}
/*
* Otherwise, return a character token for the character
* corresponding to the entity name (as given by the
* second column of the named character references
* table).
*/
char[] val = VALUES[candidate];
emitOrAppend(val, returnState);
// this is so complicated!
if (strBufMark < strBufLen) {
// if (strBufOffset != -1) {
// if ((returnState & (~1)) != 0) {
// for (int i = strBufMark; i < strBufLen; i++) {
// appendLongStrBuf(buf[strBufOffset + i]);
// }
// } else {
// tokenHandler.characters(buf, strBufOffset
// + strBufMark, strBufLen
// - strBufMark);
// }
// } else {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
for (int i = strBufMark; i < strBufLen; i++) {
appendLongStrBuf(strBuf[i]);
}
}
// }
}
state = returnState;
reconsume = true;
continue stateloop;
/*
* If the markup contains I'm ¬it; I tell you, the
* entity is parsed as "not", as in, I'm ¬it; I tell
* you. But if the markup was I'm ∉ I tell you,
* the entity would be parsed as "notin;", resulting in
* I'm ∉ I tell you.
*/
}
// XXX reorder point
case CONSUME_NCR:
if (++pos == endPos) {
break stateloop;
}
c = buf.charAt(pos);
prevValue = -1;
value = 0;
seenDigits = false;
/*
* The behavior further depends on the character after the
* U+0023 NUMBER SIGN:
*/
switch (c) {
case 'x':
/*
* U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
* LETTER X Consume the X.
*
* Follow the steps below, but using the range of
* characters U+0030 DIGIT ZERO through to U+0039
* DIGIT NINE, U+0061 LATIN SMALL LETTER A through
* to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
* CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
* LETTER F (in other words, 0-9, A-F, a-f).
*
* When it comes to interpreting the number,
* interpret it as a hexadecimal number.
*/
appendStrBuf(c);
state = HEX_NCR_LOOP;
continue stateloop;
case 'X':
/*
* XML requires a lowercase 'x' for hex character
* refs
*/
errUpperCaseXinHexNcr();
appendStrBuf(c);
state = HEX_NCR_LOOP;
continue stateloop;
default:
/*
* Anything else Follow the steps below, but using
* the range of characters U+0030 DIGIT ZERO through
* to U+0039 DIGIT NINE (i.e. just 0-9).
*
* When it comes to interpreting the number,
* interpret it as a decimal number.
*/
state = DECIMAL_NRC_LOOP;
reconsume = true;
// FALL THROUGH continue stateloop;
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case DECIMAL_NRC_LOOP:
decimalloop: for (;;) {
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
}
c = buf.charAt(pos);
}
// Deal with overflow gracefully
if (value < prevValue) {
value = 0x110000; // Value above Unicode range but
// within int
// range
}
prevValue = value;
/*
* Consume as many characters as match the range of
* characters given above.
*/
if (c >= '0' && c <= '9') {
seenDigits = true;
value *= 10;
value += c - '0';
continue;
} else if (c == ';') {
if (seenDigits) {
state = HANDLE_NCR_VALUE;
// FALL THROUGH continue stateloop;
break decimalloop;
} else {
errNoDigitsInNCR();
appendStrBuf(';');
emitOrAppendStrBuf(returnState);
state = returnState;
continue stateloop;
}
} else {
/*
* If no characters match the range, then don't
* consume any characters (and unconsume the U+0023
* NUMBER SIGN character and, if appropriate, the X
* character). This is a parse error; nothing is
* returned.
*
* Otherwise, if the next character is a U+003B
* SEMICOLON, consume that too. If it isn't, there
* is a parse error.
*/
if (!seenDigits) {
errNoDigitsInNCR();
emitOrAppendStrBuf(returnState);
state = returnState;
reconsume = true;
continue stateloop;
} else {
errCharRefLacksSemicolon();
state = HANDLE_NCR_VALUE;
reconsume = true;
// FALL THROUGH continue stateloop;
break decimalloop;
}
}
}
// WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
case HANDLE_NCR_VALUE:
// WARNING previous state sets reconsume
handleNcrValue(returnState);
state = returnState;
continue stateloop;
// XXX reorder point
case HEX_NCR_LOOP:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = buf.charAt(pos);
// Deal with overflow gracefully
if (value < prevValue) {
value = 0x110000; // Value above Unicode range but
// within int
// range
}
prevValue = value;
/*
* Consume as many characters as match the range of
* characters given above.
*/
if (c >= '0' && c <= '9') {
seenDigits = true;
value *= 16;
value += c - '0';
continue;
} else if (c >= 'A' && c <= 'F') {
seenDigits = true;
value *= 16;
value += c - 'A' + 10;
continue;
} else if (c >= 'a' && c <= 'f') {
seenDigits = true;
value *= 16;
value += c - 'a' + 10;
continue;
} else if (c == ';') {
if (seenDigits) {
state = HANDLE_NCR_VALUE;
continue stateloop;
} else {
errNoDigitsInNCR();
appendStrBuf(';');
emitOrAppendStrBuf(returnState);
state = returnState;
continue stateloop;
}
} else {
/*
* If no characters match the range, then don't
* consume any characters (and unconsume the U+0023
* NUMBER SIGN character and, if appropriate, the X
* character). This is a parse error; nothing is
* returned.
*
* Otherwise, if the next character is a U+003B
* SEMICOLON, consume that too. If it isn't, there
* is a parse error.
*/
if (!seenDigits) {
errNoDigitsInNCR();
emitOrAppendStrBuf(returnState);
state = returnState;
reconsume = true;
continue stateloop;
} else {
errCharRefLacksSemicolon();
state = HANDLE_NCR_VALUE;
reconsume = true;
continue stateloop;
}
}
}
}
}
return attributes;
}
private void appendStrBufToLongStrBuf() {
appendLongStrBuf(strBuf, 0, strBufLen);
}
private void appendLongStrBuf(char[] buffer, int offset, int length) {
int reqLen = longStrBufLen + length;
if (longStrBuf.length < reqLen) {
char[] newBuf = new char[reqLen + (reqLen >> 1)];
System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
longStrBuf = newBuf;
}
System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);
longStrBufLen = reqLen;
}
private void appendLongStrBuf(char[] arr) {
appendLongStrBuf(arr, 0, arr.length);
}
private void appendLongStrBuf(char c) {
if (longStrBufLen == longStrBuf.length) {
char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];
System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
longStrBuf = newBuf;
}
longStrBuf[longStrBufLen++] = c;
}
private void appendLongStrBufLineFeed() {
appendLongStrBuf('\n');
}
private void appendStrBuf(char c) {
if (strBufLen == strBuf.length) {
char[] newBuf = new char[strBuf.length + BUFFER_GROW_BY];
System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
strBuf = newBuf;
}
strBuf[strBufLen++] = c;
}
private void clearLongStrBufForNextState() {
longStrBufLen = 0;
}
private void clearLongStrBuf() {
longStrBufLen = 0;
}
private void clearLongStrBufAndAppendCurrentC(char c) {
longStrBuf[0] = c;
longStrBufLen = 1;
// longStrBufOffset = pos;
}
private void clearStrBufAndAppendCurrentC(char c) {
strBuf[0] = c;
strBufLen = 1;
}
private void emitOrAppend(char[] val, int returnState) throws SAXException {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
appendLongStrBuf(val);
}
}
private void emitOrAppendOne(char[] val, int returnState)
throws SAXException {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
appendLongStrBuf(val[0]);
}
}
private void emitOrAppendTwo(char[] val, int returnState)
throws SAXException {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
appendLongStrBuf(val[0]);
appendLongStrBuf(val[1]);
}
}
private void emitOrAppendStrBuf(int returnState) throws SAXException {
if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
appendStrBufToLongStrBuf();
}
}
private String longStrBufToString() {
return new String(longStrBuf, 0, longStrBufLen);
}
private void attributeNameComplete() throws SAXException {
attributeName = new String(strBuf, 0, strBufLen).intern();
if (attributes == null) {
attributes = new AttributesImpl();
}
/*
* When the user agent leaves the attribute name state (and before
* emitting the tag token, if appropriate), the complete attribute's
* name must be compared to the other attributes on the same token; if
* there is already an attribute on the token with the exact same name,
* then this is a parse error and the new attribute must be dropped,
* along with the value that gets associated with it (if any).
*/
for (int i = 0; i < attributes.getLength(); i++) {
if (attributes.getQName(i).equals(attributeName)) {
errDuplicateAttribute();
attributeName = null;
return;
}
}
}
private void addAttributeWithValue() throws SAXException {
if (attributeName != null) {
String value = longStrBufToString();
if (value.indexOf('<') != -1) {
errAttributeValueContainsLt();
return;
}
if (badCharInCandidateAttributeName()) {
return;
}
attributes.addAttribute("", "", attributeName, "", value);
attributeName = null;
}
}
private void addAttributeWithoutValue() throws SAXException {
if (attributeName != null) {
if (badCharInCandidateAttributeName()) {
return;
}
attributes.addAttribute("", "", attributeName, "", null);
errAttributeWithNoValue();
attributeName = null;
}
}
private boolean badCharInCandidateAttributeName() {
return attributeName.indexOf('/') != -1
|| attributeName.indexOf('>') != -1
|| attributeName.indexOf('\"') != -1
|| attributeName.indexOf('\'') != -1
|| attributeName.indexOf('<') != -1
|| attributeName.indexOf('=') != -1;
}
private void handleNcrValue(int returnState) throws SAXException {
if (!isLegalXmlCharValue(value)) {
errNcrIllegalValueForXml();
} else {
/*
* If one or more characters match the range, then take them all and
* interpret the string of characters as a number (either
* hexadecimal or decimal as appropriate).
*/
if (value <= 0xFFFF) {
/*
* about ((value <= 0x0008) || (value == 0x000B) || (value >=
* 0x000E && value <= 0x001F)) -- we already check for
* XML-illegal control characters in isLegalXmlCharValue
*/
if ((value & 0xF800) == 0xD800) {
errNcrSurrogate();
emitOrAppendOne(REPLACEMENT_CHARACTER, returnState);
} else {
/*
* Otherwise, return a character token for the Unicode
* character whose code point is that number.
*/
char ch = (char) value;
if (value >= 0xFDD0 && value <= 0xFDEF) {
errNcrUnassigned();
} else if ((value & 0xFFFE) == 0xFFFE) {
ch = errNcrNonCharacter(ch);
} else if (value >= 0x007F && value <= 0x009F) {
errNcrControlChar();
} else {
maybeWarnPrivateUse(ch);
}
bmpChar[0] = ch;
emitOrAppendOne(bmpChar, returnState);
}
} else if (value <= 0x10FFFF) {
maybeWarnPrivateUseAstral();
if ((value & 0xFFFE) == 0xFFFE) {
errAstralNonCharacter(value);
}
astralChar[0] = (char) (LEAD_OFFSET + (value >> 10));
astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
emitOrAppendTwo(astralChar, returnState);
} else {
errNcrOutOfRange();
emitOrAppendOne(REPLACEMENT_CHARACTER, returnState);
}
if ((value & 0xF800) == 0xD800) {
errNcrSurrogate();
emitOrAppendOne(REPLACEMENT_CHARACTER, returnState);
} else if (value <= 0xFFFF) {
/*
* Otherwise, return a character token for the Unicode character
* whose code point is that number.
*/
char ch = (char) value;
/*
* if ((value <= 0x0008) || (value == 0x000B) || (value >=
* 0x000E && value <= 0x001F)) { // we already check for
* XML-illegal control // characters in isLegalXmlCharValue ch =
* errNcrControlChar(ch); }
*/
if (value >= 0xFDD0 && value <= 0xFDEF) {
errNcrUnassigned();
} else if ((value & 0xFFFE) == 0xFFFE) {
ch = errNcrNonCharacter(ch);
} else if (value >= 0x007F && value <= 0x009F) {
errNcrControlChar();
} else {
maybeWarnPrivateUse(ch);
}
bmpChar[0] = ch;
emitOrAppendOne(bmpChar, returnState);
} else if (value <= 0x10FFFF) {
maybeWarnPrivateUseAstral();
astralChar[0] = (char) (LEAD_OFFSET + (value >> 10));
astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
emitOrAppend(astralChar, returnState);
} else {
errNcrOutOfRange();
emitOrAppendOne(REPLACEMENT_CHARACTER, returnState);
}
}
}
private String toUPlusString(char c) {
String hexString = Integer.toHexString(c);
switch (hexString.length()) {
case 1:
return "U+000" + hexString;
case 2:
return "U+00" + hexString;
case 3:
return "U+0" + hexString;
case 4:
return "U+" + hexString;
default:
throw new RuntimeException("Unreachable.");
}
}
private boolean isLegalXmlCharValue(int charval) {
return charval == 0x0009 || charval == 0x000A || charval == 0x000D
|| (charval >= 0x0020 && charval <= 0xD7FF)
|| (charval >= 0xE000 && charval <= 0xFFFD)
|| (charval >= 0x10000 && charval <= 0x10FFFF);
}
private boolean isPrivateUse(char c) {
return c >= '\uE000' && c <= '\uF8FF';
}
private boolean isAstralPrivateUse(int c) {
return (c >= 0xF0000 && c <= 0xFFFFD)
|| (c >= 0x100000 && c <= 0x10FFFD);
}
private void warnAboutPrivateUseChar() throws SAXException {
if (!alreadyWarnedAboutPrivateUseCharacters) {
warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)");
alreadyWarnedAboutPrivateUseCharacters = true;
}
}
private void errBadCharBeforeAttributeNameOrNull(char c)
throws SAXException {
if (c == '=') {
errEqualsSignBeforeAttributeName();
} else if (c != '\uFFFD') {
errQuoteBeforeAttributeName(c);
} else {
err("The character \u201C" + c + "\u201D is not allowed in \u201C"
+ piTarget + "\u201D pseudo-attribute names.");
}
}
private void errCharRefLacksSemicolon() throws SAXException {
err("Character reference was not terminated by a semicolon.");
}
private void errDuplicateAttribute() throws SAXException {
err("Duplicate \u201C" + piTarget + "\u201D pseudo-attribute \u201C"
+ attributeName + "\u201D.");
}
private void errEqualsSignBeforeAttributeName() throws SAXException {
err("Saw \u201C=\u201D when expecting \u201C"
+ piTarget
+ "\u201D pseudo-attribute name. Probable cause: Pseudo-attribute name missing.");
}
private void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
throws SAXException {
switch (c) {
case '=':
err("\u201C=\u201D at the start of an unquoted \u201C"
+ piTarget
+ "\u201D pseudo-attribute value. Probable cause: Stray duplicate equals sign.");
return;
case '<':
/*
* we deal with this case in the attribute-value error- checking
* code in the getPseudoAttributesFromPiData method
*/
// err("\u201C<\u201D at the start of an unquoted attribute value. Probable cause: Missing \u201C>\u201D immediately before.");
return;
case '`':
err("\u201C`\u201D at the start of an unquoted \u201C"
+ piTarget
+ "\u201D pseudo-attribute value. Probable cause: Using the wrong character as a quote.");
return;
}
}
private void errNoSpaceBetweenAttributes() throws SAXException {
err("Space is required between \u201C" + piTarget
+ "\u201D pseudo-attributes.");
}
private void errQuoteBeforeAttributeName(char c) throws SAXException {
err("Saw \u201C"
+ c
+ "\u201D when expecting a pseudo-attribute name. Probable cause: \u201C=\u201D missing immediately before.");
}
private void errQuoteOrLtInAttributeNameOrNull(char c) throws SAXException {
if (c != '\uFFFD') {
err("Quote \u201C"
+ c
+ "\u201D in pseudo-attribute name. Probable cause: Matching quote missing somewhere earlier.");
}
}
private void errUnquotedAttributeValOrNull() throws SAXException {
err("Found unquoted value for \u201c" + piTarget
+ "\u201d pseudo-attribute \u201c" + attributeName
+ "\u201d. The value of all pseudo-attributes in \u201c"
+ piTarget + "\u201d instructions must be quoted.");
}
private void errNoNamedCharacterMatch() throws SAXException {
if (getErrorHandler() == null) {
return;
}
SAXParseException spe = new SAXParseException(
"\u201C&\u201D did not start a character reference. (\u201C&\u201D probably should have been escaped as \u201C&\u201D.)",
getDocumentLocator());
getErrorHandler().error(spe);
}
private void errNcrControlChar() throws SAXException {
/*
* warn instead of error because these control characters are legal in
* XML
*/
warn("Character reference expands to a control character ("
+ toUPlusString((char) value) + ").");
}
private void errNcrIllegalValueForXml() throws SAXException {
err("Character reference expands to a character that is not legal in XML ("
+ toUPlusString((char) value) + ").");
}
private void errNcrSurrogate() throws SAXException {
err("Character reference expands to a surrogate.");
}
private void errNcrUnassigned() throws SAXException {
err("Character reference expands to a permanently unassigned code point.");
}
private char errNcrNonCharacter(char ch) throws SAXException {
err("Character reference expands to a non-character ("
+ toUPlusString((char) value) + ").");
return ch;
}
private void errNcrOutOfRange() throws SAXException {
err("Character reference outside the permissible Unicode range.");
}
private void errNoDigitsInNCR() throws SAXException {
err("No digits after \u201C" + new String(strBuf, 0, strBufLen)
+ "\u201D.");
}
private void errUnescapedAmpersandInterpretedAsCharacterReference()
throws SAXException {
if (getErrorHandler() == null) {
return;
}
SAXParseException spe = new SAXParseException(
"The string following \u201C&\u201D was interpreted as a character reference. (\u201C&\u201D probably should have been escaped as \u201C&\u201D.)",
getDocumentLocator());
getErrorHandler().error(spe);
}
private void maybeWarnPrivateUse(char ch) throws SAXException {
if (getErrorHandler() != null && isPrivateUse(ch)) {
warnAboutPrivateUseChar();
}
}
private void maybeWarnPrivateUseAstral() throws SAXException {
if (getErrorHandler() != null && isAstralPrivateUse(value)) {
warnAboutPrivateUseChar();
}
}
private void errAstralNonCharacter(int ch) throws SAXException {
err("Character reference expands to an astral non-character ("
+ toUPlusString((char) value) + ").");
}
private static String newAsciiLowerCaseStringFromString(String str) {
if (str == null) {
return null;
}
char[] buf = new char[str.length()];
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c >= 'A' && c <= 'Z') {
c += 0x20;
}
buf[i] = c;
}
return new String(buf);
}
private static String newAsciiUpperCaseStringFromString(String str) {
if (str == null) {
return null;
}
char[] buf = new char[str.length()];
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c >= 'a' && c <= 'z') {
c -= 0x20;
}
buf[i] = c;
}
return new String(buf);
}
}