All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nu.validator.checker.XmlPiChecker Maven / Gradle / Ivy

Go to download

An HTML-checking library (used by https://html5.validator.nu and the HTML5 facet of the W3C Validator)

There is a newer version: 20.7.2
Show newest version
/*
 * Copyright (c) 2010 Mozilla Foundation
 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 
 * Foundation, and Opera Software ASA.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a 
 * copy of this software and associated documentation files (the "Software"), 
 * to deal in the Software without restriction, including without limitation 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 * and/or sell copies of the Software, and to permit persons to whom the 
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 */

package nu.validator.checker;

import org.xml.sax.Attributes;
import org.xml.sax.SAXParseException;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;

import org.xml.sax.helpers.AttributesImpl;

import org.relaxng.datatype.DatatypeException;

import nu.validator.datatype.Html5DatatypeLibrary;
import nu.validator.datatype.Html5DatatypeException;
import nu.validator.datatype.Charset;
import nu.validator.datatype.IriRef;
import nu.validator.datatype.MediaQuery;
import nu.validator.datatype.MimeType;

public class XmlPiChecker extends Checker implements LexicalHandler {

    private static final char[][] NAMES = { "amp;".toCharArray(),
            "lt;".toCharArray(), "gt;".toCharArray(), "quot;".toCharArray(),
            "apos;".toCharArray(), };

    private static final char[][] VALUES = { { '\u0026' }, { '\u003c' },
            { '\u003e' }, { '\u0022' }, { '\'' }, };

    private static final int DATA_AND_RCDATA_MASK = ~1;

    private static final int BEFORE_ATTRIBUTE_NAME = 0;

    private static final int ATTRIBUTE_NAME = 1;

    private static final int AFTER_ATTRIBUTE_NAME = 2;

    private static final int BEFORE_ATTRIBUTE_VALUE = 3;

    private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 4;

    private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 5;

    private static final int ATTRIBUTE_VALUE_UNQUOTED = 6;

    private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 7;

    private static final int CONSUME_CHARACTER_REFERENCE = 8;

    private static final int CONSUME_NCR = 9;

    private static final int CHARACTER_REFERENCE_LOOP = 10;

    private static final int HEX_NCR_LOOP = 11;

    private static final int DECIMAL_NRC_LOOP = 12;

    private static final int HANDLE_NCR_VALUE = 13;

    private static final int BUFFER_GROW_BY = 1024;

    private static final char[] REPLACEMENT_CHARACTER = { '\uFFFD' };

    private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));

    private char[] strBuf = new char[64];

    private int strBufLen;

    private char[] longStrBuf = new char[1024];

    private int longStrBufLen;

    private final char[] bmpChar = new char[1];

    private final char[] astralChar = new char[2];

    private int entCol;

    private int lo;

    private int hi;

    private int candidate;

    private int strBufMark;

    private int prevValue;

    private int value;

    private boolean seenDigits;

    private char additional;

    private boolean alreadyWarnedAboutPrivateUseCharacters;

    private AttributesImpl attributes;

    private String attributeName;

    private boolean inDoctype;

    private boolean alreadyHasElement;

    private String piTarget = null;

    private boolean hasXsltPi;

    private enum PseudoAttrName {
        HREF, TYPE, TITLE, MEDIA, CHARSET, ALTERNATE, INVALID;
        private static PseudoAttrName toCaps(String str) {
            try {
                if (!str.toLowerCase().equals(str)) {
                    return INVALID;
                }
                return valueOf(newAsciiUpperCaseStringFromString(str));
            } catch (Exception ex) {
                return INVALID;
            }
        }
    }

    public XmlPiChecker() {
        super();
        inDoctype = false;
        hasXsltPi = false;
        alreadyHasElement = false;
    }

    public void startDTD(String name, String publicId, String systemId)
            throws SAXException {
        inDoctype = true;
    }

    public void endDTD() throws SAXException {
        inDoctype = false;
    }

    public void startEntity(String name) throws SAXException {
    }

    public void endEntity(String name) throws SAXException {
    }

    public void startCDATA() throws SAXException {
    }

    public void endCDATA() throws SAXException {
    }

    public void comment(char[] ch, int start, int len) throws SAXException {
    }

    @Override public void startDocument() throws SAXException {
        inDoctype = false;
        hasXsltPi = false;
        alreadyHasElement = false;
    }

    @Override public void startElement(String uri, String localName,
            String qName, Attributes atts) throws SAXException {
        alreadyHasElement = true;
    }

    @Override public void processingInstruction(String target, String data)
            throws SAXException {
        piTarget = target;
        if ("xml-stylesheet".equals(piTarget)) {
            checkXmlStylesheetPiData(data);
        }
    }

    private void errBadPseudoAttrDatatype(DatatypeException e,
            Class datatypeClass, String attrName, String attrValue)
            throws SAXException, ClassNotFoundException {
        if (getErrorHandler() != null) {
            Html5DatatypeException ex5 = (Html5DatatypeException) e;
            boolean warning = ex5.isWarning() ? true : false;
            DatatypeMismatchException bpe = new DatatypeMismatchException(
                    "Bad value \u201c" + attrValue + "\u201d for \u201c"
                            + piTarget + "\u201d pseudo-attribute \u201c"
                            + attrName + "\u201d. "
                            + e.getMessage(),
                    getDocumentLocator(), datatypeClass, warning);
            getErrorHandler().error(bpe);
        }
    }

    private void errAttributeWithNoValue() throws SAXException {
        err("Found \u201c" + piTarget + "\u201d pseudo-attribute \u201c"
                + attributeName
                + "\u201d without a value. All pseudo-attributes in \u201c"
                + piTarget + "\u201d instructions must have values.");
    }

    private void errAttributeValueContainsLt() throws SAXException {
        err("Found \u201c"
                + piTarget
                + "\u201d pseudo-attribute \u201c"
                + attributeName
                + "\u201d with the character \u201c<\u201d in its value. All pseudo-attribute values in \u201c"
                + piTarget
                + "\u201d instructions must not contain the character \u201c<\u201d.");
    }

    private void errUpperCaseXinHexNcr() throws SAXException {
        err("In XML documents, a hexadecimal character reference must begin with "
                + "\u201c&#x\u201d (lowercase \u201cx\u201d), not \u201c&#X\u201d (uppercase \u201cX\u201d).");
    }

    private void checkXmlStylesheetPiData(String data) throws SAXException {
        boolean hasHref = false;
        boolean hasTitle = false;
        boolean hasMedia = false;
        boolean hasCharset = false;
        boolean hasAlternate = false;
        boolean hasNonEmptyTitle = false;
        boolean alternateIsYes = false;
        boolean badDatatype = false;
        if (inDoctype) {
            warn("An \u201cxml-stylesheet\u201d instruction should not be used within a \u201cDOCTYPE\u201d declaration.");
        }
        if (alreadyHasElement) {
            err("Any \u201cxml-stylesheet\u201d instruction in a document must occur before any elements in the document. "
                    + "Suppressing any further errors for this \u201cxml-stylesheet\u201d instruction.");
            return;
        }
        if (!"".equals(data)) {
            Html5DatatypeLibrary dl = new Html5DatatypeLibrary();
            AttributesImpl patts = getPseudoAttributesFromPiData(data);
            String attrName;
            String attrValue;
            for (int i = 0; i < patts.getLength(); i++) {
                attrName = patts.getQName(i);
                attrValue = patts.getValue(i);
                switch (PseudoAttrName.toCaps(attrName)) {
                    case HREF:
                        hasHref = true;
                        if (attrValue == null) {
                            break;
                        }
                        try {
                            IriRef ir = (IriRef) dl.createDatatype("iri-ref");
                            ir.checkValid(attrValue);
                        } catch (DatatypeException e) {
                            try {
                                errBadPseudoAttrDatatype(e, IriRef.class,
                                        "href", attrValue);
                            } catch (ClassNotFoundException ce) {
                            }
                        }
                        break;
                    case TYPE:
                        if (attrValue == null) {
                            break;
                        }
                        try {
                            MimeType mt = (MimeType) dl.createDatatype("mime-type");
                            mt.checkValid(attrValue);
                            attrValue = newAsciiLowerCaseStringFromString(attrValue);
                        } catch (DatatypeException e) {
                            badDatatype = true;
                            try {
                                errBadPseudoAttrDatatype(e, MimeType.class,
                                        "type", attrValue);
                            } catch (ClassNotFoundException ce) {
                            }
                        }
                        if (!badDatatype) {
                            if (attrValue.matches("application/xml(;.*)?")
                                    || attrValue.matches("text/xml(;.*)?")
                                    || attrValue.matches("application/xslt+xml(;.*)?")
                                    || attrValue.matches("text/xsl(;.*)?")
                                    || attrValue.matches("text/xslt(;.*)?")) {
                                if (!attrValue.matches("text/xsl(;.*)?")) {
                                    warn("For indicating XSLT, \u201ctext/xsl\u201d is the only MIME type for the "
                                            + "\u201cxml-stylesheet\u201d pseudo-attribute \u201ctype\u201d that is supported across browsers.");
                                }
                                if (hasXsltPi) {
                                    warn("Browsers do not support multiple \u201cxml-stylesheet\u201d instructions with a "
                                            + "\u201ctype\u201d value that indicates XSLT.");
                                }
                                hasXsltPi = true;
                            } else if (!attrValue.matches("^text/css(;.*)?$")) {
                                warn("\u201ctext/css\u201d and \u201ctext/xsl\u201d are the only MIME types for the "
                                        + "\u201cxml-stylesheet\u201d pseudo-attribute \u201ctype\u201d that are supported across browsers.");
                            }
                        }
                        break;
                    case TITLE:
                        hasTitle = true;
                        if (attrValue == null) {
                            break;
                        }
                        if (!"".equals(attrValue)) {
                            hasNonEmptyTitle = true;
                        }
                        break;
                    case MEDIA:
                        hasMedia = true;
                        if (attrValue == null) {
                            break;
                        }
                        try {
                            MediaQuery mq = (MediaQuery) dl.createDatatype("media-query");
                            mq.checkValid(attrValue);
                        } catch (DatatypeException e) {
                            try {
                                errBadPseudoAttrDatatype(e, MediaQuery.class,
                                        "media", attrValue);
                            } catch (ClassNotFoundException ce) {
                            }
                        }
                        break;
                    case CHARSET:
                        hasCharset = true;
                        if (attrValue == null) {
                            break;
                        }
                        try {
                            Charset c = (Charset) dl.createDatatype("charset");
                            c.checkValid(attrValue);
                        } catch (DatatypeException e) {
                            try {
                                errBadPseudoAttrDatatype(e, Charset.class,
                                        "charset", attrValue);
                            } catch (ClassNotFoundException ce) {
                            }
                        }
                        break;
                    case ALTERNATE:
                        hasAlternate = true;
                        if (attrValue == null) {
                            break;
                        }
                        if ("yes".equals(attrValue)) {
                            alternateIsYes = true;
                        } else if (!"no".equals(attrValue)) {
                            err("The value of the \u201cxml-stylesheet\u201d pseudo-attribute \u201calternate\u201d "
                                    + "must be either \u201cyes\u201d or \u201cno\u201d.");
                        }
                        break;
                    default:
                        err("Pseudo-attribute \u201c"
                                + attrName
                                + "\u201D not allowed in \u201cxml-stylesheet\u201d instruction.");
                        break;
                }
            }
            if (alternateIsYes && !hasNonEmptyTitle) {
                err("An \u201cxml-stylesheet\u201d instruction with an \u201calternate\u201d pseudo-attribute "
                        + "whose value is \u201cyes\u201d must also have a \u201ctitle\u201d pseudo-attribute with a non-empty value.");
            }
        }
        if (!hasHref) {
            err("\u201cxml-stylesheet\u201d instruction lacks \u201chref\u201d pseudo-attribute. "
                    + "The \u201chref\u201d pseudo-attribute is required in all \u201cxml-stylesheet\u201d instructions.");
        }
        if (hasXsltPi && (hasTitle || hasMedia || hasCharset || hasAlternate)) {
            warn("When processing \u201cxml-stylesheet\u201d instructions, browsers ignore the pseudo-attributes "
                    + "\u201ctitle\u201d, \u201cmedia\u201d, \u201ccharset\u201d, and \u201calternate\u201d.");
        } else if (hasCharset) {
            warn("Some browsers ignore the value of the \u201cxml-stylesheet\u201d pseudo-attribute \u201ccharset\u201d.");
        }
    }

    /**
     * Collect a set of attribues and values from the data part of a PI.
     * 
     * 

* The bulk of this method and associated methods that follow it here are * copied from the nu.validator.htmlparser.impl.Tokenizer class, with * appropriate modifications. *

* * @see nu.validator.htmlparser.impl.Tokenizer * @see nu.validator.htmlparser.impl.ErrorReportingTokenizer * */ private AttributesImpl getPseudoAttributesFromPiData(String buf) throws SAXException { int state = BEFORE_ATTRIBUTE_NAME; int returnState = BEFORE_ATTRIBUTE_NAME; char c = '\u0000'; int pos = -1; int endPos = buf.length(); boolean reconsume = false; attributes = null; attributeName = null; stateloop: for (;;) { switch (state) { case BEFORE_ATTRIBUTE_NAME: beforeattributenameloop: for (;;) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { break stateloop; } c = buf.charAt(pos); } /* * Consume the next input character: */ switch (c) { case '\n': case ' ': case '\t': continue; case '/': case '>': case '\"': case '\'': case '<': case '=': /* * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS * SIGN (=) Parse error. */ errBadCharBeforeAttributeNameOrNull(c); /* * Treat it as per the "anything else" entry * below. */ default: /* * Anything else Start a new attribute in the * current tag token. */ /* * Set that attribute's name to the current * input character, */ clearStrBufAndAppendCurrentC(c); /* * and its value to the empty string. */ // Will do later. /* * Switch to the attribute name state. */ state = ATTRIBUTE_NAME; break beforeattributenameloop; // continue stateloop; } } // FALLTHRU DON'T REORDER case ATTRIBUTE_NAME: attributenameloop: for (;;) { if (++pos == endPos) { attributeNameComplete(); addAttributeWithoutValue(); break stateloop; } c = buf.charAt(pos); /* * Consume the next input character: */ switch (c) { case '\n': case ' ': case '\t': attributeNameComplete(); state = AFTER_ATTRIBUTE_NAME; continue stateloop; case '=': /* * U+003D EQUALS SIGN (=) Switch to the before * attribute value state. */ attributeNameComplete(); state = BEFORE_ATTRIBUTE_VALUE; break attributenameloop; // continue stateloop; case '\"': case '\'': case '<': /* * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE * (') U+003C LESS-THAN SIGN (<) Parse error. */ errQuoteOrLtInAttributeNameOrNull(c); /* * Treat it as per the "anything else" entry * below. */ default: /* * Anything else Append the current input * character to the current attribute's name. */ appendStrBuf(c); /* * Stay in the attribute name state. */ continue; } } // FALLTHRU DON'T REORDER case BEFORE_ATTRIBUTE_VALUE: beforeattributevalueloop: for (;;) { if (++pos == endPos) { addAttributeWithoutValue(); break stateloop; } c = buf.charAt(pos); /* * Consume the next input character: */ switch (c) { case '\n': case ' ': case '\t': continue; case '"': /* * U+0022 QUOTATION MARK (") Switch to the * attribute value (double-quoted) state. */ clearLongStrBufForNextState(); state = ATTRIBUTE_VALUE_DOUBLE_QUOTED; break beforeattributevalueloop; // continue stateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the attribute * value (unquoted) state and reconsume this * input character. */ clearLongStrBuf(); state = ATTRIBUTE_VALUE_UNQUOTED; reconsume = true; continue stateloop; case '\'': /* * U+0027 APOSTROPHE (') Switch to the attribute * value (single-quoted) state. */ clearLongStrBufForNextState(); state = ATTRIBUTE_VALUE_SINGLE_QUOTED; continue stateloop; case '<': case '=': case '`': /* * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN * (=) U+0060 GRAVE ACCENT (`) */ errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); /* * Treat it as per the "anything else" entry * below. */ default: /* * Anything else Append the current input * character to the current attribute's value. */ clearLongStrBufAndAppendCurrentC(c); /* * Switch to the attribute value (unquoted) * state. */ state = ATTRIBUTE_VALUE_UNQUOTED; continue stateloop; } } // FALLTHRU DON'T REORDER case ATTRIBUTE_VALUE_DOUBLE_QUOTED: attributevaluedoublequotedloop: for (;;) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { addAttributeWithoutValue(); break stateloop; } c = buf.charAt(pos); } /* * Consume the next input character: */ switch (c) { case '"': /* * U+0022 QUOTATION MARK (") Switch to the after * attribute value (quoted) state. */ addAttributeWithValue(); state = AFTER_ATTRIBUTE_VALUE_QUOTED; break attributevaluedoublequotedloop; // continue stateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in attribute value state, with the * additional allowed character being U+0022 * QUOTATION MARK ("). */ clearStrBufAndAppendCurrentC(c); returnState = state; state = CONSUME_CHARACTER_REFERENCE; continue stateloop; case '\n': appendLongStrBufLineFeed(); continue; default: /* * Anything else Append the current input * character to the current attribute's value. */ appendLongStrBuf(c); /* * Stay in the attribute value (double-quoted) * state. */ continue; } } // FALLTHRU DON'T REORDER case AFTER_ATTRIBUTE_VALUE_QUOTED: for (;;) { if (++pos == endPos) { break stateloop; } c = buf.charAt(pos); /* * Consume the next input character: */ switch (c) { case '\n': case ' ': case '\t': state = BEFORE_ATTRIBUTE_NAME; continue stateloop; default: /* * Anything else Parse error. */ errNoSpaceBetweenAttributes(); /* * Reconsume the character in the before * attribute name state. */ state = BEFORE_ATTRIBUTE_NAME; reconsume = true; continue stateloop; } } // FALLTHRU DON'T REORDER case ATTRIBUTE_VALUE_UNQUOTED: errUnquotedAttributeValOrNull(); for (;;) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { addAttributeWithValue(); break stateloop; } c = buf.charAt(pos); } /* * Consume the next input character: */ switch (c) { case '\n': case ' ': case '\t': addAttributeWithValue(); state = BEFORE_ATTRIBUTE_NAME; continue stateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in attribute value state, with the * additional allowed character being U+003E * GREATER-THAN SIGN (>) */ clearStrBufAndAppendCurrentC(c); returnState = state; state = CONSUME_CHARACTER_REFERENCE; continue stateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ // addAttributeWithValue(); // state = emitCurrentTagToken(false, pos); // if (shouldSuspend) { // break stateloop; // } /* * Switch to the data state. */ continue stateloop; case '<': case '\"': case '\'': case '=': case '`': /* * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. */ // errUnquotedAttributeValOrNull(c); /* * Treat it as per the "anything else" entry * below. */ // fall through default: /* * Anything else Append the current input * character to the current attribute's value. */ appendLongStrBuf(c); /* * Stay in the attribute value (unquoted) state. */ continue; } } // XXX reorder point case AFTER_ATTRIBUTE_NAME: for (;;) { if (++pos == endPos) { addAttributeWithoutValue(); break stateloop; } c = buf.charAt(pos); /* * Consume the next input character: */ switch (c) { case '\n': case ' ': case '\t': continue; case '=': /* * U+003D EQUALS SIGN (=) Switch to the before * attribute value state. */ state = BEFORE_ATTRIBUTE_VALUE; continue stateloop; case '\"': case '\'': case '<': errQuoteOrLtInAttributeNameOrNull(c); /* * Treat it as per the "anything else" entry * below. */ default: addAttributeWithoutValue(); /* * Anything else Start a new attribute in the * current tag token. */ /* * Set that attribute's name to the current * input character, */ clearStrBufAndAppendCurrentC(c); /* * and its value to the empty string. */ // Will do later. /* * Switch to the attribute name state. */ state = ATTRIBUTE_NAME; continue stateloop; } } // XXX reorder point case ATTRIBUTE_VALUE_SINGLE_QUOTED: attributevaluesinglequotedloop: for (;;) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { addAttributeWithoutValue(); break stateloop; } c = buf.charAt(pos); } /* * Consume the next input character: */ switch (c) { case '\'': /* * U+0027 APOSTROPHE (') Switch to the after * attribute value (quoted) state. */ addAttributeWithValue(); state = AFTER_ATTRIBUTE_VALUE_QUOTED; continue stateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in attribute value state, with the * + additional allowed character being U+0027 * APOSTROPHE ('). */ clearStrBufAndAppendCurrentC(c); returnState = state; state = CONSUME_CHARACTER_REFERENCE; break attributevaluesinglequotedloop; // continue stateloop; case '\n': appendLongStrBufLineFeed(); continue; default: /* * Anything else Append the current input * character to the current attribute's value. */ appendLongStrBuf(c); /* * Stay in the attribute value (double-quoted) * state. */ continue; } } // FALLTHRU DON'T REORDER case CONSUME_CHARACTER_REFERENCE: if (++pos == endPos) { break stateloop; } c = buf.charAt(pos); /* * Unlike the definition is the spec, this state does not * return a value and never requires the caller to * backtrack. This state takes care of emitting characters * or appending to the current attribute value. It also * takes care of that in the case when consuming the * character reference fails. */ /* * This section defines how to consume a character * reference. This definition is used when parsing character * references in text and in attributes. * * The behavior depends on the identity of the next * character (the one immediately after the U+0026 AMPERSAND * character): */ switch (c) { case '#': /* * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER * SIGN. */ appendStrBuf('#'); state = CONSUME_NCR; continue stateloop; default: if (c == additional) { emitOrAppendStrBuf(returnState); state = returnState; reconsume = true; continue stateloop; } entCol = -1; lo = 0; hi = (NAMES.length - 1); candidate = -1; strBufMark = 0; state = CHARACTER_REFERENCE_LOOP; reconsume = true; // FALL THROUGH continue stateloop; } // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER case CHARACTER_REFERENCE_LOOP: outer: for (;;) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { break stateloop; } c = buf.charAt(pos); } entCol++; /* * Consume the maximum number of characters possible, * with the consumed characters matching one of the * identifiers in the first column of the named * character references table (in a case-sensitive * manner). */ hiloop: for (;;) { if (hi == -1) { break hiloop; } if (entCol == NAMES[hi].length) { break hiloop; } if (entCol > NAMES[hi].length) { break outer; } else if (c < NAMES[hi][entCol]) { hi--; } else { break hiloop; } } loloop: for (;;) { if (hi < lo) { break outer; } if (entCol == NAMES[lo].length) { candidate = lo; strBufMark = strBufLen; lo++; } else if (entCol > NAMES[lo].length) { break outer; } else if (c > NAMES[lo][entCol]) { lo++; } else { break loloop; } } if (hi < lo) { break outer; } appendStrBuf(c); continue; } if (candidate == -1) { /* * If no match can be made, then this is a parse error. */ errNoNamedCharacterMatch(); emitOrAppendStrBuf(returnState); state = returnState; reconsume = true; continue stateloop; } else { char[] candidateArr = NAMES[candidate]; if (candidateArr[candidateArr.length - 1] != ';') { /* * If the last character matched is not a U+003B * SEMICOLON (;), there is a parse error. */ if ((returnState & DATA_AND_RCDATA_MASK) != 0) { /* * If the entity is being consumed as part of an * attribute, and the last character matched is * not a U+003B SEMICOLON (;), */ char ch; if (strBufMark == strBufLen) { ch = c; } else { // if (strBufOffset != -1) { // ch = buf[strBufOffset + strBufMark]; // } else { ch = strBuf[strBufMark]; // } } if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) { /* * and the next character is in the range * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, * U+0041 LATIN CAPITAL LETTER A to U+005A * LATIN CAPITAL LETTER Z, or U+0061 LATIN * SMALL LETTER A to U+007A LATIN SMALL * LETTER Z, then, for historical reasons, * all the characters that were matched * after the U+0026 AMPERSAND (&) must be * unconsumed, and nothing is returned. */ errNoNamedCharacterMatch(); appendStrBufToLongStrBuf(); state = returnState; reconsume = true; continue stateloop; } } if ((returnState & DATA_AND_RCDATA_MASK) != 0) { errUnescapedAmpersandInterpretedAsCharacterReference(); } } /* * Otherwise, return a character token for the character * corresponding to the entity name (as given by the * second column of the named character references * table). */ char[] val = VALUES[candidate]; emitOrAppend(val, returnState); // this is so complicated! if (strBufMark < strBufLen) { // if (strBufOffset != -1) { // if ((returnState & (~1)) != 0) { // for (int i = strBufMark; i < strBufLen; i++) { // appendLongStrBuf(buf[strBufOffset + i]); // } // } else { // tokenHandler.characters(buf, strBufOffset // + strBufMark, strBufLen // - strBufMark); // } // } else { if ((returnState & DATA_AND_RCDATA_MASK) != 0) { for (int i = strBufMark; i < strBufLen; i++) { appendLongStrBuf(strBuf[i]); } } // } } state = returnState; reconsume = true; continue stateloop; /* * If the markup contains I'm ¬it; I tell you, the * entity is parsed as "not", as in, I'm ¬it; I tell * you. But if the markup was I'm ∉ I tell you, * the entity would be parsed as "notin;", resulting in * I'm ∉ I tell you. */ } // XXX reorder point case CONSUME_NCR: if (++pos == endPos) { break stateloop; } c = buf.charAt(pos); prevValue = -1; value = 0; seenDigits = false; /* * The behavior further depends on the character after the * U+0023 NUMBER SIGN: */ switch (c) { case 'x': /* * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL * LETTER X Consume the X. * * Follow the steps below, but using the range of * characters U+0030 DIGIT ZERO through to U+0039 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL * LETTER F (in other words, 0-9, A-F, a-f). * * When it comes to interpreting the number, * interpret it as a hexadecimal number. */ appendStrBuf(c); state = HEX_NCR_LOOP; continue stateloop; case 'X': /* * XML requires a lowercase 'x' for hex character * refs */ errUpperCaseXinHexNcr(); appendStrBuf(c); state = HEX_NCR_LOOP; continue stateloop; default: /* * Anything else Follow the steps below, but using * the range of characters U+0030 DIGIT ZERO through * to U+0039 DIGIT NINE (i.e. just 0-9). * * When it comes to interpreting the number, * interpret it as a decimal number. */ state = DECIMAL_NRC_LOOP; reconsume = true; // FALL THROUGH continue stateloop; } // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER case DECIMAL_NRC_LOOP: decimalloop: for (;;) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { break stateloop; } c = buf.charAt(pos); } // Deal with overflow gracefully if (value < prevValue) { value = 0x110000; // Value above Unicode range but // within int // range } prevValue = value; /* * Consume as many characters as match the range of * characters given above. */ if (c >= '0' && c <= '9') { seenDigits = true; value *= 10; value += c - '0'; continue; } else if (c == ';') { if (seenDigits) { state = HANDLE_NCR_VALUE; // FALL THROUGH continue stateloop; break decimalloop; } else { errNoDigitsInNCR(); appendStrBuf(';'); emitOrAppendStrBuf(returnState); state = returnState; continue stateloop; } } else { /* * If no characters match the range, then don't * consume any characters (and unconsume the U+0023 * NUMBER SIGN character and, if appropriate, the X * character). This is a parse error; nothing is * returned. * * Otherwise, if the next character is a U+003B * SEMICOLON, consume that too. If it isn't, there * is a parse error. */ if (!seenDigits) { errNoDigitsInNCR(); emitOrAppendStrBuf(returnState); state = returnState; reconsume = true; continue stateloop; } else { errCharRefLacksSemicolon(); state = HANDLE_NCR_VALUE; reconsume = true; // FALL THROUGH continue stateloop; break decimalloop; } } } // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER case HANDLE_NCR_VALUE: // WARNING previous state sets reconsume handleNcrValue(returnState); state = returnState; continue stateloop; // XXX reorder point case HEX_NCR_LOOP: for (;;) { if (++pos == endPos) { break stateloop; } c = buf.charAt(pos); // Deal with overflow gracefully if (value < prevValue) { value = 0x110000; // Value above Unicode range but // within int // range } prevValue = value; /* * Consume as many characters as match the range of * characters given above. */ if (c >= '0' && c <= '9') { seenDigits = true; value *= 16; value += c - '0'; continue; } else if (c >= 'A' && c <= 'F') { seenDigits = true; value *= 16; value += c - 'A' + 10; continue; } else if (c >= 'a' && c <= 'f') { seenDigits = true; value *= 16; value += c - 'a' + 10; continue; } else if (c == ';') { if (seenDigits) { state = HANDLE_NCR_VALUE; continue stateloop; } else { errNoDigitsInNCR(); appendStrBuf(';'); emitOrAppendStrBuf(returnState); state = returnState; continue stateloop; } } else { /* * If no characters match the range, then don't * consume any characters (and unconsume the U+0023 * NUMBER SIGN character and, if appropriate, the X * character). This is a parse error; nothing is * returned. * * Otherwise, if the next character is a U+003B * SEMICOLON, consume that too. If it isn't, there * is a parse error. */ if (!seenDigits) { errNoDigitsInNCR(); emitOrAppendStrBuf(returnState); state = returnState; reconsume = true; continue stateloop; } else { errCharRefLacksSemicolon(); state = HANDLE_NCR_VALUE; reconsume = true; continue stateloop; } } } } } return attributes; } private void appendStrBufToLongStrBuf() { appendLongStrBuf(strBuf, 0, strBufLen); } private void appendLongStrBuf(char[] buffer, int offset, int length) { int reqLen = longStrBufLen + length; if (longStrBuf.length < reqLen) { char[] newBuf = new char[reqLen + (reqLen >> 1)]; System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); longStrBuf = newBuf; } System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length); longStrBufLen = reqLen; } private void appendLongStrBuf(char[] arr) { appendLongStrBuf(arr, 0, arr.length); } private void appendLongStrBuf(char c) { if (longStrBufLen == longStrBuf.length) { char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)]; System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length); longStrBuf = newBuf; } longStrBuf[longStrBufLen++] = c; } private void appendLongStrBufLineFeed() { appendLongStrBuf('\n'); } private void appendStrBuf(char c) { if (strBufLen == strBuf.length) { char[] newBuf = new char[strBuf.length + BUFFER_GROW_BY]; System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); strBuf = newBuf; } strBuf[strBufLen++] = c; } private void clearLongStrBufForNextState() { longStrBufLen = 0; } private void clearLongStrBuf() { longStrBufLen = 0; } private void clearLongStrBufAndAppendCurrentC(char c) { longStrBuf[0] = c; longStrBufLen = 1; // longStrBufOffset = pos; } private void clearStrBufAndAppendCurrentC(char c) { strBuf[0] = c; strBufLen = 1; } private void emitOrAppend(char[] val, int returnState) throws SAXException { if ((returnState & DATA_AND_RCDATA_MASK) != 0) { appendLongStrBuf(val); } } private void emitOrAppendOne(char[] val, int returnState) throws SAXException { if ((returnState & DATA_AND_RCDATA_MASK) != 0) { appendLongStrBuf(val[0]); } } private void emitOrAppendTwo(char[] val, int returnState) throws SAXException { if ((returnState & DATA_AND_RCDATA_MASK) != 0) { appendLongStrBuf(val[0]); appendLongStrBuf(val[1]); } } private void emitOrAppendStrBuf(int returnState) throws SAXException { if ((returnState & DATA_AND_RCDATA_MASK) != 0) { appendStrBufToLongStrBuf(); } } private String longStrBufToString() { return new String(longStrBuf, 0, longStrBufLen); } private void attributeNameComplete() throws SAXException { attributeName = new String(strBuf, 0, strBufLen).intern(); if (attributes == null) { attributes = new AttributesImpl(); } /* * When the user agent leaves the attribute name state (and before * emitting the tag token, if appropriate), the complete attribute's * name must be compared to the other attributes on the same token; if * there is already an attribute on the token with the exact same name, * then this is a parse error and the new attribute must be dropped, * along with the value that gets associated with it (if any). */ for (int i = 0; i < attributes.getLength(); i++) { if (attributes.getQName(i).equals(attributeName)) { errDuplicateAttribute(); attributeName = null; return; } } } private void addAttributeWithValue() throws SAXException { if (attributeName != null) { String value = longStrBufToString(); if (value.indexOf('<') != -1) { errAttributeValueContainsLt(); return; } if (badCharInCandidateAttributeName()) { return; } attributes.addAttribute("", "", attributeName, "", value); attributeName = null; } } private void addAttributeWithoutValue() throws SAXException { if (attributeName != null) { if (badCharInCandidateAttributeName()) { return; } attributes.addAttribute("", "", attributeName, "", null); errAttributeWithNoValue(); attributeName = null; } } private boolean badCharInCandidateAttributeName() { return attributeName.indexOf('/') != -1 || attributeName.indexOf('>') != -1 || attributeName.indexOf('\"') != -1 || attributeName.indexOf('\'') != -1 || attributeName.indexOf('<') != -1 || attributeName.indexOf('=') != -1; } private void handleNcrValue(int returnState) throws SAXException { if (!isLegalXmlCharValue(value)) { errNcrIllegalValueForXml(); } else { /* * If one or more characters match the range, then take them all and * interpret the string of characters as a number (either * hexadecimal or decimal as appropriate). */ if (value <= 0xFFFF) { /* * about ((value <= 0x0008) || (value == 0x000B) || (value >= * 0x000E && value <= 0x001F)) -- we already check for * XML-illegal control characters in isLegalXmlCharValue */ if ((value & 0xF800) == 0xD800) { errNcrSurrogate(); emitOrAppendOne(REPLACEMENT_CHARACTER, returnState); } else { /* * Otherwise, return a character token for the Unicode * character whose code point is that number. */ char ch = (char) value; if (value >= 0xFDD0 && value <= 0xFDEF) { errNcrUnassigned(); } else if ((value & 0xFFFE) == 0xFFFE) { ch = errNcrNonCharacter(ch); } else if (value >= 0x007F && value <= 0x009F) { errNcrControlChar(); } else { maybeWarnPrivateUse(ch); } bmpChar[0] = ch; emitOrAppendOne(bmpChar, returnState); } } else if (value <= 0x10FFFF) { maybeWarnPrivateUseAstral(); if ((value & 0xFFFE) == 0xFFFE) { errAstralNonCharacter(value); } astralChar[0] = (char) (LEAD_OFFSET + (value >> 10)); astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); emitOrAppendTwo(astralChar, returnState); } else { errNcrOutOfRange(); emitOrAppendOne(REPLACEMENT_CHARACTER, returnState); } if ((value & 0xF800) == 0xD800) { errNcrSurrogate(); emitOrAppendOne(REPLACEMENT_CHARACTER, returnState); } else if (value <= 0xFFFF) { /* * Otherwise, return a character token for the Unicode character * whose code point is that number. */ char ch = (char) value; /* * if ((value <= 0x0008) || (value == 0x000B) || (value >= * 0x000E && value <= 0x001F)) { // we already check for * XML-illegal control // characters in isLegalXmlCharValue ch = * errNcrControlChar(ch); } */ if (value >= 0xFDD0 && value <= 0xFDEF) { errNcrUnassigned(); } else if ((value & 0xFFFE) == 0xFFFE) { ch = errNcrNonCharacter(ch); } else if (value >= 0x007F && value <= 0x009F) { errNcrControlChar(); } else { maybeWarnPrivateUse(ch); } bmpChar[0] = ch; emitOrAppendOne(bmpChar, returnState); } else if (value <= 0x10FFFF) { maybeWarnPrivateUseAstral(); astralChar[0] = (char) (LEAD_OFFSET + (value >> 10)); astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); emitOrAppend(astralChar, returnState); } else { errNcrOutOfRange(); emitOrAppendOne(REPLACEMENT_CHARACTER, returnState); } } } private String toUPlusString(char c) { String hexString = Integer.toHexString(c); switch (hexString.length()) { case 1: return "U+000" + hexString; case 2: return "U+00" + hexString; case 3: return "U+0" + hexString; case 4: return "U+" + hexString; default: throw new RuntimeException("Unreachable."); } } private boolean isLegalXmlCharValue(int charval) { return charval == 0x0009 || charval == 0x000A || charval == 0x000D || (charval >= 0x0020 && charval <= 0xD7FF) || (charval >= 0xE000 && charval <= 0xFFFD) || (charval >= 0x10000 && charval <= 0x10FFFF); } private boolean isPrivateUse(char c) { return c >= '\uE000' && c <= '\uF8FF'; } private boolean isAstralPrivateUse(int c) { return (c >= 0xF0000 && c <= 0xFFFFD) || (c >= 0x100000 && c <= 0x10FFFD); } private void warnAboutPrivateUseChar() throws SAXException { if (!alreadyWarnedAboutPrivateUseCharacters) { warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)"); alreadyWarnedAboutPrivateUseCharacters = true; } } private void errBadCharBeforeAttributeNameOrNull(char c) throws SAXException { if (c == '=') { errEqualsSignBeforeAttributeName(); } else if (c != '\uFFFD') { errQuoteBeforeAttributeName(c); } else { err("The character \u201C" + c + "\u201D is not allowed in \u201C" + piTarget + "\u201D pseudo-attribute names."); } } private void errCharRefLacksSemicolon() throws SAXException { err("Character reference was not terminated by a semicolon."); } private void errDuplicateAttribute() throws SAXException { err("Duplicate \u201C" + piTarget + "\u201D pseudo-attribute \u201C" + attributeName + "\u201D."); } private void errEqualsSignBeforeAttributeName() throws SAXException { err("Saw \u201C=\u201D when expecting \u201C" + piTarget + "\u201D pseudo-attribute name. Probable cause: Pseudo-attribute name missing."); } private void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) throws SAXException { switch (c) { case '=': err("\u201C=\u201D at the start of an unquoted \u201C" + piTarget + "\u201D pseudo-attribute value. Probable cause: Stray duplicate equals sign."); return; case '<': /* * we deal with this case in the attribute-value error- checking * code in the getPseudoAttributesFromPiData method */ // err("\u201C<\u201D at the start of an unquoted attribute value. Probable cause: Missing \u201C>\u201D immediately before."); return; case '`': err("\u201C`\u201D at the start of an unquoted \u201C" + piTarget + "\u201D pseudo-attribute value. Probable cause: Using the wrong character as a quote."); return; } } private void errNoSpaceBetweenAttributes() throws SAXException { err("Space is required between \u201C" + piTarget + "\u201D pseudo-attributes."); } private void errQuoteBeforeAttributeName(char c) throws SAXException { err("Saw \u201C" + c + "\u201D when expecting a pseudo-attribute name. Probable cause: \u201C=\u201D missing immediately before."); } private void errQuoteOrLtInAttributeNameOrNull(char c) throws SAXException { if (c != '\uFFFD') { err("Quote \u201C" + c + "\u201D in pseudo-attribute name. Probable cause: Matching quote missing somewhere earlier."); } } private void errUnquotedAttributeValOrNull() throws SAXException { err("Found unquoted value for \u201c" + piTarget + "\u201d pseudo-attribute \u201c" + attributeName + "\u201d. The value of all pseudo-attributes in \u201c" + piTarget + "\u201d instructions must be quoted."); } private void errNoNamedCharacterMatch() throws SAXException { if (getErrorHandler() == null) { return; } SAXParseException spe = new SAXParseException( "\u201C&\u201D did not start a character reference. (\u201C&\u201D probably should have been escaped as \u201C&\u201D.)", getDocumentLocator()); getErrorHandler().error(spe); } private void errNcrControlChar() throws SAXException { /* * warn instead of error because these control characters are legal in * XML */ warn("Character reference expands to a control character (" + toUPlusString((char) value) + ")."); } private void errNcrIllegalValueForXml() throws SAXException { err("Character reference expands to a character that is not legal in XML (" + toUPlusString((char) value) + ")."); } private void errNcrSurrogate() throws SAXException { err("Character reference expands to a surrogate."); } private void errNcrUnassigned() throws SAXException { err("Character reference expands to a permanently unassigned code point."); } private char errNcrNonCharacter(char ch) throws SAXException { err("Character reference expands to a non-character (" + toUPlusString((char) value) + ")."); return ch; } private void errNcrOutOfRange() throws SAXException { err("Character reference outside the permissible Unicode range."); } private void errNoDigitsInNCR() throws SAXException { err("No digits after \u201C" + new String(strBuf, 0, strBufLen) + "\u201D."); } private void errUnescapedAmpersandInterpretedAsCharacterReference() throws SAXException { if (getErrorHandler() == null) { return; } SAXParseException spe = new SAXParseException( "The string following \u201C&\u201D was interpreted as a character reference. (\u201C&\u201D probably should have been escaped as \u201C&\u201D.)", getDocumentLocator()); getErrorHandler().error(spe); } private void maybeWarnPrivateUse(char ch) throws SAXException { if (getErrorHandler() != null && isPrivateUse(ch)) { warnAboutPrivateUseChar(); } } private void maybeWarnPrivateUseAstral() throws SAXException { if (getErrorHandler() != null && isAstralPrivateUse(value)) { warnAboutPrivateUseChar(); } } private void errAstralNonCharacter(int ch) throws SAXException { err("Character reference expands to an astral non-character (" + toUPlusString((char) value) + ")."); } private static String newAsciiLowerCaseStringFromString(String str) { if (str == null) { return null; } char[] buf = new char[str.length()]; for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); if (c >= 'A' && c <= 'Z') { c += 0x20; } buf[i] = c; } return new String(buf); } private static String newAsciiUpperCaseStringFromString(String str) { if (str == null) { return null; } char[] buf = new char[str.length()]; for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); if (c >= 'a' && c <= 'z') { c -= 0x20; } buf[i] = c; } return new String(buf); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy