de.unkrig.commons.text.scanner.XmlScanner Maven / Gradle / Ivy

Go to download

/*
 * de.unkrig.commons - A general-purpose Java class library
 *
 * Copyright (c) 2017, Arno Unkrig
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
 * following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
 *       following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
 *       following disclaimer in the documentation and/or other materials provided with the distribution.
 *    3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote
 *       products derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package de.unkrig.commons.text.scanner;

/**
 * A scanner for XML.
 */
public final
class XmlScanner {

    private XmlScanner() {}

    // PUBLIC INTERFACE

    private
    enum State { TAG }

    /**
     * The token types that form an XML document.
     */
    public
    enum TokenType {

        /**
         * {@code XMLDecl      ::= ''}

         * {@code VersionInfo  ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')}

         * {@code Eq           ::= S? '=' S?}

         * {@code VersionNum   ::= '1.' [0-9]+}

         * {@code EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )}

         * {@code EncName      ::= [A-Za-z] ([A-Za-z0-9._] | '-')*}

         * {@code SDDecl       ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))}
         */
        XML_DECLARATION,

        /** {@code Comment ::= ''} */
        COMMENT,

        /**
         * Example:
         * 
         *   {@code }
         * 
         *
         * {@code PI       ::= '' Char*)))? '?>' }

         * {@code PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))}
         */
        PROCESSING_INSTRUCTION,

        /**
         * {@code doctypedecl   ::= ''}

         * {@code ExternalID    ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral}

         * {@code SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")}

         * {@code PubidLiteral  ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"}

         * {@code PubidChar     ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]}

         * {@code intSubset     ::= (markupdecl | DeclSep)*}

         * {@code markupdecl    ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment}

         * {@code elementdecl   ::= ''}

         * {@code contentspec   ::= 'EMPTY' | 'ANY' | Mixed | children}

         * {@code Mixed         ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')'}

         * {@code children      ::= (choice | seq) ('?' | '*' | '+')?}

         * {@code cp            ::= (Name | choice | seq) ('?' | '*' | '+')?}

         * {@code choice        ::= '(' S? cp ( S? '|' S? cp )+ S? ')'}

         * {@code seq           ::= '(' S? cp ( S? ',' S? cp )* S? ')'}
         */
        DOCUMENT_TYPE_DECLARATION,

        /**
         * The beginning of an empty element tag or a start tag (but not the beginning of an end
         * tag!).
         * 
         *   {@code StartTag        ::= BeginTag Attribute* EndStartTag}

         *   {@code EmptyElementTag ::= BeginTag Attribute* EndEmptyElementTag}

         *   {@code BeginTag        ::= '<' Name}

         *   {@code Attribute       ::= S AttributeName Eq AttributeValue}
         * 
         */
        BEGIN_TAG,

        /**
         * {@code AttributeName ::= S Name}
         *
         * @see #BEGIN_TAG
         */
        ATTRIBUTE_NAME,

        /**
         * {@code AttributeValue  ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"}
         *
         * @see #BEGIN_TAG
         */
        ATTRIBUTE_VALUE,

        /**
         * The end of a start tag.
         * {@code EndOfStartTag ::= S? '>'}
         *
         * @see #BEGIN_TAG
         */
        END_START_TAG,

        /**
         * The end of an empty element tag.
         * {@code EndOfEmptyElementTag ::= S? '/>'}
         *
         * @see #BEGIN_TAG
         */
        END_EMPTY_ELEMENT_TAG,

        /** {@code ETag ::= ''} */
        END_TAG,

        /** {@code CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)} */
        CHAR_DATA,

        /** {@code EntityRef ::= '&' Name ';'} */
        ENTITY_REFERENCE,

        /** {@code CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'} */
        CHARACTER_REFERENCE,

        /**
         * {@code CDSect  ::= CDStart CData CDEnd}

         * {@code CDStart ::= '
         * {@code CData   ::= (Char* - (Char* ']]>' Char*))}

         * {@code CDEnd   ::= ']]>'}
         */
        CDATA_SECTION,
    }

    /**
     * Creates a {@link StringScanner} that scans XML documents.
     */
    public static StringScanner
    stringScanner() {
        StatefulScanner scanner = new StatefulScanner(State.class);

        final String s                  = "(?:[ \\t\\r\\n]+)";
        final String eq                 = "(?:" + s + "?=" + s + "?)";
        final String nameStartChar      = "[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]"; // SUPPRESS CHECKSTYLE LineLength
        final String nameChar           = "(?:" + nameStartChar + "|[\\-.0-9\u00B7\u0300-\u036F\u203F-\u2040])";
        final String name               = "(?:" + nameStartChar + nameChar + "*)";
        final String entityReference    = "(?:&" + name + ";)";
        final String characterReference = "(?:&#[0-9]+|&#x[0-9a-zA-Z]+;)";
        final String reference          = "(?:" + entityReference + "|" + characterReference + ")";
        final String attributeValue     = "(?:'(?:[^<&\"]|" + reference + ")*'|\"(?:[^<&\"]|" + reference + ")*\")";

        scanner.addRule("", TokenType.COMMENT);

        // The ""
        ).replace("@S@", s).replace("@Eq@", eq), TokenType.XML_DECLARATION);
        scanner.addRule((
            ""
            + "<\\?(@Name@)"
            + "(@S@.*?)?"
            + "\\?>"
        ).replace("@Name@", name).replace("@S@", s), TokenType.PROCESSING_INSTRUCTION);

        scanner.addRule("", TokenType.DOCUMENT_TYPE_DECLARATION);

        scanner.addRule("<(@Name@)".replace("@Name@", name), TokenType.BEGIN_TAG).goTo(State.TAG);

        scanner.addRule(State.TAG, s + "(" + name + ")", TokenType.ATTRIBUTE_NAME).goTo(State.TAG);

        scanner.addRule(State.TAG, eq + "(" + attributeValue + ")", TokenType.ATTRIBUTE_VALUE).goTo(State.TAG);

        scanner.addRule(State.TAG, "@S@?>".replace("@S@", s), TokenType.END_START_TAG);

        scanner.addRule(State.TAG, "@S@?/>".replace("@S@", s), TokenType.END_EMPTY_ELEMENT_TAG);

        scanner.addRule((
            ""
            + ""
        ).replace("@Name@", name).replace("@S@", s), TokenType.END_TAG);

        scanner.addRule("(?:[^<&](?!-->))+", TokenType.CHAR_DATA);

        scanner.addRule(entityReference, TokenType.ENTITY_REFERENCE);

        scanner.addRule(characterReference, TokenType.CHARACTER_REFERENCE);

        scanner.addRule("(?:)", TokenType.CDATA_SECTION);

        return scanner;
    }

    /**
     * Must only be used for subpatterns that consume neither an apostrophe nor a quote, e.g. {@code
     * "1\\.[0-9]+"} is ok, but {@code ".*"} is not.
     */
    private static String
    quoted(String subpattern) {
        return (
            ""
            + "(?:"
            +   "'(?:" + subpattern + ")*'"
            +   "|"
            +   "\"(?:" + subpattern + ")*\""
            + ")"
        );
    }
}