All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.unkrig.commons.text.scanner.XmlScanner Maven / Gradle / Ivy


/*
 * de.unkrig.commons - A general-purpose Java class library
 *
 * Copyright (c) 2017, Arno Unkrig
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
 * following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
 *       following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
 *       following disclaimer in the documentation and/or other materials provided with the distribution.
 *    3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote
 *       products derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package de.unkrig.commons.text.scanner;

/**
 * A scanner for XML.
 */
public final
class XmlScanner {

    private XmlScanner() {}

    // PUBLIC INTERFACE

    private
    enum State { TAG }

    /**
     * The token types that form an XML document.
     */
    public
    enum TokenType {

        /**
         * {@code XMLDecl      ::= ''}
* {@code VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')}
* {@code Eq ::= S? '=' S?}
* {@code VersionNum ::= '1.' [0-9]+}
* {@code EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )}
* {@code EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*}
* {@code SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))} */ XML_DECLARATION, /** {@code Comment ::= ''} */ COMMENT, /** * Example: *

* {@code } *

* * {@code PI ::= '' Char*)))? '?>' }
* {@code PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))} */ PROCESSING_INSTRUCTION, /** * {@code doctypedecl ::= ''}
* {@code ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral}
* {@code SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")}
* {@code PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"}
* {@code PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]}
* {@code intSubset ::= (markupdecl | DeclSep)*}
* {@code markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment}
* {@code elementdecl ::= ''}
* {@code contentspec ::= 'EMPTY' | 'ANY' | Mixed | children}
* {@code Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')'}
* {@code children ::= (choice | seq) ('?' | '*' | '+')?}
* {@code cp ::= (Name | choice | seq) ('?' | '*' | '+')?}
* {@code choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'}
* {@code seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'} */ DOCUMENT_TYPE_DECLARATION, /** * The beginning of an empty element tag or a start tag (but not the beginning of an end * tag!). *

* {@code StartTag ::= BeginTag Attribute* EndStartTag}
* {@code EmptyElementTag ::= BeginTag Attribute* EndEmptyElementTag}
* {@code BeginTag ::= '<' Name}
* {@code Attribute ::= S AttributeName Eq AttributeValue} *

*/ BEGIN_TAG, /** * {@code AttributeName ::= S Name} * * @see #BEGIN_TAG */ ATTRIBUTE_NAME, /** * {@code AttributeValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"} * * @see #BEGIN_TAG */ ATTRIBUTE_VALUE, /** * The end of a start tag. *

{@code EndOfStartTag ::= S? '>'}

* * @see #BEGIN_TAG */ END_START_TAG, /** * The end of an empty element tag. *

{@code EndOfEmptyElementTag ::= S? '/>'}

* * @see #BEGIN_TAG */ END_EMPTY_ELEMENT_TAG, /** {@code ETag ::= ''} */ END_TAG, /** {@code CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)} */ CHAR_DATA, /** {@code EntityRef ::= '&' Name ';'} */ ENTITY_REFERENCE, /** {@code CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'} */ CHARACTER_REFERENCE, /** * {@code CDSect ::= CDStart CData CDEnd}
* {@code CDStart ::= ' * {@code CData ::= (Char* - (Char* ']]>' Char*))}
* {@code CDEnd ::= ']]>'} */ CDATA_SECTION, } /** * Creates a {@link StringScanner} that scans XML documents. */ public static StringScanner stringScanner() { StatefulScanner scanner = new StatefulScanner(State.class); final String s = "(?:[ \\t\\r\\n]+)"; final String eq = "(?:" + s + "?=" + s + "?)"; final String nameStartChar = "[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]"; // SUPPRESS CHECKSTYLE LineLength final String nameChar = "(?:" + nameStartChar + "|[\\-.0-9\u00B7\u0300-\u036F\u203F-\u2040])"; final String name = "(?:" + nameStartChar + nameChar + "*)"; final String entityReference = "(?:&" + name + ";)"; final String characterReference = "(?:&#[0-9]+|&#x[0-9a-zA-Z]+;)"; final String reference = "(?:" + entityReference + "|" + characterReference + ")"; final String attributeValue = "(?:'(?:[^<&\"]|" + reference + ")*'|\"(?:[^<&\"]|" + reference + ")*\")"; scanner.addRule("", TokenType.COMMENT); // The "" ).replace("@S@", s).replace("@Eq@", eq), TokenType.XML_DECLARATION); scanner.addRule(( "" + "<\\?(@Name@)" + "(@S@.*?)?" + "\\?>" ).replace("@Name@", name).replace("@S@", s), TokenType.PROCESSING_INSTRUCTION); scanner.addRule("", TokenType.DOCUMENT_TYPE_DECLARATION); scanner.addRule("<(@Name@)".replace("@Name@", name), TokenType.BEGIN_TAG).goTo(State.TAG); scanner.addRule(State.TAG, s + "(" + name + ")", TokenType.ATTRIBUTE_NAME).goTo(State.TAG); scanner.addRule(State.TAG, eq + "(" + attributeValue + ")", TokenType.ATTRIBUTE_VALUE).goTo(State.TAG); scanner.addRule(State.TAG, "@S@?>".replace("@S@", s), TokenType.END_START_TAG); scanner.addRule(State.TAG, "@S@?/>".replace("@S@", s), TokenType.END_EMPTY_ELEMENT_TAG); scanner.addRule(( "" + "" ).replace("@Name@", name).replace("@S@", s), TokenType.END_TAG); scanner.addRule("(?:[^<&](?!-->))+", TokenType.CHAR_DATA); scanner.addRule(entityReference, TokenType.ENTITY_REFERENCE); scanner.addRule(characterReference, TokenType.CHARACTER_REFERENCE); scanner.addRule("(?:)", TokenType.CDATA_SECTION); return scanner; } /** * Must only be used for subpatterns that consume neither an apostrophe nor a quote, e.g. {@code * "1\\.[0-9]+"} is ok, but {@code ".*"} is not. */ private static String quoted(String subpattern) { return ( "" + "(?:" + "'(?:" + subpattern + ")*'" + "|" + "\"(?:" + subpattern + ")*\"" + ")" ); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy