com.tangosol.run.xml.SimpleParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of coherence Show documentation
Oracle Coherence Community Edition
There is a newer version: 24.09
/*
 * Copyright (c) 2000, 2020, Oracle and/or its affiliates.
 *
 * Licensed under the Universal Permissive License v 1.0 as shown at
 * http://oss.oracle.com/licenses/upl.
 */
package com.tangosol.run.xml;

import com.oracle.coherence.common.base.Logger;

import com.tangosol.coherence.config.Config;

import com.tangosol.dev.compiler.CompilerException;
import com.tangosol.dev.compiler.SyntaxException;

import com.tangosol.util.Base;
import com.tangosol.util.ErrorList;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;

/**
* This class uses the XmlTokenizer to produce an XmlDocument from XML text.
*
* @version 1.00, 2001.07.16
* @author  Cameron Purdy
*/
public class SimpleParser
        extends Base
    {
    // ----- construction ---------------------------------------------------

    /**
    * Construct an XML SimpleParser.  If the XML contains an XSD reference,
    * the parser will validate using the provided XSD.
    */
    public SimpleParser()
        {
        this(true);
        }

    /**
    * Construct an XML SimpleParser.
    *
    * @param fValidate  if true, validate XML if it contains
    *                   an XSD reference
    */
    public SimpleParser(boolean fValidate)
        {
        // hidden system property to disable all validation
        boolean fDisable = Config.getBoolean("coherence.xml.validation.disable");
        if (fDisable && fValidate)
            {
            Logger.info("XML validation disabled");
            }
        m_fValidate = !fDisable && fValidate;
        }

    /**
    * Internal initialization.
    */
    protected void init()
        {
        m_toker = null;
        m_token = null;
        }


    // ----- public interface -----------------------------------------------

    /**
    * Parse the specified String into an XmlDocument object.
    *
    * @param sXml the String to parse
    *
    * @return an XmlDocument object
    *
    * @throws IOException  if I/O error occurs
    */
    public XmlDocument parseXml(String sXml)
            throws IOException
        {
        return parseXml(sXml, null);
        }

    /**
    * Parse the specified Reader into an XmlDocument object.
    *
    * @param reader  the Reader object
    *
    * @return an XmlDocument object
    *
    * @throws IOException  if I/O error occurs
    */
    public XmlDocument parseXml(Reader reader)
            throws IOException
        {
        return parseXml(read(reader), null);
        }

    /**
    * Parse the specified InputStream into an XmlDocument object.
    *
    * @param stream  the InputStream object
    *
    * @return an XmlDocument object
    *
    * @throws IOException  if I/O error occurs
    */
    public XmlDocument parseXml(InputStream stream)
            throws IOException
        {
        XmlDocument doc;
        InputStream in = new ByteArrayInputStream(read(stream));

        try
            {
            in.mark(0);
            try
                {
                // try to parse with UTF-8 encoding
                doc = parseXml(in, XML_DEFAULT_ENCODING);
                }
            catch (UnsupportedEncodingException e)
                {
                // in the unlikely event that UTF-8 isn't supported, try
                // the default (platform specific) encoding
                in.reset();
                doc = parseXml(new InputStreamReader(in));
                }

            String sEncoding = doc.getEncoding();
            if (sEncoding != null && !sEncoding.equalsIgnoreCase(XML_DEFAULT_ENCODING))
                {
                in.reset();
                try
                    {
                    // reparse document with specified encoding
                    doc = parseXml(in, sEncoding);
                    }
                catch (UnsupportedEncodingException e)
                    {
                    // proceeding with parsed document
                    Logger.warn("Could not parse XML with encoding " + sEncoding);
                    }
                }
            }
        finally
            {
            in.close();
            }

        return doc;
        }

    /**
    * Parse the specified InputStream into an XmlDocument object using
    * the specified charset.
    *
    * @param stream    the InputStream object
    * @param sCharset  the charset name
    *
    * @return an XmlDocument object
    *
    * @throws IOException  if I/O error occurs
    */
    public XmlDocument parseXml(InputStream stream, String sCharset)
            throws IOException
        {
        return parseXml(new InputStreamReader(skipBOM(stream), sCharset));
        }

    /**
    * Parse the passed script.
    *
    * @param sXml  the script to parse (as a string)
    * @param xml   the XML document object to parse into
    *
    * @return the XmlDocument object
    *
    * @throws IOException  if I/O error occurs
    */
    public synchronized XmlDocument parseXml(String sXml, XmlDocument xml)
            throws IOException
        {
        azzert(sXml != null);

        if (xml == null)
            {
            xml = instantiateDocument();
            }

        init();
        ErrorList errlist = new ErrorList();

        try
            {
            m_toker = new XmlTokenizer(sXml, errlist);
            m_token = next();
            parseDocument(xml);
            }
        catch (CompilerException e)
            {
            throw ensureRuntimeException(e, "Exception occurred during parsing: " + e.getMessage());
            }
        catch (Exception e)
            {
            String s = "Exception occurred during parsing: " + e.getMessage();
            if (!errlist.isEmpty())
                {
                s += "\nLogged errors:\n" + errlist;
                }
            throw new IOException(s, e);
            }

        if (m_fValidate)
            {
            try
                {
                // attempt to validate the XML if a schema Location
                // is specified, using sax parser
                new SaxParser().validateXsd(sXml, xml);
                }
            catch (Exception e)
                {
                String s = "Exception occurred during schema validation: \n"
                        + e.getMessage();
                throw new IOException(s, e);
                }
            }

        return xml;
        }


    // ----- factory methods ------------------------------------------------

    /**
    * Factory method to instantiate an XmlDocument implementation.
    *
    * @return an object implementing XmlDocument
    */
    protected XmlDocument instantiateDocument()
        {
        return new SimpleDocument();
        }


    // ----- construction ---------------------------------------------------

    /**
    * Unit test.
    *
    * @param asArgs  the string array arguments
    */
    public static void main(String[] asArgs)
        {
        try
            {
            String sFile = "TestXml.xml";
            try
                {
                sFile = asArgs[0];
                }
            catch (Exception e)
                {
                }

            File file = new File(sFile);
            azzert(file.exists() && file.canRead());

            out();
            out("Original Document:");
            String sXml = new String(read(file));
            out(sXml);

            out();
            out("Parsing ...");
            XmlDocument xml = new SimpleParser().parseXml(sXml);

            out();
            out("Parsed Document:");
            sXml = xml.toString();
            out(sXml);

            out();
            out("Parsing the Parsed Document...");
            XmlDocument xml2 = new SimpleParser().parseXml(sXml);

            out();
            out("Re-parsed Document:");
            String sXml2 = xml2.toString();
            out(sXml2);

            out();
            out("Comparing:");
            trace(xml.equals(xml2));
            trace(sXml.equals(sXml2));
            }
        catch (Exception e)
            {
            out("Exception occurred in test: " + e);
            out(e);
            }
        }


    // ----- implementation -------------------------------------------------

    /**
    * Factory method to instantiate an XmlDocument implementation.
    *
    * @param xml  a blank XmlDocument
    *
    * @throws CompilerException  if compiler error occurs
    */
    protected void parseDocument(XmlDocument xml)
            throws CompilerException
        {
        // document    ::= prolog element Misc*
        // prolog      ::= XMLDecl? Misc* (doctypedecl Misc*)?
        // XMLDecl     ::= ''
        // doctypedecl ::= ''
        // Misc        ::= Comment | PI | S
        // Comment     ::= ''
        // PI          ::= '' Char*)))? '?>'

        // check for "'
        // VersionInfo  ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
        // VersionNum   ::= ([a-zA-Z0-9_.:] | '-')+
        // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
        // EncName      ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
        // SDDecl       ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))

        if (peek("version") != null)
            {
            // version is assumed to be "1.0"
            match(XmlToken.EQUALS);
            match(XmlToken.LITERAL);
            }
        else
            {
            throw new SyntaxException("The version value is"
                + " is missing from the XML declaration");
            }

        if (peek("encoding") != null)
            {
            match(XmlToken.EQUALS);
            String sValue = match(XmlToken.LITERAL).getText();
            if (!XmlHelper.isEncodingValid(sValue))
                {
                throw new SyntaxException("The encoding value in"
                    + " the XML declaration is illegal (" + sValue + ")");
                }
            xml.setEncoding(sValue);
            }

        if (peek("standalone") != null)
            {
            // standalone is discarded
            match(XmlToken.EQUALS);
            String sValue = match(XmlToken.LITERAL).getText();
            if (!(sValue.equals("yes") || sValue.equals("no")))
                {
                throw new SyntaxException("The standalone value in"
                    + " the XML declaration must be 'yes' or 'no'");
                }
            }

        match(XmlToken.PI_STOP);
        }

    /**
    * Parse doc type.
    *
    * @param xml  the XML document
    *
    * @throws CompilerException  if compiler error occurs
    */
    protected void parseDoctype(XmlDocument xml)
            throws CompilerException
        {
        // doctypedecl ::= ''
        // ExternalID  ::= 'SYSTEM' S SystemLiteral
        //                 'PUBLIC' S PubidLiteral S SystemLiteral

        // root element name
        xml.setName(match(XmlToken.NAME).getText());

        // ExternalID (optional): public identifier
        boolean fPublic = (peek("PUBLIC") != null);
        if (fPublic)
            {
            String sName = match(XmlToken.LITERAL).getText();
            sName = XmlHelper.decodeAttribute(sName);
            if (!XmlHelper.isPublicIdentifierValid(sName))
                {
                throw new SyntaxException("The public identifier in"
                    + " the XML DOCTYPE is invalid (" + sName + ")");
                }
            xml.setDtdName(sName);
            }

        // ExternalID (optional): system identifier
        if (fPublic || peek("SYSTEM") != null)
            {
            String sUri = match(XmlToken.LITERAL).getText();
            sUri = XmlHelper.decodeUri(XmlHelper.decodeAttribute(sUri));
            if (!XmlHelper.isSystemIdentifierValid(sUri))
                {
                throw new SyntaxException("The system identifier in"
                    + " the XML DOCTYPE is invalid (" + sUri + ")");
                }
            xml.setDtdUri(sUri);
            }

        // ignore inline markup decl
        if (peek(XmlToken.DTD_DECL_START) != null)
            {
            while (current().getID() != XmlToken.DTD_DECL_STOP)
                {
                }
            }

        match(XmlToken.ELEMENT_STOP);
        }

    /**
    *
    * Note: '<' and element name have already been parsed
    *
    * @param xml  the XML element
    *
    * @throws CompilerException  if compiler error occurs
    */
    protected void parseElement(XmlElement xml)
            throws CompilerException
        {
        // element      ::= EmptyElemTag
        //                  STag content ETag
        // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
        // STag         ::= '<' Name (S Attribute)* S? '>'
        // Attribute    ::= Name Eq AttValue
        // ETag         ::= ''

        // parse attributes
        while (true)
            {
            XmlToken token = peek(XmlToken.NAME);
            if (token == null)
                {
                break;
                }
            String sAttr = token.getText();
            if (!XmlHelper.isNameValid(sAttr))
                {
                throw new SyntaxException("Illegal attribute name: " + sAttr);
                }
            match(XmlToken.EQUALS);
            String sValue = match(XmlToken.LITERAL).getText();
            sValue = XmlHelper.decodeAttribute(sValue);
            xml.addAttribute(sAttr).setString(sValue);
            }

        // check if this were an empty element
        if (peek(XmlToken.EMPTY_STOP) != null)
            {
            return;
            }

        // this element is the "content" type (not empty)
        match(XmlToken.ELEMENT_STOP);

        String sValue = null;
        while (true)
            {
            XmlToken token = current();
            switch (token.getID())
                {
                case XmlToken.COMMENT_START:
                    parseComment(xml);
                    break;

                case XmlToken.PI_START:
                    parsePi(xml);
                    break;

                case XmlToken.CHARDATA:
                    {
                    String sChunk = (String) token.getValue();
                    sChunk = XmlHelper.trim(sChunk);
                    sChunk = XmlHelper.decodeContent(sChunk);
                    if (sChunk.length() > 0)
                        {
                        if (sValue == null)
                            {
                            sValue = sChunk;
                            }
                        else
                            {
                            sValue += sChunk;
                            }
                        }
                    }
                    break;

                case XmlToken.CHARDATA_RAW:
                    {
                    String sChunk = (String) token.getValue();
                    if (sValue == null)
                        {
                        sValue = sChunk;
                        }
                    else
                        {
                        sValue += sChunk;
                        }
                    }
                    break;

                case XmlToken.ELEMENT_START:
                    String sName = match(XmlToken.NAME).getText();
                    parseElement(xml.addElement(sName));
                    break;

                case XmlToken.ENDTAG_START:
                    if (sValue != null)
                        {
                        xml.setString(sValue);
                        }
                    match(xml.getName());
                    match(XmlToken.ELEMENT_STOP);
                    return;
                }
            }
        }

    /**
    * Parse comments / other PIs.
    *
    * @param xml  the XML element
    *
    * @throws CompilerException  if compiler error occurs
    */
    protected void parseMisc(XmlElement xml)
            throws CompilerException
        {
        // Misc        ::= Comment | PI | S
        // Comment     ::= ''
        // PI          ::= '' Char*)))? '?>'
        while (true)
            {
            if (!hasCurrent())
                {
                return;
                }

            if (peek(XmlToken.COMMENT_START) != null)
                {
                parseComment(xml, true);
                continue;
                }

            if (peek(XmlToken.PI_START) != null)
                {
                parsePi(xml);
                continue;
                }

            return;
            }
        }

    /**
    * Parse comments.
    *
    * @param xml  the XML element
    *
    * @throws CompilerException  if compiler error occurs
    */
    protected void parseComment(XmlElement xml)
            throws CompilerException
        {
        parseComment(xml, false);
        }

    /**
    * Parse comments.
    *
    * @param xml          the XML element
    * @param fIsDocument  whether the passed in XmlElement is an XmlDocument
    *
    * @throws CompilerException  if compiler error occurs
    */
    protected void parseComment(XmlElement xml, boolean fIsDocument)
            throws CompilerException
        {
        StringBuilder sb       = new StringBuilder();
        boolean      fFirst    = true;
        int          cchIndent = 0;
        int          cDeferredBlanks = 0;
        while (peek(XmlToken.COMMENT_STOP) == null)
            {
            String sComment = match(XmlToken.COMMENT).getText();
            char[] ach      = sComment.toCharArray();
            int    cch      = ach.length;

            if (fFirst)
                {
                cchIndent = 0;
                scan: for (int of = 0; of < cch; ++of)
                    {
                    switch (ach[of])
                        {
                        case 0x20:
                        case 0x09:
                            ++cchIndent;
                            break;
                        default:
                            fFirst = false;
                            break scan;
                        }
                    }
                }

            // unindent comment
            int ofStart = 0;
            if (cchIndent > 0)
                {
                scan: for (int of = 0; of < cch && of < cchIndent; ++of)
                    {
                    switch (ach[of])
                        {
                        case 0x20:
                        case 0x09:
                            ++ofStart;
                            break;
                        default:
                            break scan;
                        }
                    }
                }

            // trim off whitespace from end of comment
            scan: for (int of = cch - 1; of >= ofStart; --of)
                {
                switch (ach[of])
                    {
                    case 0x20:
                    case 0x09:
                        --cch;
                        break;
                    default:
                        break scan;
                    }
                }

            if (sb.length() > 0)
                {
                ++cDeferredBlanks;
                }

            if (ofStart < cch)
                {
                for (int i = 0; i < cDeferredBlanks; ++i)
                    {
                    sb.append('\n');
                    }
                cDeferredBlanks = 0;

                sb.append(ach, ofStart, cch);
                }
            }

        if (sb.length() > 0)
            {
            if (fIsDocument)
                {
                XmlDocument doc = (XmlDocument) xml;
                String sComment = doc.getDocumentComment();
                if (sComment == null || sComment.length() == 0)
                    {
                    sComment = sb.toString();
                    }
                else
                    {
                    sComment = sComment + '\n' + sb.toString();
                    }
                doc.setDocumentComment(sComment);
                }
            else
                {
                String sComment = xml.getComment();
                if (sComment == null || sComment.length() == 0)
                    {
                    sComment = sb.toString();
                    }
                else
                    {
                    sComment = sComment + '\n' + sb.toString();
                    }
                xml.setComment(sComment);
                }
            }
        }


    // ----- parsing helpers ------------------------------------------------

    /**
    * Determine if there is a current token.
    *
    * @return true if there is a current token
    */
    protected boolean hasCurrent()
        {
        return m_token != null;
        }

    /**
    * Returns the current token and advances to the next token.
    *
    * @return the current token
    *
    * @exception CompilerException  potentially thrown by the tokenizer
    */
    protected XmlToken current()
            throws CompilerException
        {
        XmlToken current = m_token;
        next();
        return current;
        }

    /**
    * Determine if there is a next token.
    *
    * @return true if there is a next token
    */
    protected boolean hasNext()
        {
        return m_toker.hasMoreTokens();
        }

    /**
    * Advances to and returns the next token.
    *
    * @return the next token
    *
    * @exception CompilerException  potentially thrown by the tokenizer
    */
    protected XmlToken next()
            throws CompilerException
        {
        XmlTokenizer toker = m_toker;
        if (toker.hasMoreTokens())
            {
            return m_token = (XmlToken) toker.nextToken();
            }

        if (m_token == null)
            {
            throw new CompilerException("Invalid root element");
            }

        return m_token = null;
        }

    /**
    * Verifies that the current token matches the passed token id and, if so,
    * advances to the next token.  Otherwise, a syntax exception is thrown.
    *
    * @param id the token id to match
    *
    * @return the current token
    *
    * @exception SyntaxException    thrown if the token does not match
    * @exception CompilerException  potentially thrown by the tokenizer
    */
    protected XmlToken match(int id)
            throws CompilerException
        {
        if (m_token.getID() != id)
            {
            throw new SyntaxException("looking for id=" + id
                + ", found id=" + m_token.getID() + '(' + m_token + ')');
            }
        return current();
        }

    /**
    * Verifies that the current token is a name token whose name matches
    * the passed String and, if so, advances to the next token.  Otherwise,
    * a syntax exception is thrown.
    *
    * @param sName  the name token text to match
    *
    * @return the matched token
    *
    * @exception SyntaxException    thrown if the token does not match
    * @exception CompilerException  potentially thrown by the tokenizer
    */
    protected XmlToken match(String sName)
            throws CompilerException
        {
        XmlToken token = peek(sName);
        if (token == null)
            {
            throw new SyntaxException("looking for name token=" + sName
                + ", found token=" + m_token + "... It is possible that " + sName
                    + " is missing a closing tag");
            }
        return token;
        }

    /**
    * Tests if the current token matches the passed token id and, if so,
    * advances to the next token.
    *
    * @param id the token id to peek for
    *
    * @return the current token, if matched, or null
    *
    * @exception CompilerException  potentially thrown by the tokenizer
    */
    protected XmlToken peek(int id)
            throws CompilerException
        {
        return (m_token.getID() == id ? current() : null);
        }

    /**
    * Tests if the current token matches the passed token category and
    * sub-category.  If so, it returns the current token and advances
    * to the next token.
    *
    * @param cat     the category to peek for
    * @param subcat  the sub-category to peek for
    *
    * @return the current token, if matched, or null
    *
    * @exception CompilerException  potentially thrown by the tokenizer
    */
    protected XmlToken peek(int cat, int subcat)
            throws CompilerException
        {
        XmlToken token = m_token;
        return (token.getCategory() == cat && token.getSubCategory() == subcat ? current() : null);
        }

    /**
    * Tests if the current token is a name that matches the passed String
    * and, if so, advances to the next token.
    *
    * @param sName  the name token text to peek for
    *
    * @return id the current token, if matched, or null
    *
    * @exception CompilerException  potentially thrown by the tokenizer
    */
    protected XmlToken peek(String sName)
            throws CompilerException
        {
        XmlToken token = m_token;
        return (token.getID() == XmlToken.NAME && token.getText().equals(sName) ? current() : null);
        }

    /**
    * Marks the current position and returns it as a token.
    *
    * @return the current token
    */
    protected XmlToken mark()
        {
        return m_token;
        }

    /**
    * Read the provided {@link InputStream} to determine if the stream starts
    * with a UTF-8 BOM (http://www.unicode.org/faq/utf_bom.html#BOM). If the
    * BOM is present, advance the stream to skip it.
    * 
    * This is a workaround for the inability of the Java UTF-8 encoding to
    * recognize the UTF-8 BOM (http://bugs.sun.com/view_bug.do?bug_id=4508058).
    *
    * @param in  InputStream to check for BOM
    *
    * @return an  InputStream with the UTF-8 BOM skipped
    *
    * @throws IOException  if I/O error occurs
    */
    protected InputStream skipBOM(InputStream in)
            throws IOException
        {
        // make sure we have a stream that supports mark/reset
        if (!in.markSupported())
            {
            in = new BufferedInputStream(in);
            }

        // mark the beginning of the stream so that we can reset if necessary
        in.mark(UTF_8_BOM.length);

        // attempt to read the BOM
        boolean fBOM;
        int     cb = 0;
        do
            {
            int n = in.read();
            if (n == -1)
                {
                // EOF
                fBOM = false;
                }
            else
                {
                fBOM = ((byte) n) == UTF_8_BOM[cb++];
                }
            }
        while (fBOM && cb < UTF_8_BOM.length);

        // if the UTF-8 BOM is not found, reset the stream
        if (!fBOM)
            {
            in.reset();
            }

        return in;
        }


    // ----- data members ---------------------------------------------------

    /**
    * If true, validate XML if it contains an XSD reference
    */
    protected final boolean m_fValidate;

    /**
    * The lexical tokenizer.
    */
    protected XmlTokenizer m_toker;

    /**
    * The "current" token being evaluated.
    */
    protected XmlToken m_token;

    /**
    *  The default encoding of an XML document if no encoding declaration is present.
    */
    private static final String XML_DEFAULT_ENCODING = "UTF-8";

    /**
    * UTF-8 BOM (See http://www.unicode.org/faq/utf_bom.html#BOM).
    */
    private static final byte[] UTF_8_BOM = new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
    }