groovy.util.XmlSlurper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of groovy-xml Show documentation
Groovy: A powerful, dynamic language for the JVM
There is a newer version: 3.0.23
/*
 *  Licensed to the Apache Software Foundation (ASF) under one
 *  or more contributor license agreements.  See the NOTICE file
 *  distributed with this work for additional information
 *  regarding copyright ownership.  The ASF licenses this file
 *  to you under the Apache License, Version 2.0 (the
 *  "License"); you may not use this file except in compliance
 *  with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing,
 *  software distributed under the License is distributed on an
 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *  KIND, either express or implied.  See the License for the
 *  specific language governing permissions and limitations
 *  under the License.
 */
package groovy.util;

import groovy.util.slurpersupport.GPathResult;
import groovy.util.slurpersupport.NamespaceAwareHashMap;
import groovy.util.slurpersupport.Node;
import groovy.util.slurpersupport.NodeChild;
import groovy.xml.FactorySupport;
import groovy.xml.QName;
import org.xml.sax.Attributes;
import org.xml.sax.DTDHandler;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

import javax.xml.XMLConstants;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;

/**
 * Parse XML into a document tree that may be traversed similar to XPath
 * expressions.  For example:
 *  * def rootNode = new XmlSlurper().parseText(
 *    '<root><one a1="uno!"/><two>Some text!</two></root>' )
 *
 * assert rootNode.name() == 'root'
 * assert rootNode.one[0].@a1 == 'uno!'
 * assert rootNode.two.text() == 'Some text!'
 * rootNode.children().each { assert it.name() in ['one','two'] }
 * 
 * 
 * Note that in some cases, a 'selector' expression may not resolve to a
 * single node.  For example:
 * 
 * def rootNode = new XmlSlurper().parseText(
 *    '''<root>
 *         <a>one!</a>
 *         <a>two!</a>
 *       </root>''' )
 *
 * assert rootNode.a.size() == 2
 * rootNode.a.each { assert it.text() in ['one!','two!'] }
 * 
 *
 * @see GPathResult
 */
public class XmlSlurper extends DefaultHandler {
    private final XMLReader reader;
    private Node currentNode = null;
    private final Stack stack = new Stack();
    private final StringBuilder charBuffer = new StringBuilder();
    private final Map namespaceTagHints = new HashMap();
    private boolean keepIgnorableWhitespace = false;
    private boolean namespaceAware = false;

    /**
     * Creates a non-validating and namespace-aware XmlSlurper which does not allow DOCTYPE declarations in documents.
     *
     * @throws ParserConfigurationException if no parser which satisfies the requested configuration can be created.
     * @throws SAXException for SAX errors.
     */
    public XmlSlurper() throws ParserConfigurationException, SAXException {
        this(false, true);
    }
    
    /**
     * Creates a XmlSlurper which does not allow DOCTYPE declarations in documents.
     * 
     * @param validating true if the parser should validate documents as they are parsed; false otherwise.
     * @param namespaceAware true if the parser should provide support for XML namespaces; false otherwise.
     *
     * @throws ParserConfigurationException if no parser which satisfies the requested configuration can be created.
     * @throws SAXException for SAX errors.
     */
    public XmlSlurper(final boolean validating, final boolean namespaceAware) throws ParserConfigurationException, SAXException {
        this(validating, namespaceAware, false);
    }

    /**
     * Creates a XmlSlurper.
     * 
     * @param validating true if the parser should validate documents as they are parsed; false otherwise.
     * @param namespaceAware true if the parser should provide support for XML namespaces; false otherwise.
     * @param allowDocTypeDeclaration true if the parser should provide support for DOCTYPE declarations; false otherwise.
     *
     * @throws ParserConfigurationException if no parser which satisfies the requested configuration can be created.
     * @throws SAXException for SAX errors.
     */
    public XmlSlurper(final boolean validating, final boolean namespaceAware, boolean allowDocTypeDeclaration) throws ParserConfigurationException, SAXException {
        SAXParserFactory factory = FactorySupport.createSaxParserFactory();
        factory.setNamespaceAware(namespaceAware);
        this.namespaceAware = namespaceAware;
        factory.setValidating(validating);
        setQuietly(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
        setQuietly(factory, "http://apache.org/xml/features/disallow-doctype-decl", !allowDocTypeDeclaration);
        reader = factory.newSAXParser().getXMLReader();
    }

    public XmlSlurper(final XMLReader reader) {
        this.reader = reader;
    }

    public XmlSlurper(final SAXParser parser) throws SAXException {
        this(parser.getXMLReader());
    }
    
    private static void setQuietly(SAXParserFactory factory, String feature, boolean value) {
        try {
            factory.setFeature(feature, value);
        }
        catch (ParserConfigurationException | SAXNotSupportedException | SAXNotRecognizedException ignored) { }
    }

    /**
     * @deprecated use setKeepIgnorableWhitespace
     * @param keepWhitespace If true then whitespace before elements is kept.
     *                       The default is to discard the whitespace.
     */
    @Deprecated
    public void setKeepWhitespace(boolean keepWhitespace) {
        setKeepIgnorableWhitespace(keepWhitespace);
    }

    /**
     * @param keepIgnorableWhitespace If true then ignorable whitespace (i.e. whitespace before elements) is kept.
     *                       The default is to discard the whitespace.
     */
    public void setKeepIgnorableWhitespace(boolean keepIgnorableWhitespace) {
        this.keepIgnorableWhitespace = keepIgnorableWhitespace;
    }

    /**
     * @return true if ignorable whitespace is kept
     */
    public boolean isKeepIgnorableWhitespace() {
        return keepIgnorableWhitespace;
    }

    /**
     * @return The GPathResult instance created by consuming a stream of SAX events
     *         Note if one of the parse methods has been called then this returns null
     *         Note if this is called more than once all calls after the first will return null
     */
    public GPathResult getDocument() {
        try {
            // xml namespace is always defined
            if (namespaceAware) {
                namespaceTagHints.put("xml", "http://www.w3.org/XML/1998/namespace");
            }
            return new NodeChild(currentNode, null, namespaceTagHints);
        } finally {
            currentNode = null;
        }
    }

    /**
     * Parse the content of the specified input source into a GPathResult object
     *
     * @param input the InputSource to parse
     * @return An object which supports GPath expressions
     * @throws SAXException Any SAX exception, possibly wrapping another exception.
     * @throws IOException An IO exception from the parser, possibly from a byte stream
     *         or character stream supplied by the application.
     */
    public GPathResult parse(final InputSource input) throws IOException, SAXException {
        reader.setContentHandler(this);
        reader.parse(input);
        return getDocument();
    }

    /**
     * Parses the content of the given file as XML turning it into a GPathResult object
     *
     * @param file the File to parse
     * @return An object which supports GPath expressions
     * @throws SAXException Any SAX exception, possibly wrapping another exception.
     * @throws IOException An IO exception from the parser, possibly from a byte stream
     *         or character stream supplied by the application.
     */
    public GPathResult parse(final File file) throws IOException, SAXException {
        final FileInputStream fis = new FileInputStream(file);
        final InputSource input = new InputSource(fis);
        input.setSystemId("file://" + file.getAbsolutePath());
        try {
            return parse(input);
        } finally {
            fis.close();
        }
    }

    /**
     * Parse the content of the specified input stream into an GPathResult Object.
     * Note that using this method will not provide the parser with any URI
     * for which to find DTDs etc. It is up to you to close the InputStream
     * after parsing is complete (if required).
     *
     * @param input the InputStream to parse
     * @return An object which supports GPath expressions
     * @throws SAXException Any SAX exception, possibly wrapping another exception.
     * @throws IOException An IO exception from the parser, possibly from a byte stream
     *         or character stream supplied by the application.
     */
    public GPathResult parse(final InputStream input) throws IOException, SAXException {
        return parse(new InputSource(input));
    }

    /**
     * Parse the content of the specified reader into a GPathResult Object.
     * Note that using this method will not provide the parser with any URI
     * for which to find DTDs etc. It is up to you to close the Reader
     * after parsing is complete (if required).
     *
     * @param in the Reader to parse
     * @return An object which supports GPath expressions
     * @throws SAXException Any SAX exception, possibly wrapping another exception.
     * @throws IOException An IO exception from the parser, possibly from a byte stream
     *         or character stream supplied by the application.
     */
    public GPathResult parse(final Reader in) throws IOException, SAXException {
        return parse(new InputSource(in));
    }

    /**
     * Parse the content of the specified URI into a GPathResult Object
     *
     * @param uri a String containing the URI to parse
     * @return An object which supports GPath expressions
     * @throws SAXException Any SAX exception, possibly wrapping another exception.
     * @throws IOException An IO exception from the parser, possibly from a byte stream
     *         or character stream supplied by the application.
     */
    public GPathResult parse(final String uri) throws IOException, SAXException {
        return parse(new InputSource(uri));
    }

    /**
     * A helper method to parse the given text as XML
     *
     * @param text a String containing XML to parse
     * @return An object which supports GPath expressions
     * @throws SAXException Any SAX exception, possibly wrapping another exception.
     * @throws IOException An IO exception from the parser, possibly from a byte stream
     *         or character stream supplied by the application.
     */
    public GPathResult parseText(final String text) throws IOException, SAXException {
        return parse(new StringReader(text));
    }

    // Delegated XMLReader methods
    //------------------------------------------------------------------------

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#getDTDHandler()
    */
    public DTDHandler getDTDHandler() {
        return reader.getDTDHandler();
    }

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#getEntityResolver()
    */
    public EntityResolver getEntityResolver() {
        return reader.getEntityResolver();
    }

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#getErrorHandler()
    */
    public ErrorHandler getErrorHandler() {
        return reader.getErrorHandler();
    }

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
    */
    public boolean getFeature(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
        return reader.getFeature(uri);
    }

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
    */
    public Object getProperty(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
        return reader.getProperty(uri);
    }

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
    */
    public void setDTDHandler(final DTDHandler dtdHandler) {
        reader.setDTDHandler(dtdHandler);
    }

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
    */
    public void setEntityResolver(final EntityResolver entityResolver) {
        reader.setEntityResolver(entityResolver);
    }

    /**
     * Resolves entities against using the supplied URL as the base for relative URLs
     *
     * @param base The URL used to resolve relative URLs
     */
    public void setEntityBaseUrl(final URL base) {
        reader.setEntityResolver(new EntityResolver() {
            public InputSource resolveEntity(final String publicId, final String systemId) throws IOException {
                return new InputSource(new URL(base, systemId).openStream());
            }
        });
    }

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
    */
    public void setErrorHandler(final ErrorHandler errorHandler) {
        reader.setErrorHandler(errorHandler);
    }

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
    */
    public void setFeature(final String uri, final boolean value) throws SAXNotRecognizedException, SAXNotSupportedException {
        reader.setFeature(uri, value);
    }

    /* (non-Javadoc)
    * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object)
    */
    public void setProperty(final String uri, final Object value) throws SAXNotRecognizedException, SAXNotSupportedException {
        reader.setProperty(uri, value);
    }

    // ContentHandler interface
    //-------------------------------------------------------------------------

    /* (non-Javadoc)
    * @see org.xml.sax.ContentHandler#startDocument()
    */
    public void startDocument() throws SAXException {
        currentNode = null;
        charBuffer.setLength(0);
    }

    /* (non-Javadoc)
    * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String)
    */
    public void startPrefixMapping(final String tag, final String uri) throws SAXException {
        if (namespaceAware) namespaceTagHints.put(tag, uri);
    }

    /* (non-Javadoc)
    * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
    */
    public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes atts) throws SAXException {
        addCdata();

        final Map attributes = new NamespaceAwareHashMap();
        final Map attributeNamespaces = new HashMap();

        for (int i = atts.getLength() - 1; i != -1; i--) {
            if (atts.getURI(i).length() == 0) {
                attributes.put(atts.getQName(i), atts.getValue(i));
            } else {
                String key = new QName(atts.getURI(i), atts.getLocalName(i)).toString();
                attributes.put(key, atts.getValue(i));
                attributeNamespaces.put(key, atts.getURI(i));
            }
        }

        final Node newElement;

        if (namespaceURI.length() == 0) {
            newElement = new Node(currentNode, qName, attributes, attributeNamespaces, namespaceURI);
        } else {
            newElement = new Node(currentNode, localName, attributes, attributeNamespaces, namespaceURI);
        }

        if (currentNode != null) {
            currentNode.addChild(newElement);
        }

        stack.push(currentNode);
        currentNode = newElement;
    }

    public void ignorableWhitespace(char buffer[], int start, int len) throws SAXException {
        if (keepIgnorableWhitespace) characters(buffer, start, len);
    }

    /* (non-Javadoc)
    * @see org.xml.sax.ContentHandler#characters(char[], int, int)
    */
    public void characters(final char[] ch, final int start, final int length) throws SAXException {
        charBuffer.append(ch, start, length);
    }

    /* (non-Javadoc)
    * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
    */
    public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException {
        addCdata();
        Node oldCurrentNode = stack.pop();
        if (oldCurrentNode != null) {
            currentNode = oldCurrentNode;
        }
    }

    /* (non-Javadoc)
    * @see org.xml.sax.ContentHandler#endDocument()
    */
    public void endDocument() throws SAXException {
    }

    private void addCdata() {
        if (charBuffer.length() != 0) {
            //
            // This element is preceded by CDATA if keepIgnorableWhitespace is false (the default setting) and
            // it's not whitespace add it to the body
            // Note that, according to the XML spec, we should preserve the CDATA if it's all whitespace
            // but for the sort of work I'm doing ignoring the whitespace is preferable
            //
            final String cdata = charBuffer.toString();
            charBuffer.setLength(0);
            if (keepIgnorableWhitespace || cdata.trim().length() != 0) {
                currentNode.addChild(cdata);
            }
        }
    }
}