org.identityconnectors.common.XmlUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of framework Show documentation
There is a newer version: 0.4.3
/*
 * ====================
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 * 
 * Copyright 2008-2009 Sun Microsystems, Inc. All rights reserved.     
 * 
 * The contents of this file are subject to the terms of the Common Development 
 * and Distribution License("CDDL") (the "License").  You may not use this file 
 * except in compliance with the License.
 * 
 * You can obtain a copy of the License at 
 * http://IdentityConnectors.dev.java.net/legal/license.txt
 * See the License for the specific language governing permissions and limitations 
 * under the License. 
 * 
 * When distributing the Covered Code, include this CDDL Header Notice in each file
 * and include the License file at identityconnectors/legal/license.txt.
 * If applicable, add the following below this CDDL Header, with the fields 
 * enclosed by brackets [] replaced by your own identifying information: 
 * "Portions Copyrighted [year] [name of copyright owner]"
 * ====================
 */
package org.identityconnectors.common;

import java.io.IOException;
import java.io.StringReader;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.Text;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class XmlUtil {
    private XmlUtil() {

    }

    /////////////////////////////////////////////////////////////
    //
    // Constants
    //
    ////////////////////////////////////////////////////////////

    public static final char NO_DELIM     = 0;
    public static final char DOUBLE_QUOTE = '"';
    public static final char SINGLE_QUOTE = '\'';

    /////////////////////////////////////////////////////////////
    //
    // Parsing
    //
    ////////////////////////////////////////////////////////////

    /**
     * Parses a string without validation and returns the Document.
     */
    public static Document parseString(String xml)
        throws IOException, SAXException, ParserConfigurationException
    {
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        dbf.setValidating(false);
        DocumentBuilder db = dbf.newDocumentBuilder();
        //some parsers will attempt to find and parse dtd even
        //if not validating and that makes it very slow
        db.setEntityResolver(new DummyDTDResolver());
        InputSource is = new InputSource(new StringReader(xml));
        return db.parse(is);
    }

    private static class DummyDTDResolver implements EntityResolver {
        public InputSource resolveEntity(String publicID, String systemID) {
            if ((publicID != null && publicID.endsWith(".dtd"))
                    || (systemID != null && systemID.endsWith(".dtd"))) {
                return new InputSource(new StringReader(""));
            } else {
                return null;
            }
        }
    }

    /////////////////////////////////////////////////////////////
    //
    // DOM Navigation utilities
    //
    ////////////////////////////////////////////////////////////

    /**
     * Return the value of an attribute on an element.  The DOM getAttribute
     * method returns an empty string if the attribute doesn't exist. Here, we
     * detect this and return null.
     */
    public static String getAttribute(Element e, String name) {
        String value = e.getAttribute(name);
        if (value != null && value.length() == 0)
            value = null;
        return value;
    }

    /**
     * Find an immediate child of the given name
     */
    public static Element findImmediateChildElement(Node node, String name) {

        Element found = null;
        
        if (node != null) {

            for (Node child = node.getFirstChild(); child != null
                    && found == null; child = child.getNextSibling()) {

                if (child.getNodeType() == Node.ELEMENT_NODE) {
                    Element tmp = (Element) child;
                    if ( tmp.getTagName().equals(name) ) {
                        return tmp;
                    }
                }
            }
        }

        return found;
    }

    /**
     * Returns the First child element or null if none found
     * @param node The node. May be null.
     * @return the First child element or null if none found
     */
    public static Element getFirstChildElement(Node node) {
        if ( node == null ) {
            return null;
        }
        Node child = node.getFirstChild();
        if ( child instanceof Element ) {
            return (Element)child;
        }
        else {
            return getNextElement(child);
        }
    }
    
    /**
     * Get the next right sibling that is an element.
     */
    public static Element getNextElement(Node node) {

        Element found = null;

        if (node != null) {

            for (Node next = node.getNextSibling(); next != null
                    && found == null; next = next.getNextSibling()) {

                if (next.getNodeType() == Node.ELEMENT_NODE)
                    found = (Element) next;
            }
        }

        return found;
    }

    /**
     * Locate the first text node at any level below the given node. If the
     * ignoreEmpty flag is true, we will ignore text nodes that contain only
     * whitespace characteres. 
 Note that if you're trying to extract
     * element content, you probably don't want this since parser's can break up
     * pcdata into multiple adjacent text nodes. See getContent() for a more
     * useful method.
     */
    private static Text findText(Node node, boolean ignoreEmpty) {

        Text found = null;

        if (node != null) {

            if (node.getNodeType() == Node.TEXT_NODE
                    || node.getNodeType() == Node.CDATA_SECTION_NODE) {

                Text t = (Text) node;
                if (!ignoreEmpty)
                    found = t;
                else {
                    String s = t.getData().trim();
                    if (s.length() > 0)
                        found = t;
                }
            }

            if (found == null) {

                for (Node child = node.getFirstChild(); child != null
                        && found == null; child = child.getNextSibling()) {

                    found = findText(child, ignoreEmpty);
                }
            }
        }

        return found;
    }


    /**
     * Return the content of the given element. 
 We will descend to an
     * arbitrary depth looking for the first text node. 
 Note that
     * the parser may break what was originally a single string of pcdata into
     * multiple adjacent text nodes. Xerces appears to do this when it
     * encounters a '$' in the text, not sure if there is specified behavior, or
     * if its parser specific. 
 Here, we will congeal adjacent text nodes.
     *  We will NOT ignore text nodes that have only whitespace.
     */
    public static String getContent(Element e) {

        String content = null;

        if (e != null) {

            // find the first inner text node,
            Text t = findText(e, false);
            if (t != null) {
                // we have at least some text
                StringBuilder b = new StringBuilder();
                while (t != null) {
                    b.append(t.getData());
                    Node n = t.getNextSibling();

                    t = null;
                    if (n != null
                            && ((n.getNodeType() == Node.TEXT_NODE) || 
                                    (n.getNodeType() == Node.CDATA_SECTION_NODE))) {
                        t = (Text) n;
                    }
                }
                content = b.toString();
            }
        }

        return content;
    }

    /////////////////////////////////////////////////////////////
    //
    // Xml Encoding Utilities
    //
    ////////////////////////////////////////////////////////////

    /**
     * Escapes the given string and appends to the given buffer
     * @param b The buffer
     * @param s The script to be escaped. May be null.
     * @param delim May be {@link #SINGLE_QUOTE}, {@link #DOUBLE_QUOTE}, or {@link #NO_DELIM}.
     */
    public static void escape(StringBuilder b, String s, char delim) {
        if (s != null) {

            for (int i = 0; i < s.length(); i++) {
                char ch = s.charAt(i);

                if (ch == '&') {
                    // Ampersand: introduces a character entity.
                    b.append("&");
                } else if (ch == '<') {
                    // LessThan: introduces a tag.
                    b.append("<");
                } else if (ch == '>') {
                    // GreaterThan: some browsers impute an opening "<".
                    b.append(">");
                } else if (ch == 0xA) {
                    // LineFeed: preserve format.
                    b.append("
");
                } else if (ch == 0xD) {
                    // CarriageReturn: preserve format.
                    b.append("");
                } else if (ch == 0x9) {
                    // HorizontalTab: preserve format.
                    b.append("	");
                } else if (ch == delim && delim == SINGLE_QUOTE) {
                    // Accept only single or double quote as delimiter.
                    b.append("'");
                } else if (ch == delim && delim == DOUBLE_QUOTE) {
                    // Accept only single or double quote as delimiter.
                    // Does """ work in XML?
                    b.append(""");
                } else if (ch >= 0x20 && ch < 0x7f) {
                    b.append(ch);
                } else if (validXmlChar(ch)) {
                    b.append(ch);
                }
            }
        }
    }

    /**
     * legal xml chars from http://www.xml.com/axml/testaxml.htm Char::= #x9 |
     * #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
     */
    private static boolean validXmlChar(char ch) {
        if (ch >= 0x20 && ch < 0x7f)
            return true; // short circuit test

        if (ch == 0x09 || ch == 0x0A || ch == 0x0D
                || (ch >= 0x20 && ch <= 0xfd7ff)
                || (ch >= 0x0E000 && ch <= 0xffffd)
                || (ch >= 0x010000 && ch <= 0xF10ffff))
            return true;

        return false;
    }
}