com.twelvemonkeys.xml.XMLSerializer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of common-io Show documentation
TwelveMonkeys Common I/O support classes.
The newest version!
/*
 * Copyright (c) 2008, Harald Kuhr
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * * Neither the name of the copyright holder nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package com.twelvemonkeys.xml;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.util.Date;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.*;
import org.xml.sax.SAXException;

import com.twelvemonkeys.lang.StringUtil;

/**
 * XMLSerializer
 *
 * @author Harald Kuhr
 * @author last modified by $Author: haku $
 * @version $Id: //depot/branches/personal/haraldk/twelvemonkeys/release-2/twelvemonkeys-core/src/main/java/com/twelvemonkeys/xml/XMLSerializer.java#1 $
 */
public class XMLSerializer {
    // TODO: Replace with DOMSerializer? Test performance, pretty printing etc...
    // Main problem: Sun's Java 5 does not have LS 3.0 support
    // This class has no dependencies, which probably makes it more useful

    // TODO: Don't insert initial and ending line-break for text-nodes
    // TODO: Support not inserting line-breaks, to preserve space
    // TODO: Support line breaking (at configurable width)
    // TODO: Support standalone?
    // TODO: Support more than version 1.0?
    // TODO: Consider using IOException to communicate trouble, rather than RTE,
    // to be more compatible...

    private final OutputStream output;
    private final Charset encoding;
    private final SerializationContext context;

    public XMLSerializer(final OutputStream pOutput, final String pEncoding) {
        output = pOutput;
        encoding = Charset.forName(pEncoding);
        context = new SerializationContext();
    }

    public final XMLSerializer indentation(String pIndent) {
        // TODO: Verify that indent value is only whitespace?
        context.indent = pIndent != null ? pIndent : "\t";
        return this;
    }

    public final XMLSerializer stripComments(boolean pStrip) {
        context.stripComments = pStrip;
        return this;
    }

    /**
     * Serializes the entire document, along with the XML declaration
     * ({@code <?xml version="1.0" encoding="..."?>}).
     *
     * @param pDocument the document to serialize.
     */
    public void serialize(final Document pDocument) {
        serialize(pDocument, true);
    }

    /**
     * Serializes the entire sub tree starting at {@code pRootNode}, along with an optional XML declaration
     * ({@code <?xml version="1.0" encoding="..."?>}).
     *
     * @param pRootNode the root node to serialize.
     * @param pWriteXMLDeclaration {@code true} if the XML declaration should be included, otherwise {@code false}.
     */
    public void serialize(final Node pRootNode, final boolean pWriteXMLDeclaration) {
        PrintWriter out = new PrintWriter(new OutputStreamWriter(output, encoding));
        try {
            if (pWriteXMLDeclaration) {
                writeXMLDeclaration(out);
            }
            writeXML(out, pRootNode, context.copy());
        }
        finally {
            out.flush();
        }
    }

    private void writeXMLDeclaration(final PrintWriter pOut) {
        pOut.print("");
    }

    private void writeXML(final PrintWriter pOut, final Node pDocument, final SerializationContext pContext) {
        writeNodeRecursive(pOut, pDocument, pContext);
    }

    private void writeNodeRecursive(final PrintWriter pOut, final Node pNode, final SerializationContext pContext) {
        if (pNode.getNodeType() != Node.TEXT_NODE) {
            indentToLevel(pOut, pContext);
        }

        switch (pNode.getNodeType()) {
            case Node.DOCUMENT_NODE:
            case Node.DOCUMENT_FRAGMENT_NODE:
                writeDocument(pOut, pNode, pContext);
                break;
            case Node.DOCUMENT_TYPE_NODE:
                writeDoctype(pOut, (DocumentType) pNode);
                break;
            case Node.ELEMENT_NODE:
                boolean preserveSpace = pContext.preserveSpace;
                updatePreserveSpace(pNode, pContext);
                writeElement(pOut, (Element) pNode, pContext);
                pContext.preserveSpace = preserveSpace;
                break;
            case Node.CDATA_SECTION_NODE:
                writeCData(pOut, pNode);
                break;
            case Node.TEXT_NODE:
                writeText(pOut, pNode, pContext);
                break;
            case Node.COMMENT_NODE:
                writeComment(pOut, pNode, pContext);
                break;
            case Node.PROCESSING_INSTRUCTION_NODE:
                writeProcessingInstruction(pOut, (ProcessingInstruction) pNode);
                break;
            case Node.ATTRIBUTE_NODE:
                throw new IllegalArgumentException("Malformed input Document: Attribute nodes should only occur inside Element nodes");
            case Node.ENTITY_NODE:
                // ''
            case Node.ENTITY_REFERENCE_NODE:
                // ( '&' | '%' ) + getNodeName + ';'
            case Node.NOTATION_NODE:
                // ''
            default:
                throw new InternalError("Lazy programmer never implemented serialization of " + pNode.getClass());
        }
    }

    private void writeProcessingInstruction(final PrintWriter pOut, final ProcessingInstruction pNode) {
        pOut.print("\n");
    }

    private void writeText(final PrintWriter pOut, final Node pNode, final SerializationContext pContext) {
        // TODO: Is this really as specified?
        String value = pNode.getNodeValue();
        if (pContext.preserveSpace) {
            pOut.print(maybeEscapeElementValue(value));
        }
        else if (!StringUtil.isEmpty(value)) {
            String escapedValue = maybeEscapeElementValue(value.trim());
            //if (escapedValue.length() + (pContext.level * pContext.indent.length()) > 78) {
                indentToLevel(pOut, pContext);
            //}
            pOut.println(escapedValue);
        }
    }

    private void writeCData(final PrintWriter pOut, final Node pNode) {
        pOut.print("");
    }

    private static void updatePreserveSpace(final Node pNode, final SerializationContext pContext) {
        NamedNodeMap attributes = pNode.getAttributes();
        if (attributes != null) {
            Node space = attributes.getNamedItem("xml:space");
            if (space != null) {
                if ("preserve".equals(space.getNodeValue())) {
                    pContext.preserveSpace = true;
                }
                else if ("default".equals(space.getNodeValue())) {
                    pContext.preserveSpace = false;
                }
                // No other values are allowed per spec, ignore
            }
        }
    }

    private static void indentToLevel(final PrintWriter pOut, final SerializationContext pContext) {
        for (int i = 0; i < pContext.level; i++) {
            pOut.print(pContext.indent);
        }
    }

    private void writeComment(final PrintWriter pOut, final Node pNode, final SerializationContext pContext) {
        if (pContext.stripComments) {
            return;
        }

        String value = pNode.getNodeValue();
        validateCommentValue(value);

        if (value.startsWith(" ")) {
            pOut.print("");
        }
        else {
            pOut.println(" -->");
        }
    }

    /**
     * Returns an escaped version of the input string. The string is guaranteed
     * to not contain illegal XML characters ({@code &<>}).
     * If no escaping is needed, the input string is returned as is. 
     *
     * @param pValue the input string that might need escaping.
     * @return an escaped version of the input string.
     */
    static String maybeEscapeElementValue(final String pValue) {
        int startEscape = needsEscapeElement(pValue);

        if (startEscape < 0) {
            // If no escaping is needed, simply return original
            return pValue;
        }
        else {
            // Otherwise, start replacing
            StringBuilder builder = new StringBuilder(pValue.substring(0, startEscape));
            builder.ensureCapacity(pValue.length() + 30);

            int pos = startEscape;
            for (int i = pos; i < pValue.length(); i++) {
                switch (pValue.charAt(i)) {
                    case '&':
                        pos = appendAndEscape(pValue, pos, i, builder, "&");
                        break;
                    case '<':
                        pos = appendAndEscape(pValue, pos, i, builder, "<");
                        break;
                    case '>':
                        pos = appendAndEscape(pValue, pos, i, builder, ">");
                        break;
                    //case '\'':
                    //case '"':
                    default:
                        break;
                }
            }

            builder.append(pValue.substring(pos));
            return builder.toString();
        }
    }

    private static int appendAndEscape(final String pString, int pStart, final int pEnd, final StringBuilder pBuilder, final String pEntity) {
        pBuilder.append(pString, pStart, pEnd);
        pBuilder.append(pEntity);
        return pEnd + 1;
    }

    /**
     * Returns an the first index from the input string that should be escaped
     * if escaping is needed, otherwise {@code -1}.
     *
     * @param pString the input string that might need escaping.
     * @return the first index from the input string that should be escaped,
     *         or {@code -1}.
     */
    private static int needsEscapeElement(final String pString) {
        for (int i = 0; i < pString.length(); i++) {
            switch (pString.charAt(i)) {
                case '&':
                case '<':
                case '>':
                //case '\'':
                //case '"':
                    return i;
                default:
            }
        }
        return -1;
    }

    private static String maybeEscapeAttributeValue(final String pValue) {
        int startEscape = needsEscapeAttribute(pValue);

        if (startEscape < 0) {
            return pValue;
        }
        else {
            StringBuilder builder = new StringBuilder(pValue.substring(0, startEscape));
            builder.ensureCapacity(pValue.length() + 16);

            int pos = startEscape;
            for (int i = pos; i < pValue.length(); i++) {
                switch (pValue.charAt(i)) {
                    case '&':
                        pos = appendAndEscape(pValue, pos, i, builder, "&");
                        break;
                    case '"':
                        pos = appendAndEscape(pValue, pos, i, builder, """);
                        break;
                    default:
                        break;
                }
            }

            builder.append(pValue.substring(pos));

            return builder.toString();
        }
    }

    /**
     * Returns an the first index from the input string that should be escaped
     * if escaping is needed, otherwise {@code -1}.
     *
     * @param pString the input string that might need escaping.
     * @return the first index from the input string that should be escaped,
     *         or {@code -1}.
     */
    private static int needsEscapeAttribute(final String pString) {
        for (int i = 0; i < pString.length(); i++) {
            switch (pString.charAt(i)) {
                case '&':
                //case '<':
                //case '>':
                //case '\'':
                case '"':
                    return i;
                default:
            }
        }

        return -1;
    }

    private static String validateCDataValue(final String pValue) {
        if (pValue.contains("]]>")) {
            throw new IllegalArgumentException("Malformed input document: CDATA block may not contain the string ']]>'");
        }
        return pValue;
    }

    private static String validateCommentValue(final String pValue) {
        if (pValue.contains("--")) {
            throw new IllegalArgumentException("Malformed input document: Comment may not contain the string '--'");
        }
        return pValue;
    }

    private void writeDocument(final PrintWriter pOut, final Node pNode, final SerializationContext pContext) {
        // Document fragments might not have child nodes...
        if (pNode.hasChildNodes()) {
            Node child = pNode.getFirstChild();
            while (child != null) {
                writeNodeRecursive(pOut, child, pContext);
                child = child.getNextSibling();
            }
        }
    }

    private void writeElement(final PrintWriter pOut, final Element pNode, final SerializationContext pContext) {
        pOut.print("<");
        pOut.print(pNode.getTagName());

        // TODO: Attributes should probably include namespaces, so that it works
        // even if the document was created using attributes instead of namespaces...
        // In that case, prefix will be null...

        // Handle namespace
        String namespace = pNode.getNamespaceURI();
        if (namespace != null && !namespace.equals(pContext.defaultNamespace)) {
            String prefix = pNode.getPrefix();
            if (prefix == null) {
                pContext.defaultNamespace = namespace;
                pOut.print(" xmlns");
            }
            else {
                pOut.print(" xmlns:");
                pOut.print(prefix);
            }
            pOut.print("=\"");
            pOut.print(namespace);
            pOut.print("\"");
        }

        // Iterate attributes if any
        if (pNode.hasAttributes()) {
            NamedNodeMap attributes = pNode.getAttributes();
            for (int i = 0; i < attributes.getLength(); i++) {
                Attr attribute = (Attr) attributes.item(i);
                String name = attribute.getName();
                if (!(name.startsWith("xmlns") && (name.length() == 5 || name.charAt(5) == ':'))) {
                    pOut.print(" ");
                    pOut.print(name);
                    pOut.print("=\"");
                    pOut.print(maybeEscapeAttributeValue(attribute.getValue()));
                    pOut.print("\"");
                }
                //else {
                //    System.err.println("attribute.getName(): " + name);
                //}
            }
        }

        // TODO: Consider not indenting/newline if the first child is a text node
        // Iterate children if any
        if (pNode.hasChildNodes()) {
            pOut.print(">");
            if (!pContext.preserveSpace) {
                pOut.println();
            }

            Node child = pNode.getFirstChild();
            while (child != null) {
                writeNodeRecursive(pOut, child, pContext.push());
                child = child.getNextSibling();
            }

            if (!pContext.preserveSpace) {
                indentToLevel(pOut, pContext);
            }

            pOut.print("");
        }
        else if (pNode.getNodeValue() != null) {
            // NOTE: This is NOT AS SPECIFIED, but we do this to support
            // the weirdness that is the javax.imageio.metadata.IIOMetadataNode.
            // According to the spec, the nodeValue of an Element is null.
            pOut.print(">");
            pOut.print(pNode.getNodeValue());
            pOut.print("");
        }
        else {
            pOut.println("/>");
        }

    }

    private void writeDoctype(final PrintWriter pOut, final DocumentType pDoctype) {
        // NOTE: The DOMImplementationLS LSSerializer actually inserts SYSTEM or
        // PUBLIC identifiers even if they are empty strings. The result is, it
        // will create invalid documents.
        // Testing for empty strings seems to be more compatible.
        if (pDoctype != null) {
            pOut.print("");
        }
    }

    public static void main(String[] pArgs) throws IOException, SAXException {
        // Build XML tree (Document) and write
        // Find the implementation
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        factory.setNamespaceAware(true);
        DocumentBuilder builder;

        try {
            builder = factory.newDocumentBuilder();
        }
        catch (ParserConfigurationException e) {
            throw new IOException(e);
        }

        DOMImplementation dom = builder.getDOMImplementation();

        Document document = dom.createDocument("http://www.twelvemonkeys.com/xml/test", "test", dom.createDocumentType("test", null, null));

        Element root = document.getDocumentElement();

        // This is probably not the correct way of setting a default namespace
        //root.setAttribute("xmlns", "http://www.twelvemonkeys.com/xml/test");

        // Create and insert the normal Properties headers as XML comments
        document.insertBefore(document.createComment(new Date().toString()), root);

        Element test = document.createElement("sub");
        root.appendChild(test);
        Element more = document.createElementNS("http://more.com/1999/namespace", "more:more");
        more.setAttribute("foo", "test");
        more.setAttribute("bar", "'really' \"legal\" & ok");
        test.appendChild(more);
        more.appendChild(document.createTextNode("Simply some text."));
        more.appendChild(document.createCDATASection("&something escaped;"));
        more.appendChild(document.createTextNode("More & !"));
        more.appendChild(document.createTextNode("\"<<'&'>>\""));
        Element another = document.createElement("another");
        test.appendChild(another);
        Element yet = document.createElement("yet-another");
        yet.setAttribute("this-one", "with-params");
        test.appendChild(yet);

        Element pre = document.createElementNS("http://www.twelvemonkeys.com/xml/test", "pre");
        pre.setAttributeNS("http://www.w3.org/XML/1998/namespace", "xml:space", "preserve");
        pre.appendChild(document.createTextNode(" \t \n\r some text & white ' '   \n   "));
        test.appendChild(pre);

        Element pre2 = document.createElementNS("http://www.twelvemonkeys.com/xml/test", "tight");
        pre2.setAttributeNS("http://www.w3.org/XML/1998/namespace", "xml:space", "preserve");
        pre2.appendChild(document.createTextNode("no-space-around-me"));
        test.appendChild(pre2);

        // Create serializer and output document
        //XMLSerializer serializer = new XMLSerializer(pOutput, new OutputFormat(document, UTF_8_ENCODING, true));
        System.out.println("XMLSerializer:");
        XMLSerializer serializer = new XMLSerializer(System.out, "UTF-8");
        serializer.serialize(document);
        System.out.println();

        System.out.println("DOMSerializer:");
        DOMSerializer serializerD = new DOMSerializer(System.out, "UTF-8");
        serializerD.setPrettyPrint(true);
        serializerD.serialize(document);
        System.out.println();
        
        System.out.println("\n");

        ByteArrayOutputStream out = new ByteArrayOutputStream();
        XMLSerializer serializer2 = new XMLSerializer(out, "UTF-8");
        serializer2.serialize(document);

        ByteArrayOutputStream outD = new ByteArrayOutputStream();
        DOMSerializer serializer2D = new DOMSerializer(outD, "UTF-8");
        serializer2D.serialize(document);

        Document document2 = builder.parse(new ByteArrayInputStream(out.toByteArray()));
        System.out.println("XMLSerializer reparsed XMLSerializer:");
        serializer.serialize(document2);
        System.out.println();
        System.out.println("DOMSerializer reparsed XMLSerializer:");
        serializerD.serialize(document2);
        System.out.println();


        Document documentD = builder.parse(new ByteArrayInputStream(outD.toByteArray()));
        System.out.println("XMLSerializer reparsed DOMSerializer:");
        serializer.serialize(documentD);
        System.out.println();
        System.out.println("DOMSerializer reparsed DOMSerializer:");
        serializerD.serialize(documentD);
        System.out.println();
    }

    static class SerializationContext implements Cloneable {
        String indent = "\t";
        int level = 0;
        boolean preserveSpace = false;
        boolean stripComments = false;
        String defaultNamespace;

        public SerializationContext copy() {
            try {
                return (SerializationContext) clone();
            }
            catch (CloneNotSupportedException e) {
                throw new Error(e);
            }
        }

        public SerializationContext push() {
            SerializationContext context = copy();
            context.level++;
            return context;
        }
    }
}