org.xwiki.xml.html.HTMLUtils Maven / Gradle / Ivy

Go to download
/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.xwiki.xml.html;

import java.io.IOException;
import java.io.Writer;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.jdom.DocType;
import org.jdom.Element;
import org.jdom.input.DOMBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * HTML Utility methods.
 *
 * @version $Id: cc7ab52deafe00eed701a7c3e06bfbc67791776f $
 * @since 1.8.3
 */
// TODO: Create a separate class for each HTML version (XHTML 1.0, HTML5, etc...)
public final class HTMLUtils
{
    /**
     * In HTML5, some elements must be expanded (for example {@code } instead of {@code }), and
     * some others must not (for example {@code 
} instead of {@code 

}. Thus for the list of elements
     * below we need special handling (not expanding).
     */
    private static final List OMIT_ELEMENT_EXPANDING_SET = Arrays.asList(
        "area", "base", "br", "col", "hr", "img", "input", "link", "meta", "param");

    /**
     * JDOM's XMLOutputter class converts reserved XML characters ({@code <, >, ' , &, \r and \n}) into their entity
     * format {@code <, > ' &  and \r\n}. However since we're using HTML Cleaner
     * (http://htmlcleaner.sourceforge.net/) and since it's buggy for character escapes we have turned off character
     * escaping for it and thus we need to perform selective escaping here.
     * 
     * Moreover, since we support HTML5, we need to
     * expand empty elements on some elements and not on the others. For example: {@code } is valid
     * meanwhile:
     * 
{@code
     * 

     * 
}
     * is not. See {@code OMIT_ELEMENT_EXPANDING_SET} for the list of elements to not expand.
     */
    // TODO: Remove the complex escaping code when SF HTML Cleaner will do proper escaping
    public static class XWikiXMLOutputter extends XMLOutputter
    {
        /**
         * Regex to recognize a XML Entity.
         */
        private static final Pattern ENTITY = Pattern.compile("&[a-z]+;|&#[0-9a-zA-Z]+;");

        /**
         * Ampersand character.
         */
        private static final String AMPERSAND = "&";

        private static final String[] REPLACE_ELEMENTS_SEARCH = new String[] { "<", ">" };

        private static final String[] REPLACE_ELEMENTS_RESULT = new String[] { "<", ">" };

        /**
         * Whether to omit the document type when printing the W3C Document or not.
         */
        private boolean omitDocType;

        /**
         * @param format the JDOM class used to control output formats, see {@link org.jdom.output.Format}
         * @param omitDocType if true then omit the document type when printing the W3C Document
         * @see XMLOutputter#XMLOutputter(Format)
         */
        public XWikiXMLOutputter(Format format, boolean omitDocType)
        {
            super(format);
            this.omitDocType = omitDocType;
        }

        @Override
        public String escapeElementEntities(String text)
        {
            if (text.length() == 0) {
                return text;
            }

            String result;
            int pos1 = text.indexOf(" -1) {
                int pos2 = text.indexOf("]]>", pos1 + 9);
                if (pos2 + 3 == text.length()) {
                    return text;
                }
                result = escapeElementEntities(text.substring(0, pos1));
                if (pos2 + 3 == text.length()) {
                    result = result + text.substring(pos1);
                } else {
                    result = result + text.substring(pos1, pos2 + 3) + escapeElementEntities(text.substring(pos2 + 3));
                }
            } else {
                result = escapeAmpersand(text);
                StringUtils.replaceEach(text, REPLACE_ELEMENTS_SEARCH, REPLACE_ELEMENTS_RESULT);
            }

            return result;
        }

        @Override
        public String escapeAttributeEntities(String text)
        {
            String result = escapeElementEntities(text);

            // Attribute values must have quotes escaped since attributes are defined with quotes...
            result = StringUtils.replace(result, "\"", """);

            return result;
        }

        /**
         * Escape ampersand when it's not defining an entity.
         *
         * @param text the text to escape
         * @return the escaped text
         */
        private String escapeAmpersand(String text)
        {
            StringBuilder buffer = new StringBuilder(text);
            // find all occurrences of &
            int pos = buffer.indexOf(AMPERSAND);
            while (pos > -1 && pos < buffer.length()) {
                // Check if the & is an entity
                Matcher matcher = ENTITY.matcher(buffer.substring(pos));
                if (matcher.lookingAt()) {
                    // We've found an entity, don't do anything, just skip it
                    pos = pos + matcher.end() - matcher.start();
                } else {
                    // No entity, escape the &
                    buffer.replace(pos, pos + 1, "&");
                    pos += 5;
                }
                pos = buffer.indexOf(AMPERSAND, pos);
            }
            return buffer.toString();
        }

        @Override
        protected void printDocType(Writer out, DocType docType) throws IOException
        {
            if (!this.omitDocType) {
                super.printDocType(out, docType);
            }
        }

        @Override
        protected void printElement(Writer out, Element element, int level, NamespaceStack namespaces)
            throws IOException
        {
            // We override the code from the super class to not expand some empty elements.
            boolean currentFormatPolicy = currentFormat.getExpandEmptyElements();
            try {
                String elementName = element.getName();
                for (String name : OMIT_ELEMENT_EXPANDING_SET) {
                    if (name.equals(elementName)) {
                        // We do not expand this empty element
                        currentFormat.setExpandEmptyElements(false);
                        break;
                    }
                }

                // Call the method from the super class
                super.printElement(out, element, level, namespaces);

            } finally {
                // Reset the format
                currentFormat.setExpandEmptyElements(currentFormatPolicy);
            }
        }
    }

    /**
     * Private constructor since this is a utility class that shouldn't be instantiated (all methods are static).
     */
    private HTMLUtils()
    {
        // Nothing to do
    }

    /**
     * @param document the W3C Document to transform into a String
     * @return the XML as a String
     */
    public static String toString(Document document)
    {
        return HTMLUtils.toString(document, false, false);
    }

    /**
     * @param document the W3C Document to transform into a String
     * @param omitDeclaration whether the XML declaration should be printed or not
     * @param omitDoctype whether the document type should be printed or not
     * @return the XML as a String
     */
    public static String toString(Document document, boolean omitDeclaration, boolean omitDoctype)
    {
        // Note: We don't use javax.xml.transform.Transformer since it prints our valid XHTML as HTML which is not
        // XHTML compliant. For example it transforms our "
" into ".
        DOMBuilder builder = new DOMBuilder();
        org.jdom.Document jdomDoc = builder.build(document);

        Format format = Format.getRawFormat();
        // Force newlines to use \n since otherwise the default is \n\r.
        // See http://www.jdom.org/docs/apidocs/org/jdom/output/Format.html#setLineSeparator(java.lang.String)
        format.setLineSeparator("\n");

        // Make sure all elements are expanded so that they can also be rendered fine in browsers that only support
        // HTML.
        format.setExpandEmptyElements(true);

        format.setOmitDeclaration(omitDeclaration);

        XMLOutputter outputter = new XWikiXMLOutputter(format, omitDoctype);
        String result = outputter.outputString(jdomDoc);

        return result;
    }

    /**
     * Strip the HTML envelope if it exists. Precisely this means removing the head tag and move all tags in the body
     * tag directly under the html element. This is useful for example if you wish to insert an HTML fragment into an
     * existing HTML page.
     *
     * @param document the w3c Document to strip
     */
    public static void stripHTMLEnvelope(Document document)
    {
        org.w3c.dom.Element root = document.getDocumentElement();
        if (root.getNodeName().equalsIgnoreCase(HTMLConstants.TAG_HTML)) {
            // Look for a head element below the root element and for a body element
            Node bodyNode = null;
            Node headNode = null;
            NodeList nodes = root.getChildNodes();
            for (int i = 0; i < nodes.getLength(); i++) {
                Node node = nodes.item(i);
                if (node.getNodeName().equalsIgnoreCase(HTMLConstants.TAG_HEAD)) {
                    headNode = node;
                } else if (node.getNodeName().equalsIgnoreCase(HTMLConstants.TAG_BODY)) {
                    bodyNode = node;
                }
            }

            if (headNode != null) {
                root.removeChild(headNode);
            }

            if (bodyNode != null) {
                // Move all children of body node under the root element
                NodeList bodyChildrenNodes = bodyNode.getChildNodes();
                while (bodyChildrenNodes.getLength() > 0) {
                    root.insertBefore(bodyChildrenNodes.item(0), null);
                }
                root.removeChild(bodyNode);
            }
        }
    }

    /**
     * Remove the first element inside a parent element and copy the element's children in the parent.
     *
     * @param document the w3c document from which to remove the top level paragraph
     * @param parentTagName the name of the parent tag to look under
     * @param elementTagName the name of the first element to remove
     */
    public static void stripFirstElementInside(Document document, String parentTagName, String elementTagName)
    {
        NodeList parentNodes = document.getElementsByTagName(parentTagName);
        if (parentNodes.getLength() > 0) {
            Node parentNode = parentNodes.item(0);
            // Look for a p element below the first parent element
            Node pNode = parentNode.getFirstChild();
            if (elementTagName.equalsIgnoreCase(pNode.getNodeName())) {
                // Move all children of p node under the root element
                NodeList pChildrenNodes = pNode.getChildNodes();
                while (pChildrenNodes.getLength() > 0) {
                    parentNode.insertBefore(pChildrenNodes.item(0), null);
                }
                parentNode.removeChild(pNode);
            }
        }
    }
}