All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.htmlcleaner.XWikiDOMSerializer Maven / Gradle / Ivy

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.htmlcleaner;

import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;

/**
 * Generate a W3C Document from a SF's HTML Cleaner TagNode.
 *
 * Some code has been copy-pasted from SF's HTML Cleaner code (which is under a BDS license, see
 * http://htmlcleaner.sourceforge.net/license.php). Our goal is to remove this class completely if we can get SF's HTML
 * Cleaner to support the CDATA-related use cases that force us to have this class.
 *
 * Remove when the following issues have been fixed:
 * 
    *
  • https://sourceforge.net/p/htmlcleaner/bugs/169/
  • *
* * Note: Even though in a public package this code is not meant to be a public API. We've had to put in under the {@code * org.htmlcleaner} package because of https://sourceforge.net/p/htmlcleaner/bugs/167/. * * @version $Id: 124eceb29fd098c392e9dcffa5c21bfef5cecb8e $ * @since 1.8.2 */ public class XWikiDOMSerializer { /** * The Regex Pattern to recognize a CDATA block. */ private static final Pattern CDATA_PATTERN = Pattern.compile("| attributes = rootNode.getAttributes(); Iterator> entryIterator = attributes.entrySet().iterator(); while (entryIterator.hasNext()) { Map.Entry entry = entryIterator.next(); String attrName = entry.getKey(); String attrValue = entry.getValue(); if (escapeXml) { attrValue = Utils.escapeXml(attrValue, props, true); } document.getDocumentElement().setAttribute(attrName, attrValue); // // Flag the attribute as an ID attribute if appropriate. Thanks to Chris173 // if (attrName.equalsIgnoreCase("id")) { document.getDocumentElement().setIdAttribute(attrName, true); } } createSubnodes(document, document.getDocumentElement(), rootNode.getAllChildren()); return document; } /** * Perform CDATA transformations if the user has specified to use CDATA inside scripts and style elements. * * @param document the W3C Document to use for creating new DOM elements * @param element the W3C element to which we'll add the text content to * @param bufferedContent the buffered text content on which we need to perform the CDATA transformations * @param item the current HTML Cleaner node being processed */ private void flushContent(Document document, Element element, StringBuffer bufferedContent, Object item) { if (bufferedContent.length() > 0 && !(item instanceof ContentNode)) { // Flush the buffered content boolean specialCase = this.props.isUseCdataForScriptAndStyle() && isScriptOrStyle(element); String content = bufferedContent.toString(); if (this.escapeXml && !specialCase) { content = Utils.escapeXml(content, this.props, true); } else if (specialCase) { content = processCDATABlocks(content); } // Generate a javascript comment in front on the CDATA block so that it works in IE and when // serving XHTML under a mimetype of HTML. if (specialCase) { if (SCRIPT_TAG_NAME.equalsIgnoreCase(element.getNodeName())) { // JS element.appendChild(document.createTextNode(JS_COMMENT)); element.appendChild(document.createCDATASection(NEW_LINE + content + NEW_LINE + JS_COMMENT)); } else { // CSS element.appendChild(document.createTextNode(CSS_COMMENT_START)); element.appendChild(document.createCDATASection(CSS_COMMENT_END + StringUtils.chomp(content) + NEW_LINE + CSS_COMMENT_START)); element.appendChild(document.createTextNode(CSS_COMMENT_END)); } } else { element.appendChild(document.createTextNode(content)); } bufferedContent.setLength(0); } } /** * Remove any existing CDATA section and unencode HTML entities that are not inside a CDATA block. * * @param content the text input to transform * @return the transformed content that will be wrapped inside a CDATA block */ private String processCDATABlocks(String content) { StringBuffer result = new StringBuffer(); Matcher matcher = CDATA_PATTERN.matcher(content); int cursor = 0; while (matcher.find()) { result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor, matcher.start()))); result.append(content.substring(matcher.start() + 9, matcher.end() - matcher.group(1).length())); cursor = matcher.end() - matcher.group(1).length() + 3; } // Copy the remaining text data in the result buffer if (cursor < content.length()) { result.append(StringEscapeUtils.unescapeHtml4(content.substring(cursor))); } // Ensure ther's no invalid remaining. String contentResult = result.toString().replace("", ""); return contentResult; } /** * @param element the element to check * @return true if the passed element is a script or style element */ protected boolean isScriptOrStyle(Element element) { String tagName = element.getNodeName(); return SCRIPT_TAG_NAME.equalsIgnoreCase(tagName) || STYLE_TAG_NAME.equalsIgnoreCase(tagName); } /** * Serialize a given SF HTML Cleaner node. * * @param document the W3C Document to use for creating new DOM elements * @param element the W3C element to which we'll add the subnodes to * @param tagChildren the SF HTML Cleaner nodes to serialize for that node */ private void createSubnodes(Document document, Element element, List tagChildren) { // We've modified the original implementation based in SF's HTML Cleaner to better handle CDATA. // More specifically we want to handle the following 3 use cases: // // Use case 1: useCdata = true && input is: // // In this case we must make sure to have only one CDATA block. // // Use case 2: useCdata = true && input is: // // We must generate a CDATA block around the whole content (the HTML Tokenizer split // ContentToken on "<" character so we need to join them before creating the CDATA block. // We must also unencode any entities (i.e. transform "<" into "<") since we'll be // wrapping them in a CDATA section. // // Use case 3: useCData = false // Simply group all ContentToken together. StringBuffer bufferedContent = new StringBuffer(); if (tagChildren != null) { for (Object item : tagChildren) { // Flush content tokens flushContent(document, element, bufferedContent, item); if (item instanceof CommentNode) { CommentNode commentToken = (CommentNode) item; Comment comment = document.createComment(commentToken.getContent()); element.appendChild(comment); } else if (item instanceof ContentNode) { ContentNode contentToken = (ContentNode) item; bufferedContent.append(contentToken.getContent()); } else if (item instanceof TagNode) { TagNode subTagNode = (TagNode) item; Element subelement = document.createElement(subTagNode.getName()); Map attributes = subTagNode.getAttributes(); for (Map.Entry entry : attributes.entrySet()) { String attrName = entry.getKey(); String attrValue = entry.getValue(); if (this.escapeXml) { attrValue = Utils.escapeXml(attrValue, this.props, true); } subelement.setAttribute(attrName, attrValue); } // recursively create subnodes createSubnodes(document, subelement, subTagNode.getAllChildren()); element.appendChild(subelement); } else if (item instanceof List) { @SuppressWarnings("unchecked") List sublist = (List) item; createSubnodes(document, element, sublist); } } flushContent(document, element, bufferedContent, null); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy