All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencms.util.CmsHtmlConverterJTidy Maven / Gradle / Ivy

Go to download

OpenCms is an enterprise-ready, easy to use website content management system based on Java and XML technology. Offering a complete set of features, OpenCms helps content managers worldwide to create and maintain beautiful websites fast and efficiently.

There is a newer version: 17.0
Show newest version
/*
 * This library is part of OpenCms -
 * the Open Source Content Management System
 *
 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * For further information about Alkacon Software GmbH & Co. KG, please see the
 * company website: http://www.alkacon.com
 *
 * For further information about OpenCms, please see the
 * project website: http://www.opencms.org
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package org.opencms.util;

import org.opencms.main.CmsLog;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;

import org.w3c.tidy.Tidy;

/**
 * HTML cleaner and pretty printer using JTidy.

* * Used to clean up HTML code (e.g. remove word tags) and optionally create XHTML from HTML.

* * @since 6.0.0 */ public class CmsHtmlConverterJTidy extends A_CmsHtmlConverter { /** The log object for this class. */ private static final Log LOG = CmsLog.getLog(CmsHtmlConverterJTidy.class); /** List of default modes if none were specified explicitly. */ private static final List MODES_DEFAULT = Collections.unmodifiableList( Arrays.asList(new String[] {CmsHtmlConverter.PARAM_ENABLED})); /** Regular expression for cleanup. */ String[] m_cleanupPatterns = { ".*(\\r\\n)*.*", ".*(\\r\\n)*.*", "<\\?xml:.*(\\r\\n).*/>", "<\\?xml:.*(\\r\\n).*(\\r\\n).*/\\?>", "<\\?xml:.*(\\r\\n).*(\\r\\n).*/>", "<\\?xml:(.*(\\r\\n)).*/\\?>", "", ""}; /** Patterns for cleanup. */ Pattern[] m_clearStyle; /** Regular expressions for paragraph replacements -- additionally remove leading and trailing breaks. */ String[] m_replaceParagraphPatterns = { "\n
", "\n
", "


", "

", "
(\\s)* (\\s)*

", "

", "

", "^
", "
$"}; /** Values for paragraph replacements. */ String[] m_replaceParagraphValues = {"", "", "
", "
", "
", "
", "
", "", ""}; /** Regular expression for replace. */ String[] m_replacePatterns = { " ", "(\\r\\n){2,}", "\u2013", "(\\n){2,}", "\\(\\r\\n<", "\\(\\n<", "\\(\\r\\n(\\ ){1,}<", "\\(\\n(\\ ){1,}<", "\\r\\n */ public CmsHtmlConverterJTidy() { super(null, MODES_DEFAULT); } /** * Constructor, creates a new CmsHtmlConverterJTidy.

* * Possible values for the conversion mode are:

    *
  • {@link CmsHtmlConverter#PARAM_DISABLED}: The conversion is disabled. *
  • {@link CmsHtmlConverter#PARAM_ENABLED}: Conversion is enabled without transformation, so HTML is pretty printed only. *
  • {@link CmsHtmlConverter#PARAM_XHTML}: Conversion from HTML to XHTML is enabled. *
  • {@link CmsHtmlConverter#PARAM_WORD}: Cleanup of word like HTML tags is enabled. *
  • {@link CmsHtmlConverter#PARAM_REPLACE_PARAGRAPHS}: Cleanup of paragraphs and leading/trailing line breaks is enabled. * *
* * @param encoding the encoding used for the HTML code conversion * @param modes the conversion modes to use */ public CmsHtmlConverterJTidy(String encoding, List modes) { super(encoding, modes); } /** * Converts the given HTML code according to the settings of this converter.

* * @param htmlInput HTML input stored in a string * @return string containing the converted HTML * * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported */ @Override public String convertToString(String htmlInput) throws UnsupportedEncodingException { // initialize the modes initModes(); // only do parsing if the mode is not set to disabled if (m_modeEnabled) { // do a maximum of 10 loops int max = m_modeWord ? 10 : 1; int count = 0; // we may have to do several parsing runs until all tags are removed int oldSize = htmlInput.length(); String workHtml = regExp(htmlInput); while (count < max) { count++; // first add the optional header if in word mode if (m_modeWord) { workHtml = adjustHtml(workHtml); } // now use tidy to parse and format the HTML workHtml = parse(workHtml); if (m_modeWord) { // cut off the line separator, which is always appended workHtml = workHtml.substring(0, workHtml.length() - m_lineSeparatorLength); } if (workHtml.length() == oldSize) { // no change in HTML code after last processing loop workHtml = regExp(workHtml); break; } oldSize = workHtml.length(); workHtml = regExp(workHtml); } if (LOG.isDebugEnabled()) { LOG.debug( Messages.get().getBundle().key( Messages.LOG_PARSING_RUNS_2, this.getClass().getName(), new Integer(count))); } htmlInput = workHtml; } return htmlInput; } /** * Adjusts the HTML input code in WORD mode if necessary.

* * When in WORD mode, the HTML tag must contain the xmlns:o="urn:schemas-microsoft-com:office:office" * attribute, otherwise tide will not remove the WORD tags from the document. * * @param htmlInput the HTML input * @return adjusted HTML input */ private String adjustHtml(String htmlInput) { // check if we have some opening and closing HTML tags if ((htmlInput.toLowerCase().indexOf("") == -1) && (htmlInput.toLowerCase().indexOf("") == -1)) { // add a correct HTML tag for word generated HTML StringBuffer tmp = new StringBuffer(); tmp.append(""); tmp.append(htmlInput); tmp.append(""); htmlInput = tmp.toString(); } return htmlInput; } /** * Initializes the JTidy modes.

*/ private void initModes() { // set all internal modes to disabled m_modeEnabled = false; m_modeReplaceParagraphs = false; m_modeWord = false; m_modeXhtml = false; // extract all operation modes List modes = getModes(); // configure the tidy depending on the operation mode if (modes.contains(CmsHtmlConverter.PARAM_ENABLED)) { m_modeEnabled = true; } if (modes.contains(CmsHtmlConverter.PARAM_XHTML)) { m_modeEnabled = true; m_modeXhtml = true; } if (modes.contains(CmsHtmlConverter.PARAM_WORD)) { m_modeEnabled = true; m_modeWord = true; } if (modes.contains(CmsHtmlConverter.PARAM_REPLACE_PARAGRAPHS)) { m_modeEnabled = true; m_modeReplaceParagraphs = true; } // get line separator length m_lineSeparatorLength = System.getProperty("line.separator").length(); // we need this only if the conversion is enabled if (m_modeEnabled) { // create the main tidy object m_tidy = new Tidy(); // set specified word, XHTML conversion settings m_tidy.setXHTML(m_modeXhtml); m_tidy.setWord2000(m_modeWord); // add additional tags // those are required to handle word 2002 (and newer) documents Properties additionalTags = new Properties(); additionalTags.put("new-empty-tags", "o:smarttagtype"); additionalTags.put("new-inline-tags", "o:smarttagtype"); m_tidy.getConfiguration().addProps(additionalTags); // set the default tidy configuration // set the tidy encoding m_tidy.setInputEncoding(getEncoding()); m_tidy.setOutputEncoding(getEncoding()); // disable the tidy meta element in output m_tidy.setTidyMark(false); // disable clean mode m_tidy.setMakeClean(false); // enable numeric entities m_tidy.setNumEntities(true); // create output of the body only m_tidy.setPrintBodyOnly(true); // disable URI fixing, because it breaks domain names with special characters (IDNs) in links when used in HTML fields m_tidy.setFixUri(false); // force output creation even if there are tidy errors m_tidy.setForceOutput(true); // set tidy to quiet mode to prevent output m_tidy.setQuiet(true); // disable warning output m_tidy.setShowWarnings(false); // allow comments in the output m_tidy.setHideComments(false); // set no line break before a
m_tidy.setBreakBeforeBR(false); // don't wrap attribute values m_tidy.setWrapAttVals(false); // warp lines after 100 chars m_tidy.setWraplen(100); // no indentation m_tidy.setSpaces(0); if (m_modeWord) { // create the regular expression for cleanup, only used in word clean mode m_clearStyle = new Pattern[m_cleanupPatterns.length]; for (int i = 0; i < m_cleanupPatterns.length; i++) { m_clearStyle[i] = Pattern.compile(m_cleanupPatterns[i]); } } // add paragraph replacement regular expression and values if needed if (m_modeReplaceParagraphs) { // add the regular expression and values for paragraph replacements String[] newPatterns = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length]; String[] newValues = new String[m_replacePatterns.length + m_replaceParagraphPatterns.length]; System.arraycopy(m_replacePatterns, 0, newPatterns, 0, m_replacePatterns.length); System.arraycopy( m_replaceParagraphPatterns, 0, newPatterns, m_replacePatterns.length, m_replaceParagraphPatterns.length); System.arraycopy(m_replaceValues, 0, newValues, 0, m_replacePatterns.length); System.arraycopy( m_replaceParagraphValues, 0, newValues, m_replacePatterns.length, m_replaceParagraphPatterns.length); m_replacePatterns = newPatterns; m_replaceValues = newValues; } // create the regular expression for replace m_replaceStyle = new Pattern[m_replacePatterns.length]; for (int i = 0; i < m_replacePatterns.length; i++) { m_replaceStyle[i] = Pattern.compile(m_replacePatterns[i]); } } } /** * Parses a byte array containing HTML code with different parsing modes.

* * @param htmlInput a byte array containing raw HTML code * * @return parsed and cleared HTML code * * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported */ private String parse(String htmlInput) throws UnsupportedEncodingException { // prepare the streams ByteArrayInputStream in = new ByteArrayInputStream(htmlInput.getBytes(getEncoding())); ByteArrayOutputStream out = new ByteArrayOutputStream(); // do the parsing m_tidy.parse(in, out); // return the result byte[] result = out.toByteArray(); return new String(result, getEncoding()); } /** * Parses the htmlInput with regular expressions for cleanup purposes.

* * @param htmlInput the HTML input * * @return the processed HTML */ private String regExp(String htmlInput) { String parsedHtml = htmlInput.trim(); if (m_modeWord) { // process all cleanup regular expressions for (int i = 0; i < m_cleanupPatterns.length; i++) { parsedHtml = m_clearStyle[i].matcher(parsedHtml).replaceAll(""); } } // process all replace regular expressions for (int i = 0; i < m_replacePatterns.length; i++) { parsedHtml = m_replaceStyle[i].matcher(parsedHtml).replaceAll(m_replaceValues[i]); } return parsedHtml; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy