org.opencms.util.CmsHtmlStripper Maven / Gradle / Ivy
Show all versions of opencms-core Show documentation
/*
* This library is part of OpenCms -
* the Open Source Content Management System
*
* Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* For further information about Alkacon Software GmbH & Co. KG, please see the
* company website: http://www.alkacon.com
*
* For further information about OpenCms, please see the
* project website: http://www.opencms.org
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.opencms.util;
import org.opencms.i18n.CmsEncoder;
import org.opencms.main.CmsLog;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import org.apache.commons.logging.Log;
import org.htmlparser.Attribute;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
/**
* Simple html tag stripper that allows configuration of html tag names that are allowed.
*
*
* All tags that are not explicitly allowed via invocation of one of the
* addPreserve...
methods will be missing in the result of the method
* {@link #stripHtml(String)}
.
*
* Instances are reusable but not shareable (multithreading). If configuration should be changed
* between subsequent invocations of {@link #stripHtml(String)}
method
* {@link #reset()}
has to be called.
*
*
* @since 6.9.2
*
*/
public final class CmsHtmlStripper {
/** The log object for this class. */
private static final Log LOG = CmsLog.getLog(CmsHtmlStripper.class);
/** A tag factory that is able to make tags invisible to visitors. */
private CmsHtmlTagRemoveFactory m_nodeFactory;
/** Flag to control whether tidy is used. */
private boolean m_useTidy;
/**
* Default constructor that turns echo on and uses the settings for replacing tags.
*
*/
public CmsHtmlStripper() {
reset();
}
/**
* Creates an instance with control whether tidy is used.
*
* @param useTidy if true tidy will be used
*/
public CmsHtmlStripper(final boolean useTidy) {
this();
m_useTidy = useTidy;
}
/**
* Adds a tag that will be preserved by {@link #stripHtml(String)}
.
*
* @param tagName the name of the tag to keep (case insensitive)
*
* @return true if the tagName was added correctly to the internal engine
*/
public boolean addPreserveTag(final String tagName) {
Vector attributeList = new Vector(1);
Attribute tagNameAttribute = new Attribute();
tagNameAttribute.setName(tagName.toLowerCase());
attributeList.add(tagNameAttribute);
Tag keepTag = m_nodeFactory.createTagNode(null, 0, 0, attributeList);
boolean result = m_nodeFactory.addTagPreserve(keepTag);
return result;
}
/**
* Convenience method for adding several tags to preserve.
*
* @param preserveTags a List<String>
with the case-insensitive tag names of the tags to preserve
*
* @see #addPreserveTag(String)
*/
public void addPreserveTagList(List preserveTags) {
for (Iterator it = preserveTags.iterator(); it.hasNext();) {
addPreserveTag(it.next());
}
}
/**
* Convenience method for adding several tags to preserve
* in form of a delimiter-separated String.
*
* The String will be {@link CmsStringUtil#splitAsList(String, char, boolean)}
* with tagList
as the first argument, separator
as the
* second argument and the third argument set to true (trimming - support).
*
* @param tagList a delimiter-separated String with case-insensitive tag names to preserve by
* {@link #stripHtml(String)}
* @param separator the delimiter that separates tag names in the tagList
argument
*
* @see #addPreserveTag(String)
*/
public void addPreserveTags(final String tagList, final char separator) {
List tags = CmsStringUtil.splitAsList(tagList, separator, true);
addPreserveTagList(tags);
}
/**
* Resets the configuration of the tags to preserve.
*
* This is called from the constructor and only has to be called if this
* instance is reused with a differen configuration (of tags to keep).
*
*/
public void reset() {
m_nodeFactory = new CmsHtmlTagRemoveFactory();
}
/**
* Extracts the text from the given html content, assuming the given html encoding.
*
* Additionally tags are replaced / removed according to the configuration of this instance.
*
*
*
Please note:
* There are static process methods in the superclass that will not do the replacements /
* removals. Don't mix them up with this method.
*
*
* @param html the content to extract the plain text from.
*
* @return the text extracted from the given html content.
*
* @throws ParserException if something goes wrong.
*/
public String stripHtml(final String html) throws ParserException {
String content = html;
if (m_useTidy) {
content = tidy(content);
}
// initialize a parser with the given charset
Parser parser = new Parser();
parser.setNodeFactory(m_nodeFactory);
Lexer lexer = new Lexer();
Page page = new Page(content);
lexer.setPage(page);
parser.setLexer(lexer);
// process the page using a string collection wizard
// echo on
CmsHtmlParser visitor = new CmsHtmlParser(true);
parser.visitAllNodesWith(visitor);
// return the result
return visitor.getResult();
}
/**
* Internally tidies with cleanup and XHTML.
*
* @param content HTML to clean
*
* @return the tidy HTML
*/
private String tidy(final String content) {
CmsHtmlConverter converter = new CmsHtmlConverter(
CmsEncoder.ENCODING_UTF_8,
new StringBuffer(CmsHtmlConverter.PARAM_WORD).append(";").append(CmsHtmlConverter.PARAM_XHTML).toString());
String result = content;
try {
result = converter.convertToString(content);
} catch (UnsupportedEncodingException e) {
// should never happen
if (LOG.isWarnEnabled()) {
LOG.warn(Messages.get().getBundle().key(Messages.LOG_WARN_TIDY_FAILURE_0), e);
}
}
return result;
}
}