com.msl.pdfier.commons.html.HTMLSanitizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdfier-commons Show documentation
PDFIER commons components
There is a newer version: 9.1.20
package com.msl.pdfier.commons.html;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import net.htmlparser.jericho.Attribute;
import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.EndTag;
import net.htmlparser.jericho.EndTagType;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.HTMLElements;
import net.htmlparser.jericho.OutputDocument;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import net.htmlparser.jericho.Tag;

/**
 * Provides facilities to sanitize HTML containing unwanted or invalid tags into
 * clean HTML.
 */
public class HTMLSanitizer {
	private HTMLSanitizer() {
	} // not instantiable

	// list of HTML attributes that will be retained in the final output:
	private static final Set NON_VALID_ATTRIBUTE_NAMES = new HashSet(
			Arrays.asList(new String[] { "de_momento_ninguno" }));

	private static final Object VALID_MARKER = new Object();

	/**
	 * Returns a sanitized version of the specified HTML, encoding any unwanted
	 * tags.
	 * 
	 * @param pseudoHTML The potentially invalid HTML to sanitize.
	 * @return a sanitized version of the specified HTML, encoding any unwanted
	 *         tags.
	 */
	public static String encodeInvalidMarkup(String pseudoHTML) {
		return encodeInvalidMarkup(pseudoHTML, false);
	}

	/**
	 * Returns a sanitized version of the specified HTML, encoding any unwanted
	 * tags.
	 * 
	 * Encoding unwanted and invalid tags results in them appearing verbatim in the
	 * rendered output, helping to highlight the problem so that the source HTML can
	 * be fixed.
	 * 
	 * Specifying a value of true as an argument to the
	 * formatWhiteSpace parameter results in the formatting of white
	 * space as described in the sanitization process in the class description
	 * above.
	 * 
	 * @param pseudoHTML       The potentially invalid HTML to sanitize.
	 * @param formatWhiteSpace Specifies whether white space should be marked up in
	 *                         the output.
	 * @return a sanitized version of the specified HTML, encoding any unwanted
	 *         tags.
	 */
	public static String encodeInvalidMarkup(String pseudoHTML, boolean formatWhiteSpace) {
		return sanitize(pseudoHTML, formatWhiteSpace, false, null);
	}

	/**
	 * Returns a sanitized version of the specified HTML, stripping any unwanted
	 * tags.
	 * 
	 * @param pseudoHTML The potentially invalid HTML to sanitize.
	 * @param elementsToStrip elements To Strip
	 * @return a sanitized version of the specified HTML, stripping any unwanted
	 *         tags.
	 */
	public static String stripInvalidMarkup(String pseudoHTML, Set elementsToStrip) {
		return sanitize(pseudoHTML, false, true, elementsToStrip);
	}

	/**
	 * Returns a sanitized version of the specified HTML, stripping any unwanted
	 * tags.
	 * 
	 * @param pseudoHTML       The potentially invalid HTML to sanitize.
	 * @param formatWhiteSpace Specifies whether white space should be marked up in
	 *                         the output.
	 * @return a sanitized version of the specified HTML, stripping any unwanted
	 *         tags.
	 */
	public static String stripInvalidMarkup(String pseudoHTML, boolean formatWhiteSpace) {
		return sanitize(pseudoHTML, formatWhiteSpace, true, null);
	}

	private static String sanitize(String pseudoHTML, boolean formatWhiteSpace, boolean stripInvalidElements,
			Set elementsToStrip) {
		Source source = new Source(pseudoHTML);
		source.fullSequentialParse();
		OutputDocument outputDocument = new OutputDocument(source);
		List tags = source.getAllTags();
		int pos = 0;
		for (Tag tag : tags) {
			if (tag.getBegin() < pos)
				continue; // this might happen if a script element was
							// encountered
			if (processTag(tag, outputDocument, elementsToStrip)) {
				tag.setUserData(VALID_MARKER);
			} else {
				if (!stripInvalidElements)
					continue; // element will be encoded along with surrounding
								// text
				if (tag.getName() == HTMLElementName.SCRIPT && tag.getTagType() == StartTagType.NORMAL) {
					EndTag endTag = tag.getElement().getEndTag();
					if (endTag == null) {
						// script has no end tag, remove everything to the end
						// of the source and don't process any more tags
						outputDocument.remove(tag.getBegin(), source.getEnd());
						return outputDocument.toString();
					}
					// remove the whole script element including tags and
					// content
					outputDocument.remove(tag.getBegin(), endTag.getEnd());
					pos = endTag.getEnd();
					continue;
				}
				outputDocument.remove(tag);
			}
			reencodeTextSegment(source, outputDocument, pos, tag.getBegin(), formatWhiteSpace);
			pos = tag.getEnd();
		}
		reencodeTextSegment(source, outputDocument, pos, source.getEnd(), formatWhiteSpace);
		return outputDocument.toString();
	}

	private static boolean processTag(Tag tag, OutputDocument outputDocument, Set elementsToStrip) {
		String elementName = tag.getName();
		if (elementsToStrip != null && elementsToStrip.contains(elementName))
			return false;
		if (tag.getTagType() == StartTagType.NORMAL) {
			Element element = tag.getElement();
			if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) {
				if (element.getEndTag() == null)
					return false; // reject start tag if its required end tag is
									// missing
			} else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName)) {
				if (elementName == HTMLElementName.LI && !isValidLITag(tag))
					return false; // reject invalid LI tags
				if (element.getEndTag() == null)
					// insert optional end tag if it is missing
					outputDocument.insert(element.getEnd(), getEndTagHTML(elementName));
			}
			outputDocument.replace(tag, getStartTagHTML(element.getStartTag()));
		} else if (tag.getTagType() == EndTagType.NORMAL) {
			if (tag.getElement() == null)
				return false; // reject end tags that aren't associated with a
								// start tag
			if (elementName == HTMLElementName.LI && !isValidLITag(tag))
				return false; // reject invalid LI tags
			outputDocument.replace(tag, getEndTagHTML(elementName));
		} else {
			return false; // reject abnormal tags
		}
		return true;
	}

	private static boolean isValidLITag(Tag tag) {
		Element parentElement = tag.getElement().getParentElement();
		if (parentElement == null)
			return false; // ignore LI elements without a parent
		if (parentElement.getStartTag().getUserData() != VALID_MARKER)
			return false; // ignore LI elements who's parent is not valid
		// only accept LI tags who's immediate parent is UL or OL.
		return parentElement.getName() == HTMLElementName.UL || parentElement.getName() == HTMLElementName.OL;
	}

	private static void reencodeTextSegment(Source source, OutputDocument outputDocument, int begin, int end,
			boolean formatWhiteSpace) {
		if (begin >= end)
			return;
		Segment textSegment = new Segment(source, begin, end);
		String decodedText = CharacterReference.decode(textSegment);
		String encodedText = formatWhiteSpace ? CharacterReference.encodeWithWhiteSpaceFormatting(decodedText)
				: CharacterReference.encode(decodedText);
		outputDocument.replace(textSegment, encodedText);
	}

	private static CharSequence getStartTagHTML(StartTag startTag) {
		// tidies and filters out non-approved attributes
		StringBuilder sb = new StringBuilder();
		sb.append('<').append(startTag.getName());
		for (Attribute attribute : startTag.getAttributes()) {
			if (!NON_VALID_ATTRIBUTE_NAMES.contains(attribute.getKey())) {
				sb.append(' ').append(attribute.getName());
				if (attribute.getValue() != null) {
					sb.append("=\"");
					sb.append(CharacterReference.encode(attribute.getValue()));
					sb.append('"');
				}
			}
		}
		if (startTag.getElement().getEndTag() == null
				&& !HTMLElements.getEndTagOptionalElementNames().contains(startTag.getName()))
			sb.append(" /");
		sb.append('>');
		return sb;
	}

	private static String getEndTagHTML(String tagName) {
		return "';
	}

}