ca.gc.aafc.dina.security.TextHtmlSanitizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of dina-base-api Show documentation
Show all versions of dina-base-api Show documentation
Base DINA API package for Java built on SpringBoot and Crnk
package ca.gc.aafc.dina.security;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Safelist;
import java.util.Set;
/**
* Utility class to check and sanitize text received from the user in case it could be unsafe.
* Safe means safe to display on a html page.
*/
public final class TextHtmlSanitizer {
private static final Safelist NONE = Safelist.none();
private static final Safelist BASIC = Safelist.basic();
private static final int HTML_SHELL_SIZE = Document.createShell("").getAllElements().size();
private static final int MAX_ERROR_TRACKED = 5;
private static final Set CONDITIONAL_ACCEPTED_PARSE_ERROR =
Set.of("Unexpectedly reached end of file (EOF)",
"Invalid character reference: missing semicolon on [",
"Unexpected character ");
private TextHtmlSanitizer() {
//utility class
}
/**
* Sanitize value received from the user to make sure it is safe to return it.
* @param txt
* @return
*/
public static String sanitizeText(String txt) {
if (StringUtils.isBlank(txt)) {
return txt;
}
return Jsoup.clean(txt, NONE);
}
/**
* Check if the provided text can be considered as safe for html.
* allowUnescapedEntities is set to true
* @param txt
* @return
*/
public static boolean isSafeText(String txt) {
return isSafeText(txt, NONE, true);
}
public static boolean isSafeText(String txt, boolean allowUnescapedEntities) {
return isSafeText(txt, NONE, allowUnescapedEntities);
}
/**
* Check if the text is safe to use in HTML according to the Safelist.
* Optionally, the check can skip unescapedEntities (e.g. <, > ) if the text will be used in something else than html.
* @param txt the text input
* @param safelist JSoup Safelist instance
* @param allowUnescapedEntities should unescaped entities be identified as safe or no
* @return can the text be considered safe or not
*/
public static boolean isSafeText(String txt, Safelist safelist, boolean allowUnescapedEntities) {
if (StringUtils.isBlank(txt)) {
return true;
}
if(Jsoup.isValid(txt, safelist)) {
return true;
}
// make sure that the unescaped entities are not part of an unsafe html so, we sanitize the input first.
if(allowUnescapedEntities) {
return StringUtils.normalizeSpace(txt).equals(Parser.unescapeEntities(TextHtmlSanitizer.sanitizeText(txt), false));
}
return false;
}
/**
* Check if the text can be considered acceptable. Acceptable does NOT mean safe.
* The result should still be used with caution and proper escaping in html.
*
* Acceptable is defined by a text that only contains unclosed tag without creating additional html elements.
* If they were to prefix another html element it should only create an element from the Basic Safelist.
*
* @param txt
* @return
*/
public static boolean isAcceptableText(String txt) {
Parser p = Parser.htmlParser();
p.setTrackErrors(MAX_ERROR_TRACKED);
Document d = p.parseInput(txt, "");
// if a single element is added it should be rejected
if(d.getAllElements().size() != HTML_SHELL_SIZE) {
return false;
}
// if we reached the maximum number of errors it should be rejected
if(p.getErrors().size() == MAX_ERROR_TRACKED) {
return false;
}
// if some parsing errors are not in the accepted list it should be rejected
if (!p.getErrors().stream()
.allMatch(pe -> isAcceptableParseError(pe.getErrorMessage()))) {
return false;
}
// check the impact of prefixing the txt with a paragraph
return isFollowedByParagraphOnlyCreatesBasicElement(txt);
}
/**
* Checks if the errorMessage is part of the acceptable parse errors Set.
* The check is done on the prefix since the last part of the message is the concrete text.
* @param errorMessage
* @return
*/
private static boolean isAcceptableParseError(String errorMessage) {
return CONDITIONAL_ACCEPTED_PARSE_ERROR.stream()
.anyMatch( t -> StringUtils.startsWith(errorMessage, t));
}
/**
* Tries to evaluate the impact of having the acceptable text before a html paragraph.
* The browser may use the paragraph to close tags in the provided txt. We allow it if the impact is still passing the SafeList BASIC.
* @param txt
* @return
*/
private static boolean isFollowedByParagraphOnlyCreatesBasicElement(String txt) {
Parser p = Parser.htmlParser();
Document d = p.parseInput(txt + "abc
", "");
Cleaner c = new Cleaner(BASIC);
Document dd = c.clean(d);
// if a single element is cleaned it should be rejected
return d.getAllElements().size() == dd.getAllElements().size();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy