All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.itextpdf.styledxmlparser.jsoup.safety.Cleaner Maven / Gradle / Ivy

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2024 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .
 */
package com.itextpdf.styledxmlparser.jsoup.safety;

import com.itextpdf.styledxmlparser.jsoup.helper.Validate;
import com.itextpdf.styledxmlparser.jsoup.nodes.Attribute;
import com.itextpdf.styledxmlparser.jsoup.nodes.Attributes;
import com.itextpdf.styledxmlparser.jsoup.nodes.DataNode;
import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
import com.itextpdf.styledxmlparser.jsoup.nodes.TextNode;
import com.itextpdf.styledxmlparser.jsoup.parser.ParseErrorList;
import com.itextpdf.styledxmlparser.jsoup.parser.Parser;
import com.itextpdf.styledxmlparser.jsoup.parser.Tag;
import com.itextpdf.styledxmlparser.jsoup.select.NodeTraversor;
import com.itextpdf.styledxmlparser.jsoup.select.NodeVisitor;

import java.util.List;


/**
 The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
 that you are expecting; no junk, and no cross-site scripting attacks!
 

The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain HTML that is allowed by the safelist.

It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the canned safe-lists only allow body contained tags.

Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link com.itextpdf.styledxmlparser.jsoup.Jsoup}. */ public class Cleaner { private final Safelist safelist; /** * Create a new cleaner, that sanitizes documents using the supplied safelist. * * @param safelist safe-list to clean with */ public Cleaner(Safelist safelist) { Validate.notNull(safelist); this.safelist = safelist; } /** * Use {@link #Cleaner(Safelist)} instead. * * @deprecated as of 1.14.1. */ @Deprecated public Cleaner(Whitelist whitelist) { Validate.notNull(whitelist); this.safelist = whitelist; } /** * Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist. * The original document is not modified. Only elements from the dirty document's body are used. The * OutputSettings of the original document are cloned into the clean document. * * @param dirtyDocument Untrusted base document to clean. * @return cleaned document. */ public Document clean(Document dirtyDocument) { Validate.notNull(dirtyDocument); Document clean = Document.createShell(dirtyDocument.baseUri()); copySafeNodes(dirtyDocument.body(), clean.body()); clean.outputSettings((Document.OutputSettings) dirtyDocument.outputSettings().clone()); return clean; } /** * Determines if the input document bodyis valid, against the safelist. It is considered valid if all the tags and attributes * in the input HTML are allowed by the safelist, and that there is no content in the head. *

* This method can be used as a validator for user input. An invalid document will still be cleaned successfully * using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document * to ensure enforced attributes are set correctly, and that the output is tidied. * * @param dirtyDocument document to test * @return true if no tags or attributes need to be removed; false if they do */ public boolean isValid(Document dirtyDocument) { Validate.notNull(dirtyDocument); Document clean = Document.createShell(dirtyDocument.baseUri()); int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); return numDiscarded == 0 && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head } public boolean isValidBodyHtml(String bodyHtml) { Document clean = Document.createShell(""); Document dirty = Document.createShell(""); ParseErrorList errorList = ParseErrorList.tracking(1); List nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList); dirty.body().insertChildren(0, nodes); int numDiscarded = copySafeNodes(dirty.body(), clean.body()); return numDiscarded == 0 && errorList.isEmpty(); } /** * Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. */ private final class CleaningVisitor implements NodeVisitor { int numDiscarded = 0; private final Element root; private Element destination; // current element to append nodes to CleaningVisitor(Element root, Element destination) { this.root = root; this.destination = destination; } public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText()); destination.appendChild(destText); } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } } public void tail(Node source, int depth) { if (source instanceof Element && safelist.isSafeTag(source.nodeName())) { destination = (Element) destination.parent(); // would have descended, so pop destination stack } } } private int copySafeNodes(Element source, Element dest) { CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest); NodeTraversor.traverse(cleaningVisitor, source); return cleaningVisitor.numDiscarded; } private ElementMeta createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); Attributes destAttrs = new Attributes(); Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) destAttrs.put(sourceAttr); else numDiscarded++; } Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag); destAttrs.addAll(enforcedAttrs); return new ElementMeta(dest, numDiscarded); } private static class ElementMeta { Element el; int numAttribsDiscarded; ElementMeta(Element el, int numAttribsDiscarded) { this.el = el; this.numAttribsDiscarded = numAttribsDiscarded; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy