com.itextpdf.styledxmlparser.node.impl.jsoup.JsoupHtmlParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of styled-xml-parser Show documentation
Styled XML parser is used by iText modules to parse HTML and XML
There is a newer version: 9.0.0
/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2024 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .
 */
package com.itextpdf.styledxmlparser.node.impl.jsoup;

import com.itextpdf.commons.utils.MessageFormatUtil;
import com.itextpdf.styledxmlparser.IXmlParser;
import com.itextpdf.styledxmlparser.jsoup.Jsoup;
import com.itextpdf.styledxmlparser.jsoup.nodes.Comment;
import com.itextpdf.styledxmlparser.jsoup.nodes.DataNode;
import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
import com.itextpdf.styledxmlparser.jsoup.nodes.DocumentType;
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
import com.itextpdf.styledxmlparser.jsoup.nodes.TextNode;
import com.itextpdf.styledxmlparser.node.IDocumentNode;
import com.itextpdf.styledxmlparser.node.INode;
import com.itextpdf.styledxmlparser.node.impl.jsoup.node.JsoupDataNode;
import com.itextpdf.styledxmlparser.node.impl.jsoup.node.JsoupDocumentNode;
import com.itextpdf.styledxmlparser.node.impl.jsoup.node.JsoupDocumentTypeNode;
import com.itextpdf.styledxmlparser.node.impl.jsoup.node.JsoupElementNode;
import com.itextpdf.styledxmlparser.node.impl.jsoup.node.JsoupTextNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;

/**
 * Class that uses JSoup to parse HTML.
 */
public class JsoupHtmlParser implements IXmlParser {

    /** The logger. */
    private static Logger logger = LoggerFactory.getLogger(JsoupHtmlParser.class);

    /* (non-Javadoc)
     * @see com.itextpdf.styledxmlparser.html.IXmlParser#parse(java.io.InputStream, java.lang.String)
     */
    @Override
    public IDocumentNode parse(InputStream htmlStream, String charset) throws IOException {
        // Based on some brief investigations, it seems that Jsoup uses baseUri for resolving relative uri's into absolute
        // on user demand. We perform such resolving in ResourceResolver class, therefore it is not needed here.
        String baseUri = "";
        Document doc = Jsoup.parse(htmlStream, charset, baseUri);
        INode result = wrapJsoupHierarchy(doc);
        if (result instanceof IDocumentNode) {
            return (IDocumentNode) result;
        } else {
            throw new IllegalStateException();
        }
    }

    /* (non-Javadoc)
     * @see com.itextpdf.styledxmlparser.html.IXmlParser#parse(java.lang.String)
     */
    @Override
    public IDocumentNode parse(String html) {
        Document doc = Jsoup.parse(html);
        INode result = wrapJsoupHierarchy(doc);
        if (result instanceof IDocumentNode) {
            return (IDocumentNode) result;
        } else {
            throw new IllegalStateException();
        }
    }

    /**
     * Wraps JSoup nodes into pdfHTML {@link INode} classes.
     *
     * @param jsoupNode the JSoup node instance
     * @return the {@link INode} instance
     */
    private INode wrapJsoupHierarchy(Node jsoupNode) {
        INode resultNode = null;
        if (jsoupNode instanceof Document) {
            resultNode = new JsoupDocumentNode((Document) jsoupNode) ;
        } else if (jsoupNode instanceof TextNode) {
            resultNode = new JsoupTextNode((TextNode) jsoupNode);
        } else if (jsoupNode instanceof Element) {
            resultNode = new JsoupElementNode((Element) jsoupNode);
        } else if (jsoupNode instanceof DataNode) {
            resultNode = new JsoupDataNode((DataNode) jsoupNode);
        } else if (jsoupNode instanceof DocumentType) {
            resultNode = new JsoupDocumentTypeNode((DocumentType) jsoupNode);
        } else if (jsoupNode instanceof Comment) {
        } else {
            logger.error(MessageFormatUtil.format("Could not map node type: {0}", jsoupNode.getClass()));
        }

        for (Node node : jsoupNode.childNodes()) {
            INode childNode = wrapJsoupHierarchy(node);
            if (childNode != null) {
                resultNode.addChild(childNode);
            }
        }

        return resultNode;
    }
}