All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.parse.html.JsoupUtils.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.skeleton.crawl.parse.html

import com.google.common.collect.Sets
import org.apache.commons.lang3.StringUtils
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.nodes.Node
import org.jsoup.select.NodeTraversor
import java.util.*
import java.util.function.Consumer

/**
 * Created by vincent on 17-8-9.
 * Copyright @ 2013-2023 Platon AI. All rights reserved
 */
object JsoupUtils {
    fun sanitize(doc: Document, pithy: Boolean): Document {
        val unsafeNodes: Set = Sets.newHashSet(
                "title", "base", "script", "meta", "iframe", "link[ref=icon]", "link[ref=\"shortcut icon\"]")
        val obsoleteNodeNames: Set = Sets.newHashSet("style", "link", "head")
        val obsoleteNodes: MutableSet = HashSet()
        NodeTraversor.traverse({ node: Node, depth: Int ->
            val nodeName = node.nodeName()
            if (unsafeNodes.contains(nodeName)) {
                obsoleteNodes.add(node)
            }
            if (pithy) {
                node.removeAttr("style")
                if (obsoleteNodeNames.contains(nodeName)) {
                    obsoleteNodes.add(node)
                }
            }
        }, doc)

        obsoleteNodes.forEach { obj: Node -> obj.remove() }

        NodeTraversor.traverse({ node: Node?, _: Int ->
            if (node !is Element) {
                return@traverse
            }
            val ele = node
            if (ele.id().isEmpty() && ele.className().isEmpty()) {
                return@traverse
            }
            val selector = ele.cssSelector()
            ele.attr("pulsar-selector", selector)
            ele.addClass("has-selector")
        }, doc)
        for (ele in doc.select("html,head,body")) { // ele.clearAttrs();
            ele.attr("id", "pulsar" + StringUtils.capitalize(ele.nodeName()))
        }
        return doc
    }

    fun toHtmlPiece(doc_: Document, pithy: Boolean): String {
        var doc = doc_
        doc = sanitize(doc, pithy)
        var content = doc.toString()
        val pos = StringUtils.indexOf(content, " 0) {
            content = content.substring(pos)
        }
        content = content
                .replaceFirst("




© 2015 - 2024 Weber Informatics LLC | Privacy Policy