All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.dom.model.Models.kt Maven / Gradle / Ivy

package ai.platon.pulsar.dom.model

import ai.platon.pulsar.dom.nodes.node.ext.height
import ai.platon.pulsar.dom.nodes.node.ext.width
import org.apache.commons.lang3.StringUtils
import org.jsoup.nodes.Element
import java.util.*

val imageSuffixes = arrayOf("jpg", "jpeg", "png", "gif", "webp")

/**
 * An image represent a loading
 * */
const val imageLoading = ""
/**
 * An image represent a dot
 * */
const val imageDot = ""

/**
 * The image model
 * */
data class Image(val attributes: TreeMap = TreeMap()) {
    override fun toString(): String {
        return attributes.entries.joinToString(" ", "") { "${it.key}='${it.value}'" }
    }
}

/**
 * The link model
 * */
data class Link(
        var text: String = "",
        val attributes: TreeMap = TreeMap(),
        var image: Image? = null
) {
    override fun toString(): String {
        val imageString = image?.toString()?:""
        return attributes.entries
                .joinToString(" ", "$imageString$text") { "${it.key}='${it.value}'" }
    }
}

/**
 * The website model
 * */
data class Website(val domain: String, val name: String)

/**
 * Create an anchor with an optional image
 *
 * Lazy attributes are compatible with https://appelsiini.net/projects/lazyload/v1/
 * */
fun createLink(ele: Element, keepMetadata: Boolean = true, lazy: Boolean = false): Link {
    val link = Link()

    if (!ele.tagName().equals("a", ignoreCase = true))
        return link

    val image = ele.getElementsByTag("img").first()
    if (image != null) {
        link.image = createImage(image, keepMetadata = keepMetadata, lazy = lazy)
    }

    link.text = sniffLinkText(ele, image)
    val href = ele.absUrl("href")
    if (!href.isNullOrBlank()) {
        link.attributes["href"] = href
    }

    if (keepMetadata) {
        buildAttributes(ele, link)
    }

    return link
}

/**
 * Create a image
 *
 * Lazy attributes are compatible with https://appelsiini.net/projects/lazyload/v1/
 * */
fun createImage(ele: Element, keepMetadata: Boolean = true, lazy: Boolean = false): Image {
    val image = Image()

    if (ele.tagName() != "img") {
        return image
    }

    val ignoredAttrs = listOf("id", "class", "style")

    // calculate attributes
    var lazySrc: String? = null
    for (attr in ele.attributes()) {
        val name = attr.key
        var value = attr.value

        if (ignoredAttrs.contains(name)) {
            continue
        }

        if (name.startsWith("data-") && value == "0") {
            continue
        }

        // TODO : this is tricky to found out lazy and actual image src
        if (maybeUrl(name, value)) {
            val absUrl = ele.absUrl(name)

            if (name.contains("lazy")) {
                lazySrc = absUrl
            }

            // data-src is a better candidate than lazy
            if (name.contains("data-src")) {
                lazySrc = absUrl
            }

            value = absUrl
        }

        if (value.endsWith(".js")) {
            value += ".rename"
        }

        if (keepMetadata && name.isNotEmpty() && value.isNotEmpty()) {
            image.attributes[name] = value
        }
    }

    if (lazy) {
        // compatible with https://appelsiini.net/projects/lazyload/v1/
        val w = ele.width
        val h = ele.height
        image.attributes["alt"] = "img[$w x $h]"
        image.attributes["class"] = image.attributes["class"]?.let { "$it lazy" }?:"lazy"
        var src = image.attributes["src"]?:""
        src = if (lazySrc == null || lazySrc.isBlank()) src else lazySrc
        if (src.isNotBlank()) {
            image.attributes["data-original"] = src
        }
        image.attributes["src"] = imageDot
        image.attributes["style"] = "min-width:10px; min-height:10px"
    } else {
        image.attributes["src"] = lazySrc ?: ""
    }

    return image
}

private fun buildAttributes(ele: Element, link: Link) {
    val ignoredAttrs = Arrays.asList("id", "class", "style", "_target", "target", "title")

    for (attr in ele.attributes()) {
        val name = attr.key
        var value = attr.value

        if (ignoredAttrs.contains(name)) {
            continue
        }

        // TODO : tricky? site specified?
        if (name.startsWith("data-") && value == "0") {
            continue
        }

        if (maybeUrl(name, value)) {
            // TODO : better sniff strategy
            value = ele.absUrl(name)
        }

        if (value.endsWith(".js")) {
            value += ".rename"
        }

        if (!name.isEmpty() && !value.isEmpty()) {
            link.attributes[name] = value
        }
    }
}

fun sniffLinkText(link: Element, image: Element?): String {
    var text: String? = StringUtils.trimToNull(link.text())
    if (text == null)
        text = StringUtils.trimToNull(link.attr("title"))
    if (text == null && image != null)
        text = StringUtils.trimToNull(image.attr("alt"))

    return text ?: ""
}

fun maybeUrl(attrName: String, attrValue: String): Boolean {
    val urlAttrs = listOf("src", "url", "data-src", "data-url")

    if (urlAttrs.contains(attrName))
        return true
    if (attrValue.contains("http://"))
        return true
    return StringUtils.countMatches(attrValue, "/") > 3
}

/**
 * pseudo links :
 * href="#comment"
 * href="javascript:;"
 * href="void:;"
 */
fun isPseudoLink(href: String): Boolean {
    return href.startsWith("#") && !href.startsWith("java") && !href.startsWith("void")
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy