All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.common.collect.HyperlinkExtractors.kt Maven / Gradle / Ivy

There is a newer version: 2.1.0
Show newest version
package ai.platon.pulsar.skeleton.common.collect

import ai.platon.pulsar.skeleton.session.PulsarSession
import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.Strings
import ai.platon.pulsar.common.urls.Hyperlink
import ai.platon.pulsar.common.urls.StatefulHyperlink
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.common.urls.preprocess.UrlNormalizer
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.dom.nodes.node.ext.bestElement
import ai.platon.pulsar.dom.nodes.node.ext.isAnchor
import ai.platon.pulsar.dom.select.appendSelectorIfMissing
import ai.platon.pulsar.dom.select.collectNotNull
import ai.platon.pulsar.persist.WebPage
import org.slf4j.Logger
import org.slf4j.LoggerFactory

open class HyperlinkExtractor(
    val page: WebPage,
    val document: FeaturedDocument,
    val selector: String,
    val normalizer: UrlNormalizer? = null
) {
    private val log = LoggerFactory.getLogger(HyperlinkExtractor::class.java)

    fun extract() = extractTo(LinkedHashSet())

    fun extractTo(fetchUrls: MutableCollection): MutableCollection {
        val selector0 = appendSelectorIfMissing(selector, "a")

        var i = 0
        val parsedUrls = document.select(selector0).mapNotNull { element ->
            element.attr("abs:href").takeIf { UrlUtils.isStandard(it) }
                ?.let { Pair(it, normalizer?.invoke(it) ?: it) }
                ?.let { Hyperlink(it.second, element.text(), i++, referrer = page.url, href = it.first) }
        }
        parsedUrls.toCollection(fetchUrls)

        reportHyperlink(page, parsedUrls, fetchUrls, log)

        return fetchUrls
    }
}

open class RegexHyperlinkExtractor(
    val page: WebPage,
    val document: FeaturedDocument,
    val restrictCss: String,
    val urlPattern: String
) {
    private val log = LoggerFactory.getLogger(RegexHyperlinkExtractor::class.java)

    private val urlRegex = urlPattern.toRegex()

    constructor(
        session: PulsarSession,
        page: WebPage,
        restrictCss: String,
        urlPattern: String
    ) : this(page, session.parse(page), restrictCss, urlPattern)

    fun extract(): MutableCollection {
        return extractTo(LinkedHashSet())
    }

    fun extractTo(fetchUrls: MutableCollection): MutableCollection {
        val restrictedSection = document.document.selectFirst(restrictCss)

        if (restrictedSection == null) {
            log.warn("There is no restricted section <{}> | {}", restrictCss, page.url)
            return fetchUrls
        }

        var i = 0
        val parsedUrls = restrictedSection.collectNotNull { node ->
            node.takeIf { it.isAnchor }?.attr("abs:href")
                ?.takeIf { UrlUtils.isStandard(it) && it.matches(urlRegex) }
                ?.let { StatefulHyperlink(it, node.bestElement.text(), i++, referrer = page.url) }
        }
        parsedUrls.toCollection(fetchUrls)

        reportHyperlink(page, parsedUrls, fetchUrls, log)

        return fetchUrls
    }
}

private fun reportHyperlink(
    page: WebPage,
    links: Collection,
    fetchUrls: Collection,
    log: Logger
) {
    if (page.contentLength < 100) {
        log.info("Portal page is illegal (too small) | {}", page.url)
    } else {
        val exportLink = AppPaths.uniqueSymbolicLinkForUri(page.url)
        val readableBytes = Strings.compactFormat(page.contentLength)
        log.info(
            "{}. There are {} links in portal page ({}), total {} fetch urls | file://{} | {}",
            page.id, links.size, readableBytes, fetchUrls.size, exportLink, page.url
        )
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy