All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.parse.html.JsoupParser.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.skeleton.crawl.parse.html

import ai.platon.pulsar.common.config.AppConstants.PULSAR_DOCUMENT_NORMALIZED_URI
import ai.platon.pulsar.common.config.AppConstants.PULSAR_META_INFORMATION_SELECTOR
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.skeleton.common.persist.ext.options
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.dom.select.selectFirstOrNull
import ai.platon.pulsar.persist.WebPage
import org.jsoup.Jsoup
import org.slf4j.LoggerFactory
import java.io.IOException
import java.util.concurrent.atomic.AtomicInteger

/**
 * Created by vincent on 16-9-14.
 */
class JsoupParser(
        private val page: WebPage,
        private val conf: ImmutableConfig
) {
    companion object {
        val numJsoupParses = AtomicInteger()
        val numJsoupParsed = AtomicInteger()
    }

    val LOG = LoggerFactory.getLogger(JsoupParser::class.java)

    var document = FeaturedDocument.NIL
        private set

    fun parse(): FeaturedDocument {
        numJsoupParses.incrementAndGet()

        if (page.encoding == null) {
            val primerParser = PrimerParser(conf)
            primerParser.detectEncoding(page)
        }

        try {
            val mutableDocument = Jsoup.parse(page.contentAsInputStream, page.encoding, page.baseUrl)
            updateMetaInfos(page, mutableDocument)

            // Calculate features for each node in the constructor
            document = FeaturedDocument(mutableDocument)
            return document
        } catch (e: IOException) {
            LOG.warn("Failed to parse page {}", page.url)
            LOG.warn(e.toString())
        }

        numJsoupParsed.incrementAndGet()

        return document
    }

    private fun updateMetaInfos(page: WebPage, document: org.jsoup.nodes.Document) {
        // the node is created by injected javascript
        val selector = PULSAR_META_INFORMATION_SELECTOR
        val metadata = document.selectFirstOrNull(selector) ?: return

        val urls = mutableMapOf(PULSAR_DOCUMENT_NORMALIZED_URI to page.url)

        page.href?.takeIf { UrlUtils.isStandard(it) }?.let {
            urls.put("href", it)
        }
        page.referrer.takeIf { UrlUtils.isStandard(it) }?.let {
            urls.put("referrer", it)
        }

        val head = document.head()
        urls.forEach { (rel, href) ->
            head.append("")
            metadata.attr(rel, href)
        }

        // deprecated, use body head link[rel=normalizedURI] instead
        metadata.attr("normURL", page.url)
        // deprecated, use body head link[rel=normalizedURI] instead
        metadata.attr("normalizedUrl", page.url)

        val options = page.options
        metadata.attr("label", options.label)
        metadata.attr("taskId", options.taskId)
        metadata.attr("taskTime", options.taskTime.toString())
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy