ai.platon.pulsar.skeleton.crawl.parse.html.PrimerHtmlParser.kt Maven / Gradle / Ivy

Go to download
package ai.platon.pulsar.skeleton.crawl.parse.html

import ai.platon.pulsar.common.config.CapabilityTypes.PARSE_DEFAULT_ENCODING
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.config.Params
import ai.platon.pulsar.skeleton.common.persist.ext.loadEvent
import ai.platon.pulsar.skeleton.crawl.parse.ParseFilters
import ai.platon.pulsar.skeleton.crawl.parse.ParseResult
import ai.platon.pulsar.skeleton.crawl.parse.Parser
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.metadata.ParseStatusCodes
import ai.platon.pulsar.skeleton.common.persist.ext.loadEventHandlers
import ai.platon.pulsar.skeleton.crawl.GlobalEventHandlers
import org.slf4j.LoggerFactory
import java.net.MalformedURLException
import java.time.Duration
import java.util.concurrent.atomic.AtomicInteger

/**
 * Primer HTML parser is a built-in parser while others should be a protocol-plugin.
 * Primer HTML parser uses Jsoup to parse the page content to a HTML document.
 * The parser can also register [ai.platon.pulsar.skeleton.crawl.parse.ParseFilter]s to extend the system.
 */
class PrimerHtmlParser(
    private val parseFilters: ParseFilters? = null,
    private val conf: ImmutableConfig,
) : Parser {
    companion object {
        val numHtmlParses = AtomicInteger()
        val numHtmlParsed = AtomicInteger()
    }

    private val logger = LoggerFactory.getLogger(PrimerHtmlParser::class.java)
    private val tracer = logger.takeIf { it.isDebugEnabled }
    private val defaultCharEncoding = conf.get(PARSE_DEFAULT_ENCODING, "utf-8")
    private val primerParser = PrimerParser(conf)
    /**
     * Optimized for html content
     * To parse non-html content, the parser might run into an endless loop,
     * run it in a separate coroutine to protect the process
     * */
    override val timeout = Duration.ZERO

    init {
        logger.info(params.formatAsLine())
    }

    constructor(conf: ImmutableConfig): this(null, conf)

    override fun getParams(): Params {
        return Params.of(
            "className", this.javaClass.simpleName,
            "defaultCharEncoding", defaultCharEncoding,
            "parseFilters", parseFilters?.javaClass?.simpleName ?: "null",
        )
    }

    override fun parse(page: WebPage): ParseResult {
        return try {
            // The base url is set by protocol, it might be different from the page url
            // if the request redirects.
            onWillParseHTMLDocument(page)

            val parseContext = primerParser.parseHTMLDocument(page)

            parseFilters?.filter(parseContext)

            parseContext.document?.let { onHTMLDocumentParsed(page, it) }

            parseContext.parseResult
        } catch (e: MalformedURLException) {
            ParseResult.failed(ParseStatusCodes.FAILED_MALFORMED_URL, e.message)
        } catch (e: Exception) {
            ParseResult.failed(ParseStatusCodes.FAILED_INVALID_FORMAT, e.message)
        }
    }

    /**
     *
     * */
    private fun onWillParseHTMLDocument(page: WebPage) {
        numHtmlParses.incrementAndGet()

        try {
            // notice the calling order.
            // The more specific handlers has the opportunity to override the result of more general handlers.
            page.loadEventHandlers?.onWillParseHTMLDocument?.invoke(page)
            GlobalEventHandlers.pageEventHandlers?.loadEventHandlers?.onWillParseHTMLDocument?.invoke(page)
        } catch (e: Throwable) {
            logger.warn("Failed to invoke onWillParseHTMLDocument | ${page.configuredUrl}", e)
        }
    }

    /**
     *
     * */
    private fun onHTMLDocumentParsed(page: WebPage, document: FeaturedDocument) {
        try {
            // The more specific handlers has the opportunity to override the result of more general handlers.
            page.loadEventHandlers?.onHTMLDocumentParsed?.invoke(page, document)
            GlobalEventHandlers.pageEventHandlers?.loadEventHandlers?.onHTMLDocumentParsed?.invoke(page, document)
        } catch (e: Throwable) {
            logger.warn("Failed to invoke onHTMLDocumentParsed | ${page.configuredUrl}", e)
        } finally {
            numHtmlParsed.incrementAndGet()
        }
    }
}