All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.rest.api.common.XSQLScrapeHyperlink.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.rest.api.common

import ai.platon.pulsar.skeleton.session.PulsarSession
import ai.platon.pulsar.common.*
import ai.platon.pulsar.common.PulsarParams.VAR_IS_SCRAPE
import ai.platon.pulsar.skeleton.crawl.event.impl.DefaultLoadEventHandlers
import ai.platon.pulsar.skeleton.crawl.PageEventHandlers
import ai.platon.pulsar.skeleton.crawl.common.GlobalCacheFactory
import ai.platon.pulsar.skeleton.crawl.common.url.CompletableListenableHyperlink
import ai.platon.pulsar.skeleton.crawl.event.impl.PageEventHandlersFactory
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.ql.context.AbstractSQLContext
import ai.platon.pulsar.ql.common.ResultSets
import ai.platon.pulsar.ql.h2.utils.ResultSetUtils
import ai.platon.pulsar.rest.api.entities.ScrapeRequest
import ai.platon.pulsar.rest.api.entities.ScrapeResponse
import org.h2.jdbc.JdbcSQLException
import java.sql.Connection
import java.sql.ResultSet
import java.time.Instant
import java.util.*
import kotlin.system.measureTimeMillis

class ScrapeLoadEventHandlers(
    val hyperlink: XSQLScrapeHyperlink,
    val response: ScrapeResponse,
) : DefaultLoadEventHandlers() {
    init {
        onWillLoad.addLast {
            response.pageStatusCode = ResourceStatus.SC_PROCESSING
            null
        }
        onWillParseHTMLDocument.addLast { page ->
            page.variables[VAR_IS_SCRAPE] = true
            null
        }
        onWillParseHTMLDocument.addLast { page ->
        }
        onHTMLDocumentParsed.addLast { page, document ->
            require(page.hasVar(VAR_IS_SCRAPE))
            hyperlink.extract(page, document)
        }
        onLoaded.addLast { page ->
            hyperlink.complete(page)
        }
    }
}

open class XSQLScrapeHyperlink(
    val request: ScrapeRequest,
    val sql: NormXSQL,
    val session: PulsarSession,
    val globalCacheFactory: GlobalCacheFactory,
    val uuid: String = UUID.randomUUID().toString()
) : CompletableListenableHyperlink(sql.url) {

    private val logger = getLogger(XSQLScrapeHyperlink::class)

    private val sqlContext get() = session.context as AbstractSQLContext
    private val connectionPool get() = sqlContext.connectionPool
    private val randomConnection get() = sqlContext.randomConnection

    val response = ScrapeResponse()

    override var args: String? = "-parse ${sql.args}"
    override var event: PageEventHandlers = PageEventHandlersFactory().create(loadEventHandlers = ScrapeLoadEventHandlers(this, response))

    open fun executeQuery(): ResultSet = executeQuery(request, response)

    open fun extract(page: WebPage, document: FeaturedDocument) {
        try {
            response.pageContentBytes = page.contentLength.toInt()
            response.pageStatusCode = page.protocolStatus.minorCode

            doExtract(page, document)
        } catch (t: Throwable) {
            // Log the exception and throw it
            logger.warn("Unexpected exception", t)
            throw t
        }
    }

    open fun complete(page: WebPage) {
        response.uuid = uuid
        response.isDone = true
        response.finishTime = Instant.now()

        complete(response)
    }

    protected open fun doExtract(page: WebPage, document: FeaturedDocument): ResultSet {
        if (!page.protocolStatus.isSuccess ||
            page.contentLength == 0L || page.persistedContentLength == 0L
            || page.content == null
        ) {
            response.statusCode = ResourceStatus.SC_NO_CONTENT
            return ResultSets.newSimpleResultSet()
        }

        return executeQuery(request, response)
    }

    protected open fun executeQuery(request: ScrapeRequest, response: ScrapeResponse): ResultSet {
        var rs: ResultSet = ResultSets.newSimpleResultSet()

        try {
            response.statusCode = ResourceStatus.SC_OK

            rs = executeQuery(sql.sql)
            val resultSet = mutableListOf>()
            ResultSetUtils.getEntitiesFromResultSetTo(rs, resultSet)
            response.resultSet = resultSet
        } catch (e: JdbcSQLException) {
            response.statusCode = ResourceStatus.SC_EXPECTATION_FAILED
            logger.warn("Failed to execute sql #${response.uuid}{}", e.brief())
        } catch (e: Throwable) {
            response.statusCode = ResourceStatus.SC_EXPECTATION_FAILED
            logger.warn("Failed to execute sql #${response.uuid}\n{}", e.brief())
        }

        return rs
    }

    private fun executeQuery(sql: String): ResultSet {
        val connection = connectionPool.poll() ?: randomConnection
        return executeQuery(sql, connection).also { connectionPool.offer(connection) }
    }

    private fun executeQuery(sql: String, conn: Connection): ResultSet {
        var result: ResultSet? = null
        val millis = measureTimeMillis {
            conn.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_READ_ONLY)?.use { st ->
                try {
                    st.executeQuery(sql)?.use { rs ->
                        result = ResultSetUtils.copyResultSet(rs)
                    }
                } catch (e: JdbcSQLException) {
                    val message = e.toString()
                    if (message.contains("Syntax error in SQL statement")) {
                        response.statusCode = ResourceStatus.SC_BAD_REQUEST
                        logger.warn("Syntax error in SQL statement #${response.uuid}>>>\n{}\n<<<", e.sql)
                    } else {
                        response.statusCode = ResourceStatus.SC_EXPECTATION_FAILED
                        logger.warn("Failed to execute scrape task #${response.uuid}\n{}", e.stringify())
                    }
                }
            }
        }

        return result ?: ResultSets.newResultSet()
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy