ai.platon.pulsar.rest.api.common.XSQLScrapeHyperlink.kt Maven / Gradle / Ivy
The newest version!
package ai.platon.pulsar.rest.api.common
import ai.platon.pulsar.skeleton.session.PulsarSession
import ai.platon.pulsar.common.*
import ai.platon.pulsar.common.PulsarParams.VAR_IS_SCRAPE
import ai.platon.pulsar.skeleton.crawl.event.impl.DefaultLoadEventHandlers
import ai.platon.pulsar.skeleton.crawl.PageEventHandlers
import ai.platon.pulsar.skeleton.crawl.common.GlobalCacheFactory
import ai.platon.pulsar.skeleton.crawl.common.url.CompletableListenableHyperlink
import ai.platon.pulsar.skeleton.crawl.event.impl.PageEventHandlersFactory
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.ql.context.AbstractSQLContext
import ai.platon.pulsar.ql.common.ResultSets
import ai.platon.pulsar.ql.h2.utils.ResultSetUtils
import ai.platon.pulsar.rest.api.entities.ScrapeRequest
import ai.platon.pulsar.rest.api.entities.ScrapeResponse
import org.h2.jdbc.JdbcSQLException
import java.sql.Connection
import java.sql.ResultSet
import java.time.Instant
import java.util.*
import kotlin.system.measureTimeMillis
class ScrapeLoadEventHandlers(
val hyperlink: XSQLScrapeHyperlink,
val response: ScrapeResponse,
) : DefaultLoadEventHandlers() {
init {
onWillLoad.addLast {
response.pageStatusCode = ResourceStatus.SC_PROCESSING
null
}
onWillParseHTMLDocument.addLast { page ->
page.variables[VAR_IS_SCRAPE] = true
null
}
onWillParseHTMLDocument.addLast { page ->
}
onHTMLDocumentParsed.addLast { page, document ->
require(page.hasVar(VAR_IS_SCRAPE))
hyperlink.extract(page, document)
}
onLoaded.addLast { page ->
hyperlink.complete(page)
}
}
}
open class XSQLScrapeHyperlink(
val request: ScrapeRequest,
val sql: NormXSQL,
val session: PulsarSession,
val globalCacheFactory: GlobalCacheFactory,
val uuid: String = UUID.randomUUID().toString()
) : CompletableListenableHyperlink(sql.url) {
private val logger = getLogger(XSQLScrapeHyperlink::class)
private val sqlContext get() = session.context as AbstractSQLContext
private val connectionPool get() = sqlContext.connectionPool
private val randomConnection get() = sqlContext.randomConnection
val response = ScrapeResponse()
override var args: String? = "-parse ${sql.args}"
override var event: PageEventHandlers = PageEventHandlersFactory().create(loadEventHandlers = ScrapeLoadEventHandlers(this, response))
open fun executeQuery(): ResultSet = executeQuery(request, response)
open fun extract(page: WebPage, document: FeaturedDocument) {
try {
response.pageContentBytes = page.contentLength.toInt()
response.pageStatusCode = page.protocolStatus.minorCode
doExtract(page, document)
} catch (t: Throwable) {
// Log the exception and throw it
logger.warn("Unexpected exception", t)
throw t
}
}
open fun complete(page: WebPage) {
response.uuid = uuid
response.isDone = true
response.finishTime = Instant.now()
complete(response)
}
protected open fun doExtract(page: WebPage, document: FeaturedDocument): ResultSet {
if (!page.protocolStatus.isSuccess ||
page.contentLength == 0L || page.persistedContentLength == 0L
|| page.content == null
) {
response.statusCode = ResourceStatus.SC_NO_CONTENT
return ResultSets.newSimpleResultSet()
}
return executeQuery(request, response)
}
protected open fun executeQuery(request: ScrapeRequest, response: ScrapeResponse): ResultSet {
var rs: ResultSet = ResultSets.newSimpleResultSet()
try {
response.statusCode = ResourceStatus.SC_OK
rs = executeQuery(sql.sql)
val resultSet = mutableListOf