ai.platon.pulsar.skeleton.session.AbstractPulsarSession.kt Maven / Gradle / Ivy
The newest version!
package ai.platon.pulsar.skeleton.session
import ai.platon.pulsar.boilerpipe.extractors.ArticleExtractor
import ai.platon.pulsar.boilerpipe.sax.SAXInput
import ai.platon.pulsar.common.AppFiles
import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.AppPaths.WEB_CACHE_DIR
import ai.platon.pulsar.common.config.VolatileConfig
import ai.platon.pulsar.common.extractor.TextDocument
import ai.platon.pulsar.common.urls.PlainUrl
import ai.platon.pulsar.common.urls.UrlAware
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.common.warnForClose
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.dom.select.firstTextOrNull
import ai.platon.pulsar.dom.select.selectFirstOrNull
import ai.platon.pulsar.external.ModelResponse
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.skeleton.common.IllegalApplicationStateException
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.common.urls.NormURL
import ai.platon.pulsar.skeleton.context.support.AbstractPulsarContext
import ai.platon.pulsar.skeleton.crawl.PageEvent
import ai.platon.pulsar.skeleton.crawl.PageEventHandlers
import ai.platon.pulsar.skeleton.crawl.common.FetchEntry
import ai.platon.pulsar.skeleton.crawl.common.url.ListenableHyperlink
import org.jsoup.nodes.Element
import org.slf4j.LoggerFactory
import org.xml.sax.InputSource
import java.io.StringReader
import java.nio.ByteBuffer
import java.nio.file.Path
import java.time.Instant
import java.util.concurrent.CompletableFuture
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.atomic.AtomicBoolean
import java.util.concurrent.atomic.AtomicInteger
import java.util.concurrent.atomic.AtomicLong
/**
* Created by vincent on 18-1-17.
* Copyright @ 2013-2023 Platon AI. All rights reserved
*/
abstract class AbstractPulsarSession(
/**
* The pulsar context
* */
override val context: AbstractPulsarContext,
/**
* The session scope volatile config, every setting is supposed to be changed at any time and any place
* */
override val sessionConfig: VolatileConfig,
/**
* The session id. Session id is expected to be set by the container, e.g. the h2 database runtime
* */
override val id: Int
) : PulsarSession {
companion object {
const val ID_CAPACITY = 1_000_000
const val ID_START = 1_000_000
const val ID_END = ID_START + ID_CAPACITY - 1
private val idGen = AtomicInteger()
val pageCacheHits = AtomicLong()
val documentCacheHits = AtomicLong()
fun generateNextId() = ID_START + idGen.incrementAndGet()
}
private val logger = LoggerFactory.getLogger(AbstractPulsarSession::class.java)
override val unmodifiedConfig get() = context.unmodifiedConfig
override val display get() = "$id"
private val closed = AtomicBoolean()
val isActive get() = !closed.get() && context.isActive
private val dataCache = ConcurrentHashMap()
private var enablePDCache = true
override val globalCache get() = context.globalCache
override val pageCache get() = globalCache.pageCache
override val documentCache get() = globalCache.documentCache
private val contextOrNull get() = if (isActive) context else null
private val globalCacheFactoryOrNull get() = contextOrNull?.globalCacheFactory
private val pageCacheOrNull get() = globalCacheFactoryOrNull?.globalCache?.pageCache
private val documentCacheOrNull get() = globalCacheFactoryOrNull?.globalCache?.documentCache
private val closableObjects = mutableSetOf()
fun registerClosable(closable: AutoCloseable) = ensureActive { closableObjects.add(closable) }
override fun disablePDCache() = run { enablePDCache = false }
override fun options(args: String) = options(args, null)
override fun options(args: String, event: PageEventHandlers?): LoadOptions {
val opts = LoadOptions.parse(args, sessionConfig.toVolatileConfig())
if (event != null) {
opts.rawEvent = event
}
return opts
}
override fun property(name: String): String? {
return sessionConfig[name] ?: unmodifiedConfig[name]
}
override fun property(name: String, value: String) {
sessionConfig[name] = value
}
override fun normalize(url: String) = normalize(url, "")
override fun normalize(url: String, args: String, toItemOption: Boolean) =
context.normalize(url, options(args), toItemOption)
override fun normalize(url: String, options: LoadOptions, toItemOption: Boolean) =
context.normalize(url, options, toItemOption)
override fun normalizeOrNull(url: String?, options: LoadOptions, toItemOption: Boolean) =
context.normalizeOrNull(url, options, toItemOption)
override fun normalize(urls: Iterable) = normalize(urls, options(), false)
override fun normalize(urls: Iterable, args: String, toItemOption: Boolean) =
normalize(urls, options(args), toItemOption)
override fun normalize(urls: Iterable, options: LoadOptions, toItemOption: Boolean) =
context.normalize(urls, options, toItemOption)
override fun normalize(url: UrlAware) = normalize(url, options())
override fun normalize(url: UrlAware, args: String, toItemOption: Boolean) =
normalize(url, options(args), toItemOption)
override fun normalize(url: UrlAware, options: LoadOptions, toItemOption: Boolean) =
context.normalize(url, options, toItemOption)
override fun normalizeOrNull(url: UrlAware?, options: LoadOptions, toItemOption: Boolean) =
context.normalizeOrNull(url, options, toItemOption)
override fun normalize(urls: Collection) = normalize(urls, options(), false)
override fun normalize(urls: Collection, args: String, toItemOption: Boolean) =
normalize(urls, options(args), toItemOption)
override fun normalize(urls: Collection, options: LoadOptions, toItemOption: Boolean) =
context.normalize(urls, options, toItemOption)
override fun get(url: String): WebPage = ensureActive { context.get(url) }
override fun get(url: String, vararg fields: String): WebPage = ensureActive { context.get(url, *fields) }
override fun getOrNull(url: String): WebPage? = contextOrNull?.getOrNull(url)
override fun getOrNull(url: String, vararg fields: String): WebPage? = contextOrNull?.getOrNull(url, *fields)
override fun getContent(url: String): ByteBuffer? = contextOrNull?.getContent(url)
override fun getContentAsString(url: String): String? = contextOrNull?.getContentAsString(url)
override fun exists(url: String): Boolean = ensureActive { context.exists(url) }
override fun fetchState(page: WebPage, options: LoadOptions) = context.fetchState(page, options)
override fun open(url: String): WebPage = load(url, "-refresh")
override fun open(url: String, event: PageEvent): WebPage = load(url, options("-refresh", event))
override fun load(url: String): WebPage = load(url, options())
override fun load(url: String, args: String): WebPage = load(url, options(args))
override fun load(url: String, options: LoadOptions): WebPage = load(normalize(url, options))
override fun load(url: UrlAware): WebPage = load(normalize(url, options()))
override fun load(url: UrlAware, args: String): WebPage = load(normalize(url, options(args)))
override fun load(url: UrlAware, options: LoadOptions): WebPage = load(normalize(url, options))
override fun load(normURL: NormURL): WebPage {
if (!enablePDCache) {
return context.load(normURL)
}
return createPageWithCachedCoreOrNull(normURL) ?: loadAndCache(normURL)
}
override suspend fun loadDeferred(url: String, args: String) = loadDeferred(normalize(url, options(args)))
override suspend fun loadDeferred(url: String, options: LoadOptions) = loadDeferred(normalize(url, options))
override suspend fun loadDeferred(url: UrlAware, args: String): WebPage =
loadDeferred(normalize(url, options(args)))
override suspend fun loadDeferred(url: UrlAware, options: LoadOptions): WebPage =
loadDeferred(normalize(url, options))
override suspend fun loadDeferred(url: NormURL): WebPage {
if (!enablePDCache) {
return context.loadDeferred(url)
}
return createPageWithCachedCoreOrNull(url) ?: loadAndCacheDeferred(url)
}
override fun loadAll(urls: Iterable) = loadAll(urls, options())
override fun loadAll(urls: Iterable, args: String) = loadAll(urls, options(args))
override fun loadAll(urls: Iterable, options: LoadOptions) = loadAll(normalize(urls, options))
override fun loadAll(urls: Collection) = loadAll(urls, options())
override fun loadAll(urls: Collection, args: String) = loadAll(urls, options(args))
override fun loadAll(urls: Collection, options: LoadOptions) = loadAll(normalize(urls, options))
override fun loadAll(normUrls: List) = context.loadAll(normUrls)
override fun loadAsync(url: String) = loadAsync(normalize(url))
override fun loadAsync(url: String, args: String) = loadAsync(normalize(url, args))
override fun loadAsync(url: String, options: LoadOptions) = loadAsync(normalize(url, options))
override fun loadAsync(url: UrlAware) = loadAsync(normalize(url))
override fun loadAsync(url: UrlAware, args: String) = loadAsync(normalize(url, args))
override fun loadAsync(url: UrlAware, options: LoadOptions) = loadAsync(normalize(url, options))
override fun loadAsync(url: NormURL) = context.loadAsync(url)
override fun loadAllAsync(urls: Iterable) = loadAllAsync(normalize(urls))
override fun loadAllAsync(urls: Iterable, args: String) = loadAllAsync(normalize(urls, args))
override fun loadAllAsync(urls: Iterable, options: LoadOptions) = loadAllAsync(normalize(urls, options))
override fun loadAllAsync(urls: Collection) = loadAllAsync(normalize(urls))
override fun loadAllAsync(urls: Collection, args: String) = loadAllAsync(normalize(urls, args))
override fun loadAllAsync(urls: Collection, options: LoadOptions) = loadAllAsync(normalize(urls, options))
override fun loadAllAsync(urls: List) = context.loadAllAsync(urls)
override fun submit(url: String) = submit(PlainUrl(url))
override fun submit(url: String, args: String) = submit(PlainUrl(url, args))
override fun submit(url: String, options: LoadOptions) =
submit(ListenableHyperlink(url, args = options.toString(), event = options.event))
override fun submit(url: UrlAware) = submit(url, "")
override fun submit(url: UrlAware, args: String) =
also { context.submit(url.also { it.args = LoadOptions.normalize(url.args, args) }) }
override fun submitAll(urls: Iterable) = submitAll(urls.map { PlainUrl(it) })
override fun submitAll(urls: Iterable, args: String) = submitAll(urls.map { PlainUrl(it, args) })
override fun submitAll(urls: Iterable, options: LoadOptions) =
submitAll(urls.map { ListenableHyperlink(it, args = options.toString(), event = options.event) })
override fun submitAll(urls: Collection) = also { context.submitAll(urls) }
override fun submitAll(urls: Collection, args: String) =
also { context.submitAll(urls.onEach { it.args = LoadOptions.normalize(it.args, args) }) }
override fun loadOutPages(portalUrl: String, args: String) = loadOutPages(portalUrl, options(args))
override fun loadOutPages(portalUrl: String, options: LoadOptions) = loadOutPages(PlainUrl(portalUrl), options)
override fun loadOutPages(portalUrl: UrlAware, args: String) = loadOutPages(portalUrl, options(args))
override fun loadOutPages(portalUrl: UrlAware, options: LoadOptions) = loadOutPages0(portalUrl, options)
override fun submitForOutPages(portalUrl: String, args: String) = submitForOutPages(portalUrl, options(args))
override fun submitForOutPages(portalUrl: String, options: LoadOptions) =
submitForOutPages(PlainUrl(portalUrl), options)
override fun submitForOutPages(portalUrl: UrlAware, args: String) = submitForOutPages(portalUrl, options(args))
override fun submitForOutPages(portalUrl: UrlAware, options: LoadOptions) = submitForOutPages0(portalUrl, options)
override fun loadOutPagesAsync(portalUrl: String, args: String) = loadOutPagesAsync(portalUrl, options(args))
override fun loadOutPagesAsync(portalUrl: String, options: LoadOptions) = loadOutPagesAsync0(portalUrl, options)
override fun loadResource(url: String, referrer: String) = loadResource(url, referrer, options())
override fun loadResource(url: String, referrer: String, args: String) = loadResource(url, referrer, options(args))
override fun loadResource(url: String, referrer: String, options: LoadOptions) =
load(url, options.apply { isResource = true }.also { it.referrer = referrer })
override suspend fun loadResourceDeferred(url: String, referrer: String) =
loadResourceDeferred(url, referrer, options())
override suspend fun loadResourceDeferred(url: String, referrer: String, args: String) =
loadResourceDeferred(url, referrer, options(args))
override suspend fun loadResourceDeferred(url: String, referrer: String, options: LoadOptions) =
loadDeferred(url, options.apply { isResource = true }.also { it.referrer = referrer })
override fun parse(page: WebPage) = parse0(page, false)
override fun parse(page: WebPage, noCache: Boolean) = parse0(page, noCache)
override fun loadDocument(url: String) = parse(load(url))
override fun loadDocument(url: String, args: String) = parse(load(url, args))
override fun loadDocument(url: String, options: LoadOptions) = parse(load(url, options))
override fun loadDocument(url: UrlAware) = parse(load(url))
override fun loadDocument(url: UrlAware, args: String) = parse(load(url, args))
override fun loadDocument(url: UrlAware, options: LoadOptions) = parse(load(url, options))
override fun loadDocument(url: NormURL) = parse(load(url))
override fun scrape(url: String, args: String, fieldSelectors: Iterable): Map =
scrape(url, options(args), fieldSelectors)
override fun scrape(url: String, options: LoadOptions, fieldSelectors: Iterable): Map {
val document = loadDocument(url, options)
return fieldSelectors.associateWith { document.selectFirstOrNull(it)?.text() }
}
override fun scrape(url: String, args: String, fieldSelectors: Map): Map =
scrape(url, options(args), fieldSelectors)
override fun scrape(url: String, options: LoadOptions, fieldSelectors: Map): Map {
val document = loadDocument(url, options)
return fieldSelectors.entries.associate { it.key to document.selectFirstOrNull(it.value)?.text() }
}
override fun scrape(
url: String, args: String, restrictSelector: String, fieldSelectors: Iterable
): List