All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.session.AbstractPulsarSession.kt Maven / Gradle / Ivy

package ai.platon.pulsar.skeleton.session

import ai.platon.pulsar.boilerpipe.extractors.ArticleExtractor
import ai.platon.pulsar.boilerpipe.sax.SAXInput
import ai.platon.pulsar.common.AppFiles
import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.AppPaths.WEB_CACHE_DIR
import ai.platon.pulsar.common.config.VolatileConfig
import ai.platon.pulsar.common.extractor.TextDocument
import ai.platon.pulsar.common.urls.PlainUrl
import ai.platon.pulsar.common.urls.UrlAware
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.common.warnForClose
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.dom.select.firstTextOrNull
import ai.platon.pulsar.dom.select.selectFirstOrNull
import ai.platon.pulsar.external.ModelResponse
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.skeleton.common.IllegalApplicationStateException
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.common.urls.NormURL
import ai.platon.pulsar.skeleton.context.support.AbstractPulsarContext
import ai.platon.pulsar.skeleton.crawl.PageEvent
import ai.platon.pulsar.skeleton.crawl.PageEventHandlers
import ai.platon.pulsar.skeleton.crawl.common.FetchEntry
import ai.platon.pulsar.skeleton.crawl.common.url.ListenableHyperlink
import org.jsoup.nodes.Element
import org.slf4j.LoggerFactory
import org.xml.sax.InputSource
import java.io.StringReader
import java.nio.ByteBuffer
import java.nio.file.Path
import java.time.Instant
import java.util.concurrent.CompletableFuture
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.atomic.AtomicBoolean
import java.util.concurrent.atomic.AtomicInteger
import java.util.concurrent.atomic.AtomicLong

/**
 * Created by vincent on 18-1-17.
 * Copyright @ 2013-2023 Platon AI. All rights reserved
 */
abstract class AbstractPulsarSession(
    /**
     * The pulsar context
     * */
    override val context: AbstractPulsarContext,
    /**
     * The session scope volatile config, every setting is supposed to be changed at any time and any place
     * */
    override val sessionConfig: VolatileConfig,
    /**
     * The session id. Session id is expected to be set by the container, e.g. the h2 database runtime
     * */
    override val id: Int
) : PulsarSession {
    
    companion object {
        const val ID_CAPACITY = 1_000_000
        const val ID_START = 1_000_000
        const val ID_END = ID_START + ID_CAPACITY - 1
        
        private val idGen = AtomicInteger()
        
        val pageCacheHits = AtomicLong()
        val documentCacheHits = AtomicLong()
        
        fun generateNextId() = ID_START + idGen.incrementAndGet()
    }
    
    private val logger = LoggerFactory.getLogger(AbstractPulsarSession::class.java)
    
    override val unmodifiedConfig get() = context.unmodifiedConfig
    
    override val display get() = "$id"
    
    private val closed = AtomicBoolean()
    val isActive get() = !closed.get() && context.isActive
    
    private val dataCache = ConcurrentHashMap()
    private var enablePDCache = true
    override val globalCache get() = context.globalCache
    override val pageCache get() = globalCache.pageCache
    override val documentCache get() = globalCache.documentCache
    
    private val contextOrNull get() = if (isActive) context else null
    private val globalCacheFactoryOrNull get() = contextOrNull?.globalCacheFactory
    private val pageCacheOrNull get() = globalCacheFactoryOrNull?.globalCache?.pageCache
    private val documentCacheOrNull get() = globalCacheFactoryOrNull?.globalCache?.documentCache
    
    private val closableObjects = mutableSetOf()
    
    fun registerClosable(closable: AutoCloseable) = ensureActive { closableObjects.add(closable) }
    
    override fun disablePDCache() = run { enablePDCache = false }
    
    override fun options(args: String) = options(args, null)
    
    override fun options(args: String, event: PageEventHandlers?): LoadOptions {
        val opts = LoadOptions.parse(args, sessionConfig.toVolatileConfig())
        if (event != null) {
            opts.rawEvent = event
        }
        return opts
    }
    
    override fun property(name: String): String? {
        return sessionConfig[name] ?: unmodifiedConfig[name]
    }
    
    override fun property(name: String, value: String) {
        sessionConfig[name] = value
    }
    
    override fun normalize(url: String) = normalize(url, "")
    
    override fun normalize(url: String, args: String, toItemOption: Boolean) =
        context.normalize(url, options(args), toItemOption)
    
    override fun normalize(url: String, options: LoadOptions, toItemOption: Boolean) =
        context.normalize(url, options, toItemOption)
    
    override fun normalizeOrNull(url: String?, options: LoadOptions, toItemOption: Boolean) =
        context.normalizeOrNull(url, options, toItemOption)
    
    override fun normalize(urls: Iterable) = normalize(urls, options(), false)
    
    override fun normalize(urls: Iterable, args: String, toItemOption: Boolean) =
        normalize(urls, options(args), toItemOption)
    
    override fun normalize(urls: Iterable, options: LoadOptions, toItemOption: Boolean) =
        context.normalize(urls, options, toItemOption)
    
    override fun normalize(url: UrlAware) = normalize(url, options())
    
    override fun normalize(url: UrlAware, args: String, toItemOption: Boolean) =
        normalize(url, options(args), toItemOption)
    
    override fun normalize(url: UrlAware, options: LoadOptions, toItemOption: Boolean) =
        context.normalize(url, options, toItemOption)
    
    override fun normalizeOrNull(url: UrlAware?, options: LoadOptions, toItemOption: Boolean) =
        context.normalizeOrNull(url, options, toItemOption)
    
    override fun normalize(urls: Collection) = normalize(urls, options(), false)
    
    override fun normalize(urls: Collection, args: String, toItemOption: Boolean) =
        normalize(urls, options(args), toItemOption)
    
    override fun normalize(urls: Collection, options: LoadOptions, toItemOption: Boolean) =
        context.normalize(urls, options, toItemOption)
    
    override fun get(url: String): WebPage = ensureActive { context.get(url) }
    
    override fun get(url: String, vararg fields: String): WebPage = ensureActive { context.get(url, *fields) }
    
    override fun getOrNull(url: String): WebPage? = contextOrNull?.getOrNull(url)
    
    override fun getOrNull(url: String, vararg fields: String): WebPage? = contextOrNull?.getOrNull(url, *fields)
    
    override fun getContent(url: String): ByteBuffer? = contextOrNull?.getContent(url)
    
    override fun getContentAsString(url: String): String? = contextOrNull?.getContentAsString(url)
    
    override fun exists(url: String): Boolean = ensureActive { context.exists(url) }
    
    override fun fetchState(page: WebPage, options: LoadOptions) = context.fetchState(page, options)
    
    override fun open(url: String): WebPage = load(url, "-refresh")
    
    override fun open(url: String, event: PageEvent): WebPage = load(url, options("-refresh", event))
    
    override fun load(url: String): WebPage = load(url, options())
    
    override fun load(url: String, args: String): WebPage = load(url, options(args))
    
    override fun load(url: String, options: LoadOptions): WebPage = load(normalize(url, options))
    
    override fun load(url: UrlAware): WebPage = load(normalize(url, options()))
    
    override fun load(url: UrlAware, args: String): WebPage = load(normalize(url, options(args)))
    
    override fun load(url: UrlAware, options: LoadOptions): WebPage = load(normalize(url, options))
    
    override fun load(normURL: NormURL): WebPage {
        if (!enablePDCache) {
            return context.load(normURL)
        }
        
        return createPageWithCachedCoreOrNull(normURL) ?: loadAndCache(normURL)
    }
    
    override suspend fun loadDeferred(url: String, args: String) = loadDeferred(normalize(url, options(args)))
    
    override suspend fun loadDeferred(url: String, options: LoadOptions) = loadDeferred(normalize(url, options))
    
    override suspend fun loadDeferred(url: UrlAware, args: String): WebPage =
        loadDeferred(normalize(url, options(args)))
    
    override suspend fun loadDeferred(url: UrlAware, options: LoadOptions): WebPage =
        loadDeferred(normalize(url, options))
    
    override suspend fun loadDeferred(url: NormURL): WebPage {
        if (!enablePDCache) {
            return context.loadDeferred(url)
        }
        
        return createPageWithCachedCoreOrNull(url) ?: loadAndCacheDeferred(url)
    }
    
    override fun loadAll(urls: Iterable) = loadAll(urls, options())
    
    override fun loadAll(urls: Iterable, args: String) = loadAll(urls, options(args))
    
    override fun loadAll(urls: Iterable, options: LoadOptions) = loadAll(normalize(urls, options))
    
    override fun loadAll(urls: Collection) = loadAll(urls, options())
    
    override fun loadAll(urls: Collection, args: String) = loadAll(urls, options(args))
    
    override fun loadAll(urls: Collection, options: LoadOptions) = loadAll(normalize(urls, options))
    
    override fun loadAll(normUrls: List) = context.loadAll(normUrls)
    
    override fun loadAsync(url: String) = loadAsync(normalize(url))
    
    override fun loadAsync(url: String, args: String) = loadAsync(normalize(url, args))
    
    override fun loadAsync(url: String, options: LoadOptions) = loadAsync(normalize(url, options))
    
    override fun loadAsync(url: UrlAware) = loadAsync(normalize(url))
    
    override fun loadAsync(url: UrlAware, args: String) = loadAsync(normalize(url, args))
    
    override fun loadAsync(url: UrlAware, options: LoadOptions) = loadAsync(normalize(url, options))
    
    override fun loadAsync(url: NormURL) = context.loadAsync(url)
    
    override fun loadAllAsync(urls: Iterable) = loadAllAsync(normalize(urls))
    
    override fun loadAllAsync(urls: Iterable, args: String) = loadAllAsync(normalize(urls, args))
    
    override fun loadAllAsync(urls: Iterable, options: LoadOptions) = loadAllAsync(normalize(urls, options))
    
    override fun loadAllAsync(urls: Collection) = loadAllAsync(normalize(urls))
    
    override fun loadAllAsync(urls: Collection, args: String) = loadAllAsync(normalize(urls, args))
    
    override fun loadAllAsync(urls: Collection, options: LoadOptions) = loadAllAsync(normalize(urls, options))
    
    override fun loadAllAsync(urls: List) = context.loadAllAsync(urls)
    
    override fun submit(url: String) = submit(PlainUrl(url))
    
    override fun submit(url: String, args: String) = submit(PlainUrl(url, args))
    
    override fun submit(url: String, options: LoadOptions) =
        submit(ListenableHyperlink(url, args = options.toString(), event = options.event))
    
    override fun submit(url: UrlAware) = submit(url, "")
    
    override fun submit(url: UrlAware, args: String) =
        also { context.submit(url.also { it.args = LoadOptions.normalize(url.args, args) }) }
    
    override fun submitAll(urls: Iterable) = submitAll(urls.map { PlainUrl(it) })
    
    override fun submitAll(urls: Iterable, args: String) = submitAll(urls.map { PlainUrl(it, args) })
    
    override fun submitAll(urls: Iterable, options: LoadOptions) =
        submitAll(urls.map { ListenableHyperlink(it, args = options.toString(), event = options.event) })
    
    override fun submitAll(urls: Collection) = also { context.submitAll(urls) }
    
    override fun submitAll(urls: Collection, args: String) =
        also { context.submitAll(urls.onEach { it.args = LoadOptions.normalize(it.args, args) }) }
    
    override fun loadOutPages(portalUrl: String, args: String) = loadOutPages(portalUrl, options(args))
    
    override fun loadOutPages(portalUrl: String, options: LoadOptions) = loadOutPages(PlainUrl(portalUrl), options)
    
    override fun loadOutPages(portalUrl: UrlAware, args: String) = loadOutPages(portalUrl, options(args))
    
    override fun loadOutPages(portalUrl: UrlAware, options: LoadOptions) = loadOutPages0(portalUrl, options)
    
    override fun submitForOutPages(portalUrl: String, args: String) = submitForOutPages(portalUrl, options(args))
    
    override fun submitForOutPages(portalUrl: String, options: LoadOptions) =
        submitForOutPages(PlainUrl(portalUrl), options)
    
    override fun submitForOutPages(portalUrl: UrlAware, args: String) = submitForOutPages(portalUrl, options(args))
    
    override fun submitForOutPages(portalUrl: UrlAware, options: LoadOptions) = submitForOutPages0(portalUrl, options)
    
    override fun loadOutPagesAsync(portalUrl: String, args: String) = loadOutPagesAsync(portalUrl, options(args))
    
    override fun loadOutPagesAsync(portalUrl: String, options: LoadOptions) = loadOutPagesAsync0(portalUrl, options)
    
    override fun loadResource(url: String, referrer: String) = loadResource(url, referrer, options())
    
    override fun loadResource(url: String, referrer: String, args: String) = loadResource(url, referrer, options(args))
    
    override fun loadResource(url: String, referrer: String, options: LoadOptions) =
        load(url, options.apply { isResource = true }.also { it.referrer = referrer })
    
    override suspend fun loadResourceDeferred(url: String, referrer: String) =
        loadResourceDeferred(url, referrer, options())
    
    override suspend fun loadResourceDeferred(url: String, referrer: String, args: String) =
        loadResourceDeferred(url, referrer, options(args))
    
    override suspend fun loadResourceDeferred(url: String, referrer: String, options: LoadOptions) =
        loadDeferred(url, options.apply { isResource = true }.also { it.referrer = referrer })
    
    override fun parse(page: WebPage) = parse0(page, false)
    
    override fun parse(page: WebPage, noCache: Boolean) = parse0(page, noCache)
    
    override fun loadDocument(url: String) = parse(load(url))
    
    override fun loadDocument(url: String, args: String) = parse(load(url, args))
    
    override fun loadDocument(url: String, options: LoadOptions) = parse(load(url, options))
    
    override fun loadDocument(url: UrlAware) = parse(load(url))
    
    override fun loadDocument(url: UrlAware, args: String) = parse(load(url, args))
    
    override fun loadDocument(url: UrlAware, options: LoadOptions) = parse(load(url, options))
    
    override fun loadDocument(url: NormURL) = parse(load(url))
    
    override fun scrape(url: String, args: String, fieldSelectors: Iterable): Map =
        scrape(url, options(args), fieldSelectors)
    
    override fun scrape(url: String, options: LoadOptions, fieldSelectors: Iterable): Map {
        val document = loadDocument(url, options)
        return fieldSelectors.associateWith { document.selectFirstOrNull(it)?.text() }
    }
    
    override fun scrape(url: String, args: String, fieldSelectors: Map): Map =
        scrape(url, options(args), fieldSelectors)
    
    override fun scrape(url: String, options: LoadOptions, fieldSelectors: Map): Map {
        val document = loadDocument(url, options)
        return fieldSelectors.entries.associate { it.key to document.selectFirstOrNull(it.value)?.text() }
    }
    
    override fun scrape(
        url: String, args: String, restrictSelector: String, fieldSelectors: Iterable
    ): List> = scrape(url, options(args), restrictSelector, fieldSelectors)
    
    override fun scrape(
        url: String, options: LoadOptions, restrictSelector: String, fieldSelectors: Iterable
    ): List> {
        return loadDocument(url, options).select(restrictSelector).map { ele ->
            fieldSelectors.associateWith { ele.selectFirstOrNull(it)?.text() }
        }
    }
    
    override fun scrape(
        url: String, args: String, restrictSelector: String, fieldSelectors: Map
    ): List> = scrape(url, options(args), restrictSelector, fieldSelectors)
    
    override fun scrape(
        url: String, options: LoadOptions, restrictSelector: String, fieldSelectors: Map
    ): List> {
        return loadDocument(url, options).select(restrictSelector).map { ele ->
            fieldSelectors.entries.associate { it.key to ele.selectFirstOrNull(it.value)?.text() }
        }
    }
    
    override fun scrapeOutPages(portalUrl: String, args: String, fieldSelectors: Iterable) =
        scrapeOutPages(portalUrl, args, ":root", fieldSelectors)
    
    override fun scrapeOutPages(portalUrl: String, options: LoadOptions, fieldSelectors: Iterable) =
        scrapeOutPages(portalUrl, options, ":root", fieldSelectors)
    
    override fun scrapeOutPages(
        portalUrl: String, args: String, restrictSelector: String, fieldSelectors: Iterable
    ): List> {
        return loadOutPages(portalUrl, args).asSequence().map { parse(it) }
            .mapNotNull { it.selectFirstOrNull(restrictSelector) }
            .map { ele -> fieldSelectors.associateWith { ele.firstTextOrNull(it) } }
            .toList()
    }
    
    override fun scrapeOutPages(
        portalUrl: String, options: LoadOptions, restrictSelector: String, fieldSelectors: Iterable
    ): List> {
        return loadOutPages(portalUrl, options).asSequence().map { parse(it) }
            .mapNotNull { it.selectFirstOrNull(restrictSelector) }
            .map { ele -> fieldSelectors.associateWith { ele.firstTextOrNull(it) } }
            .toList()
    }
    
    override fun scrapeOutPages(portalUrl: String, args: String, fieldSelectors: Map) =
        scrapeOutPages(portalUrl, args, ":root", fieldSelectors)
    
    override fun scrapeOutPages(
        portalUrl: String, options: LoadOptions, fieldSelectors: Map
    ): List> = scrapeOutPages(portalUrl, options, ":root", fieldSelectors)
    
    override fun scrapeOutPages(
        portalUrl: String, args: String, restrictSelector: String, fieldSelectors: Map
    ): List> {
        return loadOutPages(portalUrl, args).asSequence().map { parse(it) }
            .mapNotNull { it.selectFirstOrNull(restrictSelector) }
            .map { ele -> fieldSelectors.entries.associate { it.key to ele.firstTextOrNull(it.value) } }
            .toList()
    }
    
    override fun scrapeOutPages(
        portalUrl: String, options: LoadOptions, restrictSelector: String, fieldSelectors: Map
    ): List> {
        return loadOutPages(portalUrl, options).asSequence().map { parse(it) }
            .mapNotNull { it.selectFirstOrNull(restrictSelector) }
            .map { ele -> fieldSelectors.entries.associate { it.key to ele.firstTextOrNull(it.value) } }
            .toList()
    }
    
    override fun harvest(url: String, args: String, engine: String): TextDocument = harvest(load(url, args), engine)
    
    override fun harvest(page: WebPage, engine: String): TextDocument = harvest0(page, engine)
    
    override fun chat(prompt: String): ModelResponse {
        val model = sessionConfig["llm.name"] ?: throw IllegalArgumentException("No LLM model name specified.")
        val apiKey = sessionConfig["llm.apiKey"] ?: throw IllegalArgumentException("No LLM API key specified.")
        return context.chat(prompt, model, apiKey)
    }
    
    override fun chat(context: String, prompt: String): ModelResponse {
        // TODO: config the template to generate the final prompt
        val prompt1 = prompt + "\n\n" + context
        
        val response = chat(prompt1)
        
        return response
    }
    
    override fun chat(page: WebPage, prompt: String): ModelResponse {
        val prompt1 = prompt + "\n\n" + page.contentAsString
        
        val response = chat(prompt1)
        
        return response
    }
    
    override fun chat(document: FeaturedDocument, prompt: String): ModelResponse {
        val prompt1 = prompt + "\n\n" + document.text
        
        val response = chat(prompt1)
        
        return response
    }
    
    override fun chat(element: Element, prompt: String): ModelResponse {
        val prompt1 = prompt + "\n\n" + element.text()
        
        val response = chat(prompt1)
        
        return response
    }
    
    override fun data(name: String): Any? = let { dataCache[name] }
    
    override fun data(name: String, value: Any) = run { dataCache[name] = value }
    
    override fun delete(url: String) = ensureActive { context.delete(url) }
    
    override fun flush() = ensureActive { context.webDb.flush() }
    
    override fun persist(page: WebPage) = ensureActive { context.webDb.put(page) }
    
    override fun export(page: WebPage) = export(page, "")
    
    override fun export(page: WebPage, ident: String): Path {
        val filename = AppPaths.fromUri(page.url, "", ".htm")
        val path = WEB_CACHE_DIR.resolve("export").resolve(ident).resolve(filename)
        return AppFiles.saveTo(page.contentAsString, path, true)
    }
    
    override fun exportTo(page: WebPage, path: Path): Path {
        return AppFiles.saveTo(page.contentAsString, path, true)
    }
    
    override fun export(doc: FeaturedDocument) = export(doc, "")
    
    override fun export(doc: FeaturedDocument, ident: String): Path {
        val filename = AppPaths.fromUri(doc.baseURI, "", ".htm")
        val path = WEB_CACHE_DIR.resolve("export").resolve(ident).resolve(filename)
        return AppFiles.saveTo(doc.outerHtml, path, true)
    }
    
    override fun exportTo(doc: FeaturedDocument, path: Path): Path {
        return AppFiles.saveTo(doc.outerHtml.toByteArray(), path, true)
    }
    
    override fun equals(other: Any?) = other === this || (other is PulsarSession && other.id == id)
    
    override fun hashCode(): Int = id
    
    override fun toString(): String = "#$id"
    
    override fun close() {
        if (closed.compareAndSet(false, true)) {
            closableObjects.forEach {
                runCatching { it.close() }.onFailure { warnForClose(this, it) }
            }
            closableObjects.clear()
            logger.info("PulsarSession is closed | #{} | {}#{}", display, this.javaClass.name, hashCode())
        }
    }
    
    private fun parse0(page: WebPage, noCache: Boolean = false): FeaturedDocument {
        val nil = FeaturedDocument.NIL
        
        if (page.isNil) {
            return nil
        }
        
        if (noCache) {
            return context.parse(page) ?: nil
        }
        
        val document = documentCacheOrNull?.getDatum(page.url)
        if (document != null) {
            documentCacheHits.incrementAndGet()
            return document
        }
        
        return context.parse(page) ?: nil
    }
    
    private fun harvest0(page: WebPage, engine: String) = harvest0(page.url, page.contentAsString, engine)
    
    private fun harvest0(url: String, html: String, engine: String) =
        harvest0(url, InputSource(StringReader(html)), engine)
    
    private fun harvest0(url: String, inputSource: InputSource, engine: String): TextDocument {
        if (engine != "boilerpipe") {
            throw IllegalArgumentException("Unsupported engine: $engine")
        }
        
        val d = SAXInput().parse(url, inputSource)
        val success = ArticleExtractor().process(d)
        if (!success) {
            return TextDocument(url)
        }
        
        return TextDocument(url,
            pageTitle = d.pageTitle,
            contentTitle = d.contentTitle,
            textContent = d.textContent,
            additionalFields = d.fields.takeIf { it.isNotEmpty() }
        )
    }
    
    private fun chat0() {
    
    }
    
    private fun loadAndCache(normURL: NormURL): WebPage {
        return context.load(normURL).also {
            pageCacheOrNull?.putDatum(it.url, it)
        }
    }
    
    private suspend fun loadAndCacheDeferred(normURL: NormURL): WebPage {
        return context.loadDeferred(normURL).also {
            pageCacheOrNull?.putDatum(it.url, it)
        }
    }
    
    /**
     * Create page with cached core, but not metadata. If the page might be changed, it should be fetched again.
     *
     * If the loading is not a read-only-loading, which might modify the page status, or the loading have event handlers,
     * in such cases, we must render the page in the browser again.
     *
     * TODO: handle the session cache and the FetchComponent cache
     * */
    private fun createPageWithCachedCoreOrNull(normURL: NormURL): WebPage? {
        if (!normURL.options.readonly) {
            return null
        }
        
        // We have events to handle, so do not use the cached version
        if (normURL.options.rawEvent != null) {
            return null
        }
        
        val cachedPage = getCachedPageOrNull(normURL)
        val page = FetchEntry.createPageShell(normURL)
        
        if (cachedPage != null) {
            // the cached page can be or not be persisted, but not guaranteed
            // if a page is loaded from cache, the content remains unchanged and should not persist to database
            page.unsafeSetGPage(cachedPage.unbox())
            
            page.isCached = true
            page.tmpContent = cachedPage.tmpContent
            page.args = normURL.args
            
            return page
        }
        
        return null
    }
    
    private fun getCachedPageOrNull(normURL: NormURL): WebPage? {
        val (url, options) = normURL
        if (options.refresh) {
            // refresh the page, do not take cached version
            return null
        }
        
        val now = Instant.now()
        val page = pageCacheOrNull?.getDatum(url, options.expires, now) ?: return null
        if (!options.isExpired(page.prevFetchTime)) {
            pageCacheHits.incrementAndGet()
            return page
        }
        
        return null
    }
    
    private fun parseNormalizedLink(ele: Element, normalize: Boolean = false, ignoreQuery: Boolean = false): String? {
        var link = ele.attr("abs:href").takeIf { it.startsWith("http") } ?: return null
        if (normalize) {
            link = normalizeOrNull(link)?.spec ?: return null
        }
        
        link = link.takeUnless { ignoreQuery } ?: UrlUtils.getUrlWithoutParameters(link)
        return link.substringBeforeLast("#")
    }
    
    private fun loadOutPages0(portalUrl: UrlAware, options: LoadOptions): List {
        val normURL = normalize(portalUrl, options)
        val opts = normURL.options
        
        val selector = opts.outLinkSelectorOrNull ?: return listOf()
        val itemOpts = normURL.options.createItemOptions()
        
        require(normURL.options.rawEvent == options.rawEvent)
        require(options.rawItemEvent == itemOpts.rawEvent)
        
        val links = loadDocument(normURL)
            .select(selector) { parseNormalizedLink(it, !opts.noNorm, opts.ignoreUrlQuery) }
            .mapNotNullTo(mutableSetOf()) { it }
            .take(opts.topLinks)
        
        return loadAll(links, itemOpts)
    }
    
    private fun submitForOutPages0(portalUrl: UrlAware, options: LoadOptions): AbstractPulsarSession {
        val normURL = normalize(portalUrl, options)
        val opts = normURL.options
        val selector = opts.outLinkSelectorOrNull ?: return this
        val itemOpts = normURL.options.createItemOptions()
        
        val outLinks = loadDocument(normURL)
            .select(selector) { parseNormalizedLink(it, !opts.noNorm, opts.ignoreUrlQuery) }
            .mapNotNullTo(mutableSetOf()) { it }
            .take(opts.topLinks)
            .map { ListenableHyperlink("$it $itemOpts") }
            .onEach { link -> itemOpts.rawEvent?.let { link.event = it } }
        
        submitAll(outLinks)
        
        return this
    }
    
    private fun loadOutPagesAsync0(portalUrl: String, options: LoadOptions): List> {
        val normURL = normalize(portalUrl, options)
        val opts = normURL.options
        val itemOpts = normURL.options.createItemOptions()
        val selector = opts.outLinkSelectorOrNull ?: return listOf()
        
        val outLinks = loadDocument(normURL)
            .select(selector) { parseNormalizedLink(it, !opts.noNorm, opts.ignoreUrlQuery) }
            .mapNotNullTo(mutableSetOf()) { it }
            .take(opts.topLinks)
            .map { NormURL(it, itemOpts) }
        
        return loadAllAsync(outLinks)
    }
    
    private fun  ensureActive(action: () -> T): T =
        if (isActive) action() else throw IllegalApplicationStateException("Pulsar session is not alive")
    
    private fun  ensureActive(defaultValue: T, action: () -> T): T = defaultValue.takeIf { !isActive } ?: action()
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy