ai.platon.pulsar.skeleton.crawl.component.LoadComponent.kt Maven / Gradle / Ivy

Go to download
package ai.platon.pulsar.skeleton.crawl.component

import ai.platon.pulsar.common.CheckState
import ai.platon.pulsar.common.PulsarParams.VAR_FETCH_STATE
import ai.platon.pulsar.common.PulsarParams.VAR_PREV_FETCH_TIME_BEFORE_UPDATE
import ai.platon.pulsar.common.Strings
import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.CapabilityTypes.*
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.measure.ByteUnitConverter
import ai.platon.pulsar.persist.WebDb
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.gora.generated.GWebPage
import ai.platon.pulsar.persist.model.ActiveDOMStat
import ai.platon.pulsar.skeleton.common.AppStatusTracker
import ai.platon.pulsar.skeleton.common.message.PageLoadStatusFormatter
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.common.persist.ext.loadEvent
import ai.platon.pulsar.skeleton.common.persist.ext.loadEventHandlers
import ai.platon.pulsar.skeleton.common.urls.NormURL
import ai.platon.pulsar.skeleton.crawl.GlobalEventHandlers
import ai.platon.pulsar.skeleton.crawl.common.FetchEntry
import ai.platon.pulsar.skeleton.crawl.common.FetchState
import ai.platon.pulsar.skeleton.crawl.common.GlobalCacheFactory
import ai.platon.pulsar.skeleton.crawl.common.url.CompletableHyperlink
import ai.platon.pulsar.skeleton.crawl.common.url.toCompletableListenableHyperlink
import ai.platon.pulsar.skeleton.crawl.parse.ParseResult
import kotlinx.coroutines.*
import org.slf4j.LoggerFactory
import java.net.URL
import java.time.Duration
import java.time.Instant
import java.util.concurrent.CompletableFuture
import java.util.concurrent.atomic.AtomicBoolean
import java.util.concurrent.atomic.AtomicInteger
import java.util.concurrent.atomic.AtomicLong

/**
 * Created by vincent on 17-7-15.
 * Copyright @ 2013-2023 Platon AI. All rights reserved
 *
 * The load component is the core component of the Pulsar framework, it is responsible for loading pages from the
 * local storage or fetching them from the Internet.
 */
class LoadComponent(
    val webDb: WebDb,
    val globalCacheFactory: GlobalCacheFactory,
    val fetchComponent: BatchFetchComponent,
    val parseComponent: ParseComponent,
    val updateComponent: UpdateComponent,
    val immutableConfig: ImmutableConfig,
    val statusTracker: AppStatusTracker? = null,
) : AutoCloseable {
    companion object {
        private const val VAR_REFRESH = "refresh"
        val pageCacheHits = AtomicLong()
        val dbGetCount = AtomicLong()

        var IGNORED_PAGE_FIELDS = setOf(
            GWebPage.Field.PAGE_MODEL,
        )

        var LAZY_PAGE_FIELDS = setOf(
            GWebPage.Field.PAGE_MODEL,
            GWebPage.Field.CONTENT
        )

        var PAGE_FIELDS = GWebPage.Field.entries.toSet() - LAZY_PAGE_FIELDS
    }

    private val logger = LoggerFactory.getLogger(LoadComponent::class.java)
    private val taskLogger = LoggerFactory.getLogger(LoadComponent::class.java.name + ".Task")
    private val tracer = logger.takeIf { it.isTraceEnabled }

    private val loadStrategy = immutableConfig.get(LOAD_STRATEGY, "SIMPLE")

    private val deactivateFetchComponent1 = immutableConfig.getBoolean(LOAD_DEACTIVATE_FETCH_COMPONENT, false)
    @Deprecated("Use LOAD_DEACTIVATE_FETCH_COMPONENT instead")
    private val deactivateFetchComponent2 = immutableConfig.getBoolean(LOAD_DISABLE_FETCH, false)
    /**
     * Deactivate the fetch component, ensuring that all pages are loaded exclusively from storage
     * and never fetched from the Internet.
     *
     * If a page is not found in the local storage, return WebPage.NIL.
     * */
    private val deactivateFetchComponent = deactivateFetchComponent1 || deactivateFetchComponent2

    val globalCache get() = globalCacheFactory.globalCache
    val pageCache get() = globalCache.pageCache
    val documentCache get() = globalCache.documentCache

    private val coreMetrics get() = fetchComponent.coreMetrics
    private val closed = AtomicBoolean()

    private val isActive get() = !closed.get()

    @Volatile
    private var numWrite = 0
    private val abnormalPage get() = WebPage.NIL.takeIf { !isActive }

    private var reportCount = 0
    private val batchTaskCount = AtomicInteger()

    /**
     * Retrieve the fetch state of a page, which determines whether the page should be fetched from the Internet.
     *
     * @param page The page to be fetched
     * @param options The load options
     * @return The fetch state
     * */
    fun fetchState(page: WebPage, options: LoadOptions): CheckState {
        val protocolStatus = page.protocolStatus

        return when {
            closed.get() -> CheckState(FetchState.DO_NOT_FETCH, "closed")
            page.isNil -> CheckState(FetchState.NEW_PAGE, "nil")
            page.isInternal -> CheckState(FetchState.DO_NOT_FETCH, "internal")
            protocolStatus.isNotFetched -> CheckState(FetchState.NEW_PAGE, "not fetched")
            protocolStatus.isTempMoved -> CheckState(FetchState.TEMP_MOVED, "temp moved")
            else -> getFetchStateForExistPage(page, options)
        }
    }

    /**
     * Load a page specified by [url] with the given arguments.
     *
     * This method initially verifies the presence of the page in the local store. If the page exists and meets the
     * specified requirements, it returns the local version. Otherwise, it fetches the page from the Internet.
     *
     * Other fetch conditions can be specified by load arguments:
     *
     * 1. expiration
     * 2. page size requirement
     * 3. fields requirement
     * 4. other
     *
     * @param url The url of the page
     * @param options The load options
     * @return The page
     * */
    @Throws(Exception::class)
    fun load(url: URL, options: LoadOptions): WebPage {
        return abnormalPage ?: loadWithRetry(NormURL(url, options))
    }

    /**
     * Load a page specified by [normURL].
     *
     * This method initially verifies the presence of the page in the local store. If the page exists and meets the
     * specified requirements, it returns the local version. Otherwise, it fetches the page from the Internet.
     *
     * Other fetch conditions can be specified by load arguments:
     *
     * 1. expiration
     * 2. page size requirement
     * 3. fields requirement
     * 4. other
     *
     * @param normURL The normalized url of the page
     * @return The page
     * */
    @Throws(Exception::class)
    fun load(normURL: NormURL): WebPage {
        return abnormalPage ?: loadWithRetry(normURL)
    }

    /**
     * Load a page specified by [normURL].
     *
     * This method initially verifies the presence of the page in the local store. If the page exists and meets the
     * specified requirements, it returns the local version. Otherwise, it fetches the page from the Internet.
     *
     * This method is a coroutine version of [load].
     *
     * Other fetch conditions can be specified by load arguments:
     *
     * 1. expiration
     * 2. page size requirement
     * 3. fields requirement
     * 4. other
     *
     * @param normURL The normalized url of the page
     * @return The page
     * */
    @Throws(Exception::class)
    suspend fun loadDeferred(normURL: NormURL): WebPage {
        return abnormalPage ?: loadWithRetryDeferred(normURL)
    }

    @Throws(Exception::class)
    fun loadWithRetry(normURL: NormURL): WebPage {
        if (normURL.isNil) {
            return WebPage.NIL
        }

        var page = load0(normURL)
        var n = normURL.options.nJitRetry
        while (page.protocolStatus.isRetry && n-- > 0) {
            page = load0(normURL)
        }
        return page
    }

    @Throws(Exception::class)
    suspend fun loadWithRetryDeferred(normURL: NormURL): WebPage {
        if (normURL.isNil) {
            return WebPage.NIL
        }

        var page = loadDeferred0(normURL)
        var n = normURL.options.nJitRetry
        while (page.protocolStatus.isRetry && n-- > 0) {
            page = loadDeferred0(normURL)
        }
        return page
    }

    /**
     * Load all pages specified by [normUrls], wait until all pages are loaded or timeout
     * */
    fun loadAll(normUrls: Iterable): List {
        if (!normUrls.iterator().hasNext()) {
            return listOf()
        }

        val batchId = batchTaskCount.incrementAndGet()
        val futures = loadAllAsync(normUrls.filter { !it.isNil })

        logger.info("Waiting for {} completable links | #{}", futures.size, batchId)

        val future = CompletableFuture.allOf(*futures.toTypedArray())
        future.join()

        val pages = futures.mapNotNull { it.get().takeIf { it.isNotNil } }

        logger.info("Finished {}/{} pages | #{}", pages.size, futures.size, batchId)

        return pages
    }

    /**
     * Load a page specified by [normURL]
     *
     * @param normURL The normalized url
     * @return A completable future of webpage
     * */
    fun loadAsync(normURL: NormURL): CompletableFuture {
        val link = normURL.toCompletableListenableHyperlink()
        globalCache.urlPool.add(link)
        return link
    }

    /**
     * Load all pages specified by [normUrls], wait until all pages are loaded or timeout
     * */
    fun loadAllAsync(normUrls: Iterable): List> {
        if (!normUrls.iterator().hasNext()) {
            return listOf()
        }

        val linkFutures = normUrls.asSequence().filter { !it.isNil }.distinctBy { it.spec }
            .map { it.toCompletableListenableHyperlink() }
            .toList()
        globalCache.urlPool.addAll(linkFutures)
        return linkFutures
    }

    /**
     * Load a webpage from local storage, or if it doesn't exist in local storage,
     * fetch it from the Internet, unless the fetch component is disabled.
     * */
    @Throws(Exception::class)
    private fun load0(normURL: NormURL): WebPage {
        val page = createPageShell(normURL)

        if (deactivateFetchComponent && shouldFetch(page)) {
            return WebPage.NIL
        }

        return load1(normURL, page)
    }

    @Throws(Exception::class)
    private fun load1(normURL: NormURL, page: WebPage): WebPage {
        onWillLoad(normURL, page)

        fetchContentIfNecessary(normURL, page)

        onLoaded(page, normURL)

        return page
    }

    @Throws(Exception::class)
    private suspend fun loadDeferred0(normURL: NormURL): WebPage {
        val page = createPageShell(normURL)

        if (deactivateFetchComponent && shouldFetch(page)) {
            return WebPage.NIL
        }

        return loadDeferred1(normURL, page)
    }

    @Throws(Exception::class)
    private suspend fun loadDeferred1(normURL: NormURL, page: WebPage): WebPage {
        onWillLoad(normURL, page)

        fetchContentIfNecessaryDeferred(normURL, page)

        onLoaded(page, normURL)

        return page
    }

    @Throws(Exception::class)
    private fun fetchContentIfNecessary(normURL: NormURL, page: WebPage) {
        if (page.isInternal) {
            return
        }

        if (page.removeVar(VAR_REFRESH) != null) {
            fetchContent(page, normURL)
        }
    }

    @Throws(Exception::class)
    private suspend fun fetchContentIfNecessaryDeferred(normURL: NormURL, page: WebPage) {
        if (page.removeVar(VAR_REFRESH) != null) {
            fetchContentDeferred(page, normURL)
        }
    }

    /**
     * Create a page shell, the page shell is the process unit for most tasks.
     * */
    private fun createPageShell(normURL: NormURL): WebPage {
        val cachedPage = getCachedPageOrNull(normURL)
        var page = FetchEntry.createPageShell(normURL)

        if (cachedPage != null) {
            pageCacheHits.incrementAndGet()
            page.isCached = true
            // the cached page can be or not be persisted, but not guaranteed
            // if a page is loaded from cache, the content remains unchanged and should not persist to database
            // TODO: clone the underlying data or not?
            page.unsafeCloneGPage(cachedPage)
            page.clearPersistContent()

            page.tmpContent = cachedPage.content

            // TODO: test the dirty flag
            // do not persist this copy
            page.unbox().clearDirty()
            assert(!page.isFetched)
            assert(page.isNotInternal)
        } else {
            // get the metadata of the page from the database, this is very fast for a crawler
            // load page content and page model lazily, if we load page content and page model every time,
            // the underlying storage may crash due to the stress.
            val loadedPage = when (loadStrategy) {
                "PARTIAL_LAZY" -> {
                    webDb.getOrNull(normURL.spec, fields = PAGE_FIELDS)?.also {
                        it.setLazyFieldLoader(LazyFieldLoader(normURL.spec, webDb))
                    }
                }
                else -> {
                    webDb.getOrNull(normURL.spec)
                }
            }

            dbGetCount.incrementAndGet()
            if (loadedPage != null) {
                // override the old variables: args, href, etc
                FetchEntry.initWebPage(loadedPage, normURL.options, normURL.hrefSpec, normURL.referrer)
                page = loadedPage
            }

            initFetchState(normURL, page, loadedPage)
        }

        return page
    }

    private fun initFetchState(normURL: NormURL, page: WebPage, loadedPage: WebPage?): CheckState {
        val options = normURL.options

        val state = when {
            loadedPage == null -> CheckState(FetchState.NEW_PAGE, "nil 1")
            loadedPage.isNil -> CheckState(FetchState.NEW_PAGE, "nil 2")
            loadedPage.isInternal -> CheckState(FetchState.DO_NOT_FETCH, "internal 1")
            else -> fetchState(page, options)
        }

        page.setVar(VAR_FETCH_STATE, state)
        val refresh = state.code in FetchState.refreshCodes
        if (refresh) {
            page.setVar(VAR_REFRESH, state)
        }

        return state
    }

    private fun onWillLoad(normURL: NormURL, page: WebPage) {
        if (page.isInternal) {
            return
        }

        val options = normURL.options

        shouldBe(options.conf, page.conf) { "Conf should be the same \n${options.conf} \n${page.conf}" }

        try {
            GlobalEventHandlers.pageEventHandlers?.loadEventHandlers?.onWillLoad?.invoke(page.url)
            // The more specific handlers has the opportunity to override the result of more general handlers.
            page.loadEventHandlers?.onWillLoad?.invoke(page.url)
        } catch (e: Throwable) {
            logger.warn("Failed to invoke beforeLoad | ${page.configuredUrl}", e)
        }
    }

    private fun onLoaded(page: WebPage, normURL: NormURL) {
        if (page.isInternal) {
            return
        }

        val options = normURL.options
        val status = page.protocolStatus

        // handle page content
        if (!page.isCached) {
            // processPageContent(page, normURL)
        }

        // handle cache
        if (!options.readonly) {
            if (page.isFetched && page.protocolStatus.isSuccess) {
                documentCache.remove(page.url)
            }
            pageCache.putDatum(page.url, page)
        }

        // TODO: Too many cancels in 1.10.x, so do not report canceled pages, it will be improved in the further version
        if (!page.isCached && !page.isCanceled) {
            report(page)
        }

        // We might use the cached page's content in parse phase
        if (options.parse) {
            // TODO: do we need page.protocalStatus.isSuccess?
            if (!page.isCanceled) {
                parse(page, normURL.options)
            }
        }

        try {
            val detail = normURL.detail
            // we might use the cached page's content in after load handler
            if (detail is CompletableHyperlink<*>) {
                require(page.loadEventHandlers?.onLoaded?.isNotEmpty == true) {
                    "A completable link must have a onLoaded handler"
                }
            }
            
            GlobalEventHandlers.pageEventHandlers?.loadEventHandlers?.onLoaded?.invoke(page)
            // The more specific handlers has the opportunity to override the result of more general handlers.
            page.loadEventHandlers?.onLoaded?.invoke(page)
        } catch (e: Throwable) {
            logger.warn("Failed to invoke onLoaded | ${page.configuredUrl}", e)
        }

        if (options.persist && !page.isCanceled && !options.readonly) {
            persist(page, options)
        }
    }

    private fun parse(page: WebPage, options: LoadOptions): ParseResult? {
        val parser = parseComponent.takeIf { options.parse } ?: return null
        val parseResult = parser.parse(page, options.reparseLinks, options.noFilter)
        tracer?.trace("ParseResult: {} ParseReport: {}", parseResult, parser.getTraceInfo())

        return parseResult
    }

    /**
     * Because the content is large, a general webpage is up to 2M, so we do not load it from the database unless have to
     *
     * if the page is fetched, the content is set by the fetch component, so we do not load it from the database
     * if the protocol status is not success, the content is useless and not loaded
     * */
    private fun processPageContent(page: WebPage, normURL: NormURL) {
        val options = normURL.options

        if (page.protocolStatus.isSuccess && page.content == null) {
            shouldBe(false, page.isFetched) { "Page should not be fetched | ${page.configuredUrl}" }
            // load the content of the page
            val contentPage = webDb.getOrNull(page.url, GWebPage.Field.CONTENT)
            if (contentPage != null) {
                page.content = contentPage.content
                // TODO: test the dirty flag
                page.unbox().clearDirty(GWebPage.Field.CONTENT.index)
            }
        }

        shouldBe(options.conf, page.conf) { "Conf should be the same \n${options.conf} \n${page.conf}" }
    }

    private fun report(page: WebPage) {
        if (taskLogger.isInfoEnabled) {
            val verbose = taskLogger.isDebugEnabled
            val report = PageLoadStatusFormatter(page, withSymbolicLink = verbose, withOptions = true).toString()

            taskLogger.info(report)

            if (reportCount == 0) {
                val logExplainUrl = "https://github.com/platonai/PulsarRPA/blob/master/docs/log-format.md"
                taskLogger.info("Log explanation: $logExplainUrl")
            }

            ++reportCount
        }
    }

    private fun getCachedPageOrNull(normURL: NormURL): WebPage? {
        val (url, options) = normURL
        if (options.refresh) {
            // refresh the page, do not take cached version
            return null
        }

        val now = Instant.now()
        val cachedPage = pageCache.getDatum(url, options.expires, now)
        if (cachedPage != null && !options.isExpired(cachedPage.prevFetchTime)) {
            // TODO: properly handle page conf, a page might work in different context which have different conf
            // TODO: properly handle ListenableHyperlink
            // here is a complex logic for a ScrapingHyperlink: the page have an event handlers, and the page can
            // also be loaded inside an event handler. We must handle such situation very carefully

            // page.conf = normURL.options.conf
            // page.args = normURL.args

            return cachedPage
        }

        return null
    }

    private fun shouldFetch(page: WebPage): Boolean {
        return page.hasVar(VAR_REFRESH)
    }

    private fun beforeFetch(page: WebPage, options: LoadOptions) {
        // require(page.options == options)
        page.setVar(VAR_PREV_FETCH_TIME_BEFORE_UPDATE, page.prevFetchTime)
        globalCache.fetchingCache.add(page.url)
        logger.takeIf { it.isDebugEnabled }?.debug("Loading url | {} {}", page.url, page.args)
    }

    @Throws(Exception::class)
    private fun fetchContent(page: WebPage, normURL: NormURL) {
        try {
            beforeFetch(page, normURL.options)

            require(page.conf == normURL.options.conf)
//            require(normURL.options.eventHandler != null)
//            require(page.conf.getBeanOrNull(PulsarEventHandler::class) != null)

            fetchComponent.fetchContent(page)
        } finally {
            afterFetch(page, normURL.options)
        }
    }

    @Throws(Exception::class)
    private suspend fun fetchContentDeferred(page: WebPage, normURL: NormURL) {
        try {
            beforeFetch(page, normURL.options)
            fetchComponent.fetchContentDeferred(page)
        } finally {
            afterFetch(page, normURL.options)
        }
    }

    private fun afterFetch(page: WebPage, options: LoadOptions) {
        // the metadata of the page is loaded from database but the content is not cached,
        // so load the content again
        updateFetchSchedule(page, options)
        globalCache.fetchingCache.remove(page.url)
    }

    /**
     * TODO: FetchSchedule.shouldFetch, crawlStatus and FetchReason should keep consistent
     * */
    private fun getFetchStateForExistPage(page: WebPage, options: LoadOptions): CheckState {
        // TODO: crawl status is better to decide the fetch reason
        val crawlStatus = page.crawlStatus
        val protocolStatus = page.protocolStatus

        if (options.refresh) {
            page.fetchRetries = 0
            return CheckState(FetchState.REFRESH, "refresh")
        }

        val ignoreFailure = options.ignoreFailure
        if (protocolStatus.isRetry) {
            return CheckState(FetchState.RETRY, "retry")
        } else if (protocolStatus.isFailed && !ignoreFailure) {
            // Failed to fetch the page last time, it might be caused by page is gone
            // in such case, do not fetch it even it's expired, unless the retryFailed flag is set
            return CheckState(FetchState.DO_NOT_FETCH, "failed")
        }

        val now = Instant.now()

        // Fetch a page already fetched before if it's expired
        val prevFetchTime = page.prevFetchTime
        if (prevFetchTime.isBefore(AppConstants.TCP_IP_STANDARDIZED_TIME)) {
            statusTracker?.messageWriter?.debugIllegalLastFetchTime(page)
        }

        // if (expireAt in prevFetchTime..now || now > prevFetchTime + expires), it's expired
        if (options.isExpired(prevFetchTime)) {
            return CheckState(FetchState.EXPIRED, "expired 1")
        }

        val duration = Duration.between(page.fetchTime, now)
        val days = duration.toDays()
        if (duration.toMillis() > 0 && days < 3) {
            return CheckState(FetchState.SCHEDULED, "scheduled")
        }

        // no content
        if (page.persistedContentLength == 0L) {
            // do not enable this feature by default
            // return CheckState(FetchState.NO_CONTENT, "no content")
        }

        // TODO: page.persistedContentLength or page.originalContentLength?
        if (page.persistedContentLength < options.requireSize) {
            return CheckState(FetchState.SMALL_CONTENT, "small content")
        }

        val domStats = page.activeDOMStatTrace
        val (ni, na) = domStats["lastStat"] ?: ActiveDOMStat()
        if (ni < options.requireImages) {
            return CheckState(FetchState.MISS_FIELD, "miss image")
        }
        if (na < options.requireAnchors) {
            return CheckState(FetchState.MISS_FIELD, "miss anchor")
        }

        return CheckState(FetchState.DO_NOT_FETCH, "unknown")
    }

    private fun updateFetchSchedule(page: WebPage, options: LoadOptions) {
        if (page.isInternal) {
            logger.warn("Unexpected internal page [updateFetchSchedule]")
            return
        }

        // canceled or loaded from database, do not update fetch schedule
        if (page.isCanceled || !page.isFetched) {
            return
        }

        updateComponent.updateFetchSchedule(page)

        require(page.isFetched)
    }

    private fun persist(page: WebPage, options: LoadOptions) {
        // Remove page content if dropContent is set or storeContent is false. Page content is set earlier,
        // so the PageParser can parse it, now, we can clear it since it's usually very large.
        if (options.dropContent || !options.storeContent) {
            page.clearPersistContent()
        }

        // The content is loaded from cache, the content remains unchanged, do not persist it
        // TODO: check the logic again
        if (page.isCached) {
            page.unbox().clearDirty(GWebPage.Field.CONTENT.index)
            assert(!page.unbox().isContentDirty)
        }

        webDb.put(page)
        ++numWrite

        collectPersistMetrics(page)

        if (numWrite < 200) {
            flush()
        } else if (!options.lazyFlush || numWrite % 20 == 0) {
            flush()
        }
    }

    private fun collectPersistMetrics(page: WebPage) {
        val metrics = coreMetrics
        if (metrics != null) {
            metrics.persists.mark()
            val bytes = page.content?.array()?.size ?: 0
            if (bytes > 0) {
                metrics.contentPersists.mark()
                metrics.persistContentMBytes.inc(ByteUnitConverter.convert(bytes, "M").toLong())
            }
        }
        tracer?.trace("Persisted {} | {}", Strings.compactFormat(page.contentLength), page.url)
    }

    fun flush() = webDb.flush()

    override fun close() {
        closed.compareAndSet(false, true)
    }

    private fun assertSame(a: Any?, b: Any?, lazyMessage: () -> String) {
        require(a === b) { lazyMessage() }
    }

    private fun shouldBe(expected: Any?, actual: Any?, lazyMessage: () -> String) {
        if (actual != expected) {
            logger.warn(lazyMessage())
        }
    }

    class LazyFieldLoader(
        val url: String,
        val db: WebDb
    ): java.util.function.Function {
        override fun apply(field: String): GWebPage? {
            return db.get0(url, false, arrayOf(field))
        }
    }
}