ai.platon.pulsar.skeleton.crawl.component.BatchFetchComponent.kt Maven / Gradle / Ivy

Go to download
package ai.platon.pulsar.skeleton.crawl.component

import ai.platon.pulsar.common.AppContext
import ai.platon.pulsar.common.config.CapabilityTypes
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.common.urls.Hyperlink
import ai.platon.pulsar.skeleton.common.urls.NormURL
import ai.platon.pulsar.skeleton.crawl.CoreMetrics
import ai.platon.pulsar.skeleton.crawl.common.FetchEntry
import ai.platon.pulsar.skeleton.crawl.common.GlobalCacheFactory
import ai.platon.pulsar.skeleton.crawl.protocol.Protocol
import ai.platon.pulsar.skeleton.crawl.protocol.ProtocolFactory
import ai.platon.pulsar.skeleton.crawl.protocol.Response
import ai.platon.pulsar.persist.WebDb
import ai.platon.pulsar.persist.WebPage
import com.google.common.collect.Iterables
import kotlinx.coroutines.flow.asFlow
import kotlinx.coroutines.flow.map
import kotlinx.coroutines.flow.toList
import kotlinx.coroutines.runBlocking
import org.slf4j.LoggerFactory

class BatchFetchComponent(
    val webDb: WebDb,
    val globalCacheFactory: GlobalCacheFactory,
    coreMetrics: CoreMetrics? = null,
    protocolFactory: ProtocolFactory,
    immutableConfig: ImmutableConfig
) : FetchComponent(coreMetrics, protocolFactory, immutableConfig) {
    private val logger = LoggerFactory.getLogger(BatchFetchComponent::class.java)

    constructor(webDb: WebDb, immutableConfig: ImmutableConfig) : this(
        webDb, GlobalCacheFactory(immutableConfig), null, ProtocolFactory(immutableConfig), immutableConfig)

    val globalCache get() = globalCacheFactory.globalCache

    /**
     * Fetch all the urls, config property 'fetch.concurrency' controls the concurrency level.
     * If concurrency level is not great than 1, fetch all urls in the caller thread
     *
     * Eager fetch only some urls to response as soon as possible, the rest urls will be fetched in background later
     *
     * @param urls    The urls to fetch
     * @param options The options
     * @return The fetch result
     */
    fun fetchAll(urls: Iterable, options: LoadOptions): Collection {
        return fetchAllInternal(urls, options)
    }

    /**
     * Parallel fetch all the urls
     *
     * Eager fetch only some urls to response as soon as possible, the rest urls will be fetched in background later
     *
     * @param urls    The urls to fetch
     * @param options The options
     * @return The fetch result
     */
    fun parallelFetchAll(urls: Iterable, options: LoadOptions): Collection {
        val protocol = protocolFactory.getProtocol(options.fetchMode)
                ?: return parallelFetchAllGroupedBySchema(urls, options)
        return parallelFetchAll0(urls, protocol, options)
    }

    /**
     * Group all urls by URL schema, and parallel fetch each group.
     *
     * Eager fetch only some urls to response as soon as possible, the rest urls will be fetched in background later
     *
     * @param urls    The urls to fetch
     * @param options The options
     * @return The fetch result
     */
    fun parallelFetchAllGroupedBySchema(urls: Iterable, options: LoadOptions): Collection {
        val pages: MutableList = ArrayList()
        val groupedUrls = optimizeBatchSize(urls, options).groupBy { it.substringBefore("://") }
        groupedUrls.forEach { (key, gUrls) ->
            val protocol = protocolFactory.getProtocol(key)
            if (protocol != null) {
                pages.addAll(parallelFetchAll0(gUrls, protocol, options))
            } else {
                coreMetrics?.trackFailedUrls(gUrls)
            }
        }
        return pages
    }

    /**
     * Fetch all urls, if allowParallel is true and the config suggests parallel is preferred, parallel fetch all items
     *
     * Eager fetch only some urls to response as soon as possible, the rest urls will be fetched in background later
     *
     * If the protocol supports native parallel, use the protocol's batch fetch method,
     * Or else parallel fetch pages in a ExecutorService
     */
    private fun fetchAllInternal(urls: Iterable, options: LoadOptions): Collection {
        return parallelFetchAll(urls, options)
    }

    /**
     * Parallel fetch all urls.
     * If the protocol supports native parallel, use the protocol's native parallel fetch method,
     * Or else parallel fetch pages in a ExecutorService.
     */
    private fun parallelFetchAll0(urls: Iterable, protocol: Protocol, options: LoadOptions): Collection {
        val optimizedUrls = optimizeBatchSize(urls, options)
        return if (protocol.supportParallel) {
            protocolParallelFetchAll(optimizedUrls, protocol, options)
        } else {
            manualParallelFetchAll(optimizedUrls, options)
        }
    }

    private fun protocolParallelFetchAll(urls: Iterable, protocol: Protocol, options: LoadOptions): Collection {
        coreMetrics?.markFetchTaskStart(Iterables.size(urls))
        return urls.map { FetchEntry(it, options).page }
                .let { protocol.getResponses(it, options.conf) }
                .map { getProtocolOutput(protocol, it, it.page) }
    }

    /**
     * TODO: add to fetch queue instead of invoke new threads
     * */
    private fun manualParallelFetchAll(urls: Iterable, options: LoadOptions): Collection {
        val size = Iterables.size(urls)
        coreMetrics?.markFetchTaskStart(size)
        return runBlocking { urls.asFlow().map { fetch(it, options) }.toList(mutableListOf()) }
    }

    /**
     * Forward previous fetched response to protocol for further process: retry, status processing, etc
     */
    private fun getProtocolOutput(protocol: Protocol, response: Response, page: WebPage): WebPage {
        // forward a response
        protocol.setResponse(response)
        // run protocol.getProtocolOutput so the page have a chance to perform PROTOCOL scope retry if necessary
        // TODO: RetryScope.PROTOCOL does not work since the response is forwarded
        return processProtocolOutput(page, protocol.getProtocolOutput(page))
    }

    private fun optimizeBatchSize(urls: Iterable, options: LoadOptions): Collection {
        return if (urls is Collection<*>) {
            optimizeBatchSize(urls as Collection, options)
        } else optimizeBatchSize(urls.toList(), options)
    }

    /**
     * If there are too many urls to fetch, just fetch some of them in the foreground and
     * fetch the rest in the background
     */
    private fun optimizeBatchSize(urls: Collection, options: LoadOptions): Collection {
        val conf = options.conf
        val parallelLevel = conf.getUint(CapabilityTypes.FETCH_CONCURRENCY, AppContext.NCPU)
        if (urls.size <= parallelLevel) {
            return urls
        }

        val eagerTasks: MutableList = ArrayList(parallelLevel)
        val lazyTasks: MutableList = ArrayList(0.coerceAtLeast(urls.size - parallelLevel))
        for ((i, url) in urls.withIndex()) {
            if (i < parallelLevel) {
                eagerTasks.add(url)
            } else {
                lazyTasks.add(url)
            }
        }

        if (lazyTasks.isNotEmpty()) {
            val mode = options.fetchMode
            val links = lazyTasks.map { NormURL(it, options) }.map { Hyperlink(it.spec, args = it.args) }
            globalCache.urlPool.normalCache.nReentrantQueue.addAll(links)
            if (logger.isDebugEnabled) {
                logger.debug("Committed {} lazy tasks in mode {}", lazyTasks.size, mode)
            }
        }

        return eagerTasks
    }
}