All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.protocol.browser.emulator.impl.BrowserEmulatedFetcherImpl.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.protocol.browser.emulator.impl

import ai.platon.pulsar.common.AppContext
import ai.platon.pulsar.common.brief
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.config.VolatileConfig
import ai.platon.pulsar.common.stringify
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.protocol.browser.driver.WebDriverPoolManager
import ai.platon.pulsar.protocol.browser.emulator.BrowserEmulatedFetcher
import ai.platon.pulsar.protocol.browser.emulator.BrowserEmulator
import ai.platon.pulsar.skeleton.common.persist.ext.browseEventHandlers
import ai.platon.pulsar.skeleton.crawl.fetch.FetchResult
import ai.platon.pulsar.skeleton.crawl.fetch.FetchTask
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriver
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriverCancellationException
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriverException
import ai.platon.pulsar.skeleton.crawl.fetch.privacy.PrivacyManager
import ai.platon.pulsar.skeleton.crawl.protocol.ForwardingResponse
import ai.platon.pulsar.skeleton.crawl.protocol.Response
import kotlinx.coroutines.runBlocking
import org.slf4j.LoggerFactory
import java.util.concurrent.atomic.AtomicBoolean

/**
 * Created by vincent on 18-1-1.
 * Copyright @ 2013-2023 Platon AI. All rights reserved
 */
open class BrowserEmulatedFetcherImpl(
    override val privacyManager: PrivacyManager,
    override val driverPoolManager: WebDriverPoolManager,
    override val browserEmulator: BrowserEmulator,
    private val immutableConfig: ImmutableConfig,
    private val closeCascaded: Boolean = false
): BrowserEmulatedFetcher {
    private val logger = LoggerFactory.getLogger(BrowserEmulatedFetcherImpl::class.java)!!

    private val closed = AtomicBoolean()
    private val illegalState = AtomicBoolean()
    private val isActive get() = !illegalState.get() && !closed.get() && AppContext.isActive

    enum class EventType {
        willFetch,
        fetched
    }

    @Throws(Exception::class)
    override fun fetch(url: String) = fetchContent(WebPage.newWebPage(url, immutableConfig.toVolatileConfig()))

    @Throws(Exception::class)
    override fun fetch(url: String, conf: VolatileConfig) = fetchContent(WebPage.newWebPage(url, conf))

    /**
     * Fetch page content
     * */
    @Throws(Exception::class)
    override fun fetchContent(page: WebPage): Response = runBlocking {
        fetchContentDeferred(page)
    }

    @Throws(Exception::class)
    override suspend fun fetchDeferred(url: String) =
        fetchContentDeferred(WebPage.newWebPage(url, immutableConfig.toVolatileConfig()))

    @Throws(Exception::class)
    override suspend fun fetchDeferred(url: String, volatileConfig: VolatileConfig) =
        fetchContentDeferred(WebPage.newWebPage(url, volatileConfig))

    /**
     * Fetch page content
     * */
    @Throws(Exception::class)
    override suspend fun fetchContentDeferred(page: WebPage): Response {
        if (!isActive) {
            return ForwardingResponse.canceled(page)
        }

        if (page.isInternal) {
            logger.warn("Unexpected internal page | {}", page.url)
            return ForwardingResponse.canceled(page)
        }

        val task = FetchTask.create(page)
        return fetchTaskDeferred(task)
    }

    /**
     * Fetch page content
     * */
    @Throws(Exception::class)
    private suspend fun fetchTaskDeferred(task: FetchTask): Response {
        // TODO: it's a temporary solution to specify the web driver to fetch the page
        val driver = task.page.getVar("WEB_DRIVER") as? WebDriver
        if (driver != null) {
            return doFetch(task, driver).response
        }

        // @Throws(ProxyException::class, Exception::class)
        return privacyManager.run(task) { _, driver2 -> doFetch(task, driver2) }.response
    }

    @Throws(Exception::class)
    private suspend fun doFetch(task: FetchTask, driver: WebDriver): FetchResult {
        if (!isActive) {
            return FetchResult.canceled(task, "Browser fetcher is not active")
        }

        emit(EventType.willFetch, task.page, driver)

//        val result = try {
//            browserEmulator.visit(task, driver)
//        } catch (e: Throwable) {
//            logger.warn(e.stringify("[Unexpected] Failed to visit page | ${task.url}\n"))
//            FetchResult.failed(task, e)
//        }
        val result = browserEmulator.visit(task, driver)

        emit(EventType.fetched, task.page, driver)

        return result
    }

    override fun reset() {
        TODO("Not implemented")
    }

    override fun cancel(page: WebPage) {
        TODO("Not implemented")
    }

    override fun cancelAll() {
        TODO("Not implemented")
    }

    override fun close() {
        if (closed.compareAndSet(false, true)) {
            if (closeCascaded) {
                browserEmulator.close()
                driverPoolManager.close()
                privacyManager.close()
            }
        }
    }

    private suspend fun emit(type: EventType, page: WebPage, driver: WebDriver) {
        val event = page.browseEventHandlers ?: return
        when(type) {
            EventType.willFetch -> notify(type.name) { event.onWillFetch(page, driver) }
            EventType.fetched -> notify(type.name) { event.onFetched(page, driver) }
            else -> {}
        }
    }

    private suspend fun notify(name: String, action: suspend () -> Unit) {
        if (!isActive) {
            return
        }

        val e = kotlin.runCatching { action() }.exceptionOrNull()

        if (e != null && isActive) {
            handleEventException(name, e)
        }
    }

    private fun handleEventException(name: String, e: Throwable) {
        when (e) {
            is WebDriverCancellationException -> logger.info("Web driver is cancelled")
            is WebDriverException -> logger.warn(e.brief("[Ignored][$name] "))
            is Exception -> logger.warn(e.brief("[Ignored][$name] "))
            else -> logger.error(e.stringify("[Unexpected][$name] "))
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy