All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.protocol.browser.emulator.context.ProxyContext.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.protocol.browser.emulator.context

import ai.platon.pulsar.common.DateTimes
import ai.platon.pulsar.common.config.CapabilityTypes
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.skeleton.common.metrics.MetricsSystem
import ai.platon.pulsar.common.proxy.*
import ai.platon.pulsar.skeleton.crawl.fetch.FetchResult
import ai.platon.pulsar.skeleton.crawl.fetch.FetchTask
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriver
import ai.platon.pulsar.skeleton.crawl.fetch.privacy.PrivacyAgent
import com.codahale.metrics.Gauge
import org.slf4j.LoggerFactory
import java.time.Duration
import java.time.Instant
import java.util.concurrent.atomic.AtomicBoolean
import java.util.concurrent.atomic.AtomicInteger
import kotlin.random.Random

open class ProxyContext(
    var proxyEntry: ProxyEntry? = null,
    private val proxyPoolManager: ProxyPoolManager,
    private val driverContext: WebDriverContext,
    private val conf: ImmutableConfig
): AutoCloseable {

    companion object {
        val numProxyAbsence = AtomicInteger()
        var lastProxyAbsentTime = Instant.now()
        val numRunningTasks = AtomicInteger()
        var maxAllowedProxyAbsence = 200

        init {
            mapOf(
                "proxyAbsences" to Gauge { numProxyAbsence.get() },
                "runningTasks" to Gauge { numRunningTasks.get() }
            ).forEach { MetricsSystem.reg.register(this, it.key, it.value) }
        }

        @Throws(ProxyException::class)
        fun create(
            id: PrivacyAgent,
            driverContext: WebDriverContext,
            proxyPoolManager: ProxyPoolManager,
            conf: ImmutableConfig
        ): ProxyContext {
            val proxyPool = proxyPoolManager.proxyPool
            val proxy = proxyPool.take()

            if (proxy != null) {
                numProxyAbsence.takeIf { it.get() > 0 }?.decrementAndGet()

                val proxyEntry0 = proxyPoolManager.activeProxyEntries.computeIfAbsent(id.contextDir) { proxy }
                proxyEntry0.startWork()
                return ProxyContext(proxyEntry0, proxyPoolManager, driverContext, conf)
            } else {
                numProxyAbsence.incrementAndGet()
                checkProxyAbsence()
                throw NoProxyException("No proxy found in pool ${proxyPool.javaClass.simpleName} | $proxyPool")
            }
        }

        fun checkProxyAbsence() {
            if (numProxyAbsence.get() > maxAllowedProxyAbsence) {
                val now = Instant.now()
                val day1 = DateTimes.dayOfMonth(lastProxyAbsentTime)
                val day2 = DateTimes.dayOfMonth(now)
                if (day2 != day1) {
                    // clear the proxy absence counter at every start of day
                    numProxyAbsence.set(0)
                    lastProxyAbsentTime = now
                } else {
                    throw ProxyVendorUntrustedException("No proxy available, the vendor is untrusted." +
                            " Proxy is absent for $numProxyAbsence times from $lastProxyAbsentTime")
                }
            }
        }
    }

    private val logger = LoggerFactory.getLogger(ProxyContext::class.java)!!
    /**
     * If the number of success exceeds [maxFetchSuccess], emit a PrivacyRetry result
     * */
    private val maxFetchSuccess = conf.getInt(CapabilityTypes.PROXY_MAX_FETCH_SUCCESS, Int.MAX_VALUE / 10)
    private val minTimeToLive = Duration.ofSeconds(30)
    private val closing = AtomicBoolean()
    private val closed = AtomicBoolean()

    val isEnabled get() = proxyPoolManager.isEnabled
    val isRetired: Boolean get() {
        val p = proxyEntry
        if (p != null) {
            if (p.isExpired) {
                p.retire()
            }
            return p.isRetired
        }
        return false
    }
    val isActive get() = proxyPoolManager.isActive && !closing.get() && !closed.get()
    val isReady: Boolean get() {
        val isProxyReady = proxyEntry == null || proxyEntry?.isReady == true
        return isProxyReady && !isRetired && isActive
    }

    init {
        maxAllowedProxyAbsence = conf.getInt(CapabilityTypes.PROXY_MAX_ALLOWED_PROXY_ABSENCE, 10)
    }

    @Throws(ProxyException::class, Exception::class)
    suspend fun run(task: FetchTask, browseFun: suspend (FetchTask, WebDriver) -> FetchResult): FetchResult {
        return checkAbnormalResult(task) ?:run0(task, browseFun)
    }

    open fun maintain() {
        val p = proxyEntry
        if (p != null && p.isExpired) {
            p.retire()
        }
        // nothing to do currently
    }

    @Throws(ProxyException::class, Exception::class)
    private suspend fun run0(
        task: FetchTask, browseFun: suspend (FetchTask, WebDriver) -> FetchResult
    ): FetchResult {
        var success = false
        return try {
            beforeTaskStart(task)
            proxyPoolManager.runWith(proxyEntry) { driverContext.run(task, browseFun) }.also {
                success = it.response.protocolStatus.isSuccess
                it.response.pageDatum.proxyEntry = proxyEntry
                numProxyAbsence.takeIf { it.get() > 0 }?.decrementAndGet()
            }
        } catch (e: ProxyException) {
            handleProxyException(task, e)
        } finally {
            afterTaskFinished(task, success)
        }
    }

    private fun checkAbnormalResult(task: FetchTask): FetchResult? {
        if (!isActive) {
            return FetchResult.canceled(task, "PROXY CX INACTIVE")
        }

        checkProxyAbsence()

        return null
    }

    private fun handleProxyException(task: FetchTask, e: ProxyException): FetchResult {
        return when (e) {
            is ProxyInsufficientBalanceException -> {
                throw e
            }
            is ProxyRetiredException -> {
                logger.warn("{}, context reset will be triggered | {}", e.message, task.proxyEntry?:"")
                FetchResult.privacyRetry(task, e)
            }
            is NoProxyException -> {
                numProxyAbsence.incrementAndGet()
                checkProxyAbsence()
                logger.warn("No proxy available temporary the {}th times, cause: {}", numProxyAbsence, e.message)
                FetchResult.crawlRetry(task, "No proxy")
            }
            else -> {
                logger.warn("Task failed with proxy {}, cause: {}", proxyEntry, e.message)
                FetchResult.privacyRetry(task, e)
            }
        }
    }

    private fun beforeTaskStart(task: FetchTask) {
        numRunningTasks.incrementAndGet()

        // If the proxy is idle, and here comes a new task, reset the context
        // The proxy is about to be unusable, reset the context
        proxyEntry?.also {
            task.proxyEntry = it
            it.lastActiveTime = Instant.now()

            if (it.willExpireAfter(minTimeToLive)) {
                if (closing.compareAndSet(false, true)) {
                    throw ProxyRetiredException("The proxy is expired ($minTimeToLive)")
                }
            }

            val successPages = it.numSuccessPages.get()
            // Add a random number to disturb the anti-spider
            val delta = (0.25 * maxFetchSuccess).toInt()
            val limit = maxFetchSuccess + Random(System.currentTimeMillis()).nextInt(-delta, delta)
            if (successPages > limit) {
                // If a proxy served to many pages, the target site may track the finger print of the crawler
                // and also maxFetchSuccess can be used for test purpose
                logger.info("Served too many pages ($successPages/$maxFetchSuccess) | {}", it)
                if (closing.compareAndSet(false, true)) {
                    throw ProxyRetiredException("Served too many pages")
                }
            }
        }
    }

    private fun afterTaskFinished(task: FetchTask, success: Boolean) {
        numRunningTasks.decrementAndGet()
        proxyEntry?.apply {
            if (success) {
                refresh()
                numSuccessPages.incrementAndGet()
                lastTarget = task.url
                servedDomains.add(task.domain)
            } else {
                numFailedPages.incrementAndGet()
            }
        }
    }

    /**
     * Block until the proxy is offline
     * */
    override fun close() {
        if (closed.compareAndSet(false, true)) {
            proxyPoolManager.activeProxyEntries.remove(driverContext.browserId.userDataDir)
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy