All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.fetch.privacy.PrivacyContext.kt Maven / Gradle / Ivy

There is a newer version: 2.1.0
Show newest version
package ai.platon.pulsar.skeleton.crawl.fetch.privacy

import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.HtmlIntegrity
import ai.platon.pulsar.common.browser.BrowserFiles.computeNextSequentialContextDir
import ai.platon.pulsar.common.browser.BrowserFiles.computeRandomTmpContextDir
import ai.platon.pulsar.common.config.AppConstants.FETCH_TASK_TIMEOUT_DEFAULT
import ai.platon.pulsar.common.config.CapabilityTypes.*
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.proxy.ProxyException
import ai.platon.pulsar.common.proxy.ProxyRetiredException
import ai.platon.pulsar.common.readable
import ai.platon.pulsar.persist.RetryScope
import ai.platon.pulsar.skeleton.common.metrics.MetricsSystem
import ai.platon.pulsar.skeleton.crawl.fetch.FetchResult
import ai.platon.pulsar.skeleton.crawl.fetch.FetchTask
import ai.platon.pulsar.skeleton.crawl.fetch.driver.BrowserErrorPageException
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriver
import com.google.common.annotations.Beta
import org.slf4j.LoggerFactory
import java.nio.file.Path
import java.time.Duration
import java.time.Instant
import java.util.concurrent.atomic.AtomicBoolean
import java.util.concurrent.atomic.AtomicInteger

/**
 * A privacy context is a unique context of a privacy agent to the target website,
 * it will be closed once it is leaked.
 *
 * One of the biggest difficulties in web scraping tasks is the bot stealth.
 *
 * For web scraping tasks, the website should have no idea whether a visit is
 * from a human being or a bot. Once a page visit is suspected by the website,
 * which we call a privacy leak, the privacy context has to be dropped,
 * and Pulsar will visit the page in another privacy context.
 * */
abstract class PrivacyContext(
    val privacyAgent: PrivacyAgent,
    val conf: ImmutableConfig
) : Comparable, AutoCloseable {
    companion object {
        private val SEQUENCER = AtomicInteger()

        // The prefix for all temporary privacy contexts. System context, prototype context and default context are not
        // required to start with the prefix.
        const val CONTEXT_DIR_PREFIX = "cx."

        // The default context directory, if you need a permanent and isolate context, use this one.
        // NOTE: the user-default context is not a default context.
        val DEFAULT_CONTEXT_DIR: Path = AppPaths.CONTEXT_DEFAULT_DIR
        // A random context directory, if you need a random temporary context, use this one
        val NEXT_SEQUENTIAL_CONTEXT_DIR get() = computeNextSequentialContextDir()
        // A random context directory, if you need a random temporary context, use this one
        val RANDOM_CONTEXT_DIR get() = computeRandomTmpContextDir()
        // The prototype context directory, all privacy contexts copies browser data from the prototype.
        // A typical prototype data dir is: ~/.pulsar/browser/chrome/prototype/google-chrome/
        val PROTOTYPE_DATA_DIR: Path = AppPaths.CHROME_DATA_DIR_PROTOTYPE
        // A context dir is the dir which contains the browser data dir, and supports different browsers.
        // For example: ~/.pulsar/browser/chrome/prototype/
        val PROTOTYPE_CONTEXT_DIR: Path = AppPaths.CHROME_DATA_DIR_PROTOTYPE.parent

        val PRIVACY_CONTEXT_IDLE_TIMEOUT_DEFAULT: Duration = Duration.ofMinutes(30)

        val globalMetrics by lazy { PrivacyContextMetrics() }
    }

    private val logger = LoggerFactory.getLogger(PrivacyContext::class.java)

    val id get() = privacyAgent.id
    val seq = SEQUENCER.incrementAndGet()
    val display get() = privacyAgent.display
    val baseDir get() = privacyAgent.contextDir

    protected val numRunningTasks = AtomicInteger()
    val minimumThroughput = if (privacyAgent.isPermanent) 0f else conf.getFloat(PRIVACY_CONTEXT_MIN_THROUGHPUT, 0.3f)
    val maximumWarnings = if (privacyAgent.isPermanent) 100000 else conf.getInt(PRIVACY_MAX_WARNINGS, 8)
    val minorWarningFactor = conf.getInt(PRIVACY_MINOR_WARNING_FACTOR, 5)
    val privacyLeakWarnings = AtomicInteger()
    val privacyLeakMinorWarnings = AtomicInteger()

    private val registry = MetricsSystem.defaultMetricRegistry
    private val sms = MetricsSystem.SHADOW_METRIC_SYMBOL
    val meterTasks = registry.meter(this, "$SEQUENCER$sms", "tasks")
    val meterSuccesses = registry.meter(this, "$SEQUENCER$sms", "successes")
    val meterFinishes = registry.meter(this, "$SEQUENCER$sms", "finishes")
    val meterSmallPages = registry.meter(this, "$SEQUENCER$sms", "smallPages")
    val smallPageRate get() = 1.0 * meterSmallPages.count / meterTasks.count.coerceAtLeast(1)
    val successRate = meterSuccesses.count.toFloat() / meterTasks.count
    /**
     * The rate of failures. Failure rate is meaningless when there are few tasks.
     * */
    val failureRate get() = 1 - successRate
    val failureRateThreshold = conf.getFloat(PRIVACY_CONTEXT_FAILURE_RATE_THRESHOLD, 0.6f)
    /**
     * Check if failure rate is too high.
     * High failure rate make sense only when there are many tasks.
     * */
    val isHighFailureRate get() = meterTasks.count > 100 && failureRate > failureRateThreshold
    /**
     * The start time of the privacy context.
     * */
    val startTime = Instant.now()
    /**
     * The last active time of the privacy context.
     * */
    var lastActiveTime = Instant.now()
        private set
    /**
     * The elapsed time of the privacy context since it's started.
     * */
    val elapsedTime get() = Duration.between(startTime, Instant.now())

    private val fetchTaskTimeout
        get() = conf.getDuration(FETCH_TASK_TIMEOUT, FETCH_TASK_TIMEOUT_DEFAULT)
    private val privacyContextIdleTimeout
        get() = conf.getDuration(PRIVACY_CONTEXT_IDLE_TIMEOUT, PRIVACY_CONTEXT_IDLE_TIMEOUT_DEFAULT)
    private val idleTimeout: Duration get() = privacyContextIdleTimeout.coerceAtLeast(fetchTaskTimeout)
    /**
     * The privacy context is retired, and should be closed soon.
     * */
    protected var retired = false
    /**
     * The idle time of the privacy context.
     * */
    val idelTime get() = Duration.between(lastActiveTime, Instant.now())
    /**
     * Whether the privacy context is idle.
     * */
    open val isIdle get() = idelTime > idleTimeout

//    val historyUrls = PassiveExpiringMap()
    /**
     * Whether the privacy context is closed.
     * */
    protected val closed = AtomicBoolean()
    /**
     * Check whether the privacy context works fine and the fetch speed is qualified.
     * */
    open val isGood get() = meterSuccesses.meanRate >= minimumThroughput
    /**
     * Check whether the privacy has been leaked since there are too many warnings about privacy leakage.
     * */
    open val isLeaked get() = !privacyAgent.isPermanent && privacyLeakWarnings.get() >= maximumWarnings
    /**
     * Check whether the privacy context works fine and the fetch speed is qualified.
     * */
    open val isRetired get() = retired
    /**
     * Check whether the privacy context is active.
     * An active privacy context can be used to serve tasks, and an inactive one should be closed.
     *
     * An active privacy context has to meet the following requirements:
     * 1. not closed
     * 2. not leaked
     * 3. not retired
     *
     * Note: this flag does not guarantee consistency, and can change immediately after it's read
     * */
    open val isActive get() = !isLeaked && !isRetired && !isClosed
    /**
     * Check whether the privacy context is closed.
     * */
    open val isClosed get() = closed.get()
    /**
     * A ready privacy context is ready to serve tasks.
     *
     * A ready privacy context has to meet the following requirements:
     * 1. not closed
     * 2. not leaked
     * 3. [requirement removed] not idle
     * 4. not retired
     * 5. if there is a proxy, the proxy has to be ready
     * 6. the associated driver pool promises to provide an available driver, ether one of the following:
     *    1. it has slots to create new drivers
     *    2. it has standby drivers
     *
     * Note: this flag does not guarantee consistency, and can change immediately after it's read
     * */
    open val isReady get() = hasWebDriverPromise() && isActive
    /**
     * Check whether the privacy context is at full capacity. If the privacy context is indeed at full capacity, it
     * should not be used for processing new tasks, and the underlying services may potentially refuse to provide service.
     *
     * A privacy context is running at full capacity when the underlying webdriver pool is full capacity,
     * so the webdriver pool can not provide a webdriver for new tasks.
     *
     * Note that if a driver pool is retired or closed, it's not full capacity.
     *
     * @return True if the privacy context is running at full capacity, false otherwise.
     * */
    open val isFullCapacity = false

    /**
     * Check if the privacy context is running under loaded.
     * */
    open val isUnderLoaded get() = !isFullCapacity

    /**
     * Get the readable privacy context state.
     * */
    open val readableState: String get() {
        return listOf(
            "closed" to isClosed, "leaked" to isLeaked, "active" to isActive,
            "highFailure" to isHighFailureRate, "idle" to isIdle, "good" to isGood,
            "ready" to isReady, "retired" to isRetired
        ).filter { it.second }.joinToString(",") { it.first }
    }

    init {
        globalMetrics.contexts.mark()
    }

    /**
     * The promised worker count.
     *
     * The implementation has to tell the caller how many workers it can provide.
     * The number of workers can change immediately after reading, so the caller only has promises
     * but no guarantees.
     *
     * @return the number of workers promised.
     * */
    abstract fun promisedWebDriverCount(): Int

    /**
     * Check if the privacy context promises at least one worker to provide.
     * */
    fun hasWebDriverPromise() = promisedWebDriverCount() > 0

    @Beta
    abstract fun subscribeWebDriver(): WebDriver?

    /**
     * Mark a success task.
     * */
    fun markSuccess() {
        privacyLeakWarnings.takeIf { it.get() > 0 }?.decrementAndGet()
        meterSuccesses.mark()
        globalMetrics.successes.mark()
    }

    /**
     * Mark a warning.
     * */
    fun markWarning() {
        privacyLeakWarnings.incrementAndGet()
        globalMetrics.leakWarnings.mark()
    }

    /**
     * Mark n warnings.
     * */
    fun markWarning(n: Int) {
        privacyLeakWarnings.addAndGet(n)
        globalMetrics.leakWarnings.mark(n.toLong())
    }

    /**
     * Mark a minor warnings.
     * */
    fun markMinorWarning() {
        privacyLeakMinorWarnings.incrementAndGet()
        globalMetrics.minorLeakWarnings.mark()
        if (privacyLeakMinorWarnings.get() > minorWarningFactor) {
            privacyLeakMinorWarnings.set(0)
            markWarning()
        }
    }

    /**
     * Mark the privacy context as leaked. A leaked privacy context should not serve anymore,
     * and will be closed soon.
     * */
    fun markLeaked() {
        if (privacyAgent.isPermanent) {
            // never mark a permanent privacy context as leaked
        } else {
            require(maximumWarnings in 1..1000000) {
                "The maximum warnings should be set to a reasonable value, but not $maximumWarnings"
            }
            privacyLeakWarnings.addAndGet(maximumWarnings)
        }
    }

    /**
     * Run a task in the privacy context and record the status.
     *
     * @param task the fetch task
     * @param fetchFun the fetch function
     * @return the fetch result
     * */
    @Throws(ProxyException::class, Exception::class)
    open suspend fun run(task: FetchTask, fetchFun: suspend (FetchTask, WebDriver) -> FetchResult): FetchResult {
        beforeRun(task)
        val result = doRun(task, fetchFun)
        afterRun(result)
        return result
    }

    /**
     * Run a task in the privacy context.
     *
     * @param task the fetch task
     * @param fetchFun the fetch function
     * @return the fetch result
     * */
    @Throws(ProxyException::class)
    abstract suspend fun doRun(task: FetchTask, fetchFun: suspend (FetchTask, WebDriver) -> FetchResult): FetchResult

    fun takeSnapshot(): String {
        return "$readableState | promised drivers: ${promisedWebDriverCount()}"
    }
    /**
     * Dismiss the privacy context and mark it as be retired, so it should be closed later.
     * */
    fun dismiss() {
        retired = true
    }
    /**
     * Do the maintaining jobs.
     * */
    abstract fun maintain()

    override fun compareTo(other: PrivacyContext) = id.compareTo(other.id)

    override fun equals(other: Any?) = other is PrivacyContext && other.id == id

    override fun hashCode() = id.hashCode()

    protected fun beforeRun(task: FetchTask) {
        lastActiveTime = Instant.now()
        meterTasks.mark()
        globalMetrics.tasks.mark()

        numRunningTasks.incrementAndGet()
    }

    protected fun afterRun(result: FetchResult) {
        numRunningTasks.decrementAndGet()
//        historyUrls.add(result.task.url)

        lastActiveTime = Instant.now()
        meterFinishes.mark()
        globalMetrics.finishes.mark()

        val status = result.status
        when {
            status.isRetry(RetryScope.PRIVACY, ProxyRetiredException::class.java) -> markLeaked()
            status.isRetry(RetryScope.PRIVACY, HtmlIntegrity.FORBIDDEN) -> markLeaked()

            status.isRetry(RetryScope.PRIVACY, HtmlIntegrity.ROBOT_CHECK) -> markWarning()
            status.isRetry(RetryScope.PRIVACY, HtmlIntegrity.ROBOT_CHECK_2) -> markWarning(2)
            status.isRetry(RetryScope.PRIVACY, HtmlIntegrity.ROBOT_CHECK_3) -> markWarning(3)

            status.isRetry(RetryScope.PRIVACY, HtmlIntegrity.WRONG_LANG) -> markWarning(2)
            status.isRetry(RetryScope.PRIVACY, HtmlIntegrity.WRONG_DISTRICT) -> markWarning(2)
            status.isRetry(RetryScope.PRIVACY, HtmlIntegrity.WRONG_COUNTRY) -> markWarning(2)

            status.isRetry(RetryScope.PRIVACY, BrowserErrorPageException::class.java) -> markWarning(3)
            status.isRetry(RetryScope.PRIVACY) -> markWarning()
            status.isRetry(RetryScope.CRAWL) -> markMinorWarning()
            status.isSuccess -> markSuccess()
        }

        if (result.isSmall) {
            meterSmallPages.mark()
            globalMetrics.smallPages.mark()
        }

        if (isLeaked) {
            globalMetrics.contextLeaks.mark()
        }
    }

    open fun getReport(): String {
        return String.format("Privacy context #%s has lived for %s", SEQUENCER, elapsedTime.readable())
    }

    open fun report() {
        logger.info("Privacy context #{} has lived for {}", SEQUENCER, elapsedTime.readable())
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy