All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.fetch.privacy.PrivacyContext.kt Maven / Gradle / Ivy

/**
 * Copyright (c) Vincent Zhang, [email protected], Platon.AI.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ai.platon.pulsar.skeleton.crawl.fetch.privacy

import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.browser.BrowserFiles
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.crawl.fetch.FetchResult
import ai.platon.pulsar.skeleton.crawl.fetch.FetchTask
import ai.platon.pulsar.skeleton.crawl.fetch.driver.WebDriver
import java.nio.file.Path
import java.time.Duration

/**
 * A privacy context is a unique context of a privacy agent to the target website,
 * it will be closed once it is leaked.
 *
 * One of the biggest difficulties in web scraping tasks is the bot stealth.
 *
 * For web scraping tasks, the website should have no idea whether a visit is
 * from a human being or a bot. Once a page visit is suspected by the website,
 * which we call a privacy leak, the privacy context has to be dropped,
 * and Pulsar will visit the page in another privacy context.
 * */
interface PrivacyContext: AutoCloseable {
    val failureRate: Float
    val isHighFailureRate: Boolean
    val idleTime: Duration
    val elapsedTime: Duration
    val isFullCapacity: Boolean
    val isUnderLoaded: Boolean
    val id: PrivacyAgentId
    val isIdle: Boolean
    val isRetired: Boolean
    val isLeaked: Boolean
    val isGood: Boolean
    val isActive: Boolean
    val isClosed: Boolean
    val isReady: Boolean
    val display: String
    val readableState: String
    val privacyAgent: PrivacyAgent
    fun takeSnapshot(): String
    fun promisedWebDriverCount(): Int
    fun hasWebDriverPromise(): Boolean
    suspend fun open(url: String): FetchResult
    suspend fun open(url: String, fetchFun: suspend (FetchTask, WebDriver) -> FetchResult): FetchResult
    suspend fun open(url: String, options: LoadOptions): FetchResult
    suspend fun run(task: FetchTask, fetchFun: suspend (FetchTask, WebDriver) -> FetchResult): FetchResult
    suspend fun doRun(task: FetchTask, fetchFun: suspend (FetchTask, WebDriver) -> FetchResult): FetchResult
    fun dismiss()
    fun maintain()
    fun buildReport(): String
    
    companion object {
        // The prefix for all temporary privacy contexts. System context, prototype context and default context are not
        // required to start with the prefix.
        const val CONTEXT_DIR_PREFIX = "cx."
        
        // The default context directory, if you need a permanent and isolate context, use this one.
        // NOTE: the user-default context is not a default context.
        val DEFAULT_CONTEXT_DIR: Path = AppPaths.CONTEXT_DEFAULT_DIR
        // A random context directory, if you need a random temporary context, use this one
        val NEXT_SEQUENTIAL_CONTEXT_DIR get() = BrowserFiles.computeNextSequentialContextDir()
        // A random context directory, if you need a random temporary context, use this one
        val RANDOM_CONTEXT_DIR get() = BrowserFiles.computeRandomTmpContextDir()
        // The prototype context directory, all privacy contexts copies browser data from the prototype.
        // A typical prototype data dir is: ~/.pulsar/browser/chrome/prototype/google-chrome/
        val PROTOTYPE_DATA_DIR: Path = AppPaths.CHROME_DATA_DIR_PROTOTYPE
        // A context dir is the dir which contains the browser data dir, and supports different browsers.
        // For example: ~/.pulsar/browser/chrome/prototype/
        val PROTOTYPE_CONTEXT_DIR: Path = AppPaths.CHROME_DATA_DIR_PROTOTYPE.parent
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy