All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.common.options.deprecated.CrawlOptions.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.skeleton.common.options.deprecated

import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.CapabilityTypes
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.config.Params
import ai.platon.pulsar.skeleton.common.options.CommonOptions
import ai.platon.pulsar.skeleton.common.options.DurationConverter
import ai.platon.pulsar.skeleton.common.options.LinkOptions
import ai.platon.pulsar.skeleton.common.options.WeightedKeywordsConverter
import com.beust.jcommander.Parameter
import org.apache.commons.lang3.StringUtils
import java.text.DecimalFormat
import java.time.Duration
import java.time.ZoneId
import java.util.*

/**
 * Created by vincent on 17-3-18.
 * Copyright @ 2013-2023 Platon AI. All rights reserved
 */
class CrawlOptions(argv: Array) : CommonOptions(argv) {

    @Parameter(names = ["-verbose"], description = "Log level for this crawl task")
    var verbose = 0
    @Parameter(names = ["-i", "--fetch-interval"], converter = DurationConverter::class, description = "Fetch interval")
    var fetchInterval = Duration.ofHours(1)
    @Parameter(names = ["-p", "--fetch-priority"], description = "Fetch priority")
    var fetchPriority = AppConstants.FETCH_PRIORITY_DEFAULT
    @Parameter(names = ["-s", "--score"], description = "Injected score")
    var score = 0
    @Parameter(names = ["-d", "--depth"], description = "Max crawl depth. Do not crawl anything deeper")
    var depth = 1
    @Parameter(names = ["-z", "--zone-id"], description = "The zone id of the website we crawl")
    var zoneId = ZoneId.systemDefault().id

    @Parameter(names = ["-w", "--keywords"], converter = WeightedKeywordsConverter::class, description = "Keywords with weight, ")
    var keywords: Map = HashMap()

    @Parameter(names = ["-idx", "--indexer-url"], description = "Indexer url")
    var indexerUrl: String = ""

    var linkOptions = LinkOptions()
        private set

    init {
        addObjects(this, linkOptions)
    }

    constructor(): this("")

    constructor(args: String): this(split(args.replace("=".toRegex(), " ")))

    constructor(args: String, conf: ImmutableConfig): this(split(args.replace("=".toRegex(), " "))) {
        this.init(conf)
    }

    constructor(argv: Array, conf: ImmutableConfig): this(argv) {
        this.init(conf)
    }

    private fun init(conf: ImmutableConfig) {
        this.fetchInterval = conf.getDuration(CapabilityTypes.FETCH_INTERVAL, fetchInterval)
        this.score = conf.getInt(CapabilityTypes.INJECT_SCORE, score)
        this.depth = conf.getUint(CapabilityTypes.CRAWL_MAX_DISTANCE, depth)
        this.linkOptions = LinkOptions("", conf)
    }

    private fun formatKeywords(): String {
        val df = DecimalFormat("##.#")
        return keywords.entries.map { it.key + "^" + df.format(it.value) }.joinToString { it }
    }

    override fun getParams(): Params {
        return Params.of(
                "-log", verbose,
                "-i", fetchInterval,
                "-p", fetchPriority,
                "-s", score,
                "-d", depth,
                "-z", zoneId,
                "-w", formatKeywords(),
                "-idx", indexerUrl
        )
                .filter { p -> StringUtils.isNotEmpty(p.value.toString()) }
                .merge(linkOptions.params)
    }

    override fun toString(): String {
        return params.withKVDelimiter(" ").formatAsLine().replace("\\s+".toRegex(), " ")
    }

    companion object {

        @JvmField
        val DEFAULT = CrawlOptions()

        fun parse(args: String, conf: ImmutableConfig): CrawlOptions {
            if (args.isBlank()) {
                return CrawlOptions(arrayOf(), conf)
            }

            return CrawlOptions(args, conf).apply { parse() }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy