All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.inject.SeedBuilder.kt Maven / Gradle / Ivy

package ai.platon.pulsar.skeleton.crawl.inject

import ai.platon.pulsar.common.DateTimes
import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.config.Parameterized
import ai.platon.pulsar.common.config.Params
import ai.platon.pulsar.skeleton.common.options.deprecated.CrawlOptions
import ai.platon.pulsar.skeleton.crawl.scoring.ScoringFilters
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.metadata.Mark
import org.apache.commons.lang3.tuple.Pair
import org.slf4j.LoggerFactory
import java.time.Instant

/**
 * Created by vincent on 16-9-24.
 * Copyright @ 2013-2016 Platon AI. All rights reserved
 */
class SeedBuilder(
        private val scoreFilters: ScoringFilters,
        private val conf: ImmutableConfig
) : Parameterized {

    constructor(conf: ImmutableConfig): this(ScoringFilters(conf), conf)

    override fun getParams(): Params {
        return Params.of("injectTime", DateTimes.format(Instant.now()))
    }

    fun create(urlArgs: Pair): WebPage {
        return create(urlArgs.key, urlArgs.value)
    }

    /**
     * @param url  The seed url
     * A configured url is a string contains the url and arguments.
     * @param args The args
     * @return The created WebPage.
     * If the url is an invalid url or an internal url, return ai.platon.pulsar.persistWebPage.NIL
     */
    fun create(url: String, args: String): WebPage {
        if (url.isEmpty()) {
            return WebPage.NIL
        }
        val page = WebPage.newWebPage(url, conf.toVolatileConfig())
        return if (makeSeed(url, args, page)) page else WebPage.NIL
    }

    fun makeSeed(page: WebPage): Boolean {
        return makeSeed(page.url, page.args, page)
    }

    private fun makeSeed(url: String, args: String, page: WebPage): Boolean {
//        if (page.isSeed) {
//            return false
//        }

        if (page.isInternal) {
            return false
        }

        val options = CrawlOptions.parse(args, conf)
        val now = Instant.now()
        page.distance = 0
        if (page.createTime.isBefore(AppConstants.TCP_IP_STANDARDIZED_TIME)) {
            page.createTime = now
        }
        page.markSeed()
        page.score = options.score.toFloat()
        scoreFilters.injectedScore(page)
        page.fetchTime = now
        page.fetchInterval = options.fetchInterval
        page.fetchPriority = options.fetchPriority
        page.marks.put(Mark.INJECT, AppConstants.YES_STRING)
        return true
    }

    companion object {
        val LOG = LoggerFactory.getLogger(SeedBuilder::class.java)
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy