All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.component.InjectComponent.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.skeleton.crawl.component

import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.config.Parameterized
import ai.platon.pulsar.common.config.Params
import ai.platon.pulsar.skeleton.common.urls.NormURL
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.common.warnForClose
import ai.platon.pulsar.skeleton.crawl.common.WeakPageIndexer
import ai.platon.pulsar.skeleton.crawl.inject.SeedBuilder
import ai.platon.pulsar.persist.WebDBException
import ai.platon.pulsar.persist.WebDb
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.metadata.Mark
import org.slf4j.LoggerFactory
import java.util.concurrent.atomic.AtomicBoolean

/**
 * Created by vincent on 17-5-14.
 * Copyright @ 2013-2023 Platon AI. All rights reserved
 */
class InjectComponent(
        private val seedBuilder: SeedBuilder,
        val webDb: WebDb,
        val conf: ImmutableConfig
) : Parameterized, AutoCloseable {
    private val seedIndexer: WeakPageIndexer = WeakPageIndexer(AppConstants.SEED_HOME_URL, webDb)
    private val closed = AtomicBoolean(false)

    constructor(webDb: WebDb, conf: ImmutableConfig): this(SeedBuilder(conf), webDb, conf)

    override fun getParams(): Params {
        return seedBuilder.params
    }

    @Throws(WebDBException::class)
    fun inject(urlArgs: Pair): WebPage {
        return inject(urlArgs.first, urlArgs.second)
    }

    @Throws(WebDBException::class)
    fun inject(normURL: NormURL): WebPage {
        return inject(normURL.spec, normURL.args)
    }

    @Throws(WebDBException::class)
    fun inject(url: String, args: String): WebPage {
        var page = webDb.get(url, false)

        if (page.isNil) {
            page = seedBuilder.create(url, args)
            if (page.isSeed) {
                webDb.put(page)
                seedIndexer.index(page.url)
            }
            return page
        }

        // already exist in db, update the status and mark it as a seed
        page.args = args
        return if (inject(page)) page else WebPage.NIL
    }

    @Throws(WebDBException::class)
    fun inject(page: WebPage): Boolean {
        val success = seedBuilder.makeSeed(page)
        if (success) {
            webDb.put(page)
            seedIndexer.index(page.url)
            return true
        }

        return false
    }

    @Throws(WebDBException::class)
    fun injectAll(vararg configuredUrls: String): List {
        return configuredUrls.map { inject(UrlUtils.splitUrlArgs(it)) }
    }

    @Throws(WebDBException::class)
    fun injectAll(pages: Collection): List {
        val injectedPages = pages.onEach { inject(it) }.filter { it.isSeed }
        seedIndexer.indexAll(injectedPages.map { it.url })
        LOG.info("Injected " + injectedPages.size + " seeds out of " + pages.size + " pages")
        return injectedPages
    }

    @Throws(WebDBException::class)
    fun unInject(url: String): WebPage {
        val page = webDb.get(url)
        if (page.isSeed) {
            unInject(page)
        }
        return page
    }

    @Throws(WebDBException::class)
    fun unInject(page: WebPage): WebPage {
        if (!page.isSeed) {
            return page
        }
        page.unmarkSeed()
        page.marks.remove(Mark.INJECT)
        seedIndexer.remove(page.url)
        webDb.put(page)
        return page
    }

    @Throws(WebDBException::class)
    fun unInjectAll(vararg urls: String): List {
        val pages = urls.mapNotNull { webDb.getOrNull(it) }.filter { it.isSeed }.onEach { unInject(it) }
        LOG.debug("UnInjected " + pages.size + " urls")
        seedIndexer.removeAll(pages.map { it.url })
        return pages
    }

    @Throws(WebDBException::class)
    fun unInjectAll(pages: Collection): Collection {
        pages.forEach { this.unInject(it) }
        return pages
    }

    fun report(): String {
        val seedHome = webDb.get(AppConstants.SEED_PAGE_1_URL)
        if (seedHome.isNil) {
            val count = seedHome.liveLinks.size
            return "Total " + count + " seeds in store " + webDb.schemaName
        }
        return "No home page"
    }

    @Throws(WebDBException::class)
    fun commit() {
        webDb.flush()
        seedIndexer.commit()
    }

    override fun close() {
        if (closed.compareAndSet(false, true)) {
            runCatching { commit() }.onFailure { warnForClose(this, it) }
        }
    }

    companion object {
        val LOG = LoggerFactory.getLogger(InjectComponent::class.java)
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy