All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.skeleton.crawl.common.WeakPageIndexer.kt Maven / Gradle / Ivy

package ai.platon.pulsar.skeleton.crawl.common

import ai.platon.pulsar.persist.HyperlinkPersistable
import ai.platon.pulsar.persist.WebDb
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.WebPageExt
import com.google.common.collect.Lists
import org.slf4j.LoggerFactory

/**
 * Created by vincent on 17-6-18.
 * Copyright @ 2013-2023 Platon AI. All rights reserved
 *
 * A WebPage based indexer to pages, the index does not support ACID
 */
class WeakPageIndexer(homeUrl: CharSequence, private val webDb: WebDb) {
    private val LOG = LoggerFactory.getLogger(WeakPageIndexer::class.java)
    private val homeUrl = homeUrl.toString()

    // TODO: temporary use only
    private val temporaryDeleteAllPage = true

    fun home() = home

    operator fun get(pageNo: Int) = getIndex(pageNo)

    fun index(url: CharSequence) = indexAll(1, mutableListOf(url))

    fun indexAll(urls: Iterable) = indexAll(1, urls)

    fun indexAll(pageNo: Int, urls: Iterable) = updateAll(pageNo, urls, false)

    fun getAll(pageNo: Int) = get(pageNo).vividLinks.keys

    /**
     * Return a copy of all urls in page N, and clear it's urls
     */
    @Synchronized
    fun takeN(pageNo: Int, n: Int): Set {
        var n1 = n
        val page = get(pageNo)
        val urls: MutableSet = mutableSetOf()
        val it: MutableIterator> = page.vividLinks.entries.iterator()
        while (n1-- > 0 && it.hasNext()) {
            urls.add(it.next().key)
            it.remove()
        }
        if (urls.isNotEmpty()) {
            if (LOG.isDebugEnabled) {
                LOG.debug("Taken {} urls from page {}", urls.size, page.url)
            }
        }
        webDb.put(page)
        webDb.flush()
        return urls
    }

    @Synchronized
    fun takeAll(pageNo: Int) = takeN(pageNo, Int.MAX_VALUE)

    fun remove(url: String) = remove(1, url)

    fun removeAll(urls: Iterable) = removeAll(1, urls)

    fun remove(pageNo: Int, url: CharSequence) = updateAll(pageNo, arrayListOf(url), true)

    fun removeAll(pageNo: Int, urls: Iterable) = updateAll(pageNo, urls, true)

    fun commit() = webDb.flush()

    private fun update(pageNo: Int, newHyperlinks: HyperlinkPersistable, remove: Boolean) {
        updateAll(pageNo, Lists.newArrayList(newHyperlinks.url), remove)
    }

    @Synchronized
    private fun updateAll(pageNo: Int, urls: Iterable, remove: Boolean) {
        if (!urls.iterator().hasNext()) {
            return
        }

        val indexPage = getIndex(pageNo)
        val indexPageExt = WebPageExt(indexPage)

        val vividLinks = indexPage.vividLinks
        if (remove) {
            urls.forEach { vividLinks.remove(it) }
        } else {
            urls.forEach { vividLinks[it] = "" }
        }

        val message = "Total " + vividLinks.size + " indexed links"
        indexPageExt.setTextCascaded(message)
        LOG.takeIf { it.isTraceEnabled }?.trace(message + ", indexed in " + indexPage.url)

        // webDb.put(indexPage.getUrl(), indexPage, true);
        webDb.put(indexPage)
        // webDb.flush()
    }

    @get:Synchronized
    private val home: WebPage
        get() {
            var home = webDb.get(homeUrl)
            if (home.isNil) {
                home = WebPage.newInternalPage(homeUrl, "Web Page Index Home")
                LOG.debug("Creating weak index home: $homeUrl")
            }
            webDb.put(home)
            // webDb.flush()
            return home
        }

    private fun getIndex(pageNo: Int) = getIndex(pageNo, "Web Page Index $pageNo")

    @Synchronized
    private fun getIndex(pageNo: Int, pageTitle: String): WebPage {
        val url = "$homeUrl/$pageNo"
        var indexPage = webDb.get(url)

        if (temporaryDeleteAllPage) {
            webDb.delete(url)
            webDb.flush()
            indexPage = WebPage.NIL
        }

        if (indexPage.isNil) {
            val home = home
            home.vividLinks[url] = ""
            webDb.put(home)
            indexPage = WebPage.newInternalPage(url, pageTitle)
            webDb.put(indexPage)
            // webDb.flush()
            // log.debug("Created weak index: " + url);
        }
        return indexPage
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy