ai.platon.pulsar.common.collect.UrlCache.kt Maven / Gradle / Ivy
package ai.platon.pulsar.common.collect
import ai.platon.pulsar.common.Priority13
import ai.platon.pulsar.common.collect.queue.*
import ai.platon.pulsar.common.urls.UrlAware
import com.google.common.cache.LoadingCache
import java.time.Instant
import java.util.*
import java.util.concurrent.ConcurrentLinkedQueue
/**
* The url cache holds urls.
*
* A url cache contains several queues for different purpose: reentrant, non-reentrant and n-reentrant.
* A reentrant queue accepts the same url multiple times, a non-reentrant queue accepts the same url only once,
* and an n-reentrant queue accepts the same url for n times at most.
*
* The URL cache is expected to be very large and items may be loaded from external sources such as MongoDB.
* */
interface UrlCache {
/**
* The cache name
* */
val name: String
/**
* The priority
* */
val priority: Int
/**
* A non-reentrant queue accepts the same url only once
* */
val nonReentrantQueue: Queue
/**
* An n-reentrant queue accepts the same url for n times at most
* */
val nReentrantQueue: Queue
/**
* A reentrant queue accepts the same url multiple times
* */
val reentrantQueue: Queue
/**
* Create a list of all the queues
* */
val queues: List>
get() = listOf(nonReentrantQueue, nReentrantQueue, reentrantQueue)
/**
* The total size of all the queues
* */
val size get() = queues.sumOf { it.size }
/**
* The precise count of urls in the external source, since the external source can be very large,
* retrieving the precise size can be very slow in some external source.
* */
val externalSize: Int get() = 0
/**
* The estimated, imprecise count of urls in the external source, it should be very fast.
* */
val estimatedExternalSize get() = 0
/**
* The estimated, imprecise count of all urls both in local cache and the external source.
* */
val estimatedSize get() = size + estimatedExternalSize
/**
* Remove dead urls.
* */
fun removeDeceased()
/**
* Clear the local cache.
* */
fun clear()
/**
* Clear both the local cache and external source.
* */
fun deepClear() = clear()
}
abstract class AbstractUrlCache(
override val name: String,
override val priority: Int
) : UrlCache {
override fun removeDeceased() {
val now = Instant.now()
queues.forEach { it.removeIf { it.deadline < now } }
}
override fun clear() {
queues.forEach { it.clear() }
}
}
open class ConcurrentUrlCache(
name: String = "",
priority: Int = Priority13.NORMAL.value
) : AbstractUrlCache(name, priority) {
override val nonReentrantQueue = ConcurrentNonReentrantQueue()
override val nReentrantQueue = ConcurrentNEntrantQueue(3)
override val reentrantQueue = ConcurrentLinkedQueue()
}
/**
* Contains a sets of loading queues which can load urls from external source using [urlLoader].
* */
class LoadingUrlCache constructor(
name: String,
priority: Int,
/**
* A loader to load urls from external sources
* */
val urlLoader: ExternalUrlLoader,
/**
* The capacity for each queue
* */
val capacity: Int = LoadingQueue.DEFAULT_CAPACITY,
) : AbstractUrlCache(name, priority), Loadable {
companion object {
const val G_NON_REENTRANT = 1
const val G_N_ENTRANT = 2
const val G_REENTRANT = 3
}
override val nonReentrantQueue = ConcurrentNonReentrantLoadingQueue(urlLoader, topic(G_NON_REENTRANT))
override val nReentrantQueue = ConcurrentNEntrantLoadingQueue(urlLoader, topic(G_N_ENTRANT), 3)
override val reentrantQueue = ConcurrentLoadingQueue(urlLoader, topic(G_REENTRANT))
override val queues: List> get() = listOf(nonReentrantQueue, nReentrantQueue, reentrantQueue)
override val externalSize: Int
get() = queues.filterIsInstance>().sumOf { it.externalSize }
override val estimatedExternalSize: Int
get() = queues.filterIsInstance>().sumOf { it.estimatedExternalSize }
override fun load() {
queues.filterIsInstance>().forEach { it.load() }
}
override fun loadNow(): Collection {
return queues.filterIsInstance>().flatMap { it.loadNow() }
}
override fun deepClear() {
queues.filterIsInstance>().forEach { it.deepClear() }
}
private fun topic(group: Int) = UrlTopic(name, group, priority, capacity)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy