ai.platon.pulsar.persist.WebDb.kt Maven / Gradle / Ivy
package ai.platon.pulsar.persist
import ai.platon.pulsar.common.brief
import ai.platon.pulsar.common.config.AppConstants.UNICODE_LAST_CODE_POINT
import ai.platon.pulsar.common.config.ImmutableConfig
import ai.platon.pulsar.common.stringify
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.common.urls.UrlUtils.reverseUrlOrNull
import ai.platon.pulsar.persist.gora.db.DbIterator
import ai.platon.pulsar.persist.gora.db.DbQuery
import ai.platon.pulsar.persist.gora.generated.GWebPage
import org.apache.gora.filter.Filter
import org.apache.gora.filter.FilterOp
import org.apache.gora.filter.SingleFieldValueFilter
import org.slf4j.LoggerFactory
import java.nio.ByteBuffer
import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicBoolean
import java.util.concurrent.atomic.AtomicLong
* A simple interface to query and store web pages.
* */
class WebDb(
val conf: ImmutableConfig,
): AutoCloseable {
companion object {
val dbGetCount = AtomicLong()
val accumulateGetNanos = AtomicLong()
val dbContinousFailureCount = AtomicLong()
val dbGetAveMillis get() = TimeUnit.MILLISECONDS.convert(
accumulateGetNanos.get(), TimeUnit.NANOSECONDS) / dbGetCount.get().coerceAtLeast(1)
val dbPutCount = AtomicLong()
val accumulatePutNanos = AtomicLong()
val dbPutAveMillis get() = TimeUnit.MILLISECONDS.convert(
accumulatePutNanos.get(), TimeUnit.NANOSECONDS) / dbPutCount.get().coerceAtLeast(1)
private val logger = LoggerFactory.getLogger(
private val tracer = logger.takeIf { it.isTraceEnabled }
private val closed = AtomicBoolean()
var specifiedDataStore: DataStore? = null
private val dataStoreDelegate = lazy { specifiedDataStore ?: AutoDetectStorageProvider(conf).createPageStore() }
val dataStore: DataStore by dataStoreDelegate
val dataStoreOrNull: DataStore? get() = if (dataStoreDelegate.isInitialized()) dataStore else null
val schemaName: String get() = dataStoreOrNull?.schemaName?:"(unknown, not initialized)"
* Test if the WebDB can be connected.
* @return true if the WebDB can be connected.
* */
fun canConnect() = dataStore.runCatching { schemaExists() }.isSuccess
* Returns the WebPage corresponding to the given url.
* @param originalUrl the original url of the page, it comes from user input, webpage parsing, etc
* @param field the field required in the WebPage.
* @return the WebPage corresponding to the key or null if it cannot be found
fun getOrNull(originalUrl: String, field: GWebPage.Field): WebPage? {
return getOrNull(originalUrl, field.toString())
* Returns the WebPage corresponding to the given url.
* @param originalUrl the original url of the page, it comes from user input, webpage parsing, etc
* @param fields the fields required in the WebPage. Pass null to retrieve all fields
* @return the WebPage corresponding to the key or null if it cannot be found
fun getOrNull(originalUrl: String, fields: Iterable): WebPage? {
return getOrNull(originalUrl, false, { it.toString() }.toTypedArray())
* Returns the WebPage corresponding to the given url.
* @param originalUrl the original url of the page, it comes from user input, webpage parsing, etc
* @param field the fields required in the WebPage. Pass null to retrieve all fields
* @return the WebPage corresponding to the key or null if it cannot be found
fun getOrNull(originalUrl: String, field: String): WebPage? {
return getOrNull(originalUrl, false, arrayOf(field))
* Returns the WebPage corresponding to the given url.
* @param originalUrl the original url of the page, it comes from user input, webpage parsing, etc
* @param fields the fields required in the WebPage. Pass null to retrieve all fields
* @return the WebPage corresponding to the key or null if it cannot be found
fun getOrNull(originalUrl: String, norm: Boolean = false, fields: Array? = null): WebPage? {
// TODO: consider the design again whether we need normalize the url here
val (url, key) = UrlUtils.normalizedUrlAndKey(originalUrl, norm)
val page = getOrNull0(originalUrl, norm, fields)
if (page != null) {
val p =, key, page, conf.toVolatileConfig()).also { it.isLoaded = true }
tracer?.trace("Got {} {} {} {}", p.fetchCount, p.prevFetchTime, p.fetchTime, key)
return p
return null
* Returns the WebPage corresponding to the given url.
* @param originalUrl the original address of the page
* @return the WebPage corresponding to the key or [WebPage.NIL] if it cannot be found
fun get(originalUrl: String, field: GWebPage.Field) = getOrNull(originalUrl, field) ?: WebPage.NIL
fun get(originalUrl: String, fields: Iterable) =
getOrNull(originalUrl, fields) ?: WebPage.NIL
fun get(originalUrl: String, field: String) = getOrNull(originalUrl, field) ?: WebPage.NIL
fun get(originalUrl: String, norm: Boolean = false, fields: Array? = null): WebPage {
return getOrNull(originalUrl, norm, fields) ?: WebPage.NIL
fun get0(originalUrl: String, norm: Boolean = false, fields: Array? = null): GWebPage? {
return getOrNull0(originalUrl, norm, fields)
fun exists(originalUrl: String, norm: Boolean = false): Boolean {
val requiredField = GWebPage.Field.CREATE_TIME.toString()
return getOrNull(originalUrl, norm, arrayOf(requiredField)) != null
fun getContent(originalUrl: String): ByteBuffer? {
val fields = arrayOf(GWebPage.Field.CONTENT.toString())
return getOrNull0(originalUrl, false, fields)?.content
fun getContentAsString(originalUrl: String): String? {
val buffer = getContent(originalUrl) ?: return null
return when {
buffer.remaining() == 0 -> ""
else -> String(buffer.array(), buffer.arrayOffset(), buffer.limit())
fun put(page: WebPage, replaceIfExists: Boolean = false) = putInternal(page, replaceIfExists)
* Notice:
* There are comments in gora-hbase-0.6.1,, line 259:
* "HBase sometimes does not delete arbitrarily"
private fun putInternal(page: WebPage, replaceIfExists: Boolean): Boolean {
// Never update NIL page
if (page.isNil) {
return false
val key = page.reversedUrl
if (key.isEmpty()) {
return false
if (replaceIfExists) {
performDSAction("put") { dataStore.delete(key) }
tracer?.trace("Putting {} {} {} {}", page.fetchCount, page.prevFetchTime, page.fetchTime, key)
val startTime = System.nanoTime()
performDSAction("put") { dataStore.put(key, page.unbox()) }
accumulatePutNanos.addAndGet(System.nanoTime() - startTime)
return true
fun putAll(pages: Iterable) = pages.forEach { put(it, false) }
fun delete(originalUrl: String, norm: Boolean = false): Boolean {
val (_, key) = UrlUtils.normalizedUrlAndKey(originalUrl, norm)
if (key.isBlank()) {
return false
return performDSAction("delete", originalUrl) { dataStore.delete(key) }
fun truncate(force: Boolean = false): Boolean {
val schemaName = dataStore.schemaName
if (force) {
performDSAction("truncate") { dataStore.truncateSchema() }"Schema $schemaName is truncated")
return true
return if (schemaName.startsWith("tmp_") || schemaName.endsWith("_tmp_webpage")) {
performDSAction("truncate") { dataStore.truncateSchema() }"Schema $schemaName is truncated")
} else {"Only schema name starts with tmp_ or ends with _tmp_webpage " +
"can be truncated using this API")
* Scan all pages whose url starts with {@param urlBase}
* @param urlBase The base url to start with
* @return The iterator to retrieve pages
fun scan(urlBase: String): Iterator {
val query = dataStore.newQuery()
// TODO: key range does not working in MongoStore
query.setKeyRange(reverseUrlOrNull(urlBase), reverseUrlOrNull(urlBase + UNICODE_LAST_CODE_POINT))
val result = dataStore.execute(query)
return DbIterator(result, conf)
* Scan all pages whose url starts with {@param urlBase}
* @param urlBase The base url to start with
* @return The iterator to retrieve pages
fun scan(urlBase: String, fields: Iterable): Iterator {
return scan(urlBase, { it.toString() }.toTypedArray())
* Scan all pages whose url starts with {@param urlBase}
* @param urlBase The base url to start with
* @return The iterator to retrieve pages
fun scan(urlBase: String, fields: Array): Iterator {
val query = dataStore.newQuery()
// TODO: key range does not working in MongoStore
query.setKeyRange(reverseUrlOrNull(urlBase), reverseUrlOrNull(urlBase + UNICODE_LAST_CODE_POINT))
val result = dataStore.execute(query)
return DbIterator(result, conf)
* Scan all pages whose url starts with {@param urlBase}
* @param urlBase The base url to start with
* @return The iterator to retrieve pages
fun scan(urlBase: String, fields: Array, filter: Filter): Iterator {
val query = dataStore.newQuery()
query.filter = filter
// TODO: key range does not working in MongoStore
query.setKeyRange(reverseUrlOrNull(urlBase), reverseUrlOrNull(urlBase + UNICODE_LAST_CODE_POINT))
val result = dataStore.execute(query)
return DbIterator(result, conf)
* Scan all pages matches the {@param query}.
* @param query The query
* @return The iterator to retrieve pages
fun query(query: DbQuery): Iterator {
val goraQuery = dataStore.newQuery()
val startKey = query.startUrl?.let { reverseUrlOrNull(it) }
var endKey = query.endUrl?.let { reverseUrlOrNull(it) }
// The placeholder is used to mark the last character, it's required for serialization, especially for json format
if (endKey != null) {
endKey = endKey.replace("\\uFFFF".toRegex(), UNICODE_LAST_CODE_POINT.toString())
endKey = endKey.replace("\\\\uFFFF".toRegex(), UNICODE_LAST_CODE_POINT.toString())
// TODO: key range does not working in MongoStore
goraQuery.startKey = startKey
goraQuery.endKey = endKey
val batchId = query.batchId
if (batchId == null && query.filterNullBatchId) {
goraQuery.filter = createBatchIdFilter(query.batchId, query.filterIfMissing)
} else if (batchId != null) {
goraQuery.filter = createBatchIdFilter(query.batchId, query.filterIfMissing)
val result = performDSAction("query") { dataStore.execute(goraQuery) }
return DbIterator(result, conf)
fun flush() {
if (!dataStoreDelegate.isInitialized()) {
try {
performDSAction("flush") { dataStore.flush() }
} catch (e: IllegalStateException) {
} catch (e: Exception) {
// TODO: Embedded MongoDB fails to shutdown gracefully #5487
// see
throw WebDBException("Failed to flush", e)
override fun close() {
if (closed.compareAndSet(false, true)) {
if (dataStoreDelegate.isInitialized()) {
// flush()
// Note: mongo store does not close actually
performDSAction("close") { dataStore.close() }
// GoraStorage.close()
* Returns the WebPage corresponding to the given url.
* @param originalUrl the original url of the page, it comes from user input, webpage parsing, etc
* @param fields the fields required in the WebPage. Pass null to retrieve all fields
* @return the WebPage corresponding to the key or null if it cannot be found
private fun getOrNull0(originalUrl: String, norm: Boolean = false, fields: Array? = null): GWebPage? {
val (_, key) = UrlUtils.normalizedUrlAndKey(originalUrl, norm)
tracer?.trace("Getting $key")
val startTime = System.nanoTime()
val page = performDSAction("get", originalUrl) {
fields?.let { dataStore.get(key, it) } ?: dataStore.get(key)
accumulateGetNanos.addAndGet(System.nanoTime() - startTime)
return page
private fun createBatchIdFilter(
batchId: CharSequence?, filterIfMissing: Boolean = false
): SingleFieldValueFilter {
return SingleFieldValueFilter().also {
it.fieldName = GWebPage.Field.BATCH_ID.toString()
it.filterOp = FilterOp.EQUALS
if (batchId != null) {
it.operands = listOf(batchId)
} else {
it.operands = listOf(null)
it.isFilterIfMissing = filterIfMissing
private fun prepareFields(fields: MutableSet): Array {
if (fields.isEmpty()) {
return GWebPage._ALL_FIELDS
return fields.toTypedArray()
private fun performDSAction(name: String, url: String? = null, action: () -> T): T {
// if (!AppContext.isActive) {
// throw IllegalApplicationContextStateException("")
// }
try {
return action().also { dbContinousFailureCount.decrementAndGet() }
} catch (e: Exception) {
var message = "Data storage failure | [$name]"
if (url.isNullOrBlank()) {
message = "$message | $url"
if (dbContinousFailureCount.get() < 5) {
logger.warn(e.stringify("$message - "))
} else {
logger.warn(e.brief("$message - "))
throw WebDBException(message, e)
© 2015 - 2025 Weber Informatics LLC | Privacy Policy