All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.rest.api.common.ScrapeAPIUtils.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.rest.api.common

import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.ql.h2.utils.ResultSetUtils
import org.nibor.autolink.LinkExtractor
import org.nibor.autolink.LinkType
import java.util.*

object ScrapeAPIUtils {

    private val allowedArgs = LoadOptions.apiPublicOptionNames
    private val allowedScrapeUDFs = arrayOf("loadandselect", "loadoutpages")

    @Throws(IllegalArgumentException::class)
    fun normalize(rawSql: String?): NormXSQL {
        if (rawSql == null) {
            throw throw IllegalArgumentException("SQL is required")
        }
        val configuredUrl = extractUrl(rawSql) ?: throw IllegalArgumentException("No url found in sql: >>>$rawSql<<<")

        val (url, args) = UrlUtils.splitUrlArgs(configuredUrl)
        val sql = eraseExpireOptions(rawSql)

        return NormXSQL(url, args, sql)
    }

    @Throws(IllegalArgumentException::class)
    fun checkArgs(args: String?) {
        args?.split("\\s+".toRegex())?.filter { it.startsWith("-") }?.forEach { arg ->
            if (arg !in allowedArgs) {
                throw IllegalArgumentException("Argument is not allowed: <$arg>")
            }
        }
    }

    fun isScrapeUDF(sql: String): Boolean {
        val s = sql.replace("_", "").lowercase(Locale.getDefault())
        return allowedScrapeUDFs.any { it in s }
    }

    @Throws(IllegalArgumentException::class)
    fun checkSql(sql: String): String {
        return try {
            APISQLUtils.sanitize(sql)
        } catch (e: Exception) {
            throw IllegalArgumentException(e.message)
        }
    }

    fun eraseUrlOptions(sql: String, vararg fields: String): String {
        // do not forget the blank
        val separator = " | "
        val optionNames = fields.flatMap { LoadOptions.getOptionNames(it) }.joinToString(separator)
        return sql.replace(optionNames.toRegex(), " -erased ")
    }

    fun eraseExpireOptions(sql: String): String {
        return eraseUrlOptions(sql, "expires", "expireAt", "itemExpires", "itemExpireAt")
    }

    /**
     * Extract the url from the SQL, the url might be configured
     * */
    fun extractUrl(sql: String?): String? {
        if (sql == null) {
            return null
        }

        val sql0 = checkSql(sql).replace("\\s+".toRegex(), " ")
        return if (sql0.contains(" from ", ignoreCase = true)) {
            ResultSetUtils.extractUrlFromFromClause(sql0)
        } else {
            // TODO: this branch is deprecated
            val input = sql0
            val linkExtractor = LinkExtractor.builder()
                .linkTypes(EnumSet.of(LinkType.URL))
                .build()
            val links = linkExtractor.extractLinks(input).iterator()

            if (links.hasNext()) {
                val link = links.next()
                input.substring(link.beginIndex, link.endIndex)
            } else {
                null
            }
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy