All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.ql.h2.DomToH2Queries.kt Maven / Gradle / Ivy

The newest version!
package ai.platon.pulsar.ql.h2

import ai.platon.pulsar.common.getLogger
import ai.platon.pulsar.common.math.vectors.get
import ai.platon.pulsar.common.math.vectors.isEmpty
import ai.platon.pulsar.common.sleepSeconds
import ai.platon.pulsar.common.urls.UrlUtils
import ai.platon.pulsar.dom.FeaturedDocument
import ai.platon.pulsar.dom.features.FeatureRegistry.registeredFeatures
import ai.platon.pulsar.dom.features.NodeFeature.Companion.isFloating
import ai.platon.pulsar.dom.nodes.GeoAnchor
import ai.platon.pulsar.dom.select.appendSelectorIfMissing
import ai.platon.pulsar.dom.select.select
import ai.platon.pulsar.dom.select.selectFirstOrNull
import ai.platon.pulsar.persist.WebPage
import ai.platon.pulsar.persist.model.WebPageFormatter
import ai.platon.pulsar.ql.common.ResultSets
import ai.platon.pulsar.ql.common.types.ValueDom
import ai.platon.pulsar.skeleton.common.options.LoadOptions
import ai.platon.pulsar.skeleton.common.urls.NormURL
import ai.platon.pulsar.skeleton.crawl.common.url.CompletableListenableHyperlink
import ai.platon.pulsar.skeleton.session.PulsarSession
import org.apache.commons.math3.linear.RealVector
import org.h2.api.ErrorCode
import org.h2.message.DbException
import org.h2.tools.SimpleResultSet
import org.h2.value.DataType
import org.h2.value.Value
import org.h2.value.ValueArray
import org.h2.value.ValueString
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import java.sql.ResultSet
import java.util.*
import java.util.concurrent.CompletableFuture
import java.util.concurrent.TimeUnit
import kotlin.math.min
import kotlin.math.pow
import kotlin.math.roundToInt
import kotlin.reflect.full.memberProperties
import kotlin.reflect.full.primaryConstructor

object DomToH2Queries {
    private val logger = getLogger(this::class)

    /**
     * Load all Web pages
     *
     * @param session The session
     * @param urls The urls to load, can be a single string represented by a [ValueString]
     * or an array of strings represented by a [ValueArray]
     * @return A collection of [WebPage]s
     */
    fun loadAll(session: PulsarSession, urls: Value): Collection {
        var pages: Collection = listOf()

        when (urls) {
            is ValueString -> {
                val normURL = session.normalize(urls.string)
                pages = ArrayList()
                pages.add(session.load(normURL))
            }
            is ValueArray ->
                if (urls.list.isNotEmpty()) {
                    pages = session.loadAll(urls.list.mapTo(mutableSetOf()) { it.string })
                }
            else -> throw DbException.get(ErrorCode.METHOD_NOT_FOUND_1, "Unsupported type ${Value::class}")
        }

        return pages
    }

    /**
     * Load all Web pages, and translate Web pages to targets using the given transformer
     *
     * @param session        The session
     * @param configuredUrls The configured urls, can be a single string represented by a {@link ValueString},
     *                       or an array of strings represented by a {@link ValueArray}
     * @param cssQuery       The css query
     * @param offset         The offset
     * @param limit          The limit
     * @param transformer    The transformer used to translate a Web page into something else
     * @return A collection of O
     */
    fun  loadAll(
        session: PulsarSession,
        configuredUrls: Value, restrictCss: String, offset: Int, limit: Int,
        transformer: (Element, String, Int, Int) -> Collection
    ): Collection {
        val collection: Collection

        when (configuredUrls) {
            is ValueString -> {
                val doc = session.loadDocument(configuredUrls.string)
                collection = transformer(doc.document, restrictCss, offset, limit)
            }
            is ValueArray -> {
                collection = ArrayList()
                for (configuredUrl in configuredUrls.list) {
                    val doc = session.loadDocument(configuredUrl.string)
                    collection.addAll(transformer(doc.document, restrictCss, offset, limit))
                }
            }
            else -> throw DbException.get(ErrorCode.FUNCTION_NOT_FOUND_1, "Unknown custom type")
        }

        return collection
    }

    fun loadOutPages(
        session: PulsarSession,
        portalUrl: String, restrictCss: String,
        offset: Int = 1, limit: Int = Int.MAX_VALUE,
        normalize: Boolean = true, ignoreQuery: Boolean = false
    ): Collection {
        val transformer = if (ignoreQuery) this::getLinksIgnoreQuery else this::getLinks

        val normURL = session.normalize(portalUrl)
        val limit2 = min(limit, normURL.options.topLinks)

        val document = session.loadDocument(normURL)
        var links = transformer(document.document, restrictCss, offset, Int.MAX_VALUE).filter { !UrlUtils.isInternal(it) }

        if (normalize) {
            links = links.mapNotNull { session.normalizeOrNull(it)?.spec }
        }

        val itemOptions = normURL.options.createItemOptions()
        val distinctLinks = session.normalize(links.toSet().take(limit2), itemOptions)

        return loadAll(session, distinctLinks)
    }

    /**
     * Load all pages specified by [normUrls], wait until all pages are loaded or timeout.
     * */
    private fun loadAll(
        session: PulsarSession,
        normUrls: Iterable
    ): List {
        if (!normUrls.iterator().hasNext()) {
            return listOf()
        }

        val futures = session.loadAllAsync(normUrls.distinctBy { it.spec })

        logger.info("Waiting for {} completable hyperlinks | @{}", futures.size, futures.hashCode())

        val future = CompletableFuture.allOf(*futures.toTypedArray())
        future.join()

        val pages = futures.mapNotNull { it.get() }.filter { it.isNotInternal }

        logger.info("Finished {}/{} pages | @{}", pages.size, futures.size, futures.hashCode())

        return pages
    }

    /**
     * Load all pages specified by [normUrls], wait until all pages are loaded or timeout
     * */
    private fun loadAll2(session: PulsarSession, normUrls: Iterable, options: LoadOptions): Collection {
        val globalCache = session.globalCache
        val queue = globalCache.urlPool.higher3Cache.reentrantQueue
        val timeoutSeconds = options.pageLoadTimeout.seconds + 1
        val links = normUrls
            .asSequence()
            .map { CompletableListenableHyperlink(it.spec, args = it.args, href = it.hrefSpec) }
            .onEach { it.completeOnTimeout(WebPage.NIL, timeoutSeconds, TimeUnit.SECONDS) }
            .toList()

        queue.addAll(links)
        logger.info("Waiting for {} completable hyperlinks, {}@{}, {}", links.size,
            globalCache.javaClass, globalCache.hashCode(), globalCache.urlPool.hashCode())

        var i = 90
        val pendingLinks = links.toMutableList()
        while (i-- > 0 && pendingLinks.isNotEmpty()) {
            val finishedLinks = pendingLinks.filter { it.isDone }
            if (finishedLinks.isNotEmpty()) {
                logger.debug("Has finished {} links", finishedLinks.size)
            }

            if (i % 30 == 0) {
                logger.debug("Still {} pending links", pendingLinks.size)
            }

            pendingLinks.removeIf { it.isDone }
            sleepSeconds(1)
        }

        // timeout process?
//        val future = CompletableFuture.allOf(*links.toTypedArray())
//        future.join()

        return links.filter { it.isDone }.mapNotNull { it.get() }.filter { it.isNotInternal }
    }

    /**
     * TODO: any type support, only array of strings are supported now
     * */
    fun  select(dom: ValueDom, cssQuery: String, transform: (Element) -> O): ValueArray {
        val values = dom.element.select(cssQuery) { ValueString.get(transform(it).toString()) }.toTypedArray()
        return ValueArray.get(values)
    }

    fun  selectFirstOrNull(dom: ValueDom, cssQuery: String, transformer: (Element) -> O): O? {
        val result = dom.element.selectFirstOrNull(cssQuery, transformer)
        if (result != null && result is Element) {
            // feature: mark element matching query
            // select first element matched
            // result.attr("sf-match")
        }
        return result
    }

    fun  selectNthOrNull(dom: ValueDom, cssQuery: String, n: Int, transform: (Element) -> O): O? {
        val result = dom.element.select(cssQuery, n, 1).firstOrNull()
        if (result != null) {
            // feature: mark element matching query
            // select n-th element matched
            // result.attr("sn-match")
            return transform(result)
        }
        return null
    }

    fun getTexts(ele: Element, restrictCss: String, offset: Int, limit: Int): Collection {
        return ele.select(restrictCss, offset, limit) { it.text() }
    }

    fun getLinks(ele: Element, restrictCss: String, offset: Int, limit: Int): Collection {
        val cssQuery = appendSelectorIfMissing(restrictCss, "a")
        return ele.select(cssQuery, offset, limit) { it.absUrl("href") }
    }

    fun getLinksIgnoreQuery(ele: Element, restrictCss: String, offset: Int, limit: Int): Collection {
        val cssQuery = appendSelectorIfMissing(restrictCss, "a")
        return ele.select(cssQuery, offset, limit) {
            it.absUrl("href").takeIf { UrlUtils.isStandard(it) }?.substringBefore("?")
        }.filterNotNull()
    }

    fun getFeatures(ele: Element, restrictCss: String, offset: Int, limit: Int): Collection {
        return ele.select(restrictCss, offset, limit) { it.extension.features }
    }

    fun toValueArray(elements: Elements): ValueArray {
        val values = arrayOfNulls(elements.size)
        for (i in elements.indices) {
            values[i] = ValueDom.getOrNil(elements[i])
        }
        return ValueArray.get(values)
    }

    /**
     * Get a result set, the result set contains just one column DOM
     */
    fun  toResultSet(colName: String, collection: Iterable): ResultSet {
        val rs = ResultSets.newSimpleResultSet()
        val colType = if (colName.equals("DOM", ignoreCase = true)) ValueDom.type else Value.STRING
        val sqlType = DataType.convertTypeToSQLType(colType)
        rs.addColumn(colName, sqlType, 0, 0)

        if (colType == ValueDom.type) {
            collection.forEach { rs.addRow(it) }
        } else {
            collection.forEach { e -> rs.addRow(ValueString.get(e.toString())) }
        }

        return rs
    }

    /**
     * Get a result set, the result set contains just one column DOM
     */
    fun toDOMResultSet(document: FeaturedDocument, elements: Collection): ResultSet {
        val rs = ResultSets.newSimpleResultSet()
        val colType = ValueDom.type
        val sqlType = DataType.convertTypeToSQLType(colType)
        rs.addColumn("DOM", sqlType, 0, 0)
        rs.addColumn("DOC", sqlType, 0, 0)

        val docDOM = ValueDom.get(document)
        elements.forEach { rs.addRow(it, docDOM) }

        return rs
    }

    /**
     * Get result set for each field in Web page
     */
    fun toResultSet(anchors: Collection): ResultSet {
        val rs = SimpleResultSet()
        rs.addColumns("URL", "TEXT", "PATH", "LEFT", "TOP", "WIDTH", "HEIGHT")

        anchors.forEach {
            rs.addRow(it.url, it.text, it.path, it.left, it.top, it.width, it.height)
        }

        return rs
    }

    /**
     * Get result set of a data class
     * TODO: test is required
     */
    fun toResultSet(objects: Iterable): ResultSet {
        val rs = SimpleResultSet()
        val first = objects.firstOrNull() ?: return rs
        val primaryConstructor = first::class.primaryConstructor ?: return rs

        val propertyNames = primaryConstructor.parameters.mapIndexed { i, kParameter ->
            kParameter.name ?: "C${1 + i}"
        }
        propertyNames.forEach {
            rs.addColumn(it.uppercase(Locale.getDefault()))
        }

        val memberProperties = first::class.memberProperties.filter { it.name in propertyNames }
        objects.forEach { obj ->
            val values = memberProperties
                .filter { it.name in propertyNames }
                .map { it.getter.call(obj).toString() }
                .toTypedArray()
            rs.addRow(*values)
        }

        return rs
    }

    /**
     * Get result set for each field in Web page
     */
    fun toResultSet(page: WebPage): ResultSet {
        val rs = SimpleResultSet()
        rs.addColumns("KEY", "VALUE")

        val fields = WebPageFormatter(page).toMap()
        for (entry in fields.entries) {
            val value = entry.value.toString()
            rs.addRow(entry.key, value)
        }

        return rs
    }

    /**
     * Get a row of data contains the DOM itself and all it's feature values
     * Every float feature has 2 fraction digits
     */
    fun getFeatureRow(ele: Element): Array {
        val columnCount = 1 + registeredFeatures.size + 1
        val values = arrayOfNulls(columnCount)
        values[0] = ValueDom.get(ele)
        val features = if (!ele.extension.features.isEmpty) ele.extension.features else return values

        // TODO: configurable
        val base = 10f
        val fractionDigits = 2
        val factor = base.pow(fractionDigits)
        for (j in 1..registeredFeatures.size) {
            val key = j - 1
            val v = features[key]

            if (isFloating(key)) {
                values[j] = 1.0 * (factor * v).roundToInt() / factor
            } else {
                values[j] = v.toInt()
            }
        }

        return values
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy