ai.platon.pulsar.dom.FeaturedDocument.kt Maven / Gradle / Ivy

Go to download
package ai.platon.pulsar.dom

import ai.platon.pulsar.common.AppFiles
import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.config.AppConstants.INTERNAL_URL_PREFIX
import ai.platon.pulsar.common.math.vectors.isNotEmpty
import ai.platon.pulsar.common.urls.Hyperlink
import ai.platon.pulsar.dom.nodes.*
import ai.platon.pulsar.dom.nodes.node.ext.*
import ai.platon.pulsar.dom.select.*
import org.apache.commons.math3.linear.RealVector
import org.jsoup.nodes.*
import org.jsoup.select.NodeTraversor
import java.awt.Dimension
import java.nio.file.Path
import java.util.*
import java.util.concurrent.atomic.AtomicInteger

/**
 * An HTML Document.
 *
 * A ‘featured document’ refers to a ‘very important document.’ Additionally, the numerical features of each node are
 * calculated, and these numerical features can be used to locate nodes or utilized by machine learning algorithms.
 *
 * [FeaturedDocument] is a wrapper for [org.jsoup.nodes.Document], every node's numerical features are
 * calculated by a [ai.platon.pulsar.dom.features.FeatureCalculator] which can be customized.
 *
 * [FeaturedDocument] provides a set of powerful methods to select elements, text contexts, attributes
 * and so on:
 *
 * * [select]: retrieves a list of elements matching the CSS query.
 * * [selectFirstOrNull]: retrieves the first matching element.
 * * [selectFirstTextOrNull]: retrieves the text content of the first matching element.
 * * [selectFirstAttributeOrNull]: retrieves the attribute value associated to the given name of the first matching element.
 * * [selectHyperlinks]: retrieves all hyperlinks of elements matching the CSS query.
 * * [selectAnchors]: retrieves all anchor elements matching the CSS query.
 * * [selectImages]: retrieves all image elements matching the CSS query.
 *
 * Other methods provided include DOM traversal, node counting, document attribute retrieval, export, and so on.
 *
 * @param document The underlying [org.jsoup.nodes.Document]
 *
 * @see org.jsoup.nodes.Document
 * */
open class FeaturedDocument(val document: Document) {
    companion object {
        private val instanceSequencer = AtomicInteger()
        
        var SELECTOR_IN_BOX_DEVIATION = 25
        var primaryGridDimension = Dimension(30, 15) // about 1 em
        var secondaryGridDimension = Dimension(5, 5)
        var densityUnitArea = 400 * 400
        val globalNumDocuments get() = instanceSequencer.get()
        
        /**
         * The NIL document which is a wrapper for a nil [org.jsoup.nodes.Document]
         * */
        val NIL = FeaturedDocument(NILDocument)
        /**
         * The HTML content of a NIL document
         * */
        val NIL_DOC_HTML = NIL.unbox().outerHtml()
        /**
         * The length of a NIL document's HTML content
         * */
        val NIL_DOC_LENGTH = NIL_DOC_HTML.length
        
        /**
         * Create a shell document.
         * */
        fun createShell(baseURI: String, charset: String = "UTF-8"): FeaturedDocument {
            val document = Document.createShell(baseURI)
            document.head().append("")
            return FeaturedDocument(document)
        }
        
        /**
         * Check if this document is NIL.
         * */
        fun isNil(doc: FeaturedDocument) = doc == NIL
        
        /**
         * Check if this document is internal.
         * */
        fun isInternal(doc: FeaturedDocument) = doc.location.startsWith(INTERNAL_URL_PREFIX)
    }
    
    /**
     * The process scope unique sequence.
     * */
    val sequence = instanceSequencer.incrementAndGet()
    /**
     * The normalized URI of the document, it's also the key to retrieve the document from the database
     * and always be the same as [ai.platon.pulsar.persist.WebPage].url.
     * */
    val normalizedURI get() = document.normalizedURI
    /**
     * Get the URL this Document was parsed from. If the starting URL is a redirect,
     * this will return the final URL from which the document was served from.
     */
    val location get() = document.location()
    /**
     * The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
     * before the HTML declares a `` tag.
     *
     * @return base URI
     * @see #absUrl
     */
    val baseURI get() = document.baseUri()
    /**
     * Get the document title.
     * */
    val title get() = document.title()
    /**
     * Get this document's head element.
     *
     * As a side effect, if this Document does not already have a HTML structure,
     * it will be created. If you do not want that, use `selectFirst("head")` instead.
     *
     * @return head element.
     */
    val head: Element get() = document.head()
    
    /**
     * Get this document's [body] element.
     *
     * As a side effect, if this Document does not already have an HTML structure,
     * it will be created with a [body] element. If you do not want that,
     * use {@code #selectFirst("body")} instead.
     *
     * @return [body] element for documents with a [body], a new [body]
     * element if the document had no contents for frameset documents.
     */
    val body: Element get() = document.body()
    
    /**
     * Gets the normalized, combined text of this element and all its children. Whitespace is normalized and
     * trimmed.
     * For example, given HTML {@code 
Hello  there now! }, {@code p.text()} returns {@code "Hello there
     * now!"}
     * If you do not want normalized text, use {@link #wholeText()}. If you want just the text of this document (and not
     * children), use {@link #ownText()}
     * 
Note that this method returns the textual content that would be presented to a reader. The contents of data
     * nodes (such as {@code