ai.platon.pulsar.dom.FeaturedDocument.kt Maven / Gradle / Ivy
package ai.platon.pulsar.dom
import ai.platon.pulsar.common.AppFiles
import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.config.AppConstants.INTERNAL_URL_PREFIX
import ai.platon.pulsar.common.math.vectors.isNotEmpty
import ai.platon.pulsar.common.urls.Hyperlink
import ai.platon.pulsar.dom.nodes.*
import ai.platon.pulsar.dom.nodes.node.ext.*
import ai.platon.pulsar.dom.select.*
import org.apache.commons.math3.linear.RealVector
import org.jsoup.nodes.*
import org.jsoup.select.NodeTraversor
import java.awt.Dimension
import java.nio.file.Path
import java.util.*
import java.util.concurrent.atomic.AtomicInteger
/**
* An HTML Document.
*
* A ‘featured document’ refers to a ‘very important document.’ Additionally, the numerical features of each node are
* calculated, and these numerical features can be used to locate nodes or utilized by machine learning algorithms.
*
* [FeaturedDocument] is a wrapper for [org.jsoup.nodes.Document], every node's numerical features are
* calculated by a [ai.platon.pulsar.dom.features.FeatureCalculator] which can be customized.
*
* [FeaturedDocument] provides a set of powerful methods to select elements, text contexts, attributes
* and so on:
*
* * [select]: retrieves a list of elements matching the CSS query.
* * [selectFirstOrNull]: retrieves the first matching element.
* * [selectFirstTextOrNull]: retrieves the text content of the first matching element.
* * [selectFirstAttributeOrNull]: retrieves the attribute value associated to the given name of the first matching element.
* * [selectHyperlinks]: retrieves all hyperlinks of elements matching the CSS query.
* * [selectAnchors]: retrieves all anchor elements matching the CSS query.
* * [selectImages]: retrieves all image elements matching the CSS query.
*
* Other methods provided include DOM traversal, node counting, document attribute retrieval, export, and so on.
*
* @param document The underlying [org.jsoup.nodes.Document]
*
* @see org.jsoup.nodes.Document
* */
open class FeaturedDocument(val document: Document) {
companion object {
private val instanceSequencer = AtomicInteger()
var SELECTOR_IN_BOX_DEVIATION = 25
var primaryGridDimension = Dimension(30, 15) // about 1 em
var secondaryGridDimension = Dimension(5, 5)
var densityUnitArea = 400 * 400
val globalNumDocuments get() = instanceSequencer.get()
/**
* The NIL document which is a wrapper for a nil [org.jsoup.nodes.Document]
* */
val NIL = FeaturedDocument(NILDocument)
/**
* The HTML content of a NIL document
* */
val NIL_DOC_HTML = NIL.unbox().outerHtml()
/**
* The length of a NIL document's HTML content
* */
val NIL_DOC_LENGTH = NIL_DOC_HTML.length
/**
* Create a shell document.
* */
fun createShell(baseURI: String, charset: String = "UTF-8"): FeaturedDocument {
val document = Document.createShell(baseURI)
document.head().append("")
return FeaturedDocument(document)
}
/**
* Check if this document is NIL.
* */
fun isNil(doc: FeaturedDocument) = doc == NIL
/**
* Check if this document is internal.
* */
fun isInternal(doc: FeaturedDocument) = doc.location.startsWith(INTERNAL_URL_PREFIX)
}
/**
* The process scope unique sequence.
* */
val sequence = instanceSequencer.incrementAndGet()
/**
* The normalized URI of the document, it's also the key to retrieve the document from the database
* and always be the same as [ai.platon.pulsar.persist.WebPage].url.
* */
val normalizedURI get() = document.normalizedURI
/**
* Get the URL this Document was parsed from. If the starting URL is a redirect,
* this will return the final URL from which the document was served from.
*/
val location get() = document.location()
/**
* The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
* before the HTML declares a ` ` tag.
*
* @return base URI
* @see #absUrl
*/
val baseURI get() = document.baseUri()
/**
* Get the document title.
* */
val title get() = document.title()
/**
* Get this document's head element.
*
* As a side effect, if this Document does not already have a HTML structure,
* it will be created. If you do not want that, use `selectFirst("head")` instead.
*
* @return head element.
*/
val head: Element get() = document.head()
/**
* Get this document's [body] element.
*
* As a side effect, if this Document does not already have an HTML structure,
* it will be created with a [body] element. If you do not want that,
* use {@code #selectFirst("body")} instead.
*
* @return [body] element for documents with a [body], a new [body]
* element if the document had no contents for frameset documents.
*/
val body: Element get() = document.body()
/**
* Gets the normalized, combined text of this element and all its children. Whitespace is normalized and
* trimmed.
* For example, given HTML {@code
Hello there now!
}, {@code p.text()} returns {@code "Hello there
* now!"}
* If you do not want normalized text, use {@link #wholeText()}. If you want just the text of this document (and not
* children), use {@link #ownText()}
*
Note that this method returns the textual content that would be presented to a reader. The contents of data
* nodes (such as {@code