ai.platon.pulsar.dom.nodes.node.ext.NodeExt.kt Maven / Gradle / Ivy
package ai.platon.pulsar.dom.nodes.node.ext
import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.SParser
import ai.platon.pulsar.common.Strings
import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.AppConstants.*
import ai.platon.pulsar.common.math.geometric.str
import ai.platon.pulsar.common.math.geometric.str2
import ai.platon.pulsar.common.math.vectors.get
import ai.platon.pulsar.common.math.vectors.set
import ai.platon.pulsar.dom.features.FeatureEntry
import ai.platon.pulsar.dom.features.FeatureFormatter
import ai.platon.pulsar.dom.features.NodeFeature
import ai.platon.pulsar.dom.features.defined.*
import ai.platon.pulsar.dom.model.createLink
import ai.platon.pulsar.dom.nodes.*
import ai.platon.pulsar.dom.select.selectFirstOrNull
import org.apache.commons.lang3.StringUtils
import org.apache.commons.math3.linear.ArrayRealVector
import org.jsoup.nodes.*
import org.jsoup.select.NodeTraversor
import java.awt.Dimension
import java.awt.Point
import java.awt.Rectangle
import java.nio.file.Path
import java.util.*
import java.util.concurrent.ConcurrentSkipListSet
import java.util.concurrent.atomic.AtomicBoolean
import kotlin.reflect.KProperty
class DoubleFeature(val name: Int) {
operator fun getValue(thisRef: Node, property: KProperty<*>): Double = thisRef.extension.features[name]
operator fun setValue(thisRef: Node, property: KProperty<*>, value: Double) {
thisRef.extension.features[name] = value
}
}
class IntFeature(val name: Int) {
operator fun getValue(thisRef: Node, property: KProperty<*>): Int = thisRef.extension.features[name].toInt()
operator fun setValue(thisRef: Node, property: KProperty<*>, value: Int) {
thisRef.extension.features[name] = value.toDouble()
}
}
class MapField(val initializer: (Node) -> T) {
operator fun getValue(thisRef: Node, property: KProperty<*>): T =
thisRef.extension.variables[property.name] as? T ?: setValue(thisRef, property, initializer(thisRef))
operator fun setValue(thisRef: Node, property: KProperty<*>, value: T): T {
thisRef.extension.variables[property.name] = value
return value
}
}
class NullableMapField {
operator fun getValue(thisRef: Node, property: KProperty<*>): T? = thisRef.extension.variables[property.name] as T?
operator fun setValue(thisRef: Node, property: KProperty<*>, value: T?) {
thisRef.extension.variables[property.name] = value
}
}
fun field(initializer: (Node) -> T): MapField {
return MapField(initializer)
}
inline fun nullableField(): NullableMapField {
return NullableMapField()
}
class ExportPaths(val uri: String) {
private val namedPath = mutableMapOf()
val filename by lazy { AppPaths.fromUri(uri, "", ".htm") }
val portal get() = byType("portal")
val annotatedView get() = byType("annotated")
val tileView get() = byType("tile")
val entityView get() = byType("entity")
fun byType(type: String) = namedPath.computeIfAbsent(type) { get(type, filename) }
companion object {
val BASE_DIR = AppPaths.DOC_EXPORT_DIR
fun get(first: String) = BASE_DIR.resolve(first)
fun get(first: String, second: String) = get(first).resolve(second)
}
}
const val NILLocation: String = NIL_PAGE_URL
const val NILBaseUri: String = NILLocation
val NILDocument: Document = Document.createShell(NILBaseUri)
val NILElement: Element = NILDocument.body()
val NILNode: Node = NILElement
@Deprecated("Inappropriate name", ReplaceWith("NILDocument"))
val nilDocument = NILDocument
@Deprecated("Inappropriate name", ReplaceWith("NILElement"))
val nilElement = NILElement
@Deprecated("Inappropriate name", ReplaceWith("NILNode"))
val nilNode = NILNode
val Document.isNil get() = this === NILDocument
/**
* The element that hold the meta information generated by PulsarRPA.
* */
val Document.pulsarMetaElement get() = getElementById(PULSAR_META_INFORMATION_ID)
/**
* The element that hold a piece of script generated by PulsarRPA used to hold data.
* */
val Document.pulsarScriptElement get() = getElementById(PULSAR_SCRIPT_SECTION_ID)
/**
* The script generated by PulsarRPA used to hold data.
* */
val Document.pulsarScript get() = pulsarScriptElement?.text()
/**
* The normalized URI of the document, it's also the key to retrieve the document from the database
* and always be the same as [ai.platon.pulsar.persist.WebPage].url.
* */
val Document.normalizedURI: String? get() {
if (isNil) {
return NILBaseUri
}
// is added by JsoupParser
return head().selectFirstOrNull("link[rel=$PULSAR_DOCUMENT_NORMALIZED_URI]")?.attr("href")
?: pulsarMetaElement?.attr(PULSAR_DOCUMENT_NORMALIZED_URI)
}
/**
* Whether the document is initialized.
* */
var Document.isInitialized by field { AtomicBoolean() }
internal val Document.threadIds by field { ConcurrentSkipListSet() }
/**
* The view port of the document.
* */
val Document.viewPort by field { it.calculateViewPort() }
// geometric grid, we have two grids, a bigger one and a smaller one
/**
* The primary grid of the document.
* */
var Document.primaryGrid by field { Dimension(0, 0) }
/**
* The secondary grid of the document.
* */
var Document.secondaryGrid by field { Dimension(0, 0) }
/**
* The grid of the document.
* */
var Document.grid by field { Dimension(0, 0) }
/**
* The unit area of the document.
* */
var Document.unitArea by field { 0 }
/**
* Various paths to export different views of the document.
* */
var Document.exportPaths by field { ExportPaths(it.baseUri()) }
/**
* Whether the document is annotated.
* */
var Document.annotated by field { false }
/**
* Whether the document is nil.
* TODO: check if this override Node.isNil or not?
* */
val Element.isNil get() = this === NILElement
/**
* Add a class to the element, if the class is already in the element, it will not be added again.
* */
fun Element.addClasses(vararg classNames: String): Element {
classNames.forEach {
it.split("\\s+".toRegex()).forEach { addClass(it) }
}
return this
}
/**
* Add a class to the element, if the class is already in the element, it will not be added again.
* */
fun Element.addClasses(classNames: Iterable): Element {
classNames.forEach { addClass(it) }
return this
}
/**
* Make a copy of the element, the copy is a slim copy, all the attributes are removed.
* */
fun Element.slimCopy(): Element {
val clone = this.clone()
clone.forEach { it.extension.features = ArrayRealVector() }
simplifyDOM(clone)
clone.clearAttributesCascaded()
return clone
}
/**
* Make a copy of the element, the copy is a minimal copy, all the attributes are removed except the valuable ones.
* */
fun Element.minimalCopy(): Element {
val clone = this.clone()
simplifyDOM(clone)
// TODO: might have a bug because of concurrent modification, it can be re-produced by calling dom_minimal_html()
// clone.removeUnnecessaryAttributesCascaded()
return clone
}
/**
* This method retrieves and returns a list of the own text content contained within each child node
* of the element.
* */
fun Element.ownTexts(): List {
return this.childNodes().mapNotNullTo(mutableListOf()) { (it as? TextNode)?.text() }
}
/**
* Returns possible valuable class names of the element.
* */
fun Element.valuableClassNames(): Set {
val classNames = className().split("\\s+".toRegex()).toMutableSet()
return getValuableClassNames(classNames)
}
@Deprecated("Use Element.valuableClassNames() instead", ReplaceWith("valuableClassNames()"))
fun Element.qualifiedClassNames(): Set = valuableClassNames()
/**
* Set an attribute to the element, if the attribute is already in the element, it will be replaced.
* */
fun Element.anyAttr(attributeKey: String, attributeValue: Any): Element {
this.attr(attributeKey, attributeValue.toString())
return this
}
/**
* Clear all the temporary attributes from the element.
* */
fun Element.removeTemporaryAttributesCascaded(): Element {
this.attributes().mapNotNull { it.key }.filter { it in TEMPORARY_ATTRIBUTES || it.startsWith("tv") }.forEach {
this.removeAttr(it)
}
return this
}
/**
* Clear all the non-standard attributes from the element.
* */
fun Element.removeNonStandardAttributes(): Element {
this.attributes().mapNotNull { it.key }.forEach {
if (it !in STANDARD_ATTRIBUTES) {
this.removeAttr(it)
}
}
return this
}
/**
* Clear all the unnecessary attributes from the element.
* */
fun Element.removeUnnecessaryAttributes(): Element {
this.attributes().mapNotNull { it.key }.filterNot { it in VALUABLE_ATTRIBUTES }.forEach {
if (it !in VALUABLE_ATTRIBUTES) {
this.removeAttr(it)
}
}
return this
}
/**
* Clear all the non-standard attributes from the element.
* */
fun Element.removeNonStandardAttributesCascaded(): Element {
this.forEachElement(includeRoot = true) { it.removeNonStandardAttributes() }
return this
}
/**
* Clear all the unnecessary attributes from the element.
* */
fun Element.removeUnnecessaryAttributesCascaded(): Element {
this.forEachElement(includeRoot = true) {
it.removeUnnecessaryAttributes()
}
return this
}
/**
* Clear all the temporary attributes from the element.
* */
fun Element.clearAttributesCascaded(): Element {
this.forEachElement(includeRoot = true) { it.clearAttributes() }
return this
}
/**
* Parse style attribute of the element into an array of strings.
* */
fun Element.parseStyle(): Array {
return Strings.stripNonChar(attr("style"), ":;")
.split(";".toRegex())
.dropLastWhile { it.isEmpty() }
.toTypedArray()
}
/**
* Get a style value from the style attribute of the element.
* */
fun Element.getStyle(styleKey: String): String {
return getStyle(parseStyle(), styleKey)
}
/**
* Whether the element is a nil node.
* */
val Node.isNil get() = this === NILNode
/**
* The owner document of the node.
* TODO: should not call ownerDocument.extension, which is a recursive call
* */
val Node.ownerDocument get() = Objects.requireNonNull(extension.ownerDocumentNode) as Document
/**
* Get the URL this Document was parsed from. If the starting URL is a redirect,
* this will return the final URL from which the document was served from.
*
* Note: In most cases the base URL is simply the location of the document, but it can be affected by many factors,
* including the element in HTML and the xml:base attribute in XML.
*
* The base URL of a document is used to resolve relative URLs when the browser needs to obtain an absolute URL,
* for example when processing the HTML element's src attribute or XML xlink:href attribute.
*
* @return location
*/
val Node.location: String get() = ownerDocument.location()
/**
* The traversal depth of the node.
* */
var Node.depth by IntFeature(DEP)
/**
* The traversal sequence of the node.
* */
val Node.sequence by IntFeature(SEQ)
/**
* A globally unique id of the node.
* */
val Node.globalId: String get() = "$location $sequence-$left-$top-$width-$height"
//////////////////////////////////////////////////////////////////////////
// Geometric information
/**
* The left position of the node.
* */
var Node.left by IntFeature(LEFT)
/**
* The top position of the node.
* */
var Node.top by IntFeature(TOP)
/**
* The width of the node.
* */
var Node.width: Int by IntFeature(WIDTH)
/**
* The height of the node.
* */
var Node.height: Int by IntFeature(HEIGHT)
/**
* The right position of the node.
* */
val Node.right: Int get() = left + width
/**
* The bottom position of the node.
* */
val Node.bottom: Int get() = top + height
/**
* The x coordinate of the node. It is the same as the left position.
*
* The coordinate system:
* ------------------> x
* |
* |
* |
* v
* y
*
* @see Node.left
* */
val Node.x get() = left
/**
* The y coordinate of the node. It is the same as the top position.
*
* The coordinate system:
* ------------------> x
* |
* |
* |
* v
* y
*
* */
val Node.y get() = top
/**
* The x2 coordinate of the node. It is the same as the right position.
*
* @see Node.right
* */
val Node.x2 get() = right
/**
* The y2 coordinate of the node. It is the same as the bottom position.
*
* @see Node.bottom
* */
val Node.y2 get() = bottom
/**
* The center x coordinate of the node.
* */
val Node.centerX get() = (x + x2) / 2
/**
* The center y coordinate of the node.
* */
val Node.centerY get() = (y + y2) / 2
/**
* The geometric location of the node, it is the top-left point of the node.
* */
val Node.geoLocation get() = Point(x, y)
/**
* The dimension of the node.
* */
val Node.dimension get() = Dimension(width, height)
/**
* The rectangle of the node.
* */
val Node.rectangle get() = Rectangle(x, y, width, height)
/**
* The area of the node.
* */
val Node.area get() = width * height
/**
* Whether the hidden flag is set by javascript.
* */
val Node.hasHiddenFlag: Boolean get() = hasAttr(PULSAR_ATTR_HIDDEN)
/**
* Whether the overflow hidden flag is set by javascript.
* */
val Node.hasOverflowHiddenFlag: Boolean get() = hasAttr(PULSAR_ATTR_OVERFLOW_HIDDEN)
/**
* Whether the node is visible.
* TODO: there are bugs in this method and can not be used.
* */
val Node.isVisible: Boolean
get() {
return when {
hasHiddenFlag || hasOverflowHiddenFlag -> false
this is Element -> isVisibleElement(this)
this is TextNode -> isVisibleElement(parent())
else -> true
}
}
private fun isVisibleElement(node: Node?): Boolean {
if (node !is Element) {
return false
}
val rect = node.rectangle
return rect.x >= 0 && rect.y >= 0 && !rect.isEmpty
}
/**
* Whether the node is visible.
* */
val Node.isHidden: Boolean get() = !this.isVisible
/**
* Whether the node is a TextNode.
* */
val Node.isText: Boolean get() = this is TextNode
/**
* Whether the node is a TextNode and the text is blank.
* */
val Node.isBlankText: Boolean
get() = this is TextNode && this.isBlank
/**
* Whether the node is a TextNode and the text is non-blank.
* */
val Node.isNonBlankText: Boolean
get() = this is TextNode && !this.isBlank
/**
* Whether the node is a regular TextNode. A regular TextNode is a TextNode that is visible and non-blank.
* */
val Node.isRegularText: Boolean
get() = isVisible && isNonBlankText
/**
* Whether the node is an image.
* */
val Node.isImage: Boolean get() = this.nodeName() == "img"
/**
* Whether the node is a regular image. A regular image is an image that is visible and has a src attribute.
* */
val Node.isRegularImage: Boolean
get() = isImage && isVisible && hasAttr("src")
/**
* Whether the node is an anchor image. An anchor image is an image that is a child of an anchor.
* */
val Node.isAnchorImage: Boolean get() = isImage && this.hasAncestor { it.isAnchor }
/**
* Whether the node is an anchor node.
* */
val Node.isAnchor: Boolean get() = this.nodeName() == "a"
/**
* Whether the node is a regular anchor. A regular anchor is an anchor that is visible.
* */
val Node.isRegularAnchor: Boolean get() = isVisible && isAnchor
/**
* Whether the node is an image anchor. An image anchor is an anchor that has an image descendant.
* */
val Node.isImageAnchor: Boolean get() = isAnchor && this.numImages == 1
/**
* Whether the node is a regular image anchor. A regular image anchor is an image anchor that is a
* regular anchor and has exactly one image descendant.
* */
val Node.isRegularImageAnchor: Boolean get() = isRegularAnchor && this.numImages == 1
/**
* Whether the node is a table.
* */
val Node.isTable: Boolean get() = this.nodeName() == "table"
/**
* Whether the node is a list.
* */
val Node.isList: Boolean get() = this.nodeName() in arrayOf("ul", "ol")
/**
* Whether the node is a regular text node and the text is short.
* */
val Node.isShortText get() = isRegularText && cleanText.length in 1..9
/**
* Whether the node is a regular text node and the text is medium.
* */
val Node.isMediumText get() = isRegularText && cleanText.length in 1..20
/**
* Whether the node is a regular text node and the text is long.
* */
val Node.isLongText get() = isRegularText && cleanText.length > 20
/**
* Whether the node is a regular text node and the text is a currency symbol.
* */
val Node.isCurrencyUnit get() = isShortText && cleanText in arrayOf("¥", "$")
/**
* Whether the node is a regular text node and the text is numeric.
* */
val Node.isNumeric get() = isMediumText && StringUtils.isNumeric(cleanText)
// TODO: "isShortText" should be in -2147483648 to 2147483647, it's mapped to java.lang.Integer.
// TODO: detect all SQL types
/**
* Whether the node is a regular text node and the text is an integer number.
* */
val Node.isInt get() = isShortText && StringUtils.isNumeric(cleanText)
/**
* Whether the node is a regular text node and the text is a float number.
* */
val Node.isFloat get() = isShortText && Strings.isFloat(cleanText)
/**
* Whether the node is a regular text node and the text looks like a number.
* */
val Node.isNumericLike get() = isMediumText && Strings.isNumericLike(cleanText)
/**
* Whether the node is a regular text node and the text looks like a money value.
* */
val Node.isMoneyLike get() = isShortText && Strings.isMoneyLike(cleanText)
/**
* Try to parse the text of the node as an integer number. Returns Int.MIN_VALUE if the text
* is not a valid integer number.
* */
val Node.intValue by field { SParser(it.cleanText).getInt(Int.MIN_VALUE) }
/**
* Try to parse the text of the node as a float number. Returns Float.NaN if the text
* is not a valid float number.
* */
val Node.doubleValue by field { SParser(it.cleanText).getDouble(Double.NaN) }
//////////////////////////////////////////////////////////////////////////
// Distinguished features
/**
* The number of characters of the node.
* */
var Node.numChars by IntFeature(CH)
/**
* The number of siblings of the node.
* */
var Node.numSiblings by IntFeature(SIB)
/**
* The number of children of the node.
* */
var Node.numChildren by IntFeature(C)
/**
* The number of descend text nodes.
* */
var Node.numTextNodes by IntFeature(TN)
/**
* Number of descend images
* */
var Node.numImages by IntFeature(IMG)
/**
* Number of descend anchors
* */
var Node.numAnchors by IntFeature(A)
/**
* Text node density
* */
var Node.textNodeDensity by DoubleFeature(DNS)
//////////////////////////////////////////////////////////////////////////
// semantics
/**
* Returns the cssSelector of the node if it's an element, otherwise, returns the name of the node.
* */
val Node.selectorOrName: String
get() = when {
this is Element -> this.cssSelector()
else -> nodeName()
}
/**
* Returns the caption of the node if available, otherwise, returns the name of the node.
* */
val Node.captionOrName: String
get() = when {
hasCaption() -> caption
else -> name
}
/**
* Returns the caption of the node if available, otherwise, returns the selector of the node if it's an element,
* otherwise, returns the name of the node.
* */
val Node.captionOrSelectorOrName: String
get() = when {
hasCaption() -> caption
else -> selectorOrName
}
/**
* The trimmed text of this node.
*
* TextNodes' texts are calculated and stored while Elements' clean texts are calculated on the fly.
* This method balances time and space.
* */
val Node?.cleanText: String
get() =
when (this) {
is TextNode -> extension.immutableText.trim()
is Element -> accumulateText(this).trim()
else -> ""
}.trim()
/**
* The trimmed text of this node.
*
* TextNodes' texts are calculated and stored while Elements' clean texts are calculated on the fly.
* This is a balance of space and time.
* */
fun Node.joinToString(separator: String = " ", prefix: String = "", suffix: String = ""): String {
val text = when (this) {
is TextNode -> extension.immutableText.trim()
is Element -> accumulateText(this, separator).trim()
else -> ""
}.trim()
return when {
prefix.isEmpty() && suffix.isEmpty() -> text
suffix.isEmpty() -> "$prefix$text"
else -> "$prefix$text$suffix"
}
}
/**
* The text representation of the node.
* */
val Node.textRepresentation: String
get() =
when {
isImage -> attr("abs:src")
isAnchor -> attr("abs:href")
this is TextNode -> cleanText
this is Element -> cleanText
else -> ""
}
/**
* The slim html representation of the node.
* TODO: slim table
* */
val Node.slimHtml by field {
val nm = it.nodeName().lowercase()
when {
it.isImage || it.isAnchor || it.isNumericLike || it.isMoneyLike || it is TextNode || nm == "li" || nm == "td" -> atomSlimHtml(
it
)
it is Element && (nm == "ul" || nm == "ol" || nm == "tr") ->
String.format("<$nm>%s$nm>", it.children().joinToString("") { c -> atomSlimHtml(c) })
it is Element -> it.slimCopy().outerHtml()
else -> String.format("%s", it.name)
}
}
/**
* The minimal html representation of the node.
* */
val Node.minimalHtml by field {
val nm = it.nodeName().lowercase()
when {
it.isImage || it.isAnchor || it.isNumericLike || it.isMoneyLike || it is TextNode || nm == "li" || nm == "td" -> atomSlimHtml(
it
)
it is Element && (nm == "ul" || nm == "ol" || nm == "tr") ->
String.format("<$nm>%s$nm>", it.children().joinToString("") { c -> atomSlimHtml(c) })
it is Element -> it.minimalCopy().outerHtml()
else -> String.format("%s", it.name)
}
}
private fun atomSlimHtml(node: Node): String {
val nm = node.nodeName()
return when {
node is TextNode -> String.format("%s", node.cleanText)
node.isImage -> createSlimImageHtml(node)
node.isAnchor -> createLink(node as Element, keepMetadata = false, lazy = true).toString()
node.isNumericLike || node.isMoneyLike -> "${node.cleanText}"
nm == "li" || nm == "td" || nm == "th" -> String.format("<$nm>%s$nm>", node.cleanText)
node is Element -> node.cleanText
else -> String.format("%s", node.name)
}
}
private fun createSlimImageHtml(node: Node): String = node.run {
String.format(
"",
absUrl("src"), attr("vi"), attr("alt")
)
}
/**
* The key of the node.
* */
val Node.key: String get() = "$location#$sequence"
/**
* The name of the node.
* */
val Node.name: String
get() {
return when (this) {
is Document -> ":root"
is Element -> {
val id = id()
if (id.isNotEmpty()) {
return "#$id"
}
val cls = valuableClassNames()
if (cls.isNotEmpty()) {
return cls.joinToString(".", ".") { it }
}
nodeName()
}
is TextNode -> {
val postfix = if (siblingNodes().size > 1) {
"~" + siblingIndex()
} else ""
return bestElement.name + postfix
}
else -> nodeName()
}
}
/**
* The canonical name of the node.
* */
val Node.canonicalName: String
get() {
when (this) {
is Document -> {
return location
}
is Element -> {
var id = id().trim()
if (!id.isEmpty()) {
id = "#$id"
}
var classes = ""
if (id.isEmpty()) {
val cls = valuableClassNames()
if (cls.isNotEmpty()) {
classes = cls.joinToString(".", ".") { it }
}
}
return "${nodeName()}$id$classes"
}
is TextNode -> {
val postfix = if (siblingNodes().size > 1) {
"~" + siblingIndex()
} else ""
// Recursive call
return bestElement.canonicalName + postfix
}
else -> return nodeName()
}
}
/**
* The unique name of the node.
* */
val Node.uniqueName: String get() = "$sequence-$canonicalName"
/**
* A name with a rectangle string of the node.
* */
val Node.namedRect: String get() = "$name-${rectangle.str}"
/**
* A name with a rectangle string of the node.
* */
val Node.namedRect2: String get() = "$name-${rectangle.str2}"
/**
* Get the parent element of this node, an exception is thrown if it's root
* */
val Node.parentElement get() = this.parent() as Element
/**
* Returns a best element to represent this node: if the node itself is an element, returns itself
* otherwise, returns its parent
* */
val Node.bestElement get() = (this as? Element) ?: parentElement
/**
* The caption of an Element is a joined text values of all non-blank text nodes
* */
val Node.caption get() = getCaptionWords().joinToString(";")
/**
* Get the attribute value associated with the given name. Returns null if the attribute is not
* present or its value is blank.
* */
fun Node.attrOrNull(attrName: String): String? = (this as? Element)?.attr(attrName)?.takeIf { it.isNotBlank() }
/**
* Get the feature value by the given key.
* */
fun Node.getFeature(key: Int): Double = extension.features[key]
/**
* Get the feature value by the given name.
* */
fun Node.getFeature(name: String): Double = extension.features[NodeFeature.getKey(name)]
/**
* Get the feature entry by the given key.
* */
fun Node.getFeatureEntry(key: Int): FeatureEntry = FeatureEntry(key, getFeature(key))
/**
* Associate the given value with the given key.
* */
fun Node.setFeature(key: Int, value: Double) {
extension.features[key] = value
}
/**
* Associate the given value with the given key.
* */
fun Node.setFeature(key: Int, value: Int) {
extension.features[key] = value.toDouble()
}
/**
* Remove a feature specified by the given key. The removal is done by setting the value to 0.0.
* */
fun Node.removeFeature(key: Int): Node {
extension.features[key] = 0.0
return this
}
/**
* Clear all the features of the node, the feature vector is set to an empty vector.
* */
fun Node.clearFeatures(): Node {
extension.features = ArrayRealVector()
return this
}
/**
* Temporary node variables
* */
inline fun Node.getVariable(name: String): T? {
val v = extension.variables[name]
return if (v is T) v else null
}
inline fun Node.getVariable(name: String, defaultValue: T): T {
val v = extension.variables[name]
return if (v is T) v else defaultValue
}
inline fun Node.computeVariableIfAbsent(name: String, mappingFunction: (String) -> T): T {
var v = extension.variables[name]
if (v !is T) {
v = mappingFunction(name)
extension.variables[name] = v
}
return v
}
fun Node.setVariable(name: String, value: Any) {
extension.variables[name] = value
}
fun Node.setVariableIfNotNull(name: String, value: Any?) {
if (value != null) {
extension.variables[name] = value
}
}
fun Node.hasVariable(name: String): Boolean {
return extension.variables.containsKey(name)
}
fun Node.removeVariable(name: String): Any? {
return extension.variables.remove(name)
}
/**
* Set attribute [attributeKey] to [attributeValue]
* */
fun Node.anyAttr(attributeKey: String, attributeValue: Any): Node {
this.attr(attributeKey, attributeValue.toString())
return this
}
/**
* Set attribute [attributeKey] to [attributeValue] and return [attributeValue]
* */
fun Node.rAttr(attributeKey: String, attributeValue: String): String {
this.attr(attributeKey, attributeValue)
return attributeValue
}
/**
* Set attribute [attributeKey] to [attributeValue] and return [attributeValue]
* */
fun Node.rAnyAttr(attributeKey: String, attributeValue: Any): Any {
this.attr(attributeKey, attributeValue.toString())
return attributeValue
}
/**
* append an attribute, no guarantee for uniqueness
* */
fun Node.appendAttr(attributeKey: String, attributeValue: String, separator: String = StringUtils.SPACE) {
var value = attr(attributeKey)
if (!value.isEmpty()) {
value += separator
}
value += attributeValue
attr(attributeKey, value)
}
/**
* Tuple data
* */
fun Node.addTupleItem(tupleName: String, item: Any): Boolean {
return extension.tuples.computeIfAbsent(tupleName) { mutableListOf() }.add(item)
}
/**
*
* */
fun Node.removeTupleItem(tupleName: String, item: Any): Boolean {
return extension.tuples[tupleName]?.remove(item) ?: return false
}
fun Node.getTuple(tupleName: String): List {
return extension.tuples[tupleName] ?: return listOf()
}
fun Node.hasTupleItem(tupleName: String, item: String): Boolean {
return extension.tuples[tupleName]?.contains(item) ?: return false
}
fun Node.hasTuple(tupleName: String): Boolean {
return extension.tuples.containsKey(tupleName)
}
fun Node.clearTuple(tupleName: String) {
extension.tuples[tupleName]?.clear()
}
fun Node.removeTuple(tupleName: String) {
extension.tuples.remove(tupleName)
}
/**
* Labels are unique, so if we add a labels into a node twice, we can get only one such label
* */
fun Node.addLabel(label: String) {
addTupleItem(A_LABELS, label.trim())
}
fun Node.removeLabel(label: String): Boolean {
return removeTupleItem(A_LABELS, label)
}
fun Node.getLabels(): List {
return getTuple(A_LABELS).map { it.toString() }
}
fun Node.hasLabel(label: String): Boolean {
return hasTupleItem(A_LABELS, label)
}
fun Node.clearLabels() {
removeTuple(A_LABELS)
}
/**
* Ml labels are unique, so if we add a labels into a node twice, we can get only one such label
* */
fun Node.addMlLabel(label: String) {
addTupleItem(A_ML_LABELS, label.trim())
}
fun Node.removeMlLabel(label: String): Boolean {
return removeTupleItem(A_ML_LABELS, label)
}
fun Node.getMlLabels(): List {
return getTuple(A_ML_LABELS).map { it.toString() }
}
fun Node.getMlLabel(): String {
return getMlLabels().firstOrNull() ?: ""
}
fun Node.hasMlLabel(label: String): Boolean {
return hasTupleItem(A_ML_LABELS, label)
}
fun Node.clearMlLabels() {
removeTuple(A_ML_LABELS)
}
fun Node.addCaptionWord(word: String) {
addTupleItem(A_CAPTION, Strings.removeNonCJKChar(word))
}
fun Node.removeCaptionWord(word: String): Boolean {
return removeTupleItem(A_CAPTION, word)
}
fun Node.getCaptionWords(): List {
return getTuple(A_CAPTION).map { it.toString() }
}
fun Node.hasCaptionWord(word: String): Boolean {
return hasTupleItem(A_CAPTION, word)
}
fun Node.hasCaption(): Boolean {
return hasTuple(A_CAPTION)
}
fun Node.clearCaption() {
removeTuple(A_CAPTION)
}
fun Node.removeAttrs(vararg attributeKeys: String): Node {
attributeKeys.forEach {
if (it == "*") {
this.clearAttributes()
return@forEach
} else {
this.removeAttr(it)
}
}
return this
}
fun Node.removeAttrs(attributeKeys: Iterable): Node {
attributeKeys.forEach {
if (it == "*") {
this.clearAttributes()
return@forEach
} else {
this.removeAttr(it)
}
}
return this
}
fun Node.removeAttrsIf(filter: (Attribute) -> Boolean): Node {
val keys = attributes().mapNotNull { it.takeIf { filter(it) }?.key }
removeAttrs(keys)
return this
}
fun Node.formatEachFeatures(vararg featureKeys: Int): String {
val sb = StringBuilder()
NodeTraversor.traverse({ node: Node, _ ->
FeatureFormatter.format(node.extension.features, featureKeys.asIterable(), sb = sb)
sb.append('\n')
}, this)
return sb.toString()
}
fun Node.formatFeatures(vararg featureKeys: Int): String {
return FeatureFormatter.format(extension.features, featureKeys.asIterable()).toString()
}
fun Node.formatNamedFeatures(): String {
val sb = StringBuilder()
NodeTraversor.traverse({ node, _ ->
FeatureFormatter.format(node.extension.variables, sb)
sb.append('\n')
}, this)
return sb.toString()
}
fun Node.ancestors(): List {
val ancestors = mutableListOf()
var p = this.parent()
while (p is Element) {
ancestors.add(p)
p = p.parent()
}
return ancestors
}
fun Node.hasAncestor(predicate: (Element) -> Boolean): Boolean {
return findFirstAncestor(predicate) != null
}
fun Node.hasAncestor(stop: (Node) -> Boolean, predicate: (Element) -> Boolean): Boolean {
return findFirstAncestor(predicate) != null
}
fun Node.isAncestorOf(other: Node): Boolean {
return other.findFirstAncestor { it == this } != null
}
fun Node.isAncestorOf(other: Node, stop: (Node) -> Boolean): Boolean {
return other.findFirstAncestor(stop) { it == this } != null
}
private fun accumulateText(root: Element, seperator: String = " "): String {
val sb = StringBuilder()
NodeTraversor.traverse({ node, depth ->
val text = node.extension.immutableText
if (node is TextNode) {
if (text.isNotBlank()) {
sb.append(text)
}
} else if (node is Element) {
if (sb.isNotEmpty() && (node.isBlock || node.tagName() == "br")
&& !(sb.isNotEmpty() && sb.endsWith(seperator))
)
sb.append(seperator)
}
}, root)
return sb.toString()
}
private fun getValuableClassNames(classNames: MutableSet): MutableSet {
classNames.remove("")
if (classNames.isEmpty()) return classNames
arrayOf("clearfix", "left", "right", "l", "r").forEach {
classNames.remove(it)
if (classNames.isEmpty()) {
classNames.add(it)
return@forEach
}
}
return classNames
}
private fun Node.calculateViewPort(): Dimension {
val default = AppConstants.DEFAULT_VIEW_PORT
val ob = extension.ownerBody ?: return default
val parts = ob.attr("view-port").split("x")
return if (parts.size == 2) {
Dimension(parts[0].toIntOrNull() ?: default.width, parts[1].toIntOrNull() ?: default.height)
} else default
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy