All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.dom.nodes.node.ext.NodeExt.kt Maven / Gradle / Ivy

package ai.platon.pulsar.dom.nodes.node.ext

import ai.platon.pulsar.common.AppPaths
import ai.platon.pulsar.common.SParser
import ai.platon.pulsar.common.Strings
import ai.platon.pulsar.common.config.AppConstants
import ai.platon.pulsar.common.config.AppConstants.*
import ai.platon.pulsar.common.math.geometric.str
import ai.platon.pulsar.common.math.geometric.str2
import ai.platon.pulsar.common.math.vectors.get
import ai.platon.pulsar.common.math.vectors.set
import ai.platon.pulsar.dom.features.FeatureEntry
import ai.platon.pulsar.dom.features.FeatureFormatter
import ai.platon.pulsar.dom.features.NodeFeature
import ai.platon.pulsar.dom.features.defined.*
import ai.platon.pulsar.dom.model.createLink
import ai.platon.pulsar.dom.nodes.*
import ai.platon.pulsar.dom.select.selectFirstOrNull
import org.apache.commons.lang3.StringUtils
import org.apache.commons.math3.linear.ArrayRealVector
import org.jsoup.nodes.*
import org.jsoup.select.NodeTraversor
import java.awt.Dimension
import java.awt.Point
import java.awt.Rectangle
import java.nio.file.Path
import java.util.*
import java.util.concurrent.ConcurrentSkipListSet
import java.util.concurrent.atomic.AtomicBoolean
import kotlin.reflect.KProperty

class DoubleFeature(val name: Int) {
    operator fun getValue(thisRef: Node, property: KProperty<*>): Double = thisRef.extension.features[name]

    operator fun setValue(thisRef: Node, property: KProperty<*>, value: Double) {
        thisRef.extension.features[name] = value
    }
}

class IntFeature(val name: Int) {
    operator fun getValue(thisRef: Node, property: KProperty<*>): Int = thisRef.extension.features[name].toInt()

    operator fun setValue(thisRef: Node, property: KProperty<*>, value: Int) {
        thisRef.extension.features[name] = value.toDouble()
    }
}

class MapField(val initializer: (Node) -> T) {
    operator fun getValue(thisRef: Node, property: KProperty<*>): T =
        thisRef.extension.variables[property.name] as? T ?: setValue(thisRef, property, initializer(thisRef))

    operator fun setValue(thisRef: Node, property: KProperty<*>, value: T): T {
        thisRef.extension.variables[property.name] = value
        return value
    }
}

class NullableMapField {
    operator fun getValue(thisRef: Node, property: KProperty<*>): T? = thisRef.extension.variables[property.name] as T?

    operator fun setValue(thisRef: Node, property: KProperty<*>, value: T?) {
        thisRef.extension.variables[property.name] = value
    }
}

fun  field(initializer: (Node) -> T): MapField {
    return MapField(initializer)
}

inline fun  nullableField(): NullableMapField {
    return NullableMapField()
}

class ExportPaths(val uri: String) {
    private val namedPath = mutableMapOf()

    val filename by lazy { AppPaths.fromUri(uri, "", ".htm") }
    val portal get() = byType("portal")
    val annotatedView get() = byType("annotated")
    val tileView get() = byType("tile")
    val entityView get() = byType("entity")

    fun byType(type: String) = namedPath.computeIfAbsent(type) { get(type, filename) }

    companion object {
        val BASE_DIR = AppPaths.DOC_EXPORT_DIR

        fun get(first: String) = BASE_DIR.resolve(first)

        fun get(first: String, second: String) = get(first).resolve(second)
    }
}

const val NILLocation: String = NIL_PAGE_URL

const val NILBaseUri: String = NILLocation

val NILDocument: Document = Document.createShell(NILBaseUri)

val NILElement: Element = NILDocument.body()

val NILNode: Node = NILElement

@Deprecated("Inappropriate name", ReplaceWith("NILDocument"))
val nilDocument = NILDocument

@Deprecated("Inappropriate name", ReplaceWith("NILElement"))
val nilElement = NILElement

@Deprecated("Inappropriate name", ReplaceWith("NILNode"))
val nilNode = NILNode

val Document.isNil get() = this === NILDocument

/**
 * The element that hold the meta information generated by PulsarRPA.
 * */
val Document.pulsarMetaElement get() = getElementById(PULSAR_META_INFORMATION_ID)
/**
 * The element that hold a piece of script generated by PulsarRPA used to hold data.
 * */
val Document.pulsarScriptElement get() = getElementById(PULSAR_SCRIPT_SECTION_ID)
/**
 * The script generated by PulsarRPA used to hold data.
 * */
val Document.pulsarScript get() = pulsarScriptElement?.text()
/**
 * The normalized URI of the document, it's also the key to retrieve the document from the database
 * and always be the same as [ai.platon.pulsar.persist.WebPage].url.
 * */
val Document.normalizedURI: String? get() {
    if (isNil) {
        return NILBaseUri
    }

    //  is added by JsoupParser
    return head().selectFirstOrNull("link[rel=$PULSAR_DOCUMENT_NORMALIZED_URI]")?.attr("href")
        ?: pulsarMetaElement?.attr(PULSAR_DOCUMENT_NORMALIZED_URI)
}

/**
 * Whether the document is initialized.
 * */
var Document.isInitialized by field { AtomicBoolean() }

internal val Document.threadIds by field { ConcurrentSkipListSet() }
/**
 * The view port of the document.
 * */
val Document.viewPort by field { it.calculateViewPort() }

// geometric grid, we have two grids, a bigger one and a smaller one
/**
 * The primary grid of the document.
 * */
var Document.primaryGrid by field { Dimension(0, 0) }
/**
 * The secondary grid of the document.
 * */
var Document.secondaryGrid by field { Dimension(0, 0) }
/**
 * The grid of the document.
 * */
var Document.grid by field { Dimension(0, 0) }
/**
 * The unit area of the document.
 * */
var Document.unitArea by field { 0 }
/**
 * Various paths to export different views of the document.
 * */
var Document.exportPaths by field { ExportPaths(it.baseUri()) }
/**
 * Whether the document is annotated.
 * */
var Document.annotated by field { false }
/**
 * Whether the document is nil.
 * TODO: check if this override Node.isNil or not?
 * */
val Element.isNil get() = this === NILElement
/**
 * Add a class to the element, if the class is already in the element, it will not be added again.
 * */
fun Element.addClasses(vararg classNames: String): Element {
    classNames.forEach {
        it.split("\\s+".toRegex()).forEach { addClass(it) }
    }
    return this
}
/**
 * Add a class to the element, if the class is already in the element, it will not be added again.
 * */
fun Element.addClasses(classNames: Iterable): Element {
    classNames.forEach { addClass(it) }
    return this
}

/**
 * Make a copy of the element, the copy is a slim copy, all the attributes are removed.
 * */
fun Element.slimCopy(): Element {
    val clone = this.clone()
    clone.forEach { it.extension.features = ArrayRealVector() }
    simplifyDOM(clone)
    
    clone.clearAttributesCascaded()

    return clone
}
/**
 * Make a copy of the element, the copy is a minimal copy, all the attributes are removed except the valuable ones.
 * */
fun Element.minimalCopy(): Element {
    val clone = this.clone()
    simplifyDOM(clone)
    
    // TODO: might have a bug because of concurrent modification, it can be re-produced by calling dom_minimal_html()
    // clone.removeUnnecessaryAttributesCascaded()

    return clone
}
/**
 * This method retrieves and returns a list of the own text content contained within each child node
 * of the element.
 * */
fun Element.ownTexts(): List {
    return this.childNodes().mapNotNullTo(mutableListOf()) { (it as? TextNode)?.text() }
}
/**
 * Returns possible valuable class names of the element.
 * */
fun Element.valuableClassNames(): Set {
    val classNames = className().split("\\s+".toRegex()).toMutableSet()
    return getValuableClassNames(classNames)
}

@Deprecated("Use Element.valuableClassNames() instead", ReplaceWith("valuableClassNames()"))
fun Element.qualifiedClassNames(): Set = valuableClassNames()
/**
 * Set an attribute to the element, if the attribute is already in the element, it will be replaced.
 * */
fun Element.anyAttr(attributeKey: String, attributeValue: Any): Element {
    this.attr(attributeKey, attributeValue.toString())
    return this
}
/**
 * Clear all the temporary attributes from the element.
 * */
fun Element.removeTemporaryAttributesCascaded(): Element {
    this.attributes().mapNotNull { it.key }.filter { it in TEMPORARY_ATTRIBUTES || it.startsWith("tv") }.forEach {
        this.removeAttr(it)
    }
    return this
}
/**
 * Clear all the non-standard attributes from the element.
 * */
fun Element.removeNonStandardAttributes(): Element {
    this.attributes().mapNotNull { it.key }.forEach {
        if (it !in STANDARD_ATTRIBUTES) {
            this.removeAttr(it)
        }
    }
    return this
}
/**
 * Clear all the unnecessary attributes from the element.
 * */
fun Element.removeUnnecessaryAttributes(): Element {
    this.attributes().mapNotNull { it.key }.filterNot { it in VALUABLE_ATTRIBUTES }.forEach {
        if (it !in VALUABLE_ATTRIBUTES) {
            this.removeAttr(it)
        }
    }
    return this
}
/**
 * Clear all the non-standard attributes from the element.
 * */
fun Element.removeNonStandardAttributesCascaded(): Element {
    this.forEachElement(includeRoot = true) { it.removeNonStandardAttributes() }
    return this
}
/**
 * Clear all the unnecessary attributes from the element.
 * */
fun Element.removeUnnecessaryAttributesCascaded(): Element {
    this.forEachElement(includeRoot = true) {
        it.removeUnnecessaryAttributes()
    }
    return this
}
/**
 * Clear all the temporary attributes from the element.
 * */
fun Element.clearAttributesCascaded(): Element {
    this.forEachElement(includeRoot = true) { it.clearAttributes() }
    return this
}
/**
 * Parse style attribute of the element into an array of strings.
 * */
fun Element.parseStyle(): Array {
    return Strings.stripNonChar(attr("style"), ":;")
        .split(";".toRegex())
        .dropLastWhile { it.isEmpty() }
        .toTypedArray()
}
/**
 * Get a style value from the style attribute of the element.
 * */
fun Element.getStyle(styleKey: String): String {
    return getStyle(parseStyle(), styleKey)
}
/**
 * Whether the element is a nil node.
 * */
val Node.isNil get() = this === NILNode

/**
 * The owner document of the node.
 * TODO: should not call ownerDocument.extension, which is a recursive call
 * */
val Node.ownerDocument get() = Objects.requireNonNull(extension.ownerDocumentNode) as Document

/**
 * Get the URL this Document was parsed from. If the starting URL is a redirect,
 * this will return the final URL from which the document was served from.
 *
 * Note: In most cases the base URL is simply the location of the document, but it can be affected by many factors,
 * including the  element in HTML and the xml:base attribute in XML.
 *
 * The base URL of a document is used to resolve relative URLs when the browser needs to obtain an absolute URL,
 * for example when processing the HTML  element's src attribute or XML xlink:href attribute.
 *
 * @return location
 */
val Node.location: String get() = ownerDocument.location()
/**
 * The traversal depth of the node.
 * */
var Node.depth by IntFeature(DEP)
/**
 * The traversal sequence of the node.
 * */
val Node.sequence by IntFeature(SEQ)
/**
 * A globally unique id of the node.
 * */
val Node.globalId: String get() = "$location $sequence-$left-$top-$width-$height"

//////////////////////////////////////////////////////////////////////////
// Geometric information

/**
 * The left position of the node.
 * */
var Node.left by IntFeature(LEFT)
/**
 * The top position of the node.
 * */
var Node.top by IntFeature(TOP)
/**
 * The width of the node.
 * */
var Node.width: Int by IntFeature(WIDTH)
/**
 * The height of the node.
 * */
var Node.height: Int by IntFeature(HEIGHT)
/**
 * The right position of the node.
 * */
val Node.right: Int get() = left + width
/**
 * The bottom position of the node.
 * */
val Node.bottom: Int get() = top + height
/**
 * The x coordinate of the node. It is the same as the left position.
 *
 * The coordinate system:
 * ------------------> x
 * |
 * |
 * |
 * v
 * y
 *
 * @see Node.left
 * */
val Node.x get() = left
/**
 * The y coordinate of the node. It is the same as the top position.
 *
 * The coordinate system:
 * ------------------> x
 * |
 * |
 * |
 * v
 * y
 *
 * */
val Node.y get() = top
/**
 * The x2 coordinate of the node. It is the same as the right position.
 *
 * @see Node.right
 * */
val Node.x2 get() = right
/**
 * The y2 coordinate of the node. It is the same as the bottom position.
 *
 * @see Node.bottom
 * */
val Node.y2 get() = bottom
/**
 * The center x coordinate of the node.
 * */
val Node.centerX get() = (x + x2) / 2
/**
 * The center y coordinate of the node.
 * */
val Node.centerY get() = (y + y2) / 2
/**
 * The geometric location of the node, it is the top-left point of the node.
 * */
val Node.geoLocation get() = Point(x, y)
/**
 * The dimension of the node.
 * */
val Node.dimension get() = Dimension(width, height)
/**
 * The rectangle of the node.
 * */
val Node.rectangle get() = Rectangle(x, y, width, height)
/**
 * The area of the node.
 * */
val Node.area get() = width * height

/**
 * Whether the hidden flag is set by javascript.
 * */
val Node.hasHiddenFlag: Boolean get() = hasAttr(PULSAR_ATTR_HIDDEN)

/**
 * Whether the overflow hidden flag is set by javascript.
 * */
val Node.hasOverflowHiddenFlag: Boolean get() = hasAttr(PULSAR_ATTR_OVERFLOW_HIDDEN)

/**
 * Whether the node is visible.
 * TODO: there are bugs in this method and can not be used.
 * */
val Node.isVisible: Boolean
    get() {
        return when {
            hasHiddenFlag || hasOverflowHiddenFlag -> false
            this is Element -> isVisibleElement(this)
            this is TextNode -> isVisibleElement(parent())
            else -> true
        }
    }

private fun isVisibleElement(node: Node?): Boolean {
    if (node !is Element) {
        return false
    }

    val rect = node.rectangle
    return rect.x >= 0 && rect.y >= 0 && !rect.isEmpty
}

/**
 * Whether the node is visible.
 * */
val Node.isHidden: Boolean get() = !this.isVisible
/**
 * Whether the node is a TextNode.
 * */
val Node.isText: Boolean get() = this is TextNode
/**
 * Whether the node is a TextNode and the text is blank.
 * */
val Node.isBlankText: Boolean
    get() = this is TextNode && this.isBlank
/**
 * Whether the node is a TextNode and the text is non-blank.
 * */
val Node.isNonBlankText: Boolean
    get() = this is TextNode && !this.isBlank
/**
 * Whether the node is a regular TextNode. A regular TextNode is a TextNode that is visible and non-blank.
 * */
val Node.isRegularText: Boolean
    get() = isVisible && isNonBlankText
/**
 * Whether the node is an image.
 * */
val Node.isImage: Boolean get() = this.nodeName() == "img"
/**
 * Whether the node is a regular image. A regular image is an image that is visible and has a src attribute.
 * */
val Node.isRegularImage: Boolean
    get() = isImage && isVisible && hasAttr("src")
/**
 * Whether the node is an anchor image. An anchor image is an image that is a child of an anchor.
 * */
val Node.isAnchorImage: Boolean get() = isImage && this.hasAncestor { it.isAnchor }
/**
 * Whether the node is an anchor node.
 * */
val Node.isAnchor: Boolean get() = this.nodeName() == "a"
/**
 * Whether the node is a regular anchor. A regular anchor is an anchor that is visible.
 * */
val Node.isRegularAnchor: Boolean get() = isVisible && isAnchor
/**
 * Whether the node is an image anchor. An image anchor is an anchor that has an image descendant.
 * */
val Node.isImageAnchor: Boolean get() = isAnchor && this.numImages == 1
/**
 * Whether the node is a regular image anchor. A regular image anchor is an image anchor that is a
 * regular anchor and has exactly one image descendant.
 * */
val Node.isRegularImageAnchor: Boolean get() = isRegularAnchor && this.numImages == 1
/**
 * Whether the node is a table.
 * */
val Node.isTable: Boolean get() = this.nodeName() == "table"
/**
 * Whether the node is a list.
 * */
val Node.isList: Boolean get() = this.nodeName() in arrayOf("ul", "ol")
/**
 * Whether the node is a regular text node and the text is short.
 * */
val Node.isShortText get() = isRegularText && cleanText.length in 1..9
/**
 * Whether the node is a regular text node and the text is medium.
 * */
val Node.isMediumText get() = isRegularText && cleanText.length in 1..20
/**
 * Whether the node is a regular text node and the text is long.
 * */
val Node.isLongText get() = isRegularText && cleanText.length > 20
/**
 * Whether the node is a regular text node and the text is a currency symbol.
 * */
val Node.isCurrencyUnit get() = isShortText && cleanText in arrayOf("¥", "$")
/**
 * Whether the node is a regular text node and the text is numeric.
 * */
val Node.isNumeric get() = isMediumText && StringUtils.isNumeric(cleanText)

// TODO: "isShortText" should be in -2147483648 to 2147483647, it's mapped to java.lang.Integer.
// TODO: detect all SQL types
/**
 * Whether the node is a regular text node and the text is an integer number.
 * */
val Node.isInt get() = isShortText && StringUtils.isNumeric(cleanText)
/**
 * Whether the node is a regular text node and the text is a float number.
 * */
val Node.isFloat get() = isShortText && Strings.isFloat(cleanText)
/**
 * Whether the node is a regular text node and the text looks like a number.
 * */
val Node.isNumericLike get() = isMediumText && Strings.isNumericLike(cleanText)
/**
 * Whether the node is a regular text node and the text looks like a money value.
 * */
val Node.isMoneyLike get() = isShortText && Strings.isMoneyLike(cleanText)
/**
 * Try to parse the text of the node as an integer number. Returns Int.MIN_VALUE if the text
 * is not a valid integer number.
 * */
val Node.intValue by field { SParser(it.cleanText).getInt(Int.MIN_VALUE) }
/**
 * Try to parse the text of the node as a float number. Returns Float.NaN if the text
 * is not a valid float number.
 * */
val Node.doubleValue by field { SParser(it.cleanText).getDouble(Double.NaN) }

//////////////////////////////////////////////////////////////////////////
// Distinguished features
/**
 * The number of characters of the node.
 * */
var Node.numChars by IntFeature(CH)
/**
 * The number of siblings of the node.
 * */
var Node.numSiblings by IntFeature(SIB)
/**
 * The number of children of the node.
 * */
var Node.numChildren by IntFeature(C)
/**
 * The number of descend text nodes.
 * */
var Node.numTextNodes by IntFeature(TN)
/**
 * Number of descend images
 * */
var Node.numImages by IntFeature(IMG)
/**
 * Number of descend anchors
 * */
var Node.numAnchors by IntFeature(A)
/**
 *  Text node density
 * */
var Node.textNodeDensity by DoubleFeature(DNS)

//////////////////////////////////////////////////////////////////////////
// semantics

/**
 * Returns the cssSelector of the node if it's an element, otherwise, returns the name of the node.
 * */
val Node.selectorOrName: String
    get() = when {
        this is Element -> this.cssSelector()
        else -> nodeName()
    }
/**
 * Returns the caption of the node if available, otherwise, returns the name of the node.
 * */
val Node.captionOrName: String
    get() = when {
        hasCaption() -> caption
        else -> name
    }
/**
 * Returns the caption of the node if available, otherwise, returns the selector of the node if it's an element,
 * otherwise, returns the name of the node.
 * */
val Node.captionOrSelectorOrName: String
    get() = when {
        hasCaption() -> caption
        else -> selectorOrName
    }

/**
 * The trimmed text of this node.
 *
 * TextNodes' texts are calculated and stored while Elements' clean texts are calculated on the fly.
 * This method balances time and space.
 * */
val Node?.cleanText: String
    get() =
        when (this) {
            is TextNode -> extension.immutableText.trim()
            is Element -> accumulateText(this).trim()
            else -> ""
        }.trim()

/**
 * The trimmed text of this node.
 *
 * TextNodes' texts are calculated and stored while Elements' clean texts are calculated on the fly.
 * This is a balance of space and time.
 * */
fun Node.joinToString(separator: String = " ", prefix: String = "", suffix: String = ""): String {
    val text = when (this) {
        is TextNode -> extension.immutableText.trim()
        is Element -> accumulateText(this, separator).trim()
        else -> ""
    }.trim()

    return when {
        prefix.isEmpty() && suffix.isEmpty() -> text
        suffix.isEmpty() -> "$prefix$text"
        else -> "$prefix$text$suffix"
    }
}
/**
 * The text representation of the node.
 * */
val Node.textRepresentation: String
    get() =
        when {
            isImage -> attr("abs:src")
            isAnchor -> attr("abs:href")
            this is TextNode -> cleanText
            this is Element -> cleanText
            else -> ""
        }

/**
 * The slim html representation of the node.
 * TODO: slim table
 * */
val Node.slimHtml by field {
    val nm = it.nodeName().lowercase()
    when {
        it.isImage || it.isAnchor || it.isNumericLike || it.isMoneyLike || it is TextNode || nm == "li" || nm == "td" -> atomSlimHtml(
            it
        )

        it is Element && (nm == "ul" || nm == "ol" || nm == "tr") ->
            String.format("<$nm>%s", it.children().joinToString("") { c -> atomSlimHtml(c) })

        it is Element -> it.slimCopy().outerHtml()
        else -> String.format("%s", it.name)
    }
}

/**
 * The minimal html representation of the node.
 * */
val Node.minimalHtml by field {
    val nm = it.nodeName().lowercase()
    when {
        it.isImage || it.isAnchor || it.isNumericLike || it.isMoneyLike || it is TextNode || nm == "li" || nm == "td" -> atomSlimHtml(
            it
        )

        it is Element && (nm == "ul" || nm == "ol" || nm == "tr") ->
            String.format("<$nm>%s", it.children().joinToString("") { c -> atomSlimHtml(c) })

        it is Element -> it.minimalCopy().outerHtml()
        else -> String.format("%s", it.name)
    }
}

private fun atomSlimHtml(node: Node): String {
    val nm = node.nodeName()
    return when {
        node is TextNode -> String.format("%s", node.cleanText)
        node.isImage -> createSlimImageHtml(node)
        node.isAnchor -> createLink(node as Element, keepMetadata = false, lazy = true).toString()
        node.isNumericLike || node.isMoneyLike -> "${node.cleanText}"
        nm == "li" || nm == "td" || nm == "th" -> String.format("<$nm>%s", node.cleanText)
        node is Element -> node.cleanText
        else -> String.format("%s", node.name)
    }
}

private fun createSlimImageHtml(node: Node): String = node.run {
    String.format(
        "%s",
        absUrl("src"), attr("vi"), attr("alt")
    )
}
/**
 * The key of the node.
 * */
val Node.key: String get() = "$location#$sequence"
/**
 * The name of the node.
 * */
val Node.name: String
    get() {
        return when (this) {
            is Document -> ":root"
            is Element -> {
                val id = id()
                if (id.isNotEmpty()) {
                    return "#$id"
                }

                val cls = valuableClassNames()
                if (cls.isNotEmpty()) {
                    return cls.joinToString(".", ".") { it }
                }

                nodeName()
            }

            is TextNode -> {
                val postfix = if (siblingNodes().size > 1) {
                    "~" + siblingIndex()
                } else ""
                return bestElement.name + postfix
            }

            else -> nodeName()
        }
    }
/**
 * The canonical name of the node.
 * */
val Node.canonicalName: String
    get() {
        when (this) {
            is Document -> {
                return location
            }

            is Element -> {
                var id = id().trim()
                if (!id.isEmpty()) {
                    id = "#$id"
                }

                var classes = ""
                if (id.isEmpty()) {
                    val cls = valuableClassNames()
                    if (cls.isNotEmpty()) {
                        classes = cls.joinToString(".", ".") { it }
                    }
                }

                return "${nodeName()}$id$classes"
            }

            is TextNode -> {
                val postfix = if (siblingNodes().size > 1) {
                    "~" + siblingIndex()
                } else ""
                // Recursive call
                return bestElement.canonicalName + postfix
            }

            else -> return nodeName()
        }
    }
/**
 * The unique name of the node.
 * */
val Node.uniqueName: String get() = "$sequence-$canonicalName"
/**
 * A name with a rectangle string of the node.
 * */
val Node.namedRect: String get() = "$name-${rectangle.str}"
/**
 * A name with a rectangle string of the node.
 * */
val Node.namedRect2: String get() = "$name-${rectangle.str2}"
/**
 * Get the parent element of this node, an exception is thrown if it's root
 * */
val Node.parentElement get() = this.parent() as Element
/**
 * Returns a best element to represent this node: if the node itself is an element, returns itself
 * otherwise, returns its parent
 * */
val Node.bestElement get() = (this as? Element) ?: parentElement
/**
 * The caption of an Element is a joined text values of all non-blank text nodes
 * */
val Node.caption get() = getCaptionWords().joinToString(";")
/**
 * Get the attribute value associated with the given name. Returns null if the attribute is not
 * present or its value is blank.
 * */
fun Node.attrOrNull(attrName: String): String? = (this as? Element)?.attr(attrName)?.takeIf { it.isNotBlank() }
/**
 * Get the feature value by the given key.
 * */
fun Node.getFeature(key: Int): Double = extension.features[key]
/**
 * Get the feature value by the given name.
 * */
fun Node.getFeature(name: String): Double = extension.features[NodeFeature.getKey(name)]
/**
 * Get the feature entry by the given key.
 * */
fun Node.getFeatureEntry(key: Int): FeatureEntry = FeatureEntry(key, getFeature(key))
/**
 * Associate the given value with the given key.
 * */
fun Node.setFeature(key: Int, value: Double) {
    extension.features[key] = value
}
/**
 * Associate the given value with the given key.
 * */
fun Node.setFeature(key: Int, value: Int) {
    extension.features[key] = value.toDouble()
}
/**
 * Remove a feature specified by the given key. The removal is done by setting the value to 0.0.
 * */
fun Node.removeFeature(key: Int): Node {
    extension.features[key] = 0.0
    return this
}
/**
 * Clear all the features of the node, the feature vector is set to an empty vector.
 * */
fun Node.clearFeatures(): Node {
    extension.features = ArrayRealVector()
    return this
}

/**
 * Temporary node variables
 * */
inline fun  Node.getVariable(name: String): T? {
    val v = extension.variables[name]
    return if (v is T) v else null
}

inline fun  Node.getVariable(name: String, defaultValue: T): T {
    val v = extension.variables[name]
    return if (v is T) v else defaultValue
}

inline fun  Node.computeVariableIfAbsent(name: String, mappingFunction: (String) -> T): T {
    var v = extension.variables[name]
    if (v !is T) {
        v = mappingFunction(name)
        extension.variables[name] = v
    }
    return v
}

fun Node.setVariable(name: String, value: Any) {
    extension.variables[name] = value
}

fun Node.setVariableIfNotNull(name: String, value: Any?) {
    if (value != null) {
        extension.variables[name] = value
    }
}

fun Node.hasVariable(name: String): Boolean {
    return extension.variables.containsKey(name)
}

fun Node.removeVariable(name: String): Any? {
    return extension.variables.remove(name)
}

/**
 * Set attribute [attributeKey] to [attributeValue]
 * */
fun Node.anyAttr(attributeKey: String, attributeValue: Any): Node {
    this.attr(attributeKey, attributeValue.toString())
    return this
}

/**
 * Set attribute [attributeKey] to [attributeValue] and return [attributeValue]
 * */
fun Node.rAttr(attributeKey: String, attributeValue: String): String {
    this.attr(attributeKey, attributeValue)
    return attributeValue
}

/**
 * Set attribute [attributeKey] to [attributeValue] and return [attributeValue]
 * */
fun Node.rAnyAttr(attributeKey: String, attributeValue: Any): Any {
    this.attr(attributeKey, attributeValue.toString())
    return attributeValue
}

/**
 * append an attribute, no guarantee for uniqueness
 * */
fun Node.appendAttr(attributeKey: String, attributeValue: String, separator: String = StringUtils.SPACE) {
    var value = attr(attributeKey)
    if (!value.isEmpty()) {
        value += separator
    }
    value += attributeValue
    attr(attributeKey, value)
}

/**
 * Tuple data
 * */
fun Node.addTupleItem(tupleName: String, item: Any): Boolean {
    return extension.tuples.computeIfAbsent(tupleName) { mutableListOf() }.add(item)
}

/**
 *
 * */
fun Node.removeTupleItem(tupleName: String, item: Any): Boolean {
    return extension.tuples[tupleName]?.remove(item) ?: return false
}

fun Node.getTuple(tupleName: String): List {
    return extension.tuples[tupleName] ?: return listOf()
}

fun Node.hasTupleItem(tupleName: String, item: String): Boolean {
    return extension.tuples[tupleName]?.contains(item) ?: return false
}

fun Node.hasTuple(tupleName: String): Boolean {
    return extension.tuples.containsKey(tupleName)
}

fun Node.clearTuple(tupleName: String) {
    extension.tuples[tupleName]?.clear()
}

fun Node.removeTuple(tupleName: String) {
    extension.tuples.remove(tupleName)
}

/**
 * Labels are unique, so if we add a labels into a node twice, we can get only one such label
 * */
fun Node.addLabel(label: String) {
    addTupleItem(A_LABELS, label.trim())
}

fun Node.removeLabel(label: String): Boolean {
    return removeTupleItem(A_LABELS, label)
}

fun Node.getLabels(): List {
    return getTuple(A_LABELS).map { it.toString() }
}

fun Node.hasLabel(label: String): Boolean {
    return hasTupleItem(A_LABELS, label)
}

fun Node.clearLabels() {
    removeTuple(A_LABELS)
}

/**
 * Ml labels are unique, so if we add a labels into a node twice, we can get only one such label
 * */
fun Node.addMlLabel(label: String) {
    addTupleItem(A_ML_LABELS, label.trim())
}

fun Node.removeMlLabel(label: String): Boolean {
    return removeTupleItem(A_ML_LABELS, label)
}

fun Node.getMlLabels(): List {
    return getTuple(A_ML_LABELS).map { it.toString() }
}

fun Node.getMlLabel(): String {
    return getMlLabels().firstOrNull() ?: ""
}

fun Node.hasMlLabel(label: String): Boolean {
    return hasTupleItem(A_ML_LABELS, label)
}

fun Node.clearMlLabels() {
    removeTuple(A_ML_LABELS)
}

fun Node.addCaptionWord(word: String) {
    addTupleItem(A_CAPTION, Strings.removeNonCJKChar(word))
}

fun Node.removeCaptionWord(word: String): Boolean {
    return removeTupleItem(A_CAPTION, word)
}

fun Node.getCaptionWords(): List {
    return getTuple(A_CAPTION).map { it.toString() }
}

fun Node.hasCaptionWord(word: String): Boolean {
    return hasTupleItem(A_CAPTION, word)
}

fun Node.hasCaption(): Boolean {
    return hasTuple(A_CAPTION)
}

fun Node.clearCaption() {
    removeTuple(A_CAPTION)
}

fun Node.removeAttrs(vararg attributeKeys: String): Node {
    attributeKeys.forEach {
        if (it == "*") {
            this.clearAttributes()
            return@forEach
        } else {
            this.removeAttr(it)
        }
    }
    return this
}

fun Node.removeAttrs(attributeKeys: Iterable): Node {
    attributeKeys.forEach {
        if (it == "*") {
            this.clearAttributes()
            return@forEach
        } else {
            this.removeAttr(it)
        }
    }
    return this
}

fun Node.removeAttrsIf(filter: (Attribute) -> Boolean): Node {
    val keys = attributes().mapNotNull { it.takeIf { filter(it) }?.key }
    removeAttrs(keys)
    return this
}

fun Node.formatEachFeatures(vararg featureKeys: Int): String {
    val sb = StringBuilder()
    NodeTraversor.traverse({ node: Node, _ ->
        FeatureFormatter.format(node.extension.features, featureKeys.asIterable(), sb = sb)
        sb.append('\n')
    }, this)
    return sb.toString()
}

fun Node.formatFeatures(vararg featureKeys: Int): String {
    return FeatureFormatter.format(extension.features, featureKeys.asIterable()).toString()
}

fun Node.formatNamedFeatures(): String {
    val sb = StringBuilder()

    NodeTraversor.traverse({ node, _ ->
        FeatureFormatter.format(node.extension.variables, sb)
        sb.append('\n')
    }, this)

    return sb.toString()
}

fun Node.ancestors(): List {
    val ancestors = mutableListOf()
    var p = this.parent()
    while (p is Element) {
        ancestors.add(p)
        p = p.parent()
    }
    
    return ancestors
}

fun Node.hasAncestor(predicate: (Element) -> Boolean): Boolean {
    return findFirstAncestor(predicate) != null
}

fun Node.hasAncestor(stop: (Node) -> Boolean, predicate: (Element) -> Boolean): Boolean {
    return findFirstAncestor(predicate) != null
}

fun Node.isAncestorOf(other: Node): Boolean {
    return other.findFirstAncestor { it == this } != null
}

fun Node.isAncestorOf(other: Node, stop: (Node) -> Boolean): Boolean {
    return other.findFirstAncestor(stop) { it == this } != null
}

private fun accumulateText(root: Element, seperator: String = " "): String {
    val sb = StringBuilder()
    NodeTraversor.traverse({ node, depth ->
        val text = node.extension.immutableText
        if (node is TextNode) {
            if (text.isNotBlank()) {
                sb.append(text)
            }
        } else if (node is Element) {
            if (sb.isNotEmpty() && (node.isBlock || node.tagName() == "br")
                && !(sb.isNotEmpty() && sb.endsWith(seperator))
            )
                sb.append(seperator)
        }
    }, root)

    return sb.toString()
}

private fun getValuableClassNames(classNames: MutableSet): MutableSet {
    classNames.remove("")
    if (classNames.isEmpty()) return classNames
    arrayOf("clearfix", "left", "right", "l", "r").forEach {
        classNames.remove(it)
        if (classNames.isEmpty()) {
            classNames.add(it)
            return@forEach
        }
    }
    return classNames
}

private fun Node.calculateViewPort(): Dimension {
    val default = AppConstants.DEFAULT_VIEW_PORT
    val ob = extension.ownerBody ?: return default
    val parts = ob.attr("view-port").split("x")
    return if (parts.size == 2) {
        Dimension(parts[0].toIntOrNull() ?: default.width, parts[1].toIntOrNull() ?: default.height)
    } else default
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy