it.skrape.selects.DocElement.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of skrapeit-html-parser Show documentation
Show all versions of skrapeit-html-parser Show documentation
A Kotlin-based testing/scraping/parsing library providing the ability to analyze and extract data from HTML (server & client-side rendered). It places particular emphasis on ease of use and a high level of readability by providing an intuitive DSL. First and foremost it aims to be a testing lib, but it can also be used to scrape websites in a convenient fashion.
package it.skrape.selects
import it.skrape.SkrapeItDsl
import org.jsoup.nodes.Element
@Suppress("TooManyFunctions")
@SkrapeItDsl
public class DocElement internal constructor(
override val element: Element,
override val relaxed: Boolean
) : DomTreeElement() {
public constructor(element: Element) : this(element, false)
/**
* Get the name of the tag for this element. E.g. {@code div}.
*
* @return String of the tag's name
*/
public val tagName: String by lazy { element.tagName().orEmpty() }
/**
* Gets the text owned by this element only; does not get the combined text of all children.
* For example, given HTML {@code Hello there now!
}, {@code p.ownText()} returns {@code "Hello now!"},
* whereas {@code text} returns {@code "Hello there now!"}.
* Note that the text within the {@code b} element is not returned, as it is not a direct child of the {@code p} element.
*
* @return unencoded text, or empty string if none.
* @see text
*/
public val ownText: String by lazy { element.ownText().orEmpty() }
/**
* Get all of the element's attributes.
* @return Map> of attribute key value pairs
*/
public val attributes: Map by lazy { element.attributes().map { it.key to it.value }.toMap() }
/**
* Get all attribute keys of the element.
* @return List
*/
public val attributeKeys: List by lazy { attributes.map { it.key } }
/**
* Get all attribute values of the element.
* @return List
*/
public val attributeValues: List by lazy { attributes.map { it.value } }
/**
* Get the element's attribute value of a given attribute key.
* @return String of attribute value or empty if non existing.
*/
public infix fun attribute(attributeKey: String): String = attributes[attributeKey].orEmpty()
public fun hasAttribute(attributeKey: String): Boolean = attribute(attributeKey).isNotBlank()
/**
* Get all data-attributes of the element.
* @return Map> of data-attributes as key value pairs
*/
public val dataAttributes: Map by lazy { attributes.filter { it.key.startsWith("data-") } }
/**
* Gets the literal value of this element's "class" attribute, which may include multiple class names, space separated.
* (E.g. on returns, "header gray")
* @return String of the literal class attribute, or empty string if no class attribute set.
*/
public val className: String by lazy { attribute("class").trim() }
/**
* Get all of the element's class names. E.g. on element ,
* returns a set of two elements "header", "gray".
* @return Set distinct classnames, empty if no class attribute
*/
public val classNames: Set by lazy { className.split(" ").filter { it.isNotBlank() }.toSet() }
/**
* Case insensitive check if this element has a class.
* @return Boolean
*/
public fun hasClass(className: String): Boolean =
classNames.map { it.toLowerCase() }.contains(className.toLowerCase())
/**
* Gets the literal value of this element's "id" attribute.
* (E.g. on returns, "main")
* @return String of the literal id attribute value, or empty string if no id attribute set.
*/
public val id: String by lazy { attribute("id").trim() }
/**
* Get this element's parent and ancestors, up to the document root.
* @return List of parents, closest first.
*/
public val parents: List by lazy { element.parents().map { DocElement(it) } }
/**
* Get this element's parent and ancestors, up to the document root as lambda.
* @return T
*/
public fun parents(init: List.() -> T): T = parents.init()
/**
* Get this element's parent element.
* @return DocElement
*/
public val parent: DocElement by lazy {
try {
parents.first()
} catch (e: NoSuchElementException) {
throw ElementNotFoundException("parent")
}
}
/**
* Get this element's parent element as lambda.
* @return T
*/
public fun parent(init: DocElement.() -> T): T = parent.init()
/**
* Get all elements that are siblings of this element.
* @return List of all siblings.
*/
public val siblings: List by lazy { element.siblingElements().map { DocElement(it) } }
/**
* Get all elements that are siblings of this element as lambda.
* @return T
*/
public fun siblings(init: List.() -> T): T = siblings.init()
/**
* Check if the element is present thereby it will return true if the given node can be found otherwise false.
* @return Boolean
*/
public val isPresent: Boolean by lazy { allElements.isNotEmpty() }
/**
* Check if the element is NOT present thereby it will return true if the given node can not be found otherwise false.
* @return Boolean
*/
public val isNotPresent: Boolean by lazy { !isPresent }
/**
* Get a CSS selector that will uniquely select this element.
* If the element has an ID, returns #id; otherwise returns the parent (if any) CSS selector, followed by '>',
* followed by a unique selector for the element (tag.class.class:nth-child(n)).
* @return String representing the CSS Path that can be used to retrieve the element in a selector.
*/
override val toCssSelector: String
get() = element.cssSelector()
public val parentsCssSelector: String by lazy {
parents {
when {
isNotEmpty() -> reversed().joinToString(separator = " > ") { it.tagName }
else -> ""
}
}
}
public val ownCssSelector: String by lazy {
fun String.orNull(): String? = if (isBlank()) null else this
fun List.orNull(): List? = if (isEmpty()) null else this
CssSelector(
rawCssSelector = tagName,
withClass = classNames.joinToString(separator = ".").orNull(),
withId = id.orNull(),
withAttributes = attributes
.filterNot { it.key == "id" }
.filterNot { it.key == "class" }
.filterNot { it.value.isBlank() }
.toList(),
withAttributeKeys = attributes.filterValues { it.isBlank() }.map { it.key }.orNull()
).toString()
}
}
public val List.text: String
get(): String = joinToString(separator = " ") { it.text }
public val List.html: String
get(): String = joinToString(separator = "\n") { it.outerHtml }
public val List.isPresent: Boolean
get(): Boolean = size > 0
public val List.isNotPresent: Boolean
get(): Boolean = !isPresent
public val List.eachText: List
get(): List = map { it.text }
public val List.eachTagName: List
get(): List = map { it.tagName }
public val List.eachAttribute: Map
get() = map { it.attributes }.flatMap { it.toList() }.toMap()
public val List.eachDataAttribute: Map
get() = map { it.dataAttributes }.flatMap { it.toList() }.toMap()
public infix fun List.attribute(attributeKey: String): String =
filter { it.hasAttribute(attributeKey) }
.joinToString { it.attribute(attributeKey) }
public infix fun List.eachAttribute(attributeKey: String): List =
map { it attribute attributeKey }
.filter { it.isNotEmpty() }
public val List.eachClassName: List
get(): List = flatMap { it.classNames }.distinct()
public val List.eachHref: List
get(): List = eachAttribute("href")
public val List.eachSrc: List
get(): List = eachAttribute("src")
public val List.eachLink: Map
get(): Map =
filter { it.hasAttribute("href") }
.associate { it.text to it.attribute("href") }
public val List.eachImage: Map
get(): Map =
filter { it.tagName == "img" }
.filter { it.hasAttribute("src") }
.associate { it.attribute("alt") to it.attribute("src") }
public fun List.forEachLink(init: (text: String, url: String) -> T) {
eachLink.forEach { init(it.key, it.value) }
}
public fun List.forEachImage(init: (altText: String, url: String) -> T) {
eachImage.forEach { init(it.key, it.value) }
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy