All Downloads are FREE. Search and download functionalities are using the official Maven repository.

it.skrape.selects.DocElement.kt Maven / Gradle / Ivy

Go to download

A Kotlin-based testing/scraping/parsing library providing the ability to analyze and extract data from HTML (server & client-side rendered). It places particular emphasis on ease of use and a high level of readability by providing an intuitive DSL. First and foremost it aims to be a testing lib, but it can also be used to scrape websites in a convenient fashion.

There is a newer version: 1.3.0-alpha.2
Show newest version
package it.skrape.selects

import it.skrape.SkrapeItDsl
import org.jsoup.nodes.Element

@Suppress("TooManyFunctions")
@SkrapeItDsl
public class DocElement internal constructor(
    override val element: Element,
    override val relaxed: Boolean
) : DomTreeElement() {
    public constructor(element: Element) : this(element, false)

    /**
     * Get the name of the tag for this element. E.g. {@code div}.
     *
     * @return String of the tag's name
     */
    public val tagName: String by lazy { element.tagName().orEmpty() }

    /**
     * Gets the text owned by this element only; does not get the combined text of all children.
     * For example, given HTML {@code 

Hello there now!

}, {@code p.ownText()} returns {@code "Hello now!"}, * whereas {@code text} returns {@code "Hello there now!"}. * Note that the text within the {@code b} element is not returned, as it is not a direct child of the {@code p} element. * * @return unencoded text, or empty string if none. * @see text */ public val ownText: String by lazy { element.ownText().orEmpty() } /** * Get all of the element's attributes. * @return Map> of attribute key value pairs */ public val attributes: Map by lazy { element.attributes().map { it.key to it.value }.toMap() } /** * Get all attribute keys of the element. * @return List */ public val attributeKeys: List by lazy { attributes.map { it.key } } /** * Get all attribute values of the element. * @return List */ public val attributeValues: List by lazy { attributes.map { it.value } } /** * Get the element's attribute value of a given attribute key. * @return String of attribute value or empty if non existing. */ public infix fun attribute(attributeKey: String): String = attributes[attributeKey].orEmpty() public fun hasAttribute(attributeKey: String): Boolean = attribute(attributeKey).isNotBlank() /** * Get all data-attributes of the element. * @return Map> of data-attributes as key value pairs */ public val dataAttributes: Map by lazy { attributes.filter { it.key.startsWith("data-") } } /** * Gets the literal value of this element's "class" attribute, which may include multiple class names, space separated. * (E.g. on
returns, "header gray") * @return String of the literal class attribute, or empty string if no class attribute set. */ public val className: String by lazy { attribute("class").trim() } /** * Get all of the element's class names. E.g. on element
, * returns a set of two elements "header", "gray". * @return Set distinct classnames, empty if no class attribute */ public val classNames: Set by lazy { className.split(" ").filter { it.isNotBlank() }.toSet() } /** * Case insensitive check if this element has a class. * @return Boolean */ public fun hasClass(className: String): Boolean = classNames.map { it.toLowerCase() }.contains(className.toLowerCase()) /** * Gets the literal value of this element's "id" attribute. * (E.g. on
returns, "main") * @return String of the literal id attribute value, or empty string if no id attribute set. */ public val id: String by lazy { attribute("id").trim() } /** * Get this element's parent and ancestors, up to the document root. * @return List of parents, closest first. */ public val parents: List by lazy { element.parents().map { DocElement(it) } } /** * Get this element's parent and ancestors, up to the document root as lambda. * @return T */ public fun parents(init: List.() -> T): T = parents.init() /** * Get this element's parent element. * @return DocElement */ public val parent: DocElement by lazy { try { parents.first() } catch (e: NoSuchElementException) { throw ElementNotFoundException("parent") } } /** * Get this element's parent element as lambda. * @return T */ public fun parent(init: DocElement.() -> T): T = parent.init() /** * Get all elements that are siblings of this element. * @return List of all siblings. */ public val siblings: List by lazy { element.siblingElements().map { DocElement(it) } } /** * Get all elements that are siblings of this element as lambda. * @return T */ public fun siblings(init: List.() -> T): T = siblings.init() /** * Check if the element is present thereby it will return true if the given node can be found otherwise false. * @return Boolean */ public val isPresent: Boolean by lazy { allElements.isNotEmpty() } /** * Check if the element is NOT present thereby it will return true if the given node can not be found otherwise false. * @return Boolean */ public val isNotPresent: Boolean by lazy { !isPresent } /** * Get a CSS selector that will uniquely select this element. * If the element has an ID, returns #id; otherwise returns the parent (if any) CSS selector, followed by '>', * followed by a unique selector for the element (tag.class.class:nth-child(n)). * @return String representing the CSS Path that can be used to retrieve the element in a selector. */ override val toCssSelector: String get() = element.cssSelector() public val parentsCssSelector: String by lazy { parents { when { isNotEmpty() -> reversed().joinToString(separator = " > ") { it.tagName } else -> "" } } } public val ownCssSelector: String by lazy { fun String.orNull(): String? = if (isBlank()) null else this fun List.orNull(): List? = if (isEmpty()) null else this CssSelector( rawCssSelector = tagName, withClass = classNames.joinToString(separator = ".").orNull(), withId = id.orNull(), withAttributes = attributes .filterNot { it.key == "id" } .filterNot { it.key == "class" } .filterNot { it.value.isBlank() } .toList(), withAttributeKeys = attributes.filterValues { it.isBlank() }.map { it.key }.orNull() ).toString() } } public val List.text: String get(): String = joinToString(separator = " ") { it.text } public val List.html: String get(): String = joinToString(separator = "\n") { it.outerHtml } public val List.isPresent: Boolean get(): Boolean = size > 0 public val List.isNotPresent: Boolean get(): Boolean = !isPresent public val List.eachText: List get(): List = map { it.text } public val List.eachTagName: List get(): List = map { it.tagName } public val List.eachAttribute: Map get() = map { it.attributes }.flatMap { it.toList() }.toMap() public val List.eachDataAttribute: Map get() = map { it.dataAttributes }.flatMap { it.toList() }.toMap() public infix fun List.attribute(attributeKey: String): String = filter { it.hasAttribute(attributeKey) } .joinToString { it.attribute(attributeKey) } public infix fun List.eachAttribute(attributeKey: String): List = map { it attribute attributeKey } .filter { it.isNotEmpty() } public val List.eachClassName: List get(): List = flatMap { it.classNames }.distinct() public val List.eachHref: List get(): List = eachAttribute("href") public val List.eachSrc: List get(): List = eachAttribute("src") public val List.eachLink: Map get(): Map = filter { it.hasAttribute("href") } .associate { it.text to it.attribute("href") } public val List.eachImage: Map get(): Map = filter { it.tagName == "img" } .filter { it.hasAttribute("src") } .associate { it.attribute("alt") to it.attribute("src") } public fun List.forEachLink(init: (text: String, url: String) -> T) { eachLink.forEach { init(it.key, it.value) } } public fun List.forEachImage(init: (altText: String, url: String) -> T) { eachImage.forEach { init(it.key, it.value) } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy