it.skrape.selects.DomTreeElement.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of skrapeit-html-parser Show documentation
Show all versions of skrapeit-html-parser Show documentation
A Kotlin-based testing/scraping/parsing library providing the ability to analyze and extract data from HTML (server & client-side rendered). It places particular emphasis on ease of use and a high level of readability by providing an intuitive DSL. First and foremost it aims to be a testing lib, but it can also be used to scrape websites in a convenient fashion.
package it.skrape.selects
import org.jsoup.nodes.Element
public abstract class DomTreeElement : CssSelectable() {
public abstract val element: Element
public abstract val relaxed: Boolean
/**
* Gets the combined text of this element and all its children. Whitespace is normalized and trimmed.
*
* For example, given HTML {@code
Hello there now!
}, {@code p.text()} returns {@code "Hello there now!"}
*
* @return unencoded, normalized text, or empty string if none.
* @see #wholeText() if you don't want the text to be normalized.
* @see #ownText()
* @see #textNodes()
*/
public val text: String by lazy { element.text().orEmpty() }
/**
* Retrieves the element's inner HTML. E.g. on a {@code } with one empty {@code }, would return
* {@code
}. (Whereas {@link outerHtml} would return {@code }.)
* @return String of HTML.
* @see outerHtml
*/
public val html: String by lazy { element.html().orEmpty() }
/**
* Get the outer HTML of this node. For example, on a {@code p} element, may return {@code Para
}.
* @return outer HTML
* @see html
* @see text
*/
public val outerHtml: String by lazy { element.outerHtml().orEmpty() }
/**
* Find all elements in the document.
* @return List
*/
public val allElements: List by lazy { element.allElements.map { DocElement(it) } }
/**
* Get this element's child elements.
* @return List of child elements. If this element has no children, returns an empty list.
*/
public val children: List by lazy {
element.children().map { DocElement(it) }
}
/**
* Get this element's child elements.
* @return T
*/
public fun children(init: List.() -> T): T = children.init()
public fun eachAttribute(attributeKey: String): List =
allElements.map { it attribute attributeKey }
.filter { it.isNotEmpty() }
public val eachHref: List by lazy { eachAttribute("href").filter { it.isNotEmpty() } }
public val eachSrc: List by lazy { eachAttribute("src").filter { it.isNotEmpty() } }
public val eachLink: Map
get(): Map =
allElements.filter { it.hasAttribute("href") }
.associate { it.text to it.attribute("href") }
public val eachImage: Map
get(): Map =
allElements.filter { it.tagName == "img" }
.filter { it.hasAttribute("src") }
.associate { it.attribute("alt") to it.attribute("src") }
public open fun makeDefaultElement(cssSelector: String): DocElement {
return super.makeDefault(cssSelector)
}
override fun makeDefault(cssSelector: String): DocElement {
return if (relaxed) makeDefaultElement(cssSelector) else throw ElementNotFoundException(cssSelector)
}
override fun applySelector(rawCssSelector: String): List {
if (rawCssSelector.isEmpty()) {
return allElements
}
val queried = element.children().select(rawCssSelector).map { DocElement(it, relaxed) }
val selected = queried.takeIf { it.isNotEmpty() }
return if (relaxed) selected.orEmpty() else selected ?: throw ElementNotFoundException(rawCssSelector)
}
override fun toString(): String = element.toString()
}
public open class ElementNotFoundException(selector: String, tag: String = "") :
Exception("Could not find element \"$tag$selector\"")
© 2015 - 2025 Weber Informatics LLC | Privacy Policy