com.tfowl.jsoup.ktx.Microdata.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jsoup-ktx Show documentation
Show all versions of jsoup-ktx Show documentation
Kotlin extensions and compatibility with other HTTP libraries for Jsoup.
The newest version!
package com.tfowl.jsoup.ktx
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import java.time.format.DateTimeFormatter
import java.time.format.DateTimeParseException
import java.time.temporal.TemporalAccessor
data class MicrodataItem(
val properties: Map,
val type: Set? = null,
val id: String? = null,
)
sealed class MicrodataValue {
data class Text(val text: String) : MicrodataValue()
data class Url(val url: String) : MicrodataValue()
data class Item(val item: MicrodataItem) : MicrodataValue()
data class Datetime(val temporal: TemporalAccessor) : MicrodataValue()
data class Collection(val values: List) : MicrodataValue()
companion object {
val EmptyText = Text("")
}
}
private fun Element.findMicrodataProperties(): List {
val root = this
val results = mutableListOf()
val memory = mutableListOf()
val pending = mutableListOf()
memory += root
pending += root.children()
if (root.hasAttr("itemref")) {
for (id in root.attr("itemref").split(" ")) {
ownerDocument()?.getElementById(id)?.let { ref -> pending += ref }
}
}
while (pending.isNotEmpty()) {
val current = pending.removeFirst()
if (current in memory) {
// Microdata error - just continue
continue
}
memory += current
if (!current.hasAttr("itemscope")) {
pending += current.children()
}
if (current.hasAttr("itemprop") && current.attr("itemprop").isNotEmpty()) {
results += current
}
// TODO: sort results in tree order
}
return results
}
fun Element.getMicrodataItem(): MicrodataItem {
val tagsWithSrcAttr = listOf("audio", "embed", "iframe", "img", "source", "track", "video")
fun Element.textAttr(name: String): MicrodataValue = MicrodataValue.Text(attr(name))
fun Element.textContent(): MicrodataValue = MicrodataValue.Text(text())
fun Element.urlAttr(name: String): MicrodataValue =
if (hasAttr(name) && attr(name).isNotEmpty()) MicrodataValue.Url(attr("abs:$name")) else MicrodataValue.EmptyText
fun Element.datetime(): MicrodataValue {
val content = if (hasAttr("datetime")) attr("datetime") else text().trim()
val formatters = listOf(
DateTimeFormatter.ofPattern("uuuu-MM"),
DateTimeFormatter.ISO_LOCAL_DATE,
DateTimeFormatter.ofPattern("[--]MM-dd"),
DateTimeFormatter.ISO_LOCAL_TIME,
DateTimeFormatter.ISO_LOCAL_DATE_TIME,
DateTimeFormatter.ofPattern("X"),
DateTimeFormatter.ISO_OFFSET_DATE_TIME,
DateTimeFormatter.ofPattern("uuuu-'W'ww"),
DateTimeFormatter.ofPattern("uu"),
DateTimeFormatter.ofPattern("uuu"),
DateTimeFormatter.ofPattern("uuuu"),
DateTimeFormatter.ofPattern("uuuuu"),
DateTimeFormatter.ofPattern("uuuuuu"),
)
for (formatter in formatters) {
try {
return MicrodataValue.Datetime(formatter.parse(content))
} catch (ignored: DateTimeParseException) {
}
}
// TODO: Duration
return MicrodataValue.EmptyText
}
val properties = mutableMapOf()
findMicrodataProperties().forEach { element ->
val keys = element.attr("itemprop").split(' ').toSet()
val value = if (element.hasAttr("itemscope")) {
MicrodataValue.Item(element.getMicrodataItem())
} else when (element.tag().normalName()) {
"meta" -> element.textAttr("content")
in tagsWithSrcAttr -> element.urlAttr("src")
"a", "area", "link" -> element.urlAttr("href")
"object" -> element.urlAttr("data")
"data", "meter" -> element.textAttr("value")
"time" -> element.datetime()
else -> element.textContent()
}
fun MicrodataValue.merge(rhs: MicrodataValue): MicrodataValue = when (this) {
is MicrodataValue.Collection -> when (rhs) {
is MicrodataValue.Collection -> MicrodataValue.Collection(values + rhs.values)
else -> MicrodataValue.Collection(values + rhs)
}
else -> when (rhs) {
is MicrodataValue.Collection -> MicrodataValue.Collection(listOf(this) + rhs.values)
else -> MicrodataValue.Collection(listOf(this, rhs))
}
}
keys.forEach { key ->
properties.merge(key, value) { a, b -> a.merge(b) }
}
}
val type = if (hasAttr("itemtype")) attr("itemtype").split(' ').toSet() else null
val id = if (hasAttr("itemid")) attr("itemid").trim() else null
return MicrodataItem(properties, type, id)
}
fun Document.getMicrodataItems(): List {
// Finds all top-level items (items that are not themselves properties of other items)
return select("[itemscope]:not([itemprop])").map { it.getMicrodataItem() }
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy