commonMain.decoder.internal.HtmlTreeDecoder.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of kspoon-korlibs Show documentation
Show all versions of kspoon-korlibs Show documentation
Annotation based HTML to Kotlin class parser with KMP support, jspoon successor
The newest version!
package dev.burnoo.kspoon.decoder.internal
import com.fleeksoft.ksoup.nodes.Document
import com.fleeksoft.ksoup.nodes.Element
import com.fleeksoft.ksoup.select.Elements
import dev.burnoo.kspoon.HtmlTextMode
import dev.burnoo.kspoon.annotation.Selector
import dev.burnoo.kspoon.configuration.KspoonConfiguration
import dev.burnoo.kspoon.decoder.KspoonDecoder
import dev.burnoo.kspoon.exception.KspoonParseException
import dev.burnoo.kspoon.exception.kspoonError
import dev.burnoo.kspoon.serializer.DocumentSerializer
import dev.burnoo.kspoon.serializer.ElementSerializer
import dev.burnoo.kspoon.serializer.ElementsSerializer
import kotlinx.serialization.ExperimentalSerializationApi
import kotlinx.serialization.descriptors.SerialDescriptor
import kotlinx.serialization.descriptors.StructureKind
import kotlinx.serialization.descriptors.elementNames
import kotlinx.serialization.encoding.CompositeDecoder
import kotlinx.serialization.modules.EmptySerializersModule
import kotlinx.serialization.modules.SerializersModule
import kotlinx.serialization.modules.contextual
import kotlinx.serialization.modules.overwriteWith
import kotlin.contracts.ExperimentalContracts
import kotlin.contracts.contract
@OptIn(ExperimentalSerializationApi::class)
internal class HtmlTreeDecoder(
private val elements: Elements,
private val configuration: KspoonConfiguration,
extraSerializersModule: SerializersModule = EmptySerializersModule(),
private val tagHierarchy: List = emptyList(),
) : TaggedDecoder(), KspoonDecoder {
private val textMode = configuration.defaultTextMode
private val coerceInputValues: Boolean = configuration.coerceInputValues
override val serializersModule = SerializersModule {
contextual(ElementSerializer)
contextual(ElementsSerializer)
contextual(DocumentSerializer)
} overwriteWith extraSerializersModule
private var elementIndex = 0
override fun SerialDescriptor.getTag(index: Int): HtmlTag {
val selectorAnnotation = getElementSelectorAnnotation(index)
val newIndex = if (selectorAnnotation == null) {
getElementName(index).toIntOrNull()
} else {
null
}
return when {
selectorAnnotation != null -> selectorAnnotation.toHtmlTag()
newIndex != null -> HtmlTag.Index(newIndex)
else -> kspoonError(
"Selector annotation not found for ${getElementDescriptor(index).serialName}," +
" parent selector: ${getSelectorFullPath(tag = null)}",
)
}
}
private fun SerialDescriptor.getElementSelectorAnnotation(index: Int): Selector? {
val annotations = getElementAnnotations(index) + getElementDescriptor(index).annotations
return annotations.filterIsInstance().firstOrNull()
}
override fun decodeElementIndex(descriptor: SerialDescriptor): Int {
val isList = descriptor.kind == StructureKind.LIST
val maxCount = if (isList) elements.size else descriptor.elementsCount
while (shouldCoerceInputValue(maxCount, descriptor)) {
elementIndex++
}
if (elementIndex == maxCount) return CompositeDecoder.DECODE_DONE
return elementIndex++
}
private fun shouldCoerceInputValue(maxCount: Int, descriptor: SerialDescriptor): Boolean {
// ensure coerceInputValues from config is enabled
if (!coerceInputValues) return false
// ensure current structure hasn't ended
if (elementIndex == maxCount) return false
// ensure current structure is a class (and not a list)
if (descriptor.kind != StructureKind.CLASS) return false
// check if the element is optional - if the instance can be created with Kotlin default value
if (!descriptor.isElementOptional(0)) return false
val tag = descriptor.getTag(elementIndex)
if (tag !is HtmlTag.Selector) return false
// check if default value was set in selector - then we skip selecting element here, as default value
// will be returned instead
if (tag.defaultValue != null) return false
// finally checking if the selected element is empty
return selectElement(tag) == null
}
override fun beginStructure(descriptor: SerialDescriptor): CompositeDecoder {
val tag = currentTagOrNull ?: return this
val selectedElements = selectElements(tag)
return HtmlTreeDecoder(selectedElements, configuration, tagHierarchy = tagHierarchy + tag)
}
override fun decodeCollectionSize(descriptor: SerialDescriptor): Int = elements.size
override fun decodeTaggedNotNullMark(tag: HtmlTag): Boolean {
if (tag is HtmlTag.Index) return true
if (tag is HtmlTag.Selector && tag.defaultValue != null) return true
return selectElement(tag) != null
}
override fun decodeSequentially() = !configuration.coerceInputValues
override fun decodeTaggedLong(tag: HtmlTag) = getTextAndMap(tag) { toLong() }
override fun decodeTaggedShort(tag: HtmlTag) = getTextAndMap(tag) { toShort() }
override fun decodeTaggedByte(tag: HtmlTag) = getTextAndMap(tag) { toByte() }
override fun decodeTaggedFloat(tag: HtmlTag) = getTextAndMap(tag) { toFloat() }
override fun decodeTaggedDouble(tag: HtmlTag) = getTextAndMap(tag) { toDouble() }
override fun decodeTaggedBoolean(tag: HtmlTag) = getTextAndMap(tag) { toBoolean() }
override fun decodeTaggedChar(tag: HtmlTag) = getText(tag).first()
override fun decodeTaggedInt(tag: HtmlTag) = getTextAndMap(tag) { toInt() }
override fun decodeTaggedString(tag: HtmlTag) = getText(tag)
override fun decodeTaggedEnum(tag: HtmlTag, enumDescriptor: SerialDescriptor): Int {
val text = getText(tag)
val index = enumDescriptor.elementNames.indexOfFirst { it == text }
return if (index == -1) {
kspoonError(
"Can't parse value '$text' for enum '${enumDescriptor.serialName}' at selector: ${getSelectorFullPath(tag)}",
)
} else index
}
private inline fun getTextAndMap(tag: HtmlTag, map: String.() -> T): T {
val text = getText(tag)
return try {
text.map()
} catch (e: Throwable) {
kspoonError("Error while converting 'text' to '${T::class}' for selector ${getSelectorFullPath(tag)}", e)
}
}
private fun selectElements(tag: HtmlTag): Elements {
return when (tag) {
is HtmlTag.Selector -> elements.select(tag.selector)
is HtmlTag.Index -> elements.getAtAsElements(tag.index)
}
}
@OptIn(ExperimentalContracts::class)
private fun selectElement(tag: HtmlTag): Element? {
contract { returns(null) implies (tag is HtmlTag.Selector) }
return when (tag) {
is HtmlTag.Selector -> elements.select(tag.selector).getOrNull(tag.index)
is HtmlTag.Index -> elements[tag.index]
}
}
private fun getText(tag: HtmlTag, currentTextMode: HtmlTextMode = textMode): String {
return try {
val element = selectElementOrThrow(tag)
val attribute = (tag as? HtmlTag.Selector)?.attribute
val textModeFromSelector = (tag as? HtmlTag.Selector)?.textMode
if (attribute != null) {
element.attr(attribute)
} else when (textModeFromSelector ?: currentTextMode) {
HtmlTextMode.Text -> element.text()
HtmlTextMode.InnerHtml -> element.html()
HtmlTextMode.OuterHtml -> element.outerHtml()
HtmlTextMode.Data -> element.data()
}.withRegexIfPresent(tag)
} catch (e: Exception) {
if (tag is HtmlTag.Selector && tag.defaultValue != null) return tag.defaultValue
if (e is KspoonParseException) {
throw e
} else {
kspoonError("Error getting text for selector: ${getSelectorFullPath(tag)}", e)
}
}
}
private fun selectElementOrThrow(tag: HtmlTag): Element {
return when (val element = selectElement(tag)) {
null -> kspoonError("Element not found for selector: ${getSelectorFullPath(tag)}")
else -> element
}
}
private fun String.withRegexIfPresent(tag: HtmlTag): String {
if (tag !is HtmlTag.Selector) return this
if (tag.regex == null) return this
val matchResult = tag.regex.find(this)
?: kspoonError("Regex '${tag.regex}' not found for current selector: ${getSelectorFullPath(tag)}")
return if (matchResult.groupValues.size > 1) matchResult.groupValues[1] else matchResult.value
}
private fun Elements.getAtAsElements(index: Int) = getOrNull(index)?.let(::Elements) ?: Elements()
private fun getSelectorFullPath(tag: HtmlTag?) = (tagHierarchy + tag)
.filterNotNull()
.joinToString(" -> ", prefix = "[", postfix = "]")
// KspoonDecoder implementation
override fun decodeElement(): Element? = selectElement(tag = currentTag)
override fun decodeElementOrThrow(): Element = selectElementOrThrow(tag = currentTag)
override fun decodeElements(): Elements = selectElements(tag = currentTag)
override fun decodeDocument() = elements.firstOrNull() as? Document
?: kspoonError("Current Element is not a Document. Document type works only on root")
override fun getSelectorFullPath(): String = getSelectorFullPath(tag = currentTagOrNull)
}