com.gitlab.mvysny.konsumexml.Konsumer.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of konsume-xml Show documentation
Show all versions of konsume-xml Show documentation
Konsume-XML: A simple functional XML parser with no annotations
package com.gitlab.mvysny.konsumexml
import com.gitlab.mvysny.konsumexml.stax.*
import java.io.Closeable
import javax.xml.namespace.QName
import javax.xml.stream.events.XMLEvent
@DslMarker
@Target(AnnotationTarget.CLASS, AnnotationTarget.TYPE, AnnotationTarget.FUNCTION)
annotation class KonsumerDsl
/**
* Konsumer exception occurred at [location].
*/
class KonsumerException(@Suppress("MemberVisibilityCanBePrivate") val location: Location, @Suppress("CanBeParameter") val elementName: QName?, msg: String, cause: Throwable? = null) : RuntimeException("$location, in element <$elementName>: $msg", cause)
/**
* The settings for [Konsumer]. Mutable, can be changed during XML parsing.
* @property failOnUnconsumedAttributes defaults to false. If true, any unconsumed attributes (attributes not queried by
* via [AttributeKonsumer.getValue], with the exception of `xml:` and `xmlns:` attributes) will fail the parsing. Enabling this
* allows stricter validation since unknown attributes will be discovered and reported.
*/
data class KonsumerSettings(var failOnUnconsumedAttributes: Boolean = false) {
fun newAttributeKonsumer(stax: StaxParser): AttributeKonsumer {
var k: AttributeKonsumer = StaxAttributeKonsumer(stax)
if (failOnUnconsumedAttributes) {
k = AttributeKonsumerWatchdog(k, stax)
}
return k
}
}
/**
* Upon calling a method, say [child], konsumer reads (consumes) next event from [reader] and matches it to the expected content.
*
* All methods throw [KonsumerException] on any parsing error, [javax.xml.stream.XMLStreamException] on any I/O errors and XML parsing errors.
*
* It is not necessary to [close] child konsumers - in fact you must not done so. See [close] for more info.
* @property name the name of the current element we're in (the contents of which we're consuming). If we haven't consumed root element yet then this is null.
*/
@KonsumerDsl
class Konsumer(private val reader: StaxReader, val name: QName?, val settings: KonsumerSettings) : Closeable {
/**
* If true then this konsumer has finished reading the contents of its element. This konsumer can no longer be used
* anymore; the parent konsumer is now allowed to continue consuming.
*/
var isFinished = false
private set
/**
* Returns [QName.localPart] out of [name]. Fails if [name] is null.
*/
val localName: String
get() = name?.localPart
?: throw KonsumerException(location, this.name, "Expected to learn localName but I'm not currently in an element")
private var currentChildKonsumer: Konsumer? = null
// lazy, so that settings can be changed beforehand
private val attributesLazy: AttributeKonsumer by lazy(LazyThreadSafetyMode.NONE) { if (name == null) NullElementKonsumer(reader.stax) else settings.newAttributeKonsumer(reader.stax) }
/**
* Allows access to the attributes of the element [name]. Only accessible in the beginning, before any of the element
* contents are enumerated. Accessing attributes later on will cause [IllegalStateException] to be thrown.
*
* By default unconsumed attributes (attributes not queried by via [AttributeKonsumer.getValue], with the exception of `xml:` and `xmlns:` attributes) will not fail the parsing;
* however this can be changed by changing the [KonsumerSettings.failOnUnconsumedAttributes] setting.
*/
val attributes: AttributeKonsumer
get() {
checkNotFinished()
checkNoChildKonsumer()
return attributesLazy
}
/**
* The current location in the XML file.
*/
val location: Location get() = reader.stax.location
private fun checkNotFinished() {
check(!isFinished) { "finished - cannot be used anymore" }
}
private fun checkNoChildKonsumer() {
check(currentChildKonsumer?.isFinished != false) { "A child konsumer of '${currentChildKonsumer!!.name}' is ongoing, cannot use this consumer of '$name' until the child konsumer finishes" }
}
/**
* Checks that the current element name is [name].
* @throws KonsumerException if the current element is named differently.
*/
@KonsumerDsl
fun checkCurrent(name: String) {
if (this.name == null) {
throw KonsumerException(location, this.name, "Expected current element '$name' but I'm not currently in an element")
}
if (this.name.localPart != name) {
throw KonsumerException(location, this.name, "Expected current element '$name' but I'm in '${this.name}'")
}
}
/**
* Expects the next element is of given [name], and consumes it. Runs a child consumer which
* is responsible for consuming of the contents of this element. This consumer cannot be used while the child consumer
* haven't finished consuming of the tokens.
*
* The function fails if any non-whitespace text is encountered. Use [text] to
* read any text nodes for elements with mixed contents.
*
* @throws KonsumerException if the next element is of different name, or there is no next element (the current element ends),
* or on end-of-stream.
*/
@KonsumerDsl
fun child(name: String, block: Konsumer.() -> T): T =
child(Names.of(name), block)
/**
* Expects the next element is of given [names], and consumes it. Runs a child consumer which
* is responsible for consuming of the contents of this element. This consumer cannot be used while the child consumer
* haven't finished consuming of the tokens.
*
* The function fails if any non-whitespace text is encountered. Use [text] to
* read any text nodes for elements with mixed contents.
*
* @throws KonsumerException if the next element is of different name, or there is no next element (the current element ends),
* or on end-of-stream.
*/
@KonsumerDsl
fun child(names: Names, block: Konsumer.() -> T): T {
val childKonsumer: Konsumer = nextElement(names, true)!!
val result: T = childKonsumer.runProtected(block)
childKonsumer.finish()
return result
}
/**
* Catches any exceptions thrown by [block] and wraps them in [KonsumerException],
* which will contain the exact location of the problematic place.
*/
private fun runProtected(block: Konsumer.() -> T): T = try {
block()
} catch (e: Exception) {
if (e is KonsumerException) throw e
throw KonsumerException(location, name, e.message ?: "", e)
}
/**
* Expects the next element is of given [name], and consumes it; does nothing and returns `null` if there is no such next element.
* If there is, runs a child consumer which
* is responsible for consuming of the contents of this element. This consumer cannot be used while the child consumer
* haven't finished consuming of the tokens.
*
* The function fails if any non-whitespace text is encountered. Use [text] to
* read any text nodes for elements with mixed contents.
*
* @throws KonsumerException if the next element is of different name, or there is no next element (the current element ends),
* or on end-of-stream.
*/
@KonsumerDsl
fun childOpt(name: String, block: Konsumer.() -> T): T? {
val childKonsumer: Konsumer = nextElement(Names.of(name)) ?: return null
val result: T = childKonsumer.runProtected(block)
childKonsumer.finish()
return result
}
/**
* Consumes all follow-up [XMLEvent.START_ELEMENT]s up to [maxCount] of an element with given [name]. Runs a child consumer which
* is responsible for consuming of the contents of this element. This consumer cannot be used while the child consumer
* haven't finished consuming of the tokens. The consummation stops when another element is encountered or when the
* current element ends.
*
* The function fails if any non-whitespace text is encountered. Use [text] to
* read any text nodes for elements with mixed contents.
*
* Beware - this function will load all [T] items into memory and return that as a list. If the number of items is
* potentially huge and they're processed in a stream fashion, try using [childrenSequence] to save a huge chunk of memory.
*
* @param minCount minimum elements to match, 0 or greater. If fewer elements are matched, [KonsumerException] is thrown. Defaults to 0.
* @param maxCount maximum elements to match, 0 or greater. The function returns as soon as this amount is reached, therefore
* it could leave some elements unconsumed. Defaults to [Int.MAX_VALUE].
* @throws KonsumerException on end-of-stream, or if fewer elements than [minCount] has been encountered.
* @return a list of values as returned by [block], one per every element encountered. Contains at most [maxCount] items.
*/
@KonsumerDsl
fun children(name: String,
minCount: Int = 0,
maxCount: Int = Int.MAX_VALUE,
block: Konsumer.() -> T): List =
children(Names.of(name), minCount, maxCount, block)
/**
* Consumes all follow-up [XMLEvent.START_ELEMENT]s up to [maxCount] of an element with any of given [names]. Runs a child consumer which
* is responsible for consuming of the contents of this element. This consumer cannot be used while the child consumer
* haven't finished consuming of the tokens. The consummation stops when another element is encountered or when the
* current element ends.
*
* The function fails if any non-whitespace text is encountered. Use [text] to
* read any text nodes for elements with mixed contents.
*
* Beware - this function will load all [T] items into memory and return that as a list. If the number of items is
* potentially huge and they're processed in a stream fashion, try using [childrenSequence] to save a huge chunk of memory.
*
* @param minCount minimum elements to match, 0 or greater. If fewer elements are matched, [KonsumerException] is thrown. Defaults to 0.
* @param maxCount maximum elements to match, 0 or greater. The function returns as soon as this amount is reached, therefore
* it could leave some elements unconsumed. Defaults to [Int.MAX_VALUE].
* @throws KonsumerException on end-of-stream, or if fewer elements than [minCount] has been encountered.
* @return a list of values as returned by [block], one per every element encountered. Contains at most [maxCount] items.
*/
@KonsumerDsl
fun children(names: Names,
minCount: Int = 0,
maxCount: Int = Int.MAX_VALUE,
block: Konsumer.() -> T): List {
if (maxCount == 0) return listOf()
require(minCount >= 0) { "minCount must be 0 or greater but was $minCount" }
require(maxCount >= minCount) { "maxCount must be $minCount or greater but was $maxCount" }
val resultList: MutableList = mutableListOf()
fun verifyMinCount() {
if (resultList.size < minCount) {
throw KonsumerException(location, name, "At least $minCount of element $names was expected, but only ${resultList.size} encountered")
}
}
while (true) {
val childKonsumer: Konsumer? = nextElement(names)
if (childKonsumer == null) {
verifyMinCount()
return resultList
}
val result: T = childKonsumer.runProtected(block)
childKonsumer.finish()
resultList.add(result)
if (resultList.size >= maxCount) {
return resultList
}
}
}
/**
* Returns a sequence that lazily polls elements with given [name] from this konsumer, runs [block] and returns the
* converted items. Intended to be used for stream-processing of XML contents. Uses way less memory than [children]
* since it only holds one [T] item at any time.
*
* The sequence ends when there are no more elements with given [name]. To upper-limit the number of elements returned,
* use [Sequence.take]. The sequence can only be iterated once. The sequence does not need to be iterated entirely.
* Iterating the sequence does not close this konsumer.
*
* The function fails if any non-whitespace text is encountered. Use [text] to
* read any text nodes for elements with mixed contents.
*
* @param minCount minimum elements to match, 0 or greater. If fewer elements are matched, [KonsumerException] is thrown. Defaults to 0.
* @throws KonsumerException on end-of-stream, or if fewer elements than [minCount] has been encountered.
*/
@KonsumerDsl
fun childrenSequence(name: String, minCount: Int = 0, block: Konsumer.() -> T): Sequence =
childrenSequence(Names.of(name), minCount, block)
/**
* Returns a sequence that lazily polls elements with given [names] from this konsumer, runs [block] and returns the
* converted items. Intended to be used for stream-processing of XML contents. Uses way less memory than [children]
* since it only holds one [T] item at any time.
*
* The sequence ends when there are no more elements with one of given [names]. To upper-limit the number of elements returned,
* use [Sequence.take]. The sequence can only be iterated once. The sequence does not need to be iterated entirely.
* Iterating the sequence fully does not [finish] this konsumer.
*
* The function fails if any non-whitespace text is encountered. Use [text] to
* read any text nodes for elements with mixed contents.
*
* @param minCount minimum elements to match, 0 or greater. If fewer elements are matched, [KonsumerException] is thrown. Defaults to 0.
* @throws KonsumerException on end-of-stream, or if fewer elements than [minCount] has been encountered.
*/
@KonsumerDsl
fun childrenSequence(names: Names, minCount: Int = 0, block: Konsumer.() -> T): Sequence {
var elementsEncountered = 0
fun verifyMinCount() {
if (elementsEncountered < minCount) {
throw KonsumerException(location, name, "At least $minCount of element $names was expected, but only $elementsEncountered encountered")
}
}
return generateSequence {
val childKonsumer: Konsumer? = nextElement(names)
if (childKonsumer == null) {
verifyMinCount()
null
} else {
val result: T = block(childKonsumer)
childKonsumer.finish()
elementsEncountered++
result
}
}.constrainOnce()
}
/**
* A lower-level API intended for "streaming" approach. The function checks if the next element has one of given [names]:
* * If yes, then a child konsumer is returned, which consumes the contents of that element.
* Note: you must call [finish] after you're done with the child konsumer.
* * If not, then `null` is returned and nothing happens.
*
* You should generally never use this method directly. If you wish to stream-process
* the XML then it's safer to use [childrenSequence]; if you wish to read a particular
* element then [child] is a better choice.
*
* The function fails if any non-whitespace text is encountered. Use [text] to
* read any text nodes for elements with mixed contents.
*
* @param requireElement defaults to `false`. If `true`, the function will fail if there is no next element or if the next element has a different name than one of [names].
* If `true`, the function will never return `null`.
* @return if an element with matching name has been encountered, then a child konsumer is returned,
* otherwise `null` is returned.
*/
@KonsumerDsl
fun nextElement(names: Names,
requireElement: Boolean = false): Konsumer? {
checkNotFinished()
checkNoChildKonsumer()
attributes.finalize()
while (reader.hasNext()) {
val eventType: StaxEventType = reader.next()
when (eventType) {
StaxEventType.CData, StaxEventType.Characters -> {
if (!reader.stax.text.isNullOrBlank()) {
throw KonsumerException(location, name, "Expected element $names but there is unconsumed text: '${reader.stax.text?.trim()}'")
}
}
StaxEventType.EndDocument -> throw KonsumerException(location, name, "Expected element $names but got END_DOCUMENT")
StaxEventType.EndElement -> {
if (requireElement) {
throw KonsumerException(location, this.name, "Expected child element $names but got END_ELEMENT")
}
reader.pushBack()
return null
}
StaxEventType.StartElement -> {
if (!names.accepts(reader.stax.elementName)) {
if (requireElement) {
throw KonsumerException(location, this.name, "Expected element $names but got '${reader.stax.elementName}'")
}
reader.pushBack()
return null
}
val konsumer = Konsumer(reader, reader.stax.elementName, settings)
currentChildKonsumer = konsumer
return konsumer
}
else -> throw AssertionError("unexpected $eventType")
}
}
throw KonsumerException(location, this.name, "Expected element $names but got end-of-stream")
}
/**
* Utility method for expecting an element with given [name] and text contents only; the trimmed text contents are
* returned.
* @param whitespace the way to process whitespaces, defaults to [Whitespace.collapse]
* @throws KonsumerException if the element is absent, or the element contains child elements.
*/
@KonsumerDsl
fun childText(name: String,
whitespace: Whitespace = Whitespace.collapse): String =
childText(name, whitespace) { it }
/**
* Utility method for expecting an element with given [name] and text contents only; the trimmed text contents are
* returned.
* @param whitespace the way to process whitespaces, defaults to [Whitespace.collapse]
* @throws KonsumerException if the element is present and contains child elements.
*/
@KonsumerDsl
fun childTextOpt(name: String,
whitespace: Whitespace = Whitespace.collapse): String? =
childTextOpt(name, whitespace) { it }
/**
* Utility method for expecting a zero-to-infinite elements with given [name] and text contents only; the trimmed text contents are
* returned.
* @throws KonsumerException if the [minCount] condition can not be satisfied, or if the elements contains child elements.
*/
@KonsumerDsl
fun childrenText(name: String, minCount: Int = 0, maxCount: Int = Int.MAX_VALUE): List = childrenText(name, minCount, maxCount) { it }
/**
* Utility method for expecting one element with given [name] and text contents only; the trimmed text contents are
* ran through given [converter] and returned. If the converter throws, the exception is wrapped in [KonsumerException]
* with exact location and returned. This is perfect for performing conversion of the value to e.g. Int.
* @param whitespace the way to process whitespaces, defaults to [Whitespace.collapse]
* @throws KonsumerException if the element is absent, or the conversion fails, or the element contains child elements.
*/
@KonsumerDsl
fun childText(name: String,
whitespace: Whitespace = Whitespace.collapse,
converter: (String) -> T): T =
child(name) { text(whitespace, converter) }
/**
* Utility method for expecting 0..1 elements with given [name] and text contents only; the trimmed text contents are
* ran through given [converter] and returned. If the converter throws, the exception is wrapped in [KonsumerException]
* with exact location and returned. This is perfect for performing conversion of the value to e.g. Int.
*
* The function does nothing and returns `null` if there is no such next element.
* @param whitespace the way to process whitespaces, defaults to [Whitespace.collapse]
* @throws KonsumerException if the element is present, and either the conversion fails, or the element contains child elements.
*/
@KonsumerDsl
fun childTextOpt(name: String,
whitespace: Whitespace = Whitespace.collapse,
converter: (String) -> T): T? =
childOpt(name) { text(whitespace, converter) }
/**
* Utility method for expecting a zero-to-infinite elements with given [name] and text contents only; the trimmed text contents are
* ran through given [converter] and returned. If the converter throws, the exception is wrapped in [KonsumerException]
* with exact location and returned. This is perfect for performing conversion of the value to e.g. Int.
* @param whitespace the way to process whitespaces, defaults to [Whitespace.collapse]
* @throws KonsumerException if the [minCount] condition can not be satisfied, or if the elements contains child elements,
* or if the conversion fails for any of the children elements.
*/
@KonsumerDsl
fun childrenText(name: String,
minCount: Int = 0,
maxCount: Int = Int.MAX_VALUE,
whitespace: Whitespace = Whitespace.collapse,
converter: (String) -> T): List =
children(name, minCount, maxCount) { text(whitespace, converter) }
/**
* Skims through all further text nodes until [XMLEvent.END_ELEMENT] is encountered,
* then returns all of the text encountered in this element.
*
* By default the function fails if an element is encountered. However, that
* would prevent us from reading mixed contents. To support mixed contents,
* set [failOnElement] to false. In this mode, the function will read all text nodes
* up to the nearest element, or up until [XMLEvent.END_ELEMENT] is encountered.
* If there are no text nodes left, the function will return an empty string.
*
* @param whitespace the way to process whitespaces, defaults to [Whitespace.collapse]
* @param failOnElement defaults to true. If true, the function fails if an element is started.
* If false, the function will instead return the text encountered so far and ends, allowing you
* to consume the element encountered. Perfect for mixed contents such as `a beautiful day`.
* @throws KonsumerException if an element is started or end-of-stream is encountered.
*/
@KonsumerDsl
fun text(whitespace: Whitespace = Whitespace.collapse,
failOnElement: Boolean = true): String {
checkNotFinished()
checkNoChildKonsumer()
attributes.finalize()
val textContents = StringBuilder()
while (reader.hasNext()) {
val eventType: StaxEventType = reader.next()
when (eventType) {
StaxEventType.CData, StaxEventType.Characters -> textContents.append(whitespace.process(reader.stax.text!!))
StaxEventType.EndDocument -> throw KonsumerException(location, name, "Expected text but got END_DOCUMENT")
StaxEventType.StartElement -> {
if (failOnElement) {
throw KonsumerException(location, name, "Expected text but got START_ELEMENT: ${reader.stax.elementName}")
}
reader.pushBack()
return textContents.toString()
}
StaxEventType.EndElement -> {
isFinished = true
return textContents.toString()
}
else -> throw AssertionError("unexpected $eventType")
}
}
throw KonsumerException(location, name, "Expected text but got end of stream")
}
/**
* Exactly as [text] but runs the text through given [converter]. If the converter fails, the exception is automatically
* wrapped in [KonsumerException] with appropriate location. Handy for converting values to e.g. integers:
* ```kotlin
* text { it.toInt() }
* ```
* @param whitespace the way to process whitespaces, defaults to [Whitespace.collapse]
* @throws KonsumerException if the conversion fails, an element is started or end-of-stream is encountered.
*/
@KonsumerDsl
fun text(whitespace: Whitespace, converter: (String) -> T): T {
val t: String = text(whitespace)
try {
return converter(t)
} catch (e: Exception) {
throw KonsumerException(location, name, "Failed to convert '$t': ${e.message}", e)
}
}
/**
* Exactly as [text] but runs the text through given [converter]. If the converter fails, the exception is automatically
* wrapped in [KonsumerException] with appropriate location. Handy for converting values to e.g. integers:
* ```kotlin
* text { it.toInt() }
* ```
* Whitespaces are [Whitespace.collapse]d.
* @throws KonsumerException if the conversion fails, an element is started or end-of-stream is encountered.
*/
@KonsumerDsl
fun text(converter: (String) -> T): T = text(Whitespace.collapse, converter)
/**
* Consumes nearest [XMLEvent.END_ELEMENT], to finalize this Konsumer, to allow the parent konsumer to continue.
* Fails if there are unconsumed events such as [XMLEvent.CDATA], [XMLEvent.CHARACTERS], [XMLEvent.START_ELEMENT].
*
* A low-level API, only to be used with the [nextElement] function.
*
* Do not call for the root Konsumer - it will fail with `Expected END_ELEMENT but got END_DOCUMENT`.
*
* Multiple invocations of this method do nothing.
*/
@KonsumerDsl
fun finish() {
if (isFinished) return
attributes.finalize()
while (reader.hasNext()) {
val eventType: StaxEventType = reader.next()
when (eventType) {
StaxEventType.CData, StaxEventType.Characters -> {
if (!reader.stax.text.isNullOrBlank()) {
throw KonsumerException(location, name, "Expected END_ELEMENT but there is unconsumed text: '${reader.stax.text?.trim()}'")
}
}
StaxEventType.EndDocument -> throw KonsumerException(location, name, "Expected END_ELEMENT but got END_DOCUMENT")
StaxEventType.StartElement -> throw KonsumerException(location, name, "Expected END_ELEMENT but got START_ELEMENT: '${reader.stax.elementName}'")
StaxEventType.EndElement -> {
isFinished = true
return
}
else -> throw AssertionError("unexpected $eventType")
}
}
throw KonsumerException(location, name, "Expected END_ELEMENT but got end of stream")
}
/**
* Closes the underlying reader and makes it impossible to continue with XML parsing.
*
* Closing this class closes all underlying resources including the input stream. Konsumer only implements [Closeable]
* for convenience purpose, so that you can parse the file contents easily, simply by writing:
* ```kotlin
* File("in.xml").konsumeXml().use { k ->
* k.child("root") {}
* }
* ```
*
* You must not close child consumers, since that would close the underlying [reader], making any further attempts to
* read the XML fail.
*/
override fun close() {
reader.close()
}
/**
* Skips over the rest of the contents of this element: skips all text nodes, child elements. Note that [finish] still needs
* to be called on this konsumer.
*
* WARNING: This effectively disables validation for the contents of this element, since no child elements are
* expressed as expected. Please see the README on how to skip contents while having support for validation.
*
* See the `README.md` file for examples on how to use this function most effectively.
*/
@KonsumerDsl
fun skipContents() {
checkNotFinished()
checkNoChildKonsumer()
attributes.finalize()
while (reader.hasNext()) {
val eventType = reader.next()
when (eventType) {
StaxEventType.CData, StaxEventType.Characters -> {
// skip text
}
StaxEventType.EndDocument, StaxEventType.EndElement -> {
reader.pushBack() // so that finish() can be called
return
}
StaxEventType.StartElement -> {
val konsumer = Konsumer(reader, reader.stax.elementName, settings)
currentChildKonsumer = konsumer
konsumer.skipContents()
konsumer.finish()
}
else -> throw AssertionError("unexpected $eventType")
}
}
throw KonsumerException(location, this.name, "Expected END_ELEMENT but got end-of-stream")
}
}