
scales.xml.parser.pull.PullIterator.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of scales-xml_2.11 Show documentation
Show all versions of scales-xml_2.11 Show documentation
An alternate Scala Xml processing library
The newest version!
package scales.xml.parser.pull
import javax.xml.stream._
import scales.utils._
import scales.xml.parser._
import strategies.{MemoryOptimisationStrategy, OptimisationToken}
import scales.xml.impl.{FromParser, IsFromParser}
import scales.xml.{AttributeQName, Attributes, CData, Comment, DTD, Declaration, DocLike, EndElem, EndMisc, Misc, PI, Prolog, PullType, QName, ScalesXml, Text, Xml10, Xml11, emptyAttributes}
import scala.annotation.tailrec
/**
* Basis for xmlpulls, an Iterator[PullType]
*/
trait XmlPull extends Iterator[PullType] with DocLike {
type Token <: OptimisationToken
implicit val weAreInAParser : FromParser = IsFromParser
import ScalesXml.defaultVersion
protected[xml] val parser: XMLStreamReader
protected[xml] val resourceCloser: () => Unit
protected[xml] def internalClose {}
/**
* Why is this on a pull parser? Simple answer is by default it costs little,
* and by selection the user can optimise whole streams at an early and central place. Optimising the stream in onQNames or another iteratee or in user code seems to go against the definition of a good design.
*/
protected[xml] val strategy : MemoryOptimisationStrategy[Token]
protected[xml] val token : Token
protected[xml] val iStrictPath: List[QName] = Nil
private[xml] var path: List[QName] = Nil
private[xml] var current: PullType = null
/*
* DocLike parts follow
*/
private[xml] var vprolog: Prolog = Prolog()
private[xml] var emisc: EndMisc = EndMisc()
def prolog = vprolog
def end = emisc
/**
* If the depth is -1 then we haven't hit an element yet, and
* start should keep pumping until that happens.
*
* If the depth is -1 after processing the end root (haveRoot), then we keep pumping into the endMisc
*/
protected[xml] var depth = -1
protected[xml] var haveRoot = false
/**
* Pumps until the first elem, always collecting the prolog
*/
protected[xml] def start = {
while (depth == -1) {
current = pumpEvent
if (current.isLeft && (current.left.get eq PullUtils.dtdDummy)) {
vprolog = vprolog.copy(dtd = Some(
DTD("", "", "") // DTD has funnyness TODO find out what it looks like
))
}
if (depth == -1) {
vprolog = vprolog.copy(misc = vprolog.misc :+ PullUtils.getMisc(current, "prolog"))
}
}
}
final val it = this: Iterator[PullType]
def hasNext = current ne null
def next: PullType = {
val c = current // cache current
if (current eq null) throw new NoSuchElementException("The end of the document has been reached")
current = pumpEvent // pump for the next
if ((current ne null) && current.isRight && depth == -1) {
// we are now into the end doc, no more events will be pumped
var ends = pumpEvent
while (ends ne null) {
emisc = emisc.copy(misc = emisc.misc :+ PullUtils.getMisc(ends, "document end Misc"))
ends = pumpEvent
}
}
c // return cached
}
protected[xml] def pumpEvent: PullType = {
if (!parser.hasNext) return null
var nextEvent = XMLStreamConstants.END_DOCUMENT
try {
val (event, num, odepth, oprolog, opath) = PullUtils.pumpEvent(parser, strategy, token, vprolog, depth, iStrictPath, path) { _ => pumpEvent }
nextEvent = num
depth = odepth
vprolog = oprolog
path = opath
event
} finally {
// should we close it?
if (nextEvent == XMLStreamConstants.END_DOCUMENT) {
internalClose
}
}
}
}
object PullUtils {
private[xml] final val StartDepth = -1
private[xml] val dtdDummy = PI("onlyforme", "init")
implicit val weAreInAParser : FromParser = IsFromParser
def getMisc(c: PullType, in: String): Misc =
c.fold[Misc](e => e match {
case ev: Comment => Left(ev)
case ev: PI => Right(ev)
case _ => error("Got an event (" + e + ") that should not be in the " + in)
}, f => error("End element found in " + in + " " + c))
// it must be a left and a comment or pi
def getAttributes[Token <: OptimisationToken]( parser: XMLStreamReader, strategy : MemoryOptimisationStrategy[Token], token : Token ): Attributes = {
import ScalesXml.toQName
val count = parser.getAttributeCount()
var i = 0
var map = emptyAttributes
while (i < count) {
val jqname = parser.getAttributeName(i)
val pre = jqname.getPrefix
val local = jqname.getLocalPart
// attr qnames must be either prefixed or no namespace
val aqname: AttributeQName =
if ((pre eq null) || (pre.length == 0))
// no namespace
strategy.noNamespaceQName(local, token) // Right)
else
strategy.prefixedQName(local, jqname.getNamespaceURI, pre, token) // Left )
map = map unsafePlus
strategy.attribute(aqname,
parser.getAttributeValue(i), token)
i += 1
}
map
}
def getNamespaces[Token <: OptimisationToken]( parser: XMLStreamReader, strategy : MemoryOptimisationStrategy[Token], token : Token ): Map[String, String] = {
val count = parser.getNamespaceCount()
var i = 0
var map = Map[String, String]()
while (i < count) {
val pre = parser.getNamespacePrefix(i)
if (pre ne null) {
map += (pre -> parser.getNamespaceURI(i))
} // else nothing, the element will define it, question is should we accept it here as it was defined as such!!??
i += 1
}
map
}
def getElemQName[Token <: OptimisationToken]( parser: XMLStreamReader, strategy : MemoryOptimisationStrategy[Token], token : Token ) = {
// elems can have all three, prefixed, ns and none
/* val jqname = parser.getName()
val ns = jqname.getNamespaceURI
val pre = jqname.getPrefix
val local = jqname.getLocalPart
*/
val ns = parser.getNamespaceURI
val pre = parser.getPrefix
val local = parser.getLocalName
if ((pre eq null) || (pre.length == 0)) {
// ns only or none
if ((ns eq null) || (ns.length == 0))
strategy.noNamespaceQName(local, token)
else
strategy.unprefixedQName(local, ns, token)
} else
strategy.prefixedQName(local, ns, pre, token)
}
def pumpEvent[Token <: OptimisationToken](parser: XMLStreamReader,
strategy: MemoryOptimisationStrategy[Token],
token: Token,
prolog: Prolog,
idepth: Int,
strictPath: List[QName] = Nil,
ipath: List[QName]= Nil)(otherEventHandler: Either[Int, Throwable] => PullType) : (PullType, Int, Int, Prolog, List[QName]) = {
var depth = idepth
var vprolog = prolog
var vpath = ipath
@tailrec
def dropWhile(): PullType = {
def doStep(event: Int): Option[PullType] = {
event match {
case XMLStreamConstants.START_ELEMENT =>
depth += 1
val elemQName: QName = PullUtils.getElemQName(parser, strategy, token)
vpath = vpath :+ elemQName
if (vpath.equals(strictPath)) {
val attributes = PullUtils.getAttributes(parser, strategy, token)
val namespaces = PullUtils.getNamespaces(parser, strategy, token)
Some(strategy.elem(elemQName, attributes, namespaces, token))
} else {
None
}
case XMLStreamConstants.END_ELEMENT =>
depth -= 1
val isMissingStrictPath = vpath.equals(strictPath.take(strictPath.size - 1))
if (isMissingStrictPath) {
val elemQName = PullUtils.getElemQName(parser, strategy, token)
val namespaces = PullUtils.getNamespaces(parser, strategy, token)
vpath = ipath.take(ipath.size - 1)
Some(EndElem(elemQName, namespaces))
} else {
vpath = vpath.take(vpath.size - 1)
None
}
case XMLStreamConstants.END_DOCUMENT =>
Some(null)
case _ =>
None
}
}
if (parser.hasNext) {
doStep(parser.next()) match {
case Some(res) => res
case None => dropWhile()
}
} else {
error("this should never happen")
}
}
var nextEvent = XMLStreamConstants.END_DOCUMENT // use this in the case of error from calling next as well, blow it up but try to shut down
val event: PullType =
try {
nextEvent = parser.next
nextEvent match {
case XMLStreamConstants.START_ELEMENT => //1
depth += 1
if (strictPath.isEmpty)
strategy.elem(getElemQName(parser, strategy, token), getAttributes(parser, strategy, token), getNamespaces(parser, strategy, token), token)
else {
val elemQName = PullUtils.getElemQName(parser, strategy, token)
vpath = ipath :+ elemQName
val validSubtree = vpath.take(strictPath.size).equals(strictPath)
if (idepth == StartDepth || validSubtree) {
val attributes = PullUtils.getAttributes(parser, strategy, token)
val namespaces = PullUtils.getNamespaces(parser, strategy, token)
strategy.elem(elemQName, attributes, namespaces, token)
} else {
dropWhile()
}
}
case XMLStreamConstants.END_ELEMENT => //2
depth -= 1
if (strictPath.isEmpty)
EndElem(getElemQName(parser, strategy, token), getNamespaces(parser, strategy, token))
else {
val elemQName = PullUtils.getElemQName(parser, strategy, token)
val namespaces = PullUtils.getNamespaces(parser, strategy, token)
vpath = ipath.take(ipath.size - 1)
EndElem(elemQName, namespaces)
}
case XMLStreamConstants.CHARACTERS => Text(parser.getText)
case XMLStreamConstants.CDATA => CData(parser.getText)
case XMLStreamConstants.COMMENT => Comment(parser.getText)
case XMLStreamConstants.PROCESSING_INSTRUCTION => PI(parser.getPITarget(), parser.getPIData())
case XMLStreamConstants.SPACE => Text(parser.getText) // jdk impl never calls but to be safe we should grab it
case XMLStreamConstants.START_DOCUMENT => {
// get the encoding etc
// NB the asynch variety can also call this, if no more events are available then it returns the waiting object.
val ec = parser.getCharacterEncodingScheme()
vprolog = vprolog.copy(decl = Declaration(
version = if (parser.getVersion() == "1.1")
Xml11 else Xml10,
encoding = if (ec eq null) defaultCharset else java.nio.charset.Charset.forName(ec), // TODO what do we do about unsupported, throwing is probably fine, but it irritates, if we can get here the parser at least supports it, even if we can't write to it
standalone = parser.isStandalone()))
val (nev, nex, nde, vvp, _) = pumpEvent(parser, strategy, token, vprolog, depth)(otherEventHandler) // we don't want to handle this
// reset to keep the correct values
nextEvent = nex
depth = nde
vprolog = vvp
nev
}
case XMLStreamConstants.DTD => dtdDummy // push it through in start
// we don't really want to handle other types?
case _ => otherEventHandler(Left(nextEvent))
}
} catch {
case t: Throwable => otherEventHandler(Right(t))
}
(event, nextEvent, depth, vprolog, vpath)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy