All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fs2.data.xml.internals.EventParser.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2019 Lucas Satabin
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package fs2
package data
package xml
package internals

import scala.collection.immutable.VectorBuilder

private[xml] object EventParser {

  // ==== utils ====

  val valueDelimiters = " \t\r\n<&"

  private def fail[F[_], R](prod: String, msg: String, chunkAcc: List[XmlEvent])(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, R] =
    emitChunk(chunkAcc) >> Pull.raiseError[F](new XmlException(XmlSyntax(prod), msg))

  private def peekChar[F[_]](ctx: Context[F]): Pull[F, XmlEvent, Option[Result[F, Char]]] =
    if (ctx.isEndOfChunk) {
      emitChunk(ctx.chunkAcc) >> ctx.rest.pull.uncons.flatMap {
        case Some((hd, tl)) => peekChar(Context(hd, tl))
        case None           => Pull.pure(None)
      }
    } else {
      Pull.pure(Some((ctx, ctx.chunk(ctx.idx))))
    }

  private def nextChar[F[_]](ctx: Context[F])(implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, Char]] =
    if (ctx.isEndOfChunk) {
      emitChunk(ctx.chunkAcc) >> ctx.rest.pull.uncons.flatMap {
        case Some((hd, tl)) => nextChar(Context(hd, tl))
        case None           => fail[F, Result[F, Char]]("1", "unexpected end of input", Nil)
      }
    } else {
      Pull.pure((ctx.nextIdx, ctx.chunk(ctx.idx)))
    }

  private def isValid(is11: Boolean, c: Int): Boolean =
    if (is11)
      // [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
      (0x1 <= c && c <= 0xd7ff) || (0xe000 <= c && c <= 0xfffd) || (0x10000 <= c && c <= 0x10ffff)
    else
      // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
      c == 0x9 || c == 0xa || c == 0xd || (0x20 <= c && c <= 0xd7ff) || (0xe000 <= c && c <= 0xfffd) || (0x10000 <= c && c <= 0x10ffff)

  private def isNCNameStart(c: Char): Boolean = {
    import java.lang.Character._
    getType(c).toByte match {
      case LOWERCASE_LETTER | UPPERCASE_LETTER | OTHER_LETTER | TITLECASE_LETTER | LETTER_NUMBER => true
      case _                                                                                     => c == '_'
    }
  }

  private def isNCNameChar(c: Char): Boolean = {
    import java.lang.Character._
    // The constants represent groups Mc, Me, Mn, Lm, and Nd.
    isNCNameStart(c) || (getType(c).toByte match {
      case COMBINING_SPACING_MARK | ENCLOSING_MARK | NON_SPACING_MARK | MODIFIER_LETTER | DECIMAL_DIGIT_NUMBER => true
      case _                                                                                                   => ".-·".contains(c)
    })
  }

  private def isXmlWhitespace(c: Char): Boolean =
    c == ' ' || c == '\t' || c == '\r' || c == '\n'

  private def accept[F[_]](ctx: Context[F], c: Char, error: String, msg: String)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Context[F]] =
    if (ctx.isEndOfChunk) {
      emitChunk(ctx.chunkAcc) >> ctx.rest.pull.uncons.flatMap {
        case Some((hd, tl)) => accept(Context(hd, tl), c, error, msg)
        case None           => fail[F, Context[F]](error, msg, Nil)
      }
    } else {
      if (ctx.chunk(ctx.idx) == c)
        Pull.pure(ctx.nextIdx)
      else
        fail[F, Context[F]](error, msg, ctx.chunkAcc)
    }

  private def accept[F[_]](ctx: Context[F], s: String)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, Int]] = {
    def loop(ctx: Context[F], sidx: Int): Pull[F, XmlEvent, Result[F, Int]] =
      if (sidx >= s.length) {
        Pull.pure((ctx, s.length))
      } else if (ctx.isEndOfChunk) {
        emitChunk(ctx.chunkAcc) >> ctx.rest.pull.uncons.flatMap {
          case Some((hd, tl)) => accept(Context(hd, tl), s)
          case None           => Pull.pure((Context.eos, sidx))
        }
      } else {
        if (ctx.chunk(ctx.idx) == s.charAt(sidx))
          loop(ctx.nextIdx, sidx + 1)
        else
          Pull.pure((ctx, sidx))
      }
    loop(ctx, 0)
  }

  private def accept[F[_]](ctx: Context[F], s: String, error: String, msg: String)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Context[F]] =
    accept(ctx, s).flatMap {
      case (ctx, n) if n == s.length => Pull.pure(ctx)
      case _                         => fail[F, Context[F]](error, msg, Nil)
    }

  private def assert[F[_]](ctx: Context[F], p: Char => Boolean, error: String, msg: String)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, Char]] =
    peekChar(ctx).flatMap {
      case Some((ctx, c)) if p(c) => Pull.pure((ctx.nextIdx, c))
      case _                      => fail[F, Result[F, Char]](error, msg, Nil)
    }

  private def untilChar[F[_]](ctx: Context[F], p: Char => Boolean, sb: StringBuilder): Pull[F, XmlEvent, Context[F]] =
    if (ctx.isEndOfChunk) {
      emitChunk(ctx.chunkAcc) >> ctx.rest.pull.uncons.flatMap {
        case Some((hd, tl)) => untilChar(Context(hd, tl), p, sb)
        case None           => Pull.pure(Context.eos)
      }
    } else {
      val c = ctx.chunk(ctx.idx)
      if (!p(c))
        untilChar(ctx.nextIdx, p, sb.append(c))
      else
        Pull.pure(ctx)
    }

  // ==== low-level internals ====

  private def readNCName[F[_]](ctx: Context[F])(implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, String]] =
    if (ctx.isEndOfChunk) {
      emitChunk(ctx.chunkAcc) >> ctx.rest.pull.uncons.flatMap {
        case Some((hd, tl)) => readNCName(Context(hd, tl))
        case None           => fail[F, Result[F, String]]("1", "unexpected end of input", Nil)
      }
    } else {
      val c = ctx.chunk(ctx.idx)
      if (isNCNameStart(c)) {
        val sb = new StringBuilder
        untilChar(ctx.nextIdx, c => !isNCNameChar(c), sb.append(c)).map { ctx =>
          (ctx, sb.result())
        }
      } else {
        fail[F, Result[F, String]]("5", s"character '$c' cannot start a NCName", ctx.chunkAcc)
      }
    }

  private def readQName[F[_]](ctx: Context[F])(implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, QName]] =
    readNCName(ctx).flatMap {
      case (ctx, part1) =>
        def readPart2(ctx: Context[F]): Pull[F, XmlEvent, Result[F, QName]] =
          if (ctx.isEndOfChunk) {
            emitChunk(ctx.chunkAcc) >> ctx.rest.pull.uncons.flatMap {
              case Some((hd, tl)) => readPart2(Context(hd, tl))
              case None           => Pull.pure((Context.eos, QName(None, part1)))
            }
          } else {
            ctx.chunk(ctx.idx) match {
              case ':' =>
                readNCName(ctx.nextIdx).map {
                  case (ctx, part2) =>
                    (ctx, QName(Some(part1), part2))
                }
              case _ =>
                Pull.pure((ctx, QName(None, part1)))
            }
          }
        readPart2(ctx)
    }

  private def space[F[_]](ctx: Context[F]): Pull[F, XmlEvent, Context[F]] =
    if (ctx.isEndOfChunk) {
      emitChunk(ctx.chunkAcc) >> ctx.rest.pull.uncons.flatMap {
        case Some((hd, tl)) => space(Context(hd, tl))
        case None           => Pull.pure(Context.eos)
      }
    } else {
      if (isXmlWhitespace(ctx.chunk(ctx.idx)))
        space(ctx.nextIdx)
      else
        Pull.pure(ctx)
    }
  private def readMarkupToken[F[_]](ctx: Context[F])(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, MarkupToken]] =
    accept(ctx, '<', "43", "expected token start").flatMap { ctx =>
      def read(ctx: Context[F]): Pull[F, XmlEvent, Result[F, MarkupToken]] =
        if (ctx.isEndOfChunk) {
          emitChunk(ctx.chunkAcc) >> ctx.rest.pull.uncons.flatMap {
            case Some((hd, tl)) => read(Context(hd, tl))
            case None           => fail[F, Result[F, MarkupToken]]("1", "unexpected end of input", Nil)
          }
        } else {
          ctx.chunk(ctx.idx) match {
            case '/' =>
              for {
                (ctx, qname) <- readQName(ctx.nextIdx)
                ctx <- space(ctx)
                ctx <- accept(ctx, '>', "42", "missing '>' at the end of closing tag")
              } yield (ctx, MarkupToken.EndToken(qname))
            case '?' =>
              readNCName(ctx.nextIdx).map {
                case (ctx, name) => (ctx, MarkupToken.PIToken(name))
              }
            case '!' =>
              peekChar(ctx.nextIdx).flatMap {
                case Some((ctx, '-')) =>
                  skipComment(ctx.nextIdx)
                case Some((ctx, '[')) =>
                  readCDATA(ctx.nextIdx)
                case Some((ctx, _)) =>
                  readNCName(ctx).map {
                    case (ctx, name) =>
                      (ctx, MarkupToken.DeclToken(name))
                  }
                case None =>
                  fail[F, Result[F, MarkupToken]]("1", "unexpected end of input", Nil)
              }
            case _ =>
              readQName(ctx).map {
                case (ctx, name) => (ctx, MarkupToken.StartToken(name))
              }
          }
        }
      read(ctx)
    }

  /** We have read '
      def loop(ctx: Context[F]): Pull[F, XmlEvent, Context[F]] =
        nextChar(ctx).flatMap {
          case (ctx, '-') =>
            nextChar(ctx).flatMap {
              case (ctx, '-') =>
                accept(ctx, '>', "15", "'--' is not inside comments")
              case (ctx, _) =>
                loop(ctx)
            }
          case (ctx, _) =>
            loop(ctx)
        }
      loop(ctx).map { ctx =>
        (ctx, MarkupToken.CommentToken)
      }
    }

  /** We have read '
      (ctx, MarkupToken.CDataToken)
    }

  /** We have just read the PI target */
  private def readPIBody[F[_]](ctx: Context[F])(implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, String]] =
    space(ctx).flatMap { ctx =>
      def loop(ctx: Context[F], sb: StringBuilder): Pull[F, XmlEvent, Result[F, String]] =
        untilChar(ctx, c => c == '?', sb).flatMap { ctx =>
          accept(ctx, '?', "16", "unexpected end of input").flatMap { ctx =>
            peekChar(ctx).flatMap {
              case Some((ctx, '>')) =>
                Pull.pure((ctx.nextIdx, sb.result()))
              case Some((ctx, _)) =>
                loop(ctx, sb.append('?'))
              case None =>
                fail[F, Result[F, String]]("16", "unexpected end of input", Nil)
            }
          }
        }
      loop(ctx, new StringBuilder)
    }

  /** We read the beginning of internal DTD subset, read until final ']>' */
  private def skipInternalDTD[F[_]](ctx: Context[F])(implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Context[F]] =
    nextChar(ctx).flatMap {
      case (ctx, ']') =>
        nextChar(ctx).flatMap {
          case (ctx, '>') => Pull.pure(ctx)
          case (ctx, _)   => skipInternalDTD(ctx)
        }
      case (ctx, _) => skipInternalDTD(ctx)
    }

  private def readExternalID[F[_]](ctx: Context[F])(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, String]] =
    readNCName(ctx).flatMap {
      case (ctx, sysOrPub) =>
        assert(ctx, isXmlWhitespace(_), "75", "space required after SYSTEM or PUBLIC").flatMap {
          case (ctx, _) =>
            sysOrPub match {
              case "SYSTEM" =>
                readQuoted(ctx, false, "11")
              case "PUBLIC" =>
                for {
                  (ctx, _) <- readQuoted(ctx, true, "12")
                  (ctx, _) <- assert(ctx, isXmlWhitespace(_), "12", "space required after PubidLiteral")
                  res <- readQuoted(ctx, false, "12")
                } yield res
              case _ =>
                fail[F, Result[F, String]]("75", "SYSTEM or PUBLIC expected", ctx.chunkAcc)
            }
        }
    }

  private def readQuoted[F[_]](ctx: Context[F], pub: Boolean, error: String)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, String]] =
    space(ctx).flatMap { ctx =>
      assert(ctx, c => c == '"' || c == '\'', error, "single or double quote expected")
        .flatMap {
          case (ctx, delimiter) =>
            val pred: Char => Boolean =
              if (pub)
                if (delimiter == '\'')
                  c =>
                    !(c == 0x20 || c == 0xd || c == 0xa || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || "-'()+,./:=?;!*#@$_%"
                      .contains(c))
                else
                  c =>
                    !(c == 0x20 || c == 0xd || c == 0xa || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || "-()+,./:=?;!*#@$_%"
                      .contains(c))
              else
                c => c == delimiter

            val sb = new StringBuilder
            untilChar(ctx, pred, sb).flatMap { ctx =>
              Pull.pure((ctx.nextIdx, sb.result()))
            }
        }
    }

  private def scanMisc[F[_]](ctx: Context[F])(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Option[Result[F, MarkupToken]]] =
    space(ctx).flatMap { ctx =>
      peekChar(ctx).flatMap {
        case Some((ctx, '<')) =>
          readMarkupToken(ctx).flatMap {
            case (ctx, MarkupToken.CommentToken)      => scanMisc(ctx)
            case res @ (_, MarkupToken.PIToken(_))    => Pull.pure(Some(res))
            case res @ (_, MarkupToken.DeclToken(_))  => Pull.pure(Some(res))
            case res @ (_, MarkupToken.StartToken(_)) => Pull.pure(Some(res))
            case (ctx, t)                             => fail[F, Option[Result[F, MarkupToken]]]("22", s"unexpected token '$t'", ctx.chunkAcc)
          }
        case Some((ctx, c)) => fail[F, Option[Result[F, MarkupToken]]]("22", s"unexpected character '$c'", ctx.chunkAcc)
        case None           => Pull.pure(None)
      }
    }

  /** We read '&#' so far */
  def readCharRef[F[_]](ctx: Context[F], is11: Boolean)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, Int]] = {
    def postlude(ctx: Context[F], n: Int) =
      nextChar(ctx).flatMap {
        case (ctx, ';') =>
          if (isValid(is11, n))
            Pull.pure((ctx, n))
          else
            fail[F, Result[F, Int]]("2", "invalid character", ctx.chunkAcc)
        case _ =>
          fail[F, Result[F, Int]]("66", "character reference must end with a semicolon", ctx.chunkAcc)
      }
    peekChar(ctx).flatMap {
      case Some((ctx, 'x')) =>
        readNum(ctx.nextIdx, 16).flatMap {
          case (ctx, n) => postlude(ctx, n)
        }
      case Some((ctx, _)) =>
        readNum(ctx, 10).flatMap {
          case (ctx, n) => postlude(ctx, n)
        }
      case None => fail[F, Result[F, Int]]("66", "unexpected end of input", Nil)
    }
  }

  private def readNum[F[_]](ctx: Context[F], base: Int)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, Int]] = {
    object Digit {
      def unapply(c: Char): Option[Int] =
        if ((base == 10 || base == 16) && '0' <= c && c <= '9')
          Some(c - '0')
        else if (base == 16 && 'a' <= c && c <= 'f')
          Some(c - 'a' + 10)
        else if (base == 16 && 'A' <= c && c <= 'F')
          Some(c - 'A' + 10)
        else
          None
    }

    def restNum(ctx: Context[F], acc: Int): Pull[F, XmlEvent, Result[F, Int]] =
      peekChar(ctx).flatMap {
        case Some((ctx, Digit(d))) =>
          restNum(ctx.nextIdx, acc * base + d)
        case Some((ctx, _)) =>
          Pull.pure((ctx, acc))
        case None =>
          Pull.pure((Context.eos, acc))
      }

    nextChar(ctx).flatMap {
      case (ctx, Digit(d)) => restNum(ctx, d)
      case (ctx, _)        => fail[F, Result[F, Int]]("66", "bad first character reference digit", ctx.chunkAcc)
    }
  }

  // ==== middle-level internals ====

  private def readAttributes[F[_]](ctx: Context[F], is11: Boolean, tname: QName)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, List[Attr]]] = {
    def loop(ctx: Context[F], attributes: VectorBuilder[Attr]): Pull[F, XmlEvent, Result[F, List[Attr]]] =
      space(ctx).flatMap { ctx =>
        peekChar(ctx).flatMap {
          case Some((ctx, c)) if isNCNameStart(c) =>
            for {
              (ctx, name) <- readQName(ctx)
              ctx <- space(ctx)
              ctx <- accept(ctx, '=', "25", "'=' character expected")
              ctx <- space(ctx)
              (ctx, delimiter) <- assert(ctx,
                                         c => c == '"' || c == '\'',
                                         "10",
                                         "single or double quote expected around attribute value")
              (ctx, value) <- readAttributeValue(ctx, is11, Some(delimiter), new StringBuilder, new VectorBuilder)
              res <- loop(ctx, attributes += Attr(name, value))
            } yield res
          case Some((ctx, _)) => Pull.pure((ctx, attributes.result().toList))
          case None           => fail[F, Result[F, List[Attr]]]("1", "unexpected end of input", Nil)
        }
      }
    loop(ctx, new VectorBuilder)
  }

  private def readAttributeValue[F[_]](ctx: Context[F],
                                       is11: Boolean,
                                       delim: Option[Char],
                                       current: StringBuilder,
                                       builder: VectorBuilder[XmlEvent.XmlTexty])(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, List[XmlEvent.XmlTexty]]] = {
    val delimiters = delim.fold(valueDelimiters)(valueDelimiters + _)
    untilChar(ctx, delimiters.contains(_), current).flatMap { ctx =>
      nextChar(ctx).flatMap {
        case (ctx, c) if Some(c) == delim =>
          if (!current.isEmpty)
            builder += XmlEvent.XmlString(current.toString, false)
          Pull.pure((ctx, builder.result().toList))
        case (ctx, '\r') =>
          nextChar(ctx).flatMap {
            case (ctx, '\n') =>
              readAttributeValue(ctx, is11, delim, current.append('\n'), builder)
            case (ctx, c@_) =>
              readAttributeValue(ctx, is11, delim, current.append(' '), builder)
          }
        case ((ctx, c)) if isXmlWhitespace(c) =>
          readAttributeValue(ctx, is11, delim, current.append(' '), builder)
        case (ctx, '&') =>
          builder += XmlEvent.XmlString(current.toString, false)
          peekChar(ctx).flatMap {
            case Some((ctx, '#')) =>
              readCharRef(ctx.nextIdx, is11).flatMap {
                case (ctx, n) =>
                  builder += XmlEvent.XmlCharRef(n)
                  readAttributeValue(ctx, is11, delim, new StringBuilder, builder)
              }
            case Some((ctx, _)) =>
              readNamedEntity(ctx).flatMap {
                case (ctx, s) =>
                  builder += XmlEvent.XmlEntityRef(s)
                  readAttributeValue(ctx, is11, delim, new StringBuilder, builder)
              }
            case None =>
              fail[F, Result[F, List[XmlEvent.XmlTexty]]]("1", "unexpected end of input", Nil)
          }
        case (ctx, c) =>
          fail[F, Result[F, List[XmlEvent.XmlTexty]]]("10", s"unexpected character '$c'", ctx.chunkAcc)
      }
    }
  }

  private def readNamedEntity[F[_]](ctx: Context[F])(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, String]] =
    readNCName(ctx).flatMap {
      case (ctx, name) =>
        accept(ctx, ';', "68", "named entity must end with a semicolon").map { ctx =>
          (ctx, name)
        }
    }

  private def completeStartTag[F[_]](ctx: Context[F], is11: Boolean, name: QName)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, XmlEvent.StartTag]] =
    readAttributes(ctx, is11, name).flatMap {
      case (ctx, attributes) =>
        space(ctx).flatMap { ctx =>
          for {
            (ctx, isEmpty) <- peekChar(ctx).flatMap {
              case Some((ctx, '/')) => Pull.pure((ctx.nextIdx, true))
              case Some((ctx, _))   => Pull.pure((ctx, false))
              case None             => fail[F, Result[F, Boolean]]("44", "unexpected end of input", Nil)
            }
            ctx <- accept(ctx, '>', "44", "missing closing '>'")

          } yield (ctx, XmlEvent.StartTag(name, attributes, isEmpty))
        }
    }

  /** We read '<[CDATA[' so far */
  private def readCDATABody[F[_]](ctx: Context[F], sb: StringBuilder)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, String]] =
    untilChar(ctx, c => c == '\n' || c == '\r' || c == ']' || c == '&', sb).flatMap { ctx =>
      nextChar(ctx).flatMap {
        case (ctx, '\n') =>
          readCDATABody(ctx, sb.append('\n'))
        case (ctx, ']') =>
          peekChar(ctx).flatMap {
            case Some((ctx, ']')) =>
              checkCDATAEnd(ctx.nextIdx, sb).flatMap {
                case (ctx, true)  => Pull.pure((ctx, sb.result()))
                case (ctx, false) => readCDATABody(ctx, sb)
              }
            case Some((ctx, _)) =>
              readCDATABody(ctx, sb.append(']'))
            case None =>
              fail[F, Result[F, String]]("1", "unexpected end of input", Nil)
          }
        case (ctx, '&') =>
          accept(ctx, "gt;").flatMap {
            case (ctx, n) =>
              if (n == 3) {
                sb.append('>')
              } else {
                sb.append('&')
                for (i <- 0 until n)
                  sb.append("gt;".charAt(i))
              }
              readCDATABody(ctx, sb)
          }
        case (ctx, _) =>
          // must be '\r'
          peekChar(ctx).flatMap {
            case Some((ctx, c)) =>
              if (c == '\n')
                readCDATABody(ctx.nextIdx, sb.append('\n'))
              else
                readCDATABody(ctx, sb.append(' '))
            case None =>
              fail[F, Result[F, String]]("1", "unexpected end of input", Nil)
          }
      }
    }

  def checkCDATAEnd[F[_]](ctx: Context[F], sb: StringBuilder)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, Boolean]] =
    peekChar(ctx).flatMap {
      case Some((ctx, '>')) =>
        // done
        Pull.pure((ctx.nextIdx, true))
      case Some((ctx, ']')) =>
        checkCDATAEnd(ctx.nextIdx, sb.append(']'))
      case Some((ctx, _)) =>
        sb.append("]]")
        Pull.pure((ctx, false))
      case None =>
        fail[F, Result[F, Boolean]]("1", "unexpected end of input", Nil)
    }

  private def readCharData[F[_]](ctx: Context[F], is11: Boolean)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, XmlEvent]] =
    peekChar(ctx).flatMap {
      case Some((ctx, '<')) =>
        readMarkupToken(ctx).flatMap {
          case (ctx, MarkupToken.CommentToken) =>
            readCharData(ctx, is11)
          case (ctx, MarkupToken.DeclToken(n)) =>
            fail[F, Result[F, XmlEvent]]("14", s"unexpected declaration '$n'", ctx.chunkAcc)
          case (ctx, MarkupToken.CDataToken) =>
            readCDATABody(ctx, new StringBuilder).map {
              case (ctx, body) => (ctx, XmlEvent.XmlString(body, true))
            }
          case (ctx, MarkupToken.EndToken(name)) =>
            Pull.pure((ctx, XmlEvent.EndTag(name)))
          case (ctx, MarkupToken.StartToken(name)) =>
            completeStartTag(ctx, is11, name)
          case (ctx, MarkupToken.PIToken(target)) if !target.equalsIgnoreCase("xml") =>
            readPIBody(ctx).flatMap {
              case (ctx, body) =>
                Pull.pure((ctx, XmlEvent.XmlPI(target, body)))
            }
          case (ctx, t) =>
            fail[F, Result[F, XmlEvent]]("43", s"unexpected token ${t.render}", ctx.chunkAcc)
        }
      case Some((ctx, '&')) =>
        peekChar(ctx.nextIdx).flatMap {
          case Some((ctx, '#')) =>
            readCharRef(ctx.nextIdx, is11).map {
              case (ctx, n) => (ctx, XmlEvent.XmlCharRef(n))
            }
          case Some((ctx, _)) =>
            readNamedEntity(ctx).map {
              case (ctx, v) => (ctx, XmlEvent.XmlEntityRef(v))
            }
          case None =>
            fail[F, Result[F, XmlEvent]]("1", "unexpected end of input", Nil)
        }
      case Some((ctx, _)) =>
        slowPath(ctx, new StringBuilder)
      case None =>
        Pull.pure((Context.eos, XmlEvent.EndDocument))
    }

  private def slowPath[F[_]](ctx: Context[F], sb: StringBuilder)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, XmlEvent.XmlString]] =
    untilChar(ctx, c => c == '<' || c == '&' || c == '\r', sb).flatMap { ctx =>
      peekChar(ctx).flatMap {
        case Some((ctx, '<')) => Pull.pure((ctx, XmlEvent.XmlString(sb.toString, false)))
        case None             => Pull.pure((ctx, XmlEvent.XmlString(sb.toString, false)))
        case Some((ctx, '&')) => Pull.pure((ctx, XmlEvent.XmlString(sb.toString, false)))
        case Some((ctx, _)) =>
          peekChar(ctx.nextIdx).flatMap {
            case Some((ctx, '\n')) =>
              sb.append('\n')
              slowPath(ctx.nextIdx, sb)
            case Some((ctx, _)) =>
              sb.append('\n')
              slowPath(ctx, sb)
            case None =>
              fail[F, Result[F, XmlEvent.XmlString]]("14", "unexpected end of input", Nil)
          }
      }

    }

  // ==== high-level internals

  private def scanPrologToken0[F[_]](ctx: Context[F])(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Option[Context[F]]] =
    peekChar(ctx).flatMap {
      case Some((ctx, '<')) =>
        readMarkupToken(ctx).flatMap {
          case (ctx, MarkupToken.PIToken(name)) if name.equalsIgnoreCase("xml") =>
            handleXmlDecl(ctx).flatMap {
              case (ctx, (is11, decl)) => scanPrologToken1(ctx.accumulate(decl), is11)
            }
          case (ctx, MarkupToken.PIToken(name)) =>
            readPIBody(ctx).flatMap {
              case (ctx, body) => scanPrologToken1(ctx.accumulate(XmlEvent.XmlPI(name, body)), false)
            }
          case (ctx, MarkupToken.DeclToken(name)) =>
            handleDecl(ctx, name).flatMap {
              case ctx => scanPrologToken2(ctx, false)
            }
          case (ctx, MarkupToken.StartToken(name)) =>
            readElement(ctx, false, name).map(Some(_))
          case (ctx, MarkupToken.CommentToken) =>
            scanPrologToken1(ctx, false)
          case (ctx, t) =>
            fail[F, Option[Context[F]]]("22", s"unexpected markup $t", ctx.chunkAcc)
        }
      case Some((ctx, _)) =>
        scanPrologToken1(ctx, false)
      case None =>
        Pull.pure(None)
    }

  private def scanPrologToken1[F[_]](ctx: Context[F], is11: Boolean)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Option[Context[F]]] =
    scanMisc(ctx).flatMap {
      case Some((ctx, MarkupToken.PIToken(name))) if !name.equalsIgnoreCase("xml") =>
        readPIBody(ctx).flatMap {
          case (ctx, body) => scanPrologToken1(ctx.accumulate(XmlEvent.XmlPI(name, body)), is11)
        }
      case Some((ctx, MarkupToken.DeclToken(name))) =>
        handleDecl(ctx, name).flatMap {
          case ctx => scanPrologToken2(ctx, is11)
        }
      case Some((ctx, MarkupToken.StartToken(name))) =>
        readElement(ctx, is11, name).map(Some(_))
      case Some((ctx, t)) =>
        fail[F, Option[Context[F]]]("22", s"unexpected markup $t", ctx.chunkAcc)
      case None =>
        Pull.pure(None)
    }

  private def handleXmlDecl[F[_]](ctx: Context[F])(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, (Boolean, XmlEvent.XmlDecl)]] =
    for {
      (ctx, _) <- assert(ctx, isXmlWhitespace(_), "24", "space is expected after xml")
      ctx <- space(ctx)
      ctx <- accept(ctx, "version", "24", "expected 'version' attribute")
      ctx <- space(ctx)
      ctx <- accept(ctx, '=', "24", "expected '=' after version")
      ctx <- space(ctx)
      (ctx, delimiter) <- assert(ctx, c => c == '"' || c == '\'', "24", "simple or double quote expected")
      ctx <- accept(ctx, '1', "26", "expected major version 1")
      ctx <- accept(ctx, '.', "26", "expected dot")
      sb = new StringBuilder("1.")
      ctx <- untilChar(ctx, !_.isDigit, sb)
      version = sb.result()
      res <- if (version.length == 2) {
        fail[F, Result[F, (Boolean, XmlEvent.XmlDecl)]]("26", "expected non empty minor version", ctx.chunkAcc)
      } else {
        for {
          ctx <- accept(ctx, delimiter, "24", "expected delimiter to close version attribute value")
          (ctx, (hasSpace, encoding)) <- readEncoding(ctx, false)
          (ctx, standalone) <- readStandalone(ctx, hasSpace)
          ctx <- space(ctx)
          ctx <- accept(ctx, "?>", "23", "expected end of PI")
        } yield (ctx, (version == "1.1", XmlEvent.XmlDecl(version, encoding, standalone)))
      }
    } yield res

  private def readEncoding[F[_]](ctx: Context[F], hasSpace: Boolean)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, (Boolean, Option[String])]] =
    peekChar(ctx).flatMap {
      case Some((ctx, c)) if isXmlWhitespace(c) =>
        space(ctx).flatMap(readEncoding(_, true))
      case Some((ctx, 'e')) =>
        if (hasSpace) {
          for {
            ctx <- accept(ctx, "encoding", "80", "expected 'encoding' attribute")
            ctx <- space(ctx)
            ctx <- accept(ctx, '=', "80", "expected '='")
            ctx <- space(ctx)
            (ctx, delimiter) <- assert(ctx, c => c == '"' || c == '\'', "80", "simple or double quote expected")
            (ctx, fst) <- assert(ctx,
                                 c => (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'),
                                 "81",
                                 "wrong encoding name character")
            sb = new StringBuilder().append(fst)
            ctx <- untilChar(
              ctx,
              c =>
                !((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '.' || c == '_' || c == '-'),
              sb)
            ctx <- accept(ctx, delimiter, "80", "'encoding' attribute value must end with proper delimiter")
          } yield (ctx, (false, Some(sb.result())))
        } else {
          fail[F, Result[F, (Boolean, Option[String])]]("80", "expected space before 'encoding' attribute", ctx.chunkAcc)
        }
      case Some((ctx, _)) =>
        Pull.pure((ctx, (hasSpace, None)))
      case None =>
        Pull.pure((Context.eos, (hasSpace, None)))
    }

  private def readStandalone[F[_]](ctx: Context[F], hasSpace: Boolean)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Result[F, Option[Boolean]]] =
    peekChar(ctx).flatMap {
      case Some((ctx, c)) if isXmlWhitespace(c) =>
        space(ctx).flatMap(readStandalone(_, true))
      case Some((ctx, 's')) =>
        if (hasSpace) {
          for {
            ctx <- accept(ctx, "standalone", "32", "expected 'standalone' attribute")
            ctx <- space(ctx)
            ctx <- accept(ctx, '=', "32", "expected '='")
            ctx <- space(ctx)
            (ctx, delimiter) <- assert(ctx, c => c == '"' || c == '\'', "32", "simple or double quote expected")
            (ctx, sa) <- nextChar(ctx).flatMap {
              case (ctx, 'y') =>
                accept(ctx, "es", "32", "expected 'yes' or 'no'").map { ctx =>
                  (ctx, true)
                }
              case (ctx, 'n') =>
                accept(ctx, 'o', "32", "expected 'yes' or 'no'").map { ctx =>
                  (ctx, false)
                }
              case (ctx, _) => fail[F, Result[F, Boolean]]("32", "expected 'yes' or 'no'", ctx.chunkAcc)
            }
            ctx <- accept(ctx, delimiter, "32", "'standalone' attribute value must end with proper delimiter")
          } yield (ctx, Some(sa))
        } else {
          fail[F, Result[F, Option[Boolean]]]("32", "expected space before 'standalone' attribute", ctx.chunkAcc)
        }
      case Some((ctx, _)) => Pull.pure((ctx, None))
      case None           => Pull.pure((Context.eos, None))
    }

  private def handleDecl[F[_]](ctx: Context[F], name: String)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Context[F]] =
    name match {
      case "DOCTYPE" =>
        for {
          (ctx, _) <- assert(ctx, isXmlWhitespace(_), "28", "space is expected after DOCTYPE")
          ctx <- space(ctx)
          (ctx, docname) <- readNCName(ctx)
          ctx <- space(ctx)
          (ctx, systemid) <- peekChar(ctx).flatMap {
            case Some((ctx, c)) if isNCNameStart(c) => readExternalID(ctx).map { case (ctx, name) => (ctx, Some(name)) }
            case Some((ctx, c@_))                     => Pull.pure((ctx, None))
            case None                               => Pull.pure((Context.eos[F], None))
          }
          ctx <- space(ctx)
          res <- nextChar(ctx).flatMap {
            case (ctx, '>') =>
              // done
              Pull.pure(ctx.accumulate(XmlEvent.XmlDoctype(name, docname, systemid)))
            case (ctx, '[') =>
              skipInternalDTD(ctx).map { ctx =>
                ctx.accumulate(XmlEvent.XmlDoctype(name, docname, systemid))
              }
            case (ctx, c) =>
              fail[F, Context[F]]("28", s"end of doctype or internal DTD expected but got $c", ctx.chunkAcc)
          }
        } yield res
      case _ =>
        fail[F, Context[F]]("22", "expected DOCTYPE declaration", ctx.chunkAcc)
    }

  private def scanPrologToken2[F[_]](ctx: Context[F], is11: Boolean)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Option[Context[F]]] =
    scanMisc(ctx).flatMap {
      case Some((ctx, MarkupToken.PIToken(name))) =>
        readPIBody(ctx).flatMap {
          case (ctx, body) => scanPrologToken2(ctx.accumulate(XmlEvent.XmlPI(name, body)), is11)
        }
      case Some((ctx, MarkupToken.StartToken(name))) =>
        readElement(ctx, is11, name).map(Some(_))
      case Some((ctx, t)) =>
        fail[F, Option[Context[F]]]("22", s"unexpected markup $t", ctx.chunkAcc)
      case None =>
        Pull.pure(None)
    }

  private def readElement[F[_]](ctx: Context[F], is11: Boolean, name: QName)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Context[F]] =
    for {
      (ctx, startTag) <- completeStartTag(ctx, is11, name)
      ctx <- if (startTag.isEmpty)
        Pull.pure(ctx.accumulate(startTag, XmlEvent.EndTag(name)))
      else
        readContent(ctx.accumulate(startTag), is11, name)
    } yield ctx

  private def readContent[F[_]](ctx: Context[F], is11: Boolean, name: QName)(
      implicit F: RaiseThrowable[F]): Pull[F, XmlEvent, Context[F]] =
    for {
      (ctx, last) <- readCharData(ctx, is11)
      ctx <- last match {
        case XmlEvent.EndTag(n) if n == name =>
          // we are done reading that content
          Pull.pure(ctx.accumulate(last))
        case XmlEvent.EndTag(n) =>
          fail[F, Context[F]]("GIMatch", s"unexpected closing tag '' (expected '')", ctx.chunkAcc)
        case XmlEvent.StartTag(name1, _, false) =>
          // parse child element, and continue
          readContent(ctx.accumulate(last), is11, name1).flatMap(ctx => readContent(ctx, is11, name))
        case XmlEvent.StartTag(name1, _, true) =>
          // parse child element, and continue
          readContent(ctx.accumulate(last, XmlEvent.EndTag(name1)), is11, name)
        case _ =>
          // just emit and continue
          readContent(ctx.accumulate(last), is11, name)
      }
    } yield ctx

  def pipe[F[_]](implicit F: RaiseThrowable[F]): Pipe[F, Char, XmlEvent] = {
    def go(ctx: Context[F]): Pull[F, XmlEvent, Unit] =
      scanPrologToken0(ctx).flatMap {
        case Some(ctx) => go(ctx)
        case None      => Pull.done
      }
    s => go(Context(Chunk.empty, 0, s, Nil)).stream
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy