All Downloads are FREE. Search and download functionalities are using the official Maven repository.

playground.smithyql.parser.v2.scanner.scala Maven / Gradle / Ivy

The newest version!
package playground.smithyql.parser.v2.scanner

import cats.kernel.Eq
import cats.parse.Numbers

import scala.annotation.nowarn

case class Token(
  kind: TokenKind,
  text: String,
) {
  def width: Int = text.length
}

object Token {
  implicit val eq: Eq[Token] = Eq.fromUniversalEquals
}

sealed trait TokenKind extends Product with Serializable {

  def apply(
    text: String
  ): Token = Token(this, text)

}

object TokenKind {
  case object KW_USE extends TokenKind
  case object KW_SERVICE extends TokenKind
  case object KW_BOOLEAN extends TokenKind
  case object LIT_NUMBER extends TokenKind
  case object LIT_STRING extends TokenKind
  case object KW_NULL extends TokenKind

  case object DOT extends TokenKind
  case object COMMA extends TokenKind
  case object HASH extends TokenKind
  case object LB extends TokenKind
  case object RB extends TokenKind
  case object LBR extends TokenKind
  case object RBR extends TokenKind
  case object COLON extends TokenKind
  case object EQ extends TokenKind
  case object SPACE extends TokenKind
  case object NEWLINE extends TokenKind
  case object IDENT extends TokenKind
  case object COMMENT extends TokenKind
  case object Error extends TokenKind

  implicit val eq: Eq[TokenKind] = Eq.fromUniversalEquals
}

object Scanner {

  /** Entrypoint to scanning text into tokens.
    *
    * Always produces an output that can be rendered back to the original text.
    */
  def scan(
    s: String
  ): List[Token] = {
    var remaining = s
    var tokens = List.empty[Token]
    def add(
      tok: Token
    ) = tokens ::= tok

    def readSimple(
      token: String,
      tok: TokenKind,
    ): PartialFunction[Unit, Unit] = {
      case _ if remaining.startsWith(token) =>
        add(tok(token.toString))
        remaining = remaining.drop(token.length())
    }

    def simpleTokens(
      pairings: (
        String,
        TokenKind,
      )*
    ): PartialFunction[Unit, Unit] = pairings.map(readSimple.tupled).reduce(_.orElse(_))

    val keywords = Map(
      "use" -> TokenKind.KW_USE,
      "service" -> TokenKind.KW_SERVICE,
      "null" -> TokenKind.KW_NULL,
      "true" -> TokenKind.KW_BOOLEAN,
      "false" -> TokenKind.KW_BOOLEAN,
    )

    val readIdent: PartialFunction[Unit, Unit] = {
      case _ if remaining.head.isLetter =>
        val (letters, rest) = remaining.span(ch => ch.isLetterOrDigit || ch == '_')

        keywords.get(letters) match {
          case Some(kind) =>
            // we matched a keyword, return it.
            add(kind(letters))

          case None =>
            // normal ident
            add(TokenKind.IDENT(letters))
        }

        remaining = rest
    }

    val readPunctuation: PartialFunction[Unit, Unit] = simpleTokens(
      "." -> TokenKind.DOT,
      "," -> TokenKind.COMMA,
      "#" -> TokenKind.HASH,
      "[" -> TokenKind.LB,
      "]" -> TokenKind.RB,
      "{" -> TokenKind.LBR,
      "}" -> TokenKind.RBR,
      ":" -> TokenKind.COLON,
      "=" -> TokenKind.EQ,
    )

    val readStringLiteral: PartialFunction[Unit, Unit] = {
      case _ if remaining.startsWith("\"") =>
        val (str, rest) = remaining.tail.span(_ != '\"')
        if (rest.isEmpty) { // hit EOF
          add(TokenKind.LIT_STRING("\"" + str))
          remaining = rest
        } else {
          add(TokenKind.LIT_STRING("\"" + str + "\""))
          remaining = rest.tail
        }
    }

    val readNumberLiteral: PartialFunction[Unit, Unit] = {
      // I love this language
      object jsonNumber {
        def unapply(
          @nowarn("cat=unused")
          unused: Unit
        ): Option[
          (
            String,
            String,
          )
        ] =
          // For now, we're using the cats-parse implementation simply because it's consistent with the current implementation
          // and we can rewrite this later on when we drop support for the other parser
          // and no longer need cats-parse.
          Numbers.jsonNumber.parse(remaining).toOption
      }

      { case jsonNumber(rest, num) =>
        add(TokenKind.LIT_NUMBER(num.toString))
        remaining = rest
      }
    }

    // readOne and friends are all partial functions: this is the current implementation of lookahead.
    // it's not great, but it kinda works.
    val readOne: PartialFunction[Unit, Unit] = readIdent
      .orElse(readPunctuation)
      .orElse(readStringLiteral)
      .orElse(readNumberLiteral)

    // split "whitespace" string into chains of contiguous newlines OR whitespace characters.
    def whitespaceChains(
      whitespace: String
    ): List[Token] = {
      val isNewline = (ch: Char) => ch == '\n'

      if (whitespace.isEmpty)
        Nil
      else if (isNewline(whitespace.head)) {
        val (nl, rest) = whitespace.span(isNewline)
        TokenKind.NEWLINE(nl) :: whitespaceChains(rest)
      } else {
        val (wsp, rest) = whitespace.span(!isNewline(_))
        TokenKind.SPACE(wsp) :: whitespaceChains(rest)
      }
    }

    val eatWhitespace: PartialFunction[Unit, Unit] = {
      object matches {
        def unapply(
          @nowarn("cat=unused") u: Unit
        ): Option[
          (
            String,
            String,
          )
        ] = {
          val (wsp, rest) = remaining.span(ch => ch.isWhitespace)
          if (wsp.isEmpty())
            None
          else
            Some((wsp, rest))
        }
      }

      { case matches(wsp, rest) =>
        whitespaceChains(wsp).foreach(add)
        remaining = rest
      }
    }

    val eatComments: PartialFunction[Unit, Unit] = {
      case _ if remaining.startsWith("//") =>
        while (remaining.startsWith("//")) {
          val (comment, rest) = remaining.span(_ != '\n')
          add(TokenKind.COMMENT(comment))
          remaining = rest
        }
    }

    val readAny = readOne.orElse(eatWhitespace).orElse(eatComments)

    def eatErrors(
    ) = {
      val (failures, _) = remaining.span { _ =>
        if (readAny.isDefinedAt(()))
          // this will match. stop!
          false
        else {
          // didn't match. We need to move the cursor manually here
          remaining = remaining.tail
          true
        }
      }

      if (failures.nonEmpty) {
        add(TokenKind.Error(failures))
        true
      } else
        false
    }

    while (remaining.nonEmpty) {
      val last = remaining

      readAny
        .lift(())
        .isDefined ||
        eatErrors(): Unit

      // last-effort sanity check
      if (remaining == last)
        sys.error(s"no progress in the last run! remaining string: $remaining")
    }

    tokens.reverse
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy