dotty.tools.dotc.parsing.JavaScanners.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scala3-compiler_3 Show documentation
scala3-compiler-bootstrapped
There is a newer version: 3.6.4-RC1-bin-20241220-0bfa1af-NIGHTLY
package dotty.tools
package dotc
package parsing

import core.Contexts.*
import core.Names.SimpleName
import Scanners.*
import util.SourceFile
import JavaTokens.*
import scala.annotation.{switch, tailrec}
import util.Chars.*
import PartialFunction.cond
import core.Decorators.em

object JavaScanners {

  class JavaScanner(source: SourceFile, override val startFrom: Offset = 0)(using Context) extends ScannerCommon(source) {

    override def decodeUni: Boolean = true

    def toToken(name: SimpleName): Token = {
      val idx = name.start
      if (idx >= 0 && idx <= lastKeywordStart) kwArray(idx) else IDENTIFIER
    }

    private class JavaTokenData0 extends TokenData

    /** we need one token lookahead
      */
    val next : TokenData = new JavaTokenData0
    val prev : TokenData = new JavaTokenData0

    // Get next token ------------------------------------------------------------

    def nextToken(): Unit =
      if next.token == EMPTY then
        lastOffset = lastCharOffset
        fetchToken()
      else
        this.copyFrom(next)
        next.token = EMPTY

    def lookaheadToken: Int =
      lookAhead()
      val t = token
      reset()
      t

    def lookAhead() =
      prev.copyFrom(this)
      nextToken()

    def reset() =
      next.copyFrom(this)
      this.copyFrom(prev)

    class LookaheadScanner extends JavaScanner(source, startFrom = charOffset - 1):
      override protected def initialize(): Unit = nextChar()

    /** read next token
      */
    private def fetchToken(): Unit = {
      offset = charOffset - 1
      ch match {
        case ' ' | '\t' | CR | LF | FF =>
          nextChar()
          fetchToken()
        case _ =>
          (ch: @switch) match {
            case 'A' | 'B' | 'C' | 'D' | 'E' |
                 'F' | 'G' | 'H' | 'I' | 'J' |
                 'K' | 'L' | 'M' | 'N' | 'O' |
                 'P' | 'Q' | 'R' | 'S' | 'T' |
                 'U' | 'V' | 'W' | 'X' | 'Y' |
                 'Z' | '$' | '_' |
                 'a' | 'b' | 'c' | 'd' | 'e' |
                 'f' | 'g' | 'h' | 'i' | 'j' |
                 'k' | 'l' | 'm' | 'n' | 'o' |
                 'p' | 'q' | 'r' | 's' | 't' |
                 'u' | 'v' | 'w' | 'x' | 'y' |
                 'z' =>
              putChar(ch)
              nextChar()
              getIdentRest()

            case '0' =>
              putChar(ch)
              nextChar()
              if (ch == 'x' || ch == 'X') {
                nextChar()
                base = 16
              }
              else
                base = 8
              getNumber()

            case '1' | '2' | '3' | '4' |
                 '5' | '6' | '7' | '8' | '9' =>
              base = 10
              getNumber()

            case '\"' =>
              nextChar()
              if ch != '\"' then // "..." non-empty string literal
                while ch != '\"' && (isUnicodeEscape || ch != CR && ch != LF && ch != SU) do
                  getlitch()
                if ch == '\"' then
                  token = STRINGLIT
                  setStrVal()
                  nextChar()
                else
                  error(em"unclosed string literal")
              else
                nextChar()
                if ch != '\"' then // "" empty string literal
                  token = STRINGLIT
                  setStrVal()
                else
                  nextChar()
                  getTextBlock()

            case '\'' =>
              nextChar()
              getlitch()
              if (ch == '\'') {
                nextChar()
                token = CHARLIT
                setStrVal()
              }
              else
                error(em"unclosed character literal")

            case '=' =>
              token = EQUALS
              nextChar()
              if (ch == '=') {
                token = EQEQ
                nextChar()
              }

            case '>' =>
              token = GT
              nextChar()
              if (ch == '=') {
                token = GTEQ
                nextChar()
              }
              else if (ch == '>') {
                token = GTGT
                nextChar()
                if (ch == '=') {
                  token = GTGTEQ
                  nextChar()
                }
                else if (ch == '>') {
                  token = GTGTGT
                  nextChar()
                  if (ch == '=') {
                    token = GTGTGTEQ
                    nextChar()
                  }
                }
              }

            case '<' =>
              token = LT
              nextChar()
              if (ch == '=') {
                token = LTEQ
                nextChar()
              }
              else if (ch == '<') {
                token = LTLT
                nextChar()
                if (ch == '=') {
                  token = LTLTEQ
                  nextChar()
                }
              }

            case '!' =>
              token = BANG
              nextChar()
              if (ch == '=') {
                token = BANGEQ
                nextChar()
              }

            case '~' =>
              token = TILDE
              nextChar()

            case '?' =>
              token = QMARK
              nextChar()

            case ':' =>
              token = COLONop
              nextChar()

            case '@' =>
              token = AT
              nextChar()

            case '&' =>
              token = AMP
              nextChar()
              if (ch == '&') {
                token = AMPAMP
                nextChar()
              }
              else if (ch == '=') {
                token = AMPEQ
                nextChar()
              }

            case '|' =>
              token = BAR
              nextChar()
              if (ch == '|') {
                token = BARBAR
                nextChar()
              }
              else if (ch == '=') {
                token = BAREQ
                nextChar()
              }

            case '+' =>
              token = PLUS
              nextChar()
              if (ch == '+') {
                token = PLUSPLUS
                nextChar()
              }
              else if (ch == '=') {
                token = PLUSEQ
                nextChar()
              }

            case '-' =>
              token = MINUS
              nextChar()
              if (ch == '-') {
                token = MINUSMINUS
                nextChar()
              }
              else if (ch == '=') {
                token = MINUSEQ
                nextChar()
              }

            case '*' =>
              token = ASTERISK
              nextChar()
              if (ch == '=') {
                token = ASTERISKEQ
                nextChar()
              }

            case '/' =>
              nextChar()
              if (!skipComment()) {
                token = SLASH
                nextChar()
                if (ch == '=') {
                  token = SLASHEQ
                  nextChar()
                }
              }
              else fetchToken()

            case '^' =>
              token = HAT
              nextChar()
              if (ch == '=') {
                token = HATEQ
                nextChar()
              }

            case '%' =>
              token = PERCENT
              nextChar()
              if (ch == '=') {
                token = PERCENTEQ
                nextChar()
              }

            case '.' =>
              token = DOT
              nextChar()
              if ('0' <= ch && ch <= '9') {
                putChar('.');
                getFraction()
              }
              else if (ch == '.') {
                nextChar()
                if (ch == '.') {
                  nextChar()
                  token = DOTDOTDOT
                }
                else error(em"`.` character expected")
              }

            case ';' =>
              token = SEMI
              nextChar()

            case ',' =>
              token = COMMA
              nextChar()

            case '(' =>
              token = LPAREN
              nextChar()

            case '{' =>
              token = LBRACE
              nextChar()

            case ')' =>
              token = RPAREN
              nextChar()

            case '}' =>
              token = RBRACE
              nextChar()

            case '[' =>
              token = LBRACKET
              nextChar()

            case ']' =>
              token = RBRACKET
              nextChar()

            case SU =>
              if (isAtEnd) token = EOF
              else {
                error(em"illegal character")
                nextChar()
              }

            case _ =>
              if (Character.isUnicodeIdentifierStart(ch)) {
                putChar(ch)
                nextChar()
                getIdentRest()
              }
              else {
                error(em"illegal character: ${ch.toInt}")
                nextChar()
              }
          }
      }
    }

    protected def skipComment(): Boolean = {
      @tailrec def skipLineComment(): Unit = ch match {
        case CR | LF | SU =>
        case _ => nextChar(); skipLineComment()
      }
      @tailrec def skipJavaComment(): Unit = ch match {
        case SU => incompleteInputError(em"unclosed comment")
        case '*' => nextChar(); if (ch == '/') nextChar() else skipJavaComment()
        case _ => nextChar(); skipJavaComment()
      }
      ch match {
        case '/' => nextChar(); skipLineComment(); true
        case '*' => nextChar(); skipJavaComment(); true
        case _ => false
      }
    }

    // Identifiers ---------------------------------------------------------------

    private def getIdentRest(): Unit =
      while (true)
        (ch: @switch) match {
          case 'A' | 'B' | 'C' | 'D' | 'E' |
               'F' | 'G' | 'H' | 'I' | 'J' |
               'K' | 'L' | 'M' | 'N' | 'O' |
               'P' | 'Q' | 'R' | 'S' | 'T' |
               'U' | 'V' | 'W' | 'X' | 'Y' |
               'Z' | '$' |
               'a' | 'b' | 'c' | 'd' | 'e' |
               'f' | 'g' | 'h' | 'i' | 'j' |
               'k' | 'l' | 'm' | 'n' | 'o' |
               'p' | 'q' | 'r' | 's' | 't' |
               'u' | 'v' | 'w' | 'x' | 'y' |
               'z' |
               '0' | '1' | '2' | '3' | '4' |
               '5' | '6' | '7' | '8' | '9' =>
            putChar(ch)
            nextChar()
          case '_' =>
            putChar(ch)
            nextChar()
            getIdentRest()
            return
          case SU =>
            finishNamed()
            return
          case _ =>
            if (Character.isUnicodeIdentifierPart(ch)) {
              putChar(ch)
              nextChar()
            }
            else {
              finishNamed()
              return
            }
        }

    // Literals -----------------------------------------------------------------

    /** Read next character in character or string literal.
      */
    protected def getlitch(): Unit = getlitch(scanOnly = false, inTextBlock = false)

    /** Read next character in character or string literal.
     *
     *  @param scanOnly skip emitting errors or adding to the literal buffer
     *  @param inTextBlock is this for a text block?
     */
    def getlitch(scanOnly: Boolean, inTextBlock: Boolean): Unit =
      def octal: Char =
        val leadch: Char = ch
        var oct: Int = digit2int(ch, 8)
        nextChar()
        if ('0' <= ch && ch <= '7') {
          oct = oct * 8 + digit2int(ch, 8)
          nextChar()
          if (leadch <= '3' && '0' <= ch && ch <= '7') {
            oct = oct * 8 + digit2int(ch, 8)
            nextChar()
          }
        }
        oct.asInstanceOf[Char]
      end octal
      var skip = false
      def greatEscape: Char =
        nextChar()
        if '0' <= ch && ch <= '7' then octal
        else
          val x = ch match
            case 'b'  => '\b'
            case 's'  => ' '
            case 't'  => '\t'
            case 'n'  => '\n'
            case 'f'  => '\f'
            case 'r'  => '\r'
            case '\"' => '\"'
            case '\'' => '\''
            case '\\' => '\\'
            case CR | LF if inTextBlock =>
              if !scanOnly then nextChar()
              skip = true
              0
            case _    =>
              if !scanOnly then error("invalid escape character", charOffset - 1)
              ch
          if !skip then nextChar()
          x
      end greatEscape

      // begin getlitch
      val c: Char =
        if ch == '\\' then greatEscape
        else
          val res = ch
          nextChar()
          res
      if !skip && !scanOnly then putChar(c)
    end getlitch

    /** Read a triple-quote delimited text block, starting after the first three double quotes.
      */
    private def getTextBlock(): Unit = {
      // Open delimiter is followed by optional space, then a newline
      while (ch == ' ' || ch == '\t' || ch == FF) {
        nextChar()
      }
      if (ch != LF && ch != CR) { // CR-LF is already normalized into LF by `JavaCharArrayReader`
        error(em"illegal text block open delimiter sequence, missing line terminator")
        return
      }
      nextChar()

      /* Do a lookahead scan over the full text block to:
       *   - compute common white space prefix
       *   - find the offset where the text block ends
       */
      var commonWhiteSpacePrefix = Int.MaxValue
      var blockEndOffset = 0
      var blockClosed = false
      var lineWhiteSpacePrefix = 0
      var lineIsOnlyWhitespace = true
      val in = LookaheadScanner()
      while (!blockClosed && (isUnicodeEscape || ch != SU)) {
        if (in.ch == '\"') { // Potential end of the block
          in.nextChar()
          if (in.ch == '\"') {
            in.nextChar()
            if (in.ch == '\"') {
              blockClosed = true
              commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
              blockEndOffset = in.charOffset - 2
            }
          }

          // Not the end of the block - just a single or double " character
          if (!blockClosed) {
            lineIsOnlyWhitespace = false
          }
        } else if (in.ch == CR || in.ch == LF) { // new line in the block
          in.nextChar()
          if (!lineIsOnlyWhitespace) {
            commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
          }
          lineWhiteSpacePrefix = 0
          lineIsOnlyWhitespace = true
        } else if (lineIsOnlyWhitespace && Character.isWhitespace(in.ch)) { // extend white space prefix
          in.nextChar()
          lineWhiteSpacePrefix += 1
        } else {
          lineIsOnlyWhitespace = false
          in.getlitch(scanOnly = true, inTextBlock = true)
        }
      }

      // Bail out if the block never did have an end
      if (!blockClosed) {
        error(em"unclosed text block")
        return
      }

      // Second pass: construct the literal string value this time
      while (charOffset < blockEndOffset) {
        // Drop the line's leading whitespace
        var remainingPrefix = commonWhiteSpacePrefix
        while (remainingPrefix > 0 && ch != CR && ch != LF && charOffset < blockEndOffset) {
          nextChar()
          remainingPrefix -= 1
        }

        var trailingWhitespaceLength = 0
        var escapedNewline = false         // Does the line end with `\`?
        while (ch != CR && ch != LF && charOffset < blockEndOffset && !escapedNewline) {
          if (Character.isWhitespace(ch)) {
            trailingWhitespaceLength += 1
          } else {
            trailingWhitespaceLength = 0
          }

          // Detect if the line is about to end with `\`
          if ch == '\\' && cond(lookaheadChar()) { case CR | LF => true } then
            escapedNewline = true

          getlitch(scanOnly = false, inTextBlock = true)
        }

        // Remove the last N characters from the buffer */
        def popNChars(n: Int): Unit =
          if n > 0 then
            val text = litBuf.toString
            litBuf.clear()
            val trimmed = text.substring(0, text.length - (n min text.length))
            trimmed.nn.foreach(litBuf.append)

        // Drop the line's trailing whitespace
        popNChars(trailingWhitespaceLength)

        // Normalize line terminators
        if ((ch == CR || ch == LF) && !escapedNewline) {
          nextChar()
          putChar('\n')
        }
      }

      token = STRINGLIT
      setStrVal()

      // Trailing """
      nextChar()
      nextChar()
      nextChar()
    }
    end getTextBlock

    /** read fractional part and exponent of floating point number
      * if one is present.
      */
    protected def getFraction(): Unit = {
      token = DOUBLELIT
      while ('0' <= ch && ch <= '9') {
        putChar(ch)
        nextChar()
      }
      if (ch == 'e' || ch == 'E') {
        val lookahead = lookaheadReader()
        lookahead.nextChar()
        if (lookahead.ch == '+' || lookahead.ch == '-')
          lookahead.nextChar()
        if ('0' <= lookahead.ch && lookahead.ch <= '9') {
          putChar(ch)
          nextChar()
          if (ch == '+' || ch == '-') {
            putChar(ch)
            nextChar()
          }
          while ('0' <= ch && ch <= '9') {
            putChar(ch)
            nextChar()
          }
        }
        token = DOUBLELIT
      }
      if (ch == 'd' || ch == 'D') {
        putChar(ch)
        nextChar()
        token = DOUBLELIT
      }
      else if (ch == 'f' || ch == 'F') {
        putChar(ch)
        nextChar()
        token = FLOATLIT
      }
      setStrVal()
    }

    /** convert name to long value
     */
    def intVal(negated: Boolean): Long =
      if (token == CHARLIT && !negated)
        if (strVal.length > 0) strVal.charAt(0).toLong else 0
      else {
        var value: Long = 0
        val divider = if (base == 10) 1 else 2
        val limit: Long =
          if (token == LONGLIT) Long.MaxValue else Int.MaxValue
        var i = 0
        val len = strVal.length
        while (i < len) {
          val d = digit2int(strVal.charAt(i), base)
          if (d < 0) {
            error(em"malformed integer number")
            return 0
          }
          if (value < 0 ||
              limit / (base / divider) < value ||
              limit - (d / divider) < value * (base / divider) &&
              !(negated && limit == value * base - 1 + d)) {
                error(em"integer number too large")
                return 0
              }
          value = value * base + d
          i += 1
        }
        if (negated) -value else value
      }

    /** convert name, base to double value
     */
    def floatVal(negated: Boolean): Double = {
      val limit: Double =
        if (token == DOUBLELIT) Double.MaxValue else Float.MaxValue
      try {
        val value: Double = java.lang.Double.valueOf(strVal.toString).nn.doubleValue()
        if (value > limit)
          error(em"floating point number too large")
        if (negated) -value else value
      } catch {
        case _: NumberFormatException =>
          error(em"malformed floating point number")
          0.0
      }
    }

    /** read a number into name and set base
      */
    protected def getNumber(): Unit = {
      while (digit2int(ch, if (base < 10) 10 else base) >= 0) {
        putChar(ch)
        nextChar()
      }
      token = INTLIT
      if (base <= 10 && ch == '.') {
        val lookahead = lookaheadReader()
        lookahead.nextChar()
        lookahead.ch match {
          case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' |
               '8' | '9' | 'd' | 'D' | 'e' | 'E' | 'f' | 'F' =>
            putChar(ch)
            nextChar()
            return getFraction()
          case _ =>
            if (!isIdentifierStart(lookahead.ch)) {
              putChar(ch)
              nextChar()
              return getFraction()
            }
        }
      }
      if (base <= 10 &&
        (ch == 'e' || ch == 'E' ||
          ch == 'f' || ch == 'F' ||
          ch == 'd' || ch == 'D'))
        return getFraction()
      setStrVal()
      if (ch == 'l' || ch == 'L') {
        nextChar()
        token = LONGLIT
      }
    }

    // Errors -----------------------------------------------------------------

    override def toString(): String = token match {
      case IDENTIFIER => s"id($name)"
      case CHARLIT => s"char($strVal)"
      case INTLIT => s"int($strVal, $base)"
      case LONGLIT => s"long($strVal, $base)"
      case FLOATLIT => s"float($strVal)"
      case DOUBLELIT => s"double($strVal)"
      case STRINGLIT => s"string($strVal)"
      case SEMI =>
        ";"
      case COMMA =>
        ","
      case _ =>
        tokenString(token)
    }

    /* Initialization: read first char, then first token */
    protected def initialize(): Unit =
      nextChar()
      nextToken()
    initialize()
  }

  private val (lastKeywordStart, kwArray) = buildKeywordArray(keywords)
}