scala.meta.internal.tokenizers.LegacyScanner.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of tokenizers_2.11 Show documentation
Scalameta APIs for tokenization and their baseline implementation
There is a newer version: 4.1.6
package scala.meta
package internal
package tokenizers

import scala.annotation.{ switch, tailrec }
import scala.collection.mutable
import Chars._
import LegacyToken._
import scala.meta.inputs._
import scala.meta.tokenizers.TokenizeException

class LegacyScanner(input: Input, dialect: Dialect) {
  val reporter: Reporter      = Reporter(input)
  val curr: LegacyTokenData   = new LegacyTokenData {}
  val next: LegacyTokenData   = new LegacyTokenData {}
  val prev: LegacyTokenData   = new LegacyTokenData {}
  val reader: CharArrayReader = new CharArrayReader(input, dialect, reporter)

  import curr._, reader._, reporter._
  curr.input = this.input
  next.input = this.input
  prev.input = this.input

  private def isDigit(c: Char) = java.lang.Character isDigit c
  private var openComments = 0
  protected def putCommentChar(): Unit = nextChar()

  @tailrec private def skipLineComment(): Unit = ch match {
    case SU | CR | LF        =>
    case '$' if !getDollar() => syntaxError("can't unquote into single-line comments", at = charOffset - 1)
    case _                   => nextChar() ; skipLineComment()
  }
  private def maybeOpen() {
    putCommentChar()
    if (ch == '*') {
      putCommentChar()
      openComments += 1
    }
  }
  private def maybeClose(): Boolean = {
    putCommentChar()
    (ch == '/') && {
      putCommentChar()
      openComments -= 1
      openComments == 0
    }
  }
  @tailrec final def skipNestedComments(): Unit = ch match {
    case '/'                 => maybeOpen() ; skipNestedComments()
    case '*'                 => if (!maybeClose()) skipNestedComments()
    case SU                  => incompleteInputError("unclosed comment", at = offset)
    case '$' if !getDollar() => syntaxError("can't unquote into multi-line comments", at = charOffset - 1)
    case _                   => putCommentChar() ; skipNestedComments()
  }
  def skipDocComment(): Unit = skipNestedComments()
  def skipBlockComment(): Unit = skipNestedComments()

  private def skipToCommentEnd(isLineComment: Boolean) {
    nextChar()
    if (isLineComment) skipLineComment()
    else {
      openComments = 1
      val isDocComment = (ch == '*') && { nextChar(); true }
      if (isDocComment) {
        // Check for the amazing corner case of /**/
        if (ch == '/')
          nextChar()
        else
          skipDocComment()
      }
      else skipBlockComment()
    }
  }

  /** Precondition: ch == '/'
   *  Returns true if a comment was skipped.
   */
  def skipComment(): Boolean = ch match {
    case '/' | '*' => skipToCommentEnd(isLineComment = ch == '/') ; true
    case _         => false
  }
  def flushDoc(): Unit = ()

  /** To prevent doc comments attached to expressions from leaking out of scope
   *  onto the next documentable entity, they are discarded upon passing a right
   *  brace, bracket, or parenthesis.
   */
  def discardDocBuffer(): Unit = ()

  def isAtEnd = charOffset >= buf.length

  def resume(lastCode: LegacyToken) = {
    token = lastCode
    if (next.token != EMPTY)
      syntaxError("unexpected end of input: possible missing '}' in XML block", at = offset)

    nextToken()
  }

  /** A character buffer for literals
   */
  val cbuf = new StringBuilder

  /** append Unicode character to "cbuf" buffer
   */
  protected def putChar(c: Char) {
//      assert(cbuf.size < 10000, cbuf)
    cbuf.append(c)
  }

  /** Determines whether this scanner should emit identifier deprecation warnings,
   *  e.g. when seeing `macro' or `then', which are planned to become keywords in future versions of Scala.
   */
  protected def emitIdentifierDeprecationWarnings = true

  /** Clear buffer and set name and token */
  private def finishNamed(idtoken: LegacyToken = IDENTIFIER) {
    name = cbuf.toString
    cbuf.clear()
    token = idtoken
    if (idtoken == IDENTIFIER) {
      if (kw2legacytoken contains name) {
        token = kw2legacytoken(name)

        if (token == IDENTIFIER && emitIdentifierDeprecationWarnings)
          deprecationWarning(s"$name is now a reserved word; usage as an identifier is deprecated", at = token)

        if (token == ENUM && !dialect.allowEnums)
          token = IDENTIFIER
      }
    }
  }

  /* much like endOffset, end is inclusive */
  private def finishComposite(token: LegacyToken, end: Offset) {
    val start = offset
    curr.token = token
    curr.strVal = new String(input.chars, start, end - start + 1)
    curr.endOffset = end
    reader.charOffset = end + 1
    reader.nextChar()
  }

  /** Clear buffer and set string */
  private def setStrVal() {
    strVal = cbuf.toString
    cbuf.clear()
  }

  /** a stack of tokens which indicates whether line-ends can be statement separators
   *  also used for keeping track of nesting levels.
   *  We keep track of the closing symbol of a region. This can be
   *  RPAREN    if region starts with '('
   *  RBRACKET  if region starts with '['
   *  RBRACE    if region starts with '{'
   *  ARROW     if region starts with `case'
   *  STRINGLIT if region is a string interpolation expression starting with '${'
   *            (the STRINGLIT appears twice in succession on the stack iff the
   *             expression is a multiline string literal).
   */
  var sepRegions: List[LegacyToken] = List()

  /**
    * A map of upcoming xml literal parts that are left to be returned in nextToken().
    *
    * The keys are offset start positions of an xml literal and the values are
    * the respective offset end positions and a boolean indicating if the part
    * is the last part.
    */
  val upcomingXmlLiteralParts = mutable.Map.empty[Offset, (Offset, Boolean)]

// Get next token ------------------------------------------------------------

  /** Are we directly in a string interpolation expression?
   */
  private def inStringInterpolation =
    sepRegions.nonEmpty && sepRegions.head == STRINGLIT

  private def inXmlLiteral: Boolean = {
    upcomingXmlLiteralParts.contains(offset)
  }

  /** Are we directly in a multiline string interpolation expression?
   *  @pre inStringInterpolation
   */
  private def inMultiLineInterpolation =
    inStringInterpolation && sepRegions.tail.nonEmpty && sepRegions.tail.head == STRINGPART

  /** read next token and return last offset
   */
  def skipToken(): Offset = {
    val off = offset
    nextToken()
    off
  }

  /** Produce next token, filling curr TokenData fields of Scanner.
   */
  def nextToken(): Unit = {
    val lastToken = token
    // Adapt sepRegions according to last token
    (lastToken: @switch) match {
      case LPAREN =>
        sepRegions = RPAREN :: sepRegions
      case LBRACKET =>
        sepRegions = RBRACKET :: sepRegions
      case LBRACE =>
        sepRegions = RBRACE :: sepRegions
      case CASE =>
        sepRegions = ARROW :: sepRegions
      case RBRACE =>
        while (!sepRegions.isEmpty && sepRegions.head != RBRACE)
          sepRegions = sepRegions.tail
        if (!sepRegions.isEmpty)
          sepRegions = sepRegions.tail

        discardDocBuffer()
      case RBRACKET | RPAREN =>
        if (!sepRegions.isEmpty && sepRegions.head == lastToken)
          sepRegions = sepRegions.tail

        discardDocBuffer()
      case ARROW =>
        if (!sepRegions.isEmpty && sepRegions.head == lastToken)
          sepRegions = sepRegions.tail
      case STRINGLIT =>
        if (inMultiLineInterpolation)
          sepRegions = sepRegions.tail.tail
        else if (inStringInterpolation)
          sepRegions = sepRegions.tail
      case _ =>
    }

    // Read a token or copy it from `next` tokenData
    if (next.token == EMPTY) {
      lastOffset = charOffset - 1
      if (lastOffset > 0 && buf(lastOffset) == '\n' && buf(lastOffset - 1) == '\r') {
        lastOffset -= 1
      }

      fetchToken()
      if(token == ERROR) {
        if (inMultiLineInterpolation)
          sepRegions = sepRegions.tail.tail
        else if (inStringInterpolation)
          sepRegions = sepRegions.tail
      }
    } else {
      curr copyFrom next
      next.token = EMPTY
    }

    // NOTE: endOffset is used to determine range positions for certain tokens.
    // Most tokens (e.g. `(' or `;') have constant widths, so their range positions can be calculated trivially from their offsets,
    // however some tokens have variable widths,
    // and for them we need to remember where their parsing ended in order to calculate their positions.
    // That's what endOffset does (indirectly): each token's position should be [curr.offset, curr.endOffset]
    //
    // Now how do we calculate endOffset?
    // 1) What we have at hand is `charOffset`, which is the position right after the position of the character that's just been read.
    // 2) This means that `charOffset - 1` is the position of the character that's just been read.
    // 3) Since reading that character terminated fetchToken, this means that that character is the first character of the next token.
    // 4) This means that `charOffset - 2` is where the last character of the our current token lies.
    //
    // The only corner case here is EOF. In that case the virtual position of the character that's just been read (or, more precisely,
    // that's been attempted to be read) seems to be `buf.length`, but some other logic in the scanner suggests that sometimes it can even
    // be `buf.length + 1` or more. Therefore, we don't bother ourselves with doing decrements and just assign endOffset to be `buf.length - 1`.
    //
    // upd. Speaking of corner cases, positions of tokens emitted by string interpolation tokenizers are simply insane,
    // and need to be reverse engineered having some context (previous tokens, number of quotes in the interpolation) in mind.
    // Therefore I don't even attempt to handle them here, and instead apply fixups elsewhere when converting legacy TOKENS into new LegacyToken instances.
    if (curr.token != STRINGPART) { // endOffset of STRINGPART tokens is set elsewhere
      curr.endOffset = charOffset - 2
      if (charOffset >= buf.length && ch == SU) curr.endOffset = buf.length - 1
    }
  }

  /** Is current token first one after a newline? */
  private def afterLineEnd(): Boolean =
    lastOffset < lineStartOffset &&
    (lineStartOffset <= offset ||
     lastOffset < lastLineStartOffset && lastLineStartOffset <= offset)

  /** Is there a blank line between the current token and the last one?
   *  @pre  afterLineEnd().
   */
  private def pastBlankLine(): Boolean = {
    var idx = lastOffset
    var ch = buf(idx)
    val end = offset
    while (idx < end) {
      if (ch == LF || ch == FF) {
        do {
          idx += 1; ch = buf(idx)
          if (ch == LF || ch == FF) {
            return true
          }
          if (idx == end) return false
        } while (ch <= ' ')
      }
      idx += 1; ch = buf(idx)
    }
    false
  }

  /** read next token, filling TokenData fields of Scanner.
   */
  protected final def fetchToken() {
    offset = charOffset - 1

    if (inStringInterpolation) return fetchStringPart()
    else if (inXmlLiteral) return fetchXmlPart()

    (ch: @switch) match {
      case ' ' | '\t' | CR | LF | FF =>
        token = WHITESPACE
        strVal = ch.toString
        nextChar()
        //nextToken()
      case 'A' | 'B' | 'C' | 'D' | 'E' |
           'F' | 'G' | 'H' | 'I' | 'J' |
           'K' | 'L' | 'M' | 'N' | 'O' |
           'P' | 'Q' | 'R' | 'S' | 'T' |
           'U' | 'V' | 'W' | 'X' | 'Y' |
           'Z' | '$' | '_' |
           'a' | 'b' | 'c' | 'd' | 'e' |
           'f' | 'g' | 'h' | 'i' | 'j' |
           'k' | 'l' | 'm' | 'n' | 'o' |
           'p' | 'q' | 'r' | 's' | 't' |
           'u' | 'v' | 'w' | 'x' | 'y' |  // scala-mode: need to understand multi-line case patterns
           'z' =>
        if (ch == '$' && !getDollar()) {
          getUnquote()
        } else {
          putChar(ch)
          nextChar()
          getIdentRest()
          if (ch == '"' && token == IDENTIFIER)
            token = INTERPOLATIONID
        }
      case '<' => // is XMLSTART?
        def fetchLT() = {
          val last = if (charOffset >= 2) buf(charOffset - 2) else ' '
          nextChar()
          last match {
            case ' ' | '\t' | '\n' | '{' | '(' | '>' if isNameStart(ch) || ch == '!' || ch == '?' =>
              if (dialect.allowXmlLiterals) getXml()
              else syntaxError("xml literals are not supported", at = offset)
            case _ =>
              // Console.println("found '<', but last is '"+in.last+"'"); // DEBUG
              putChar('<')
              getOperatorRest()
          }
        }
        fetchLT()
      case '~' | '!' | '@' | '#' | '%' |
           '^' | '*' | '+' | '-' | /*'<' | */
           '>' | '?' | ':' | '=' | '&' |
           '|' | '\\' =>
        putChar(ch)
        nextChar()
        getOperatorRest()
      case '/' =>
        nextChar()
        if (skipComment()) {
          token = COMMENT
        } else {
          putChar('/')
          getOperatorRest()
        }
      case '0' =>
        def fetchZero() = {
          putChar(ch)
          nextChar()
          if (ch == 'x' || ch == 'X') {
            putChar(ch)
            nextChar()
            base = 16
          } else {
            base = 8
          }
          getNumber()
        }
        fetchZero()
      case '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' =>
        base = 10
        getNumber()
      case '`' =>
        getBackquotedIdent()
      case '\"' =>
        def fetchDoubleQuote(): Unit = {
          if (token == INTERPOLATIONID) {
            nextRawChar()
            if (ch == '\"') {
              val lookahead = lookaheadReader
              lookahead.nextChar()
              if (lookahead.ch == '\"') {
                nextRawChar()                        // now eat it
                offset += 3
                nextRawChar()
                getStringPart(multiLine = true)
                sepRegions = STRINGPART :: sepRegions // indicate string part
                sepRegions = STRINGLIT :: sepRegions // once more to indicate multi line string part
              } else {
                nextChar()
                token = STRINGLIT
                strVal = ""
              }
            } else {
              offset += 1
              getStringPart(multiLine = false)
              sepRegions = STRINGLIT :: sepRegions // indicate single line string part
            }
          } else {
            nextChar()
            if (ch == '\"') {
              nextChar()
              if (ch == '\"') {
                nextRawChar()
                getRawStringLit()
              } else {
                token = STRINGLIT
                strVal = ""
              }
            } else {
              getStringLit()
            }
          }
        }
        fetchDoubleQuote()
      case '\'' =>
        def fetchSingleQuote() = {
          nextChar()
          if (ch == '$' && !getDollar())
            syntaxError("can't unquote into character literals", at = charOffset - 1)
          else if (isIdentifierStart(ch))
            charLitOr(getIdentRest _)
          else if (isOperatorPart(ch) && (ch != '\\'))
            charLitOr(getOperatorRest _)
          else {
            getLitChar()
            if (ch == '\'') {
              nextChar()
              token = CHARLIT
              setStrVal()
            } else {
              syntaxError("unclosed character literal", at = offset)
            }
          }
        }
        fetchSingleQuote()
      case '.' =>
        nextChar()
        if ('0' <= ch && ch <= '9') {
          putChar('.'); getFraction()
        } else if (dialect.allowUnquotes && ch == '.') {
          base = 0
          while (ch == '.') {
            base += 1
            nextChar()
          }
          token = ELLIPSIS
        } else {
          token = DOT
        }
      case ';' =>
        nextChar(); token = SEMI
      case ',' =>
        nextChar(); token = COMMA
      case '(' =>
        nextChar(); token = LPAREN
      case '{' =>
        nextChar(); token = LBRACE
      case ')' =>
        nextChar(); token = RPAREN
      case '}' =>
        // save the end of the current token (exclusive) in case nextChar
        // advances the offset more than once. See UnicodeEscapeSuite for a
        // and https://github.com/scalacenter/scalafix/issues/593 for
        // an example why this this is necessary.
        val end = charOffset + 1
        nextChar(); token = RBRACE
        // restore the charOffset to the saved position
        if (end < buf.length) charOffset = end
      case '[' =>
        nextChar(); token = LBRACKET
      case ']' =>
        nextChar(); token = RBRACKET
      case SU =>
        if (isAtEnd) {
          // NOTE: sometimes EOF's offset is `input.chars.length - 1`, and that might mess things up
          offset = input.chars.length
          token = EOF
        } else {
          syntaxError("illegal character", at = offset)
          nextChar()
        }
      case _ =>
        def fetchOther() = {
          if (ch == '\u21D2') {
            nextChar(); token = ARROW
          } else if (ch == '\u2190') {
            nextChar(); token = LARROW
          } else if (Character.isUnicodeIdentifierStart(ch)) {
            putChar(ch)
            nextChar()
            getIdentRest()
          } else if (isSpecial(ch)) {
            putChar(ch)
            nextChar()
            getOperatorRest()
          } else {
            syntaxError("illegal character '" + ("" + '\\' + 'u' + "%04x".format(ch.toInt)) + "'", at = offset)
            nextChar()
          }
        }
        fetchOther()
    }
  }

// Identifiers ---------------------------------------------------------------

  private def getBackquotedIdent() {
    nextChar()
    getLitChars('`')
    if (ch == '`') {
      nextChar()
      finishNamed(BACKQUOTED_IDENT)
      if (name.length == 0) syntaxError("empty quoted identifier", at = offset)
    } else if (ch == '$') {
      syntaxError("can't unquote into quoted identifiers", at = charOffset - 1)
    } else {
      syntaxError("unclosed quoted identifier", at = offset)
    }
  }

  private def getIdentRest(): Unit = (ch: @switch) match {
    case 'A' | 'B' | 'C' | 'D' | 'E' |
         'F' | 'G' | 'H' | 'I' | 'J' |
         'K' | 'L' | 'M' | 'N' | 'O' |
         'P' | 'Q' | 'R' | 'S' | 'T' |
         'U' | 'V' | 'W' | 'X' | 'Y' |
         'Z' | '$' |
         'a' | 'b' | 'c' | 'd' | 'e' |
         'f' | 'g' | 'h' | 'i' | 'j' |
         'k' | 'l' | 'm' | 'n' | 'o' |
         'p' | 'q' | 'r' | 's' | 't' |
         'u' | 'v' | 'w' | 'x' | 'y' |
         'z' |
         '0' | '1' | '2' | '3' | '4' |
         '5' | '6' | '7' | '8' | '9' =>
      if (ch == '$' && !getDollar()) { finishNamed(); return }
      putChar(ch)
      nextChar()
      getIdentRest()
    case '_' =>
      putChar(ch)
      nextChar()
      getIdentOrOperatorRest()
    case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
      finishNamed()
    case _ =>
      if (Character.isUnicodeIdentifierPart(ch)) {
        putChar(ch)
        nextChar()
        getIdentRest()
      } else {
        finishNamed()
      }
  }

  private def getOperatorRest(): Unit = (ch: @switch) match {
    case '~' | '!' | '@' | '#' | '%' |
         '^' | '*' | '+' | '-' | '<' |
         '>' | '?' | ':' | '=' | '&' |
         '|' | '\\' =>
      putChar(ch); nextChar(); getOperatorRest()
    case '/' =>
      val lookahead = lookaheadReader
      lookahead.nextChar()
      if (lookahead.ch == '/' || lookahead.ch == '*') {
        finishNamed()
      } else {
        putChar('/')
        nextChar()
        getOperatorRest()
      }
    case _ =>
      if (isSpecial(ch)) { putChar(ch); nextChar(); getOperatorRest() }
      else finishNamed()
  }

  private def getIdentOrOperatorRest() {
    if (isIdentifierPart(ch))
      getIdentRest()
    else ch match {
      case '~' | '!' | '@' | '#' | '%' |
           '^' | '*' | '+' | '-' | '<' |
           '>' | '?' | ':' | '=' | '&' |
           '|' | '\\' | '/' =>
        getOperatorRest()
      case _ =>
        if (isSpecial(ch)) getOperatorRest()
        else finishNamed()
    }
  }

  // True means that we have successfully read '$'
  // False means that we need to switch into unquote reading mode.
  private def getDollar(): Boolean = {
    if (dialect.allowUnquotes) {
      val lookahead = lookaheadReader
      lookahead.nextChar()
      if (lookahead.ch == '$') {
        // Skip the first dollar and move on to whatever we've been doing:
        // starting or continuing tokenization of an identifier,
        // or continuing reading a string literal, or whatever.
        nextChar()
      } else {
        // Don't do anything - our caller should know what to do.
        return false
      }
    }
    return true
  }


// Literals -----------------------------------------------------------------

  private def getStringLit() = {
    getLitChars('"')
    if (ch == '"') {
      setStrVal()
      nextChar()
      token = STRINGLIT
    } else if (ch == '$') {
      syntaxError("can't unquote into string literals", at = charOffset - 1)
    } else  {
      syntaxError("unclosed string literal", at = offset)
    }
  }

  private def getRawStringLit(): Unit = {
    if (ch == '\"') {
      nextRawChar()
      if (isTripleQuote()) {
        setStrVal()
        token = STRINGLIT
      } else
        getRawStringLit()
    } else if (ch == SU) {
      incompleteInputError("unclosed multi-line string literal", at = offset)
    } else if (ch == '$' && !getDollar()) {
      syntaxError("can't unquote into string literals", at = charOffset - 1)
    } else {
      putChar(ch)
      nextRawChar()
      getRawStringLit()
    }
  }

  @scala.annotation.tailrec private def getStringPart(multiLine: Boolean): Unit = {
    def finishStringPart() = {
      setStrVal()
      token = STRINGPART
      next.lastOffset = charOffset - 1
      next.offset = charOffset - 1
    }
    if (ch == '"') {
      if (multiLine) {
        nextRawChar()
        if (isTripleQuote()) {
          setStrVal()
          token = STRINGLIT
        } else
          getStringPart(multiLine)
      } else {
        nextChar()
        setStrVal()
        token = STRINGLIT
      }
    } else if (ch == '$') {
      if (!getDollar()) {
        syntaxError("can't unquote into string interpolations", at = charOffset - 1)
      } else {
        nextRawChar()
        if (ch == '$') {
          putChar(ch)
          nextRawChar()
          getStringPart(multiLine)
        } else if (ch == '{') {
          finishStringPart()
          endOffset = charOffset - 3
          nextRawChar()
          next.token = LBRACE
        } else if (ch == '_' && dialect.allowSpliceUnderscores) {
          finishStringPart()
          endOffset = charOffset - 3
          nextRawChar()
          next.token = USCORE
        } else if (Character.isUnicodeIdentifierStart(ch)) {
          finishStringPart()
          endOffset = charOffset - 3
          do {
            putChar(ch)
            nextRawChar()
          } while (ch != SU && Character.isUnicodeIdentifierPart(ch))
          next.token = IDENTIFIER
          next.name = cbuf.toString
          cbuf.clear()
          if (kw2legacytoken contains next.name) {
            next.token = kw2legacytoken(next.name)
            if (next.token == ENUM && !dialect.allowEnums)
              next.token = IDENTIFIER
            if (next.token != IDENTIFIER && next.token != THIS)
              syntaxError("invalid unquote: `$'ident, `$'BlockExpr, `$'this or `$'_ expected", at = offset)
          }
        } else {
          var supportedCombos = List("`$$'", "`$'ident", "`$'this", "`$'BlockExpr")
          if (dialect.allowSpliceUnderscores) supportedCombos = supportedCombos :+ "`$'_"
          val s_supportedCombos = supportedCombos.init.mkString(", ") + supportedCombos.last
          syntaxError(s_supportedCombos, at = offset)
        }
      }
    } else {
      val isUnclosedLiteral = !isUnicodeEscape && (ch == SU || (!multiLine && (ch == CR || ch == LF)))
      if (isUnclosedLiteral) {
        if (multiLine)
          incompleteInputError("unclosed multi-line string interpolation", at = offset)
        else
          syntaxError("unclosed string interpolation", at = offset)
      }
      else {
        putChar(ch)
        nextRawChar()
        getStringPart(multiLine)
      }
    }
  }

  private def fetchXmlPart(): Unit = {
    require(inXmlLiteral, "must be at the start of an xml literal part")
    val (end, isLastPart) = upcomingXmlLiteralParts(offset)
    finishComposite(XMLLIT, end - 1)

    if (isLastPart) {
      next.token = XMLLITEND
    }

    // Clean up map, should be empty at EOF.
    upcomingXmlLiteralParts.remove(offset)
  }

  private def fetchStringPart(): Unit =
    getStringPart(multiLine = inMultiLineInterpolation)

  private def isTripleQuote(): Boolean =
    if (ch == '"') {
      nextRawChar()
      if (ch == '"') {
        nextChar()
        while (ch == '"') {
          putChar('"')
          nextChar()
        }
        true
      } else {
        putChar('"')
        putChar('"')
        false
      }
    } else {
      putChar('"')
      false
    }

  /** copy current character into cbuf, interpreting any escape sequences,
   *  and advance to next character.
   */
  protected def getLitChar(): Unit =
    if (ch == '\\') {
      nextChar()
      if ('0' <= ch && ch <= '7') {
        val start = charOffset - 2
        val leadch: Char = ch
        var oct: Int = digit2int(ch, 8)
        nextChar()
        if ('0' <= ch && ch <= '7') {
          oct = oct * 8 + digit2int(ch, 8)
          nextChar()
          if (leadch <= '3' && '0' <= ch && ch <= '7') {
            oct = oct * 8 + digit2int(ch, 8)
            nextChar()
          }
        }
        val alt = if (oct == LF) "\\n" else "\\u%04x" format oct
        def msg(what: String) = s"Octal escape literals are $what, use $alt instead."
        deprecationWarning(msg("deprecated"), at = start)
        putChar(oct.toChar)
      } else {
        ch match {
          case 'b'  => putChar('\b')
          case 't'  => putChar('\t')
          case 'n'  => putChar('\n')
          case 'f'  => putChar('\f')
          case 'r'  => putChar('\r')
          case '\"' => putChar('\"')
          case '\'' => putChar('\'')
          case '\\' => putChar('\\')
          case _    => invalidEscape()
        }
        nextChar()
      }
    } else if (ch == '$' && !getDollar()) {
      // bail and let the caller handle this
    } else  {
      putChar(ch)
      nextChar()
    }

  protected def invalidEscape(): Unit = {
    syntaxError("invalid escape character", at = charOffset - 1)
    putChar(ch)
  }

  private def getLitChars(delimiter: Char) = {
    def stop = {
      def success = ch == delimiter
      def naturalBreak = (ch == SU || ch == CR || ch == LF) && !isUnicodeEscape
      def unquote = ch == '$' && !getDollar()
      success || naturalBreak || unquote
    }
    while (!isAtEnd && !stop) getLitChar()
  }

  /** read fractional part and exponent of floating point number
   *  if one is present.
   */
  protected def getFraction() {
    token = DOUBLELIT
    while ('0' <= ch && ch <= '9') {
      putChar(ch)
      nextChar()
    }
    if (ch == 'e' || ch == 'E') {
      val lookahead = lookaheadReader
      lookahead.nextChar()
      if (lookahead.ch == '+' || lookahead.ch == '-') {
        lookahead.nextChar()
      }
      if ('0' <= lookahead.ch && lookahead.ch <= '9') {
        putChar(ch)
        nextChar()
        if (ch == '+' || ch == '-') {
          putChar(ch)
          nextChar()
        }
        while ('0' <= ch && ch <= '9') {
          putChar(ch)
          nextChar()
        }
      }
      token = DOUBLELIT
    }
    if (ch == 'd' || ch == 'D') {
      putChar(ch)
      nextChar()
      token = DOUBLELIT
    } else if (ch == 'f' || ch == 'F') {
      putChar(ch)
      nextChar()
      token = FLOATLIT
    }
    checkNoLetter()
    setStrVal()
  }

  def checkNoLetter() {
    if (isIdentifierPart(ch) && ch >= ' ')
      syntaxError("Invalid literal number", at = offset)
  }

  /** Read a number into strVal and set base
  */
  protected def getNumber() {
    val base1 = if (base < 10) 10 else base
    // Read 8,9's even if format is octal, produce a malformed number error afterwards.
    // At this point, we have already read the first digit, so to tell an innocent 0 apart
    // from an octal literal 0123... (which we want to disallow), we check whether there
    // are any additional digits coming after the first one we have already read.
    var notSingleZero = false
    while (digit2int(ch, base1) >= 0) {
      putChar(ch)
      nextChar()
      notSingleZero = true
    }
    token = INTLIT

    /* When we know for certain it's a number after using a touch of lookahead */
    def restOfNumber() = {
      putChar(ch)
      nextChar()
      getFraction()
    }
    def restOfUncertainToken() = {
      def isEfd = ch match { case 'e' | 'E' | 'f' | 'F' | 'd' | 'D' => true ; case _ => false }
      def isL   = ch match { case 'l' | 'L' => true ; case _ => false }

      if (base <= 10 && isEfd)
        getFraction()
      else {
        // Checking for base == 8 is not enough, because base = 8 is set
        // as soon as a 0 is read in `case '0'` of method fetchToken.
        if (base == 8 && notSingleZero) syntaxError("Non-zero integral values may not have a leading zero.", at = offset)
        if (isL) {
          putChar(ch)
          setStrVal()
          nextChar()
          token = LONGLIT
        } else {
          setStrVal()
          checkNoLetter()
        }
      }
    }

    if (base > 10 || ch != '.')
      restOfUncertainToken()
    else {
      val lookahead = lookaheadReader
      val c = lookahead.getc()

      /* Prohibit 1. */
      if (!isDigit(c))
        return setStrVal()

      val isDefinitelyNumber = (c: @switch) match {
        /** Another digit is a giveaway. */
        case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'  =>
          true

        /* Backquoted idents like 22.`foo`. */
        case '`' =>
          return setStrVal()  /** Note the early return */

        /* These letters may be part of a literal, or a method invocation on an Int.
         */
        case 'd' | 'D' | 'f' | 'F' =>
          !isIdentifierPart(lookahead.getc())

        /* A little more special handling for e.g. 5e7 */
        case 'e' | 'E' =>
          val ch = lookahead.getc()
          !isIdentifierPart(ch) || (isDigit(ch) || ch == '+' || ch == '-')

        case x  =>
          !isIdentifierStart(x)
      }
      if (isDefinitelyNumber) restOfNumber()
      else restOfUncertainToken()
    }
  }

  /** Parse character literal if current character is followed by \',
   *  or follow with given op and return a symbol literal token
   */
  def charLitOr(op: () => Unit) {
    putChar(ch)
    nextChar()
    if (ch == '\'') {
      nextChar()
      token = CHARLIT
      setStrVal()
    } else {
      op()
      token = SYMBOLLIT
      strVal = name.toString
    }
  }

  def getXml(): Unit = {
    // 1. Collect positions of scala expressions inside this xml literal.
    import fastparse.core.Parsed
    val start = offset
    val embeddedScalaExprPositions = new ScalaExprPositionParser(dialect)
    val xmlParser = new XmlParser(embeddedScalaExprPositions)
    val result: Int = xmlParser.XmlExpr.parse(input.text, index = start) match {
      case Parsed.Success(_, endExclusive) =>
        endExclusive
      case Parsed.Failure(_, failIndex, extra) =>
        syntaxError(s"malformed xml literal, expected: ${extra.traced.expected}", at = failIndex)
    }

    // 2. Populate upcomingXmlLiteralParts with xml literal part positions.
    var lastFrom = start
    embeddedScalaExprPositions.splicePositions.foreach { pos =>
      // pos contains the start and end positions of a scala expression.
      // We want the range of the xml literal part which starts at lastFrom
      // and ends at pos.from.
      val to = pos.from - 1
      upcomingXmlLiteralParts.update(lastFrom, (to, false))
      lastFrom = pos.to + 1
    }
    // The final xml literal part is not followed by any embedded scala expr.
    upcomingXmlLiteralParts.update(lastFrom, (result, true))

    // 3. Return only the first xml part.
    fetchXmlPart()
  }

// Unquotes -----------------------------------------------------------------

  def getUnquote() {
    require(ch == '$')
    val start = charOffset
    val endInclusive = {
      val exploratoryInput = Input.Slice(input, start, input.chars.length)
      val exploratoryDialect = this.dialect.copy(allowTermUnquotes = false, allowPatUnquotes = false, allowMultilinePrograms = true)
      val exploratoryScanner = new LegacyScanner(exploratoryInput, exploratoryDialect)
      exploratoryScanner.reader.nextChar()
      exploratoryScanner.nextToken()
      exploratoryScanner.curr.token match {
        case LBRACE =>
          def loop(balance: Int): Unit = {
            exploratoryScanner.nextToken()
            exploratoryScanner.curr.token match {
              case LBRACE =>
                loop(balance + 1)
              case RBRACE =>
                if (balance == 1) () // do nothing, this is the end of the unquote
                else loop(balance - 1)
              case _ =>
                loop(balance)
            }
          }
          try {
            loop(balance = 1)
          } catch {
            case TokenizeException(pos, message) =>
              syntaxError(s"invalid unquote: $message", at = start + pos.start)
          }
        case IDENTIFIER | THIS | USCORE =>
          // do nothing, this is the end of the unquote
        case _ =>
          syntaxError("invalid unquote: `$'ident, `$'BlockExpr, `$'this or `$'_ expected", at = start)
      }
      start + exploratoryScanner.curr.endOffset
    }
    finishComposite(UNQUOTE, endInclusive)
  }

// Errors -----------------------------------------------------------------

  override def toString() = token.toString

  /** Initialize scanner; call f on each scanned token data
   */
  def foreach(f: LegacyTokenData => Unit) {
    nextChar()
    do {
      nextToken()
      f(curr)
    } while(curr.token != EOF)
  }
}