dregex.impl.RegexParser.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dregex_2.12 Show documentation
Deterministic Regular Expression Engine
The newest version!
package dregex.impl

import java.util.regex.Pattern

import dregex.{InvalidRegexException, ParsedRegex}
import dregex.impl.RegexParser.DotMatch
import dregex.impl.UnicodeChar.FromCharConversion

import scala.collection.immutable.Seq
import scala.util.parsing.combinator.RegexParsers

class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean) extends RegexParsers {

  override def skipWhitespace = false

  import RegexTree._

  // Atoms (strings and regexes)

  def backslash = """\"""
  def hexDigit = """\p{XDigit}""".r
  def octalDigit = "[0-7]".r
  def decimalDigit = """\d""".r

  // Parsers that return a primitive (string, number)

  def hexNumber(digitCount: Int) = repN(digitCount, hexDigit) ^^ { digits =>
    Integer.parseInt(digits.mkString, 16)
  }

  def hexNumber = hexDigit.+ ^^ { digits =>
    Integer.parseInt(digits.mkString, 16)
  }

  def octalNumber(digitCount: Int) = repN(digitCount, octalDigit) ^^ { digits =>
    Integer.parseInt(digits.mkString, 8)
  }

  def number = decimalDigit.+ ^^ { digits =>
    try {
      Integer.parseInt(digits.mkString)
    } catch {
      case _: NumberFormatException => throw new InvalidRegexException("Cannot parse number: " + digits.mkString)
    }
  }

  def charSpecialInsideClasses = backslash | "]" | "^" | "-"
  def charSpecial = backslash | "." | "|" | "(" | ")" | "[" | "]" | "+" | "*" | "?" | "^" | "$"

  def controlEscape =
    backslash ~ "c" ~ ".".r ~>
      failure("Unsupported feature: control escape")

  def backReference =
    backslash ~ "[1-9][0-9]*".r ~>
      failure("unsupported feature: backreferences")

  def anchor = ("^" | "$") ~> failure("Unsupported feature: anchors")

  /**
    * Special ignorable space, only enabled by a special parameter.
    */
  def sp = {
    if (comments) {
      """\s*""".r ^^^ None // ASCII white space intentionally for Java compatibility
    } else {
      "" ^^^ None
    }
  }

  // Parsers that return a literal Node

  def specialEscape = backslash ~> "[^dwsDWSuxcpR0123456789]".r ^^ {
    case "n" => Lit('\n'.u)
    case "r" => Lit('\r'.u)
    case "t" => Lit('\t'.u)
    case "f" => Lit('\f'.u)
    case "b" => Lit('\b'.u)
    case "v" => Lit('\u000B'.u) // vertical tab
    case "a" => Lit('\u0007'.u) // bell
    case "e" => Lit('\u001B'.u) // escape
    case "B" => Lit('\\'.u)
    case c   => Lit(UnicodeChar.fromSingletonString(c)) // remaining escaped characters stand for themselves
  }

  def doubleUnicodeEscape = backslash ~ "u" ~ hexNumber(4) ~ backslash ~ "u" ~ hexNumber(4) ^? {
    case _ ~ _ ~ highNumber ~ _ ~ _ ~ lowNumber
        if Character.isHighSurrogate(highNumber.toChar) && Character.isLowSurrogate(lowNumber.toChar) =>
      val codePoint = Character.toCodePoint(highNumber.toChar, lowNumber.toChar)
      Lit(UnicodeChar(codePoint))
  }

  def unicodeEscape = backslash ~ "u" ~> hexNumber(4) ^^ { codePoint =>
    Lit(UnicodeChar(codePoint))
  }

  def hexEscape = backslash ~ "x" ~> hexNumber(2) ^^ { codePoint =>
    Lit(UnicodeChar(codePoint))
  }

  def longHexEscape = backslash ~ "x" ~ "{" ~> hexNumber <~ "}" ^^ { codePoint =>
    Lit(UnicodeChar(codePoint))
  }

  def octalEscape = backslash ~ "0" ~> (octalNumber(1) ||| octalNumber(2) ||| octalNumber(3)) ^^ { codePoint =>
    Lit(UnicodeChar(codePoint))
  }

  /**
    * Order between Unicode escapes is important
    */
  def anyEscape =
    specialEscape |
      doubleUnicodeEscape |
      unicodeEscape |
      hexEscape |
      longHexEscape |
      octalEscape |
      controlEscape |
      backReference

  def anythingExcept(parser: Parser[_]) = not(parser) ~> (".".r ^^ (x => Lit(UnicodeChar.fromSingletonString(x))))

  def charLit = anchor | anythingExcept(charSpecial) | anyEscape

  def characterClassLit = anythingExcept(charSpecialInsideClasses) | anyEscape

  // Parsers that return a character class Node

  def singleCharacterClassLit = characterClassLit ^^ (lit => CharSet(Seq(lit)))

  def charClassRange = characterClassLit ~ "-" ~ characterClassLit ^^ {
    case start ~ _ ~ end => CharSet.fromRange(CharRange(start.char, end.char))
  }

  private val unicodeSubsetName = "[0-9a-zA-Z_ -]+".r

  def specialCharSetByName = backslash ~ "p" ~ "{" ~> "[a-z_]+".r ~ "=" ~ unicodeSubsetName <~ "}" ^^ {
    case propName ~ _ ~ propValue =>
      if (propName == "block" || propName == "blk") {
        PredefinedCharSets.unicodeBlocks
          .getOrElse(propValue.toUpperCase(), throw new InvalidRegexException("Invalid Unicode block: " + propValue))
      } else if (propName == "script" || propName == "sc") {
        PredefinedCharSets.unicodeScripts
          .getOrElse(propValue.toUpperCase(), throw new InvalidRegexException("Invalid Unicode script: " + propValue))
      } else if (propName == "general_category" || propName == "gc") {
        PredefinedCharSets.unicodeGeneralCategories
          .getOrElse(propValue, throw new InvalidRegexException("Invalid Unicode general category: " + propValue))
      } else {
        throw new InvalidRegexException("Invalid Unicode character property name: " + propName)
      }
  }

  def specialCharSetWithIs = backslash ~ "p" ~ "{" ~ "Is" ~> unicodeSubsetName <~ "}" ^^ { name =>
    /*
     * If the property starts with "Is" it could be either a script,
     * general category or a binary property. Look for all.
     */
    PredefinedCharSets.unicodeScripts
      .get(name.toUpperCase())
      .orElse(PredefinedCharSets.unicodeGeneralCategories.get(name))
      .orElse(PredefinedCharSets.unicodeBinaryProperties.get(name.toUpperCase()))
      .getOrElse {
        throw new InvalidRegexException("Invalid Unicode script, general category or binary property: " + name)
      }
  }

  def specialCharSetWithIn = backslash ~ "p" ~ "{" ~ "In" ~> unicodeSubsetName <~ "}" ^^ { blockName =>
    PredefinedCharSets.unicodeBlocks
      .getOrElse(blockName.toUpperCase(), throw new InvalidRegexException("Invalid Unicode block: " + blockName))
  }

  def specialCharSetWithJava = backslash ~ "p" ~ "{" ~ "java" ~> unicodeSubsetName <~ "}" ^^ { charClass =>
    PredefinedCharSets.javaClasses.getOrElse(
      charClass,
      throw new InvalidRegexException(
        s"invalid Java character class: $charClass " +
          s"(note: for such a class to be valid, a method java.lang.Character.is$charClass() must exist) " +
          s"(valid options: ${PredefinedCharSets.javaClasses.keys.toSeq.sorted.mkString(",")})")
    )
  }

  def specialCharSetImplicit = backslash ~ "p" ~ "{" ~> unicodeSubsetName <~ "}" ^^ { name =>
    val effPosixClasses = {
      if (unicodeClasses) {
        PredefinedCharSets.unicodePosixClasses
      } else {
        PredefinedCharSets.posixClasses
      }
    }
    effPosixClasses.get(name).orElse(PredefinedCharSets.unicodeGeneralCategories.get(name)).getOrElse {
      throw new InvalidRegexException("Invalid POSIX character class: " + name)
    }
  }

  def specialCharSet =
    specialCharSetByName |
      specialCharSetWithIs |
      specialCharSetWithIn |
      specialCharSetWithJava |
      specialCharSetImplicit

  def charClassAtom =
    charClassRange |
      singleCharacterClassLit |
      shorthandCharSet |
      specialCharSet

  // There is the special case of a character class with only one character: the dash. This is valid, but
  // not easily parsed by the general constructs.
  def dashClass = "[" ~> "^".? <~ "-" ~ "]" ^^ { negated =>
    val set = CharSet.fromRange(Lit('-'.u))
    if (negated.isDefined) {
      set.complement
    } else {
      set
    }
  }

  def shorthandCharSet =
    shorthandCharSetDigit |
      shorthandCharSetDigitCompl |
      shorthandCharSetSpace |
      shorthandCharSetSpaceCompl |
      shorthandCharSetWord |
      shorthandCharSetWordCompl

  def shorthandCharSetDigit = backslash ~ "d" ^^^ {
    if (unicodeClasses)
      PredefinedCharSets.unicodeDigit
    else
      PredefinedCharSets.digit
  }

  def shorthandCharSetDigitCompl = backslash ~ "D" ^^^ {
    if (unicodeClasses)
      PredefinedCharSets.unicodeDigit.complement
    else
      PredefinedCharSets.digit.complement
  }

  def shorthandCharSetSpace = backslash ~ "s" ^^^ {
    if (unicodeClasses)
      PredefinedCharSets.unicodeSpace
    else
      PredefinedCharSets.space
  }

  def shorthandCharSetSpaceCompl = backslash ~ "S" ^^^ {
    if (unicodeClasses)
      PredefinedCharSets.unicodeSpace.complement
    else
      PredefinedCharSets.space.complement
  }

  def shorthandCharSetWord = backslash ~ "w" ^^^ {
    if (unicodeClasses)
      PredefinedCharSets.unicodeWordChar
    else
      PredefinedCharSets.wordChar
  }

  def shorthandCharSetWordCompl = backslash ~ "W" ^^^ {
    if (unicodeClasses)
      PredefinedCharSets.unicodeWordChar.complement
    else
      PredefinedCharSets.wordChar.complement
  }

  def charClass = "[" ~> "^".? ~ "-".? ~ charClassAtom.+ ~ "-".? <~ "]" ^^ {
    case negated ~ leftDash ~ charClass ~ rightDash =>
      val chars =
        if (leftDash.isDefined || rightDash.isDefined)
          charClass :+ CharSet.fromRange(Lit('-'.u))
        else
          charClass
      val set = CharSet.fromCharSets(chars: _*)
      if (negated.isDefined) {
        set.complement
      } else {
        set
      }
  }

  // Parsers that return a complex Node

  def quotedLiteral = backslash ~ "Q" ~> anythingExcept(backslash ~ "E").* <~ backslash ~ "E" ^^ { literal =>
    Juxt(literal)
  }

  def unicodeLineBreak = backslash ~ "R" ^^^ {
    Disj(
      Seq(
        Juxt(Seq(Lit('\u000D'.u), Lit('\u000A'.u))),
        Lit('\u000A'.u),
        Lit('\u000B'.u),
        Lit('\u000C'.u),
        Lit('\u000D'.u),
        Lit('\u0085'.u),
        Lit('\u2028'.u),
        Lit('\u2029'.u)
      ))
  }

  def group = "(" ~> ("?" ~ "<".? ~ "[:=!]".r).? ~ sp ~ regex <~ sp ~ ")" ^^ {
    case modifiers ~ _ ~ value =>
      import Direction._
      import Condition._
      modifiers match {
        case None                      => PositionalCaptureGroup(value) // Naked parenthesis
        case Some(_ ~ None ~ ":")      => value // Non-capturing group
        case Some(_ ~ None ~ "=")      => Lookaround(Ahead, Positive, value)
        case Some(_ ~ None ~ "!")      => Lookaround(Ahead, Negative, value)
        case Some(_ ~ Some("<") ~ ":") => throw new InvalidRegexException("Invalid grouping: <: ")
        case Some(_ ~ Some("<") ~ "=") => Lookaround(Behind, Positive, value)
        case Some(_ ~ Some("<") ~ "!") => Lookaround(Behind, Negative, value)
        case _                         => throw new AssertionError
      }
  }

  def namedGroup = "(" ~ "?" ~ "<" ~> "[a-zA-Z][a-zA-Z0-9]*".r ~ ">" ~ regex <~ ")" ^^ {
    case name ~ _ ~ value => NamedCaptureGroup(name, value)
  }

  def charWildcard = "." ^^^ {
    dotMatch match {
      case DotMatch.All =>
        Wildcard
      case DotMatch.JavaLines =>
        CharSet(
          Seq(
            Lit('\n'.u),
            Lit('\r'.u),
            Lit('\u0085'.u),
            Lit('\u2028'.u),
            Lit('\u2829'.u)
          )).complement
      case DotMatch.UnixLines =>
        CharSet.fromRange(Lit('\n'.u)).complement
    }
  }

  def regexAtom =
    quotedLiteral | charLit | charWildcard | charClass | unicodeLineBreak | dashClass | shorthandCharSet | specialCharSet | group | namedGroup

  case class Quantification(min: Int, max: Option[Int])

  def quantifier = predefQuantifier | generalQuantifier

  def predefQuantifier = ("+" | "*" | "?") ^^ {
    case "+" => Quantification(min = 1, max = None)
    case "*" => Quantification(min = 0, max = None)
    case "?" => Quantification(min = 0, max = Some(1))
  }

  def generalQuantifier = "{" ~> number ~ ("," ~ number.?).? <~ "}" ^^ {
    case minVal ~ Some(comma ~ Some(maxVal)) =>
      // Quantifiers of the for {min,max}
      if (minVal <= maxVal)
        Quantification(minVal, Some(maxVal))
      else
        throw new InvalidRegexException("invalid range in quantifier")
    case minVal ~ Some(comma ~ None) =>
      // Quantifiers of the form {min,}
      Quantification(minVal, None)
    case minVal ~ None =>
      // Quantifiers of the form "{n}", the value is captured as "min", despite being also the max
      Quantification(minVal, Some(minVal))
  }

  def lazyQuantifiedBranch =
    (regexAtom ~ quantifier ~ "?") ~>
      failure("reluctant quantifiers are not supported")

  def possesivelyQuantifiedBranch =
    (regexAtom ~ quantifier ~ "+") ~>
      failure("possessive quantifiers are not supported")

  def quantifiedBranch = regexAtom ~ sp ~ quantifier ^^ {
    case atom ~ _ ~ (q: Quantification) => Rep(min = q.min, max = q.max, value = atom)
  }

  def branch = ((lazyQuantifiedBranch | possesivelyQuantifiedBranch | quantifiedBranch | regexAtom) <~ sp).+ ^^ {
    case Seq()      => throw new AssertionError
    case Seq(first) => first
    case parts      => Juxt(parts)
  }

  def emptyRegex = "" ^^^ Juxt(Seq())

  def nonEmptyRegex: Parser[Node] = sp ~> branch ~ (sp ~ "|" ~ sp ~> regex).? ^^ {
    case left ~ Some(right) => Disj(Seq(left, right))
    case left ~ None        => left
  }

  def regex = nonEmptyRegex | emptyRegex

}

object RegexParser {

  private val commentPattern = Pattern.compile("""(? 0) {
          throw new InvalidRegexException(s"embedded flag are only valid at the beginning of the pattern")
        }
        for (flag <- matcher.group(1)) {
          flag match {
            case 'x' => flags.comments = true
            case 's' => flags.dotMatch = DotMatch.All
            case 'd' => flags.dotMatch = DotMatch.UnixLines
            case 'U' => flags.unicodeClasses = true
            case 'i' => flags.caseInsensitive = true
            case 'u' => flags.unicodeCase = true
            case 'm' => flags.multiline = true
            case c   => throw new InvalidRegexException(s"invalid embedded flag: $c")
          }
          effRegex = effRegex.substring(matcher.end)
        }
      }
      if (flags.multiline) {
        throw new InvalidRegexException("multiline flag is not supported; this class always works in multiline mode")
      }

      // replace comments
      if (flags.comments) {
        effRegex = commentPattern.matcher(effRegex).replaceAll(" ")
      }
      parseRegexImpl(effRegex, flags)
    }
  }

  /**
    * Parse a quoted regex. They don't really need parsing.
    */
  private def parseLiteralRegex(regex: String): ParsedRegex = {
    val literals: Seq[RegexTree.Lit] = regex.map { char =>
      RegexTree.Lit(UnicodeChar.fromChar(char))
    }
    new ParsedRegex(regex, RegexTree.Juxt(literals), Normalization.NoNormalization)
  }

  /**
    * Parse an actual regex that is not a literal.
    */
  private def parseRegexImpl(regex: String, flags: Flags): ParsedRegex = {
    // normalize case
    var normalizer: Normalization = if (flags.caseInsensitive) {
      if (flags.unicodeClasses | flags.unicodeCase) {
        Normalization.UnicodeLowerCase
      } else {
        Normalization.LowerCase
      }
    } else {
      Normalization.NoNormalization
    }

    if (flags.canonicalEq) {
      normalizer = Normalization.combine(Normalization.CanonicalDecomposition, normalizer)
    }

    // parsing proper
    val parser = new RegexParser(flags.comments, flags.dotMatch, flags.unicodeClasses)

    val tree: RegexTree.Node = parser.parseAll(parser.regex, normalizer.normalize(regex)) match {
      case parser.Success(ast, next)     => ast
      case parser.NoSuccess((msg, next)) => throw new InvalidRegexException(msg)
    }

    new ParsedRegex(regex, tree, normalizer)
  }

}