dregex.impl.RegexParser.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dregex_2.11 Show documentation
Deterministic Regular Expression Engine
There is a newer version: 0.7.0
package dregex.impl

import scala.util.parsing.combinator.JavaTokenParsers
import com.typesafe.scalalogging.slf4j.StrictLogging
import dregex.InvalidRegexException
import dregex.InvalidRegexException

class RegexParser extends JavaTokenParsers {

  override def skipWhitespace = false

  import RegexTree._

  val backslash = """\"""

  def number = """\d""".r.+ ^^ { s =>
    try {
      s.mkString.toInt
    } catch {
      case e: NumberFormatException => throw new InvalidRegexException("Cannot parse number: " + s)
    }
  }

  def charSpecialInsideClasses = backslash | "]" | "^" | "-"
  def charSpecial = backslash | "." | "|" | "(" | ")" | "[" | "]" | "+" | "*" | "?" | "^" | "$"

  def specialEscape = backslash ~ "[^dwsDWSuUxc01234567]".r ^^ {
    case _ ~ char =>
      char match {
        case "n" => Lit('\n')
        case "r" => Lit('\r')
        case "t" => Lit('\t')
        case "f" => Lit('\f')
        case "b" => Lit('\b')
        case "v" => Lit('\u000B') // vertical tab
        case "a" => Lit('\u0007') // bell
        case "e" => Lit('\u001B') // escape
        case "B" => Lit('\\')
        case c => Lit(c) // remaining escaped characters stand for themselves
      }
  }

  def hexDigit = "[0-9A-Fa-f]".r
  def octalDigit = "[0-7]".r

  def unicodeEscape = backslash ~ "u" ~ repN(4, hexDigit) ^^ {
    case _ ~ _ ~ digits =>
      Lit(Integer.parseInt(digits.mkString, 16).toChar)
  }

  def longUnicodeEscape = backslash ~ "U" ~ repN(8, hexDigit) ^^ {
    case _ ~ _ ~ digits =>
      Lit(Integer.parseInt(digits.mkString, 16).toChar)
  }

  def hexEscape = backslash ~ "x" ~ hexDigit.+ ^^ {
    case _ ~ _ ~ digits =>
      Lit(Integer.parseInt(digits.mkString, 16).toChar)
  }

  def octalEscape = backslash ~ (repN(2, octalDigit) ||| repN(3, octalDigit)) ^^ {
    case _ ~ digits =>
      Lit(Integer.parseInt(digits.mkString, 8).toChar)
  }

  def controlEscape = (backslash ~ "c" ~ ".".r) ~> failure("Unsupported feature: control escape")

  def anchor = ("^" | "$") ~> failure("Unsupported feature: anchors")

  def anyEscape = specialEscape | unicodeEscape | hexEscape | longUnicodeEscape | octalEscape | controlEscape

  def anythingExcept(parser: Parser[_]) = not(parser) ~> (".".r ^^ (x => Lit(x)))

  def charLit = anchor | anythingExcept(charSpecial) | anyEscape

  def characterClassLit = anythingExcept(charSpecialInsideClasses) | anyEscape

  def singleCharacterClassLit = characterClassLit ^^ (lit => ExtensionCharSet(lit.char))

  /*
  * For convenience, character class ranges are implemented at the parser level. This method directly
  * returns a list of all the characters included in the range 
  */
  def charClassRange = characterClassLit ~ "-" ~ characterClassLit ^^ {
    case start ~ _ ~ end => RangeCharSet(start.char, end.char)
  }

  def charClassAtom = charClassRange | singleCharacterClassLit | shorthandCharSet

  def charClass = "[" ~ "^".? ~ "-".? ~ charClassAtom.+ ~ "-".? ~ "]" ^^ {
    case _ ~ negated ~ leftDash ~ charClass ~ rightDash ~ _ =>
      val chars = if (leftDash.isDefined || rightDash.isDefined)
        charClass :+ ExtensionCharSet('-')
      else
        charClass
      negated.fold[Node](CharClass(chars: _*))(x => NegatedCharClass(chars: _*))
  }

  // There is the special case of a character class with only one character: the dash. This is valid, but
  // not easily parsed by the general constructs.
  def dashClass = "[" ~ "^".? ~ "-" ~ "]" ^^ {
    case _ ~ negated ~ _ ~ _ =>
      negated.fold[Node](CharClass(ExtensionCharSet('-')))(x => NegatedCharClass(ExtensionCharSet('-')))
  }

  val numberSet = RangeCharSet('0', '9')
  val spaceSet = ExtensionCharSet('\n', '\t', '\r', '\f', ' ')
  val wordSet = MultiRangeCharSet(numberSet, RangeCharSet('a', 'z'), RangeCharSet('A', 'Z'), ExtensionCharSet('_'))
  
  def shorthandCharSet = backslash ~ "[DWSdws]".r ^^ {
    case _ ~ "d" => numberSet
    case _ ~ "D" => CompCharSet(numberSet)
    case _ ~ "s" => spaceSet
    case _ ~ "S" => CompCharSet(spaceSet)
    case _ ~ "w" => wordSet
    case _ ~ "W" => CompCharSet(wordSet)
  }

  def shorthandCharClass = shorthandCharSet ^^ (set => CharClass(set))

  def group = "(" ~ ("?" ~ "<".? ~ "[:=!]".r).? ~ regex ~ ")" ^^ {
    case _ ~ modifiers ~ value ~ _ =>
      import Direction._
      import Condition._
      modifiers match {
        case None => value // Naked parenthesis
        case Some(_ ~ None ~ ":") => value // Non-capturing group
        case Some(_ ~ None ~ "=") => Lookaround(Ahead, Positive, value)
        case Some(_ ~ None ~ "!") => Lookaround(Ahead, Negative, value)
        case Some(_ ~ Some("<") ~ ":") => throw new InvalidRegexException("Invalid grouping: <: ")
        case Some(_ ~ Some("<") ~ "=") => Lookaround(Behind, Positive, value)
        case Some(_ ~ Some("<") ~ "!") => Lookaround(Behind, Negative, value)
        case _ => throw new AssertionError
      }
  }

  def charWildcard = "." ^^^ Wildcard

  def regexAtom =
    charLit | charWildcard | charClass | dashClass | shorthandCharClass | group

  // Lazy quantifiers (by definition) don't change whether the text matches or not, so can be ignored for our purposes

  def quantifiedBranch = regexAtom ~ ("+" | "*" | "?") ~ "?".? ^^ {
    case atom ~ "+" ~ _ => Rep(min = 1, max = -1, value = atom)
    case atom ~ "*" ~ _ => Rep(min = 0, max = -1, value = atom)
    case atom ~ "?" ~ _ => Rep(min = 0, max = 1, value = atom)
  }

  def generalQuantifier = "{" ~ number ~ ("," ~ number.?).? ~ "}" ~ "?".? ^^ {
    case _ ~ minVal ~ Some(comma ~ Some(maxVal)) ~ _ ~ _ =>
      // Quantifiers of the for {min,max}
      if (minVal <= maxVal)
        (minVal, maxVal)
      else
        throw new InvalidRegexException("invalid range in quantifier")
    case _ ~ minVal ~ Some(comma ~ None) ~ _ ~ _ =>
      // Quantifiers of the form {min,}
      (minVal, -1)
    case _ ~ minVal ~ None ~ _ ~ _ =>
      // Quantifiers of the form "{n}", the value is captured as "min", despite being also the max
      (minVal, minVal)
  }

  def generallyQuantifiedBranch = regexAtom ~ generalQuantifier ^^ {
    case atom ~ ((min, max)) => Rep(min, max, atom)
  }

  def branch = (quantifiedBranch | generallyQuantifiedBranch | regexAtom).+ ^^ {
    case Seq() => throw new AssertionError
    case Seq(first) => first
    case parts => Juxt(parts)
  }

  def emptyRegex = "" ^^^ Epsilon

  def nonEmptyRegex: Parser[Node] = branch ~ ("|" ~ regex).? ^^ {
    case left ~ Some(_ ~ right) => Disj(Seq(left, right))
    case left ~ None => left
  }

  def regex = nonEmptyRegex | emptyRegex

}

object RegexParser extends StrictLogging {

  def parse(regex: String) = {
    val parser = new RegexParser()
    parser.parseAll(parser.regex, regex) match {
      case parser.Success(ast, next) => ast
      case parser.NoSuccess((msg, next)) => throw new InvalidRegexException(msg)
    }
  }

}