rrec-parser_2. Maven / Gradle / Ivy
The newest version!
package ceedubs.irrec
package parse
import ceedubs.irrec.regex._, Match.MatchSet
import ceedubs.irrec.regex.RegexC
import ceedubs.irrec.regex.RegexPrettyPrinter.{
import combinator._
import fastparse._, NoWhitespace._
import fastparse.Parsed.{Failure, Success}
import cats.collections.{Diet, Range}
import cats.implicits._
object Parser {
private val escapableCharToLit: Map[Char, Char] = specialNonCharClassCharToLit + ('-' -> '-')
* Matches on special characters that should be escaped like `*` and `{`.
def specialChar[_: P]: P[Char] =
CharPred(escapableCharToLit.contains(_)).!.map(s => escapableCharToLit(s.head))
s"special regular expression character that should be escaped such as '(', '}', '*', etc")
def unicodeCodePoint[_: P]: P[Char] =
.rep(exactly = 4)
.map(hexChars => Integer.parseInt(hexChars, 16).toChar)
).opaque("A valid unicode code point in 4-digit hex form (ex: '006F')")
* A shorthand class such as `\d` or `\w`. This parser itself doesn't look for the `\`; it starts
* with the character after it.
def shorthandClass[_: P]: P[Match.MatchSet[Char]] =
P("d").map(_ => MatchSet.allow(CharacterClasses.digit)) |
P("D").map(_ => MatchSet.forbid(CharacterClasses.digit)) |
P("w").map(_ => MatchSet.allow(CharacterClasses.wordChar)) |
P("W").map(_ => MatchSet.forbid(CharacterClasses.wordChar)) |
P("h").map(_ => MatchSet.allow(CharacterClasses.horizontalWhitespaceChar)) |
P("H").map(_ => MatchSet.forbid(CharacterClasses.horizontalWhitespaceChar)) |
P("s").map(_ => MatchSet.allow(CharacterClasses.whitespaceChar)) |
P("S").map(_ => MatchSet.forbid(CharacterClasses.whitespaceChar))
).opaque("""character class such as \w, \d, \s, \S, etc""")
* Standard characters to match like `a` or `%`.
def standardMatchChar[_: P]: P[Char] =
CharPred(c => !nonCharClassCharsToEscape.contains(c)).!.map(s => s.head)
.opaque("""standard charact to match like `a` or `%`""")
* Standard characters to match like `a` or `%` but also characters that aren't special within
* character classes such as `*` (ex: `[*+]` matches on literal `*` and `+`).
def charClassStandardMatchChar[_: P]: P[Char] =
P(CharPred(c => !charClassCharsToEscape.contains(c)).!.map(s => s.head))
* Matches the wildcard character `.`.
def wildcard[_: P]: P[RegexC[Char]] = P(".").map(_ => combinator.wildcard)
* Positive integers within the max range of Scala's `Int`.
def posInt[_: P]: P[Int] =
.flatMap(s => Either.catchNonFatal(s.toInt).fold(_ => Fail, Pass(_))))
.opaque(s"integer between 0 and ${Int.MaxValue}")
def singleLitCharClassChar[_: P]: P[Char] =
P(("\\u" ~ unicodeCodePoint) | ("\\" ~ specialChar | charClassStandardMatchChar))
def matchLitCharClassChar[_: P]: P[Match.Literal[Char]] =
* Character range like `a-z`.
def matchCharRange[_: P]: P[Range[Char]] =
(singleLitCharClassChar ~ "-" ~ singleLitCharClassChar).map { case (l, h) =>
Range(l, h)
* Matches repeat counts like `{3}` or `{1,4}`.
def quantifier[_: P]: P[Quantifier] =
"{" ~/ (
(posInt ~ "," ~/ posInt.? ~/ "}" ~/ (P("?").map(_ => Greediness.NonGreedy) | Pass(
Greediness.Greedy))).map { case (l, h, g) =>
Quantifier.Range(l, h, g)
} |
(posInt.map(Quantifier.Exact(_)) ~ "}")
).opaque("repeat count such as '{3}', '{1,4}', `{1, 4}?`, '{3,}', or `{3,}?")
def charOrRange[_: P]: P[Match.MatchSet[Char]] =
matchCharRange.map(r => MatchSet.allow(Diet.fromRange(r))) |
singleLitCharClassChar.map(c => MatchSet.allow(Diet.one(c)))
def positivePOSIXCharClass[_: P]: P[MatchSet[Char]] =
P("alnum").map(_ => MatchSet.allow(CharacterClasses.alphaNumeric)) |
P("alpha").map(_ => MatchSet.allow(CharacterClasses.alpha)) |
P("ascii").map(_ => MatchSet.allow(CharacterClasses.ascii)) |
P("blank").map(_ => MatchSet.allow(CharacterClasses.horizontalWhitespaceChar)) |
P("cntrl").map(_ => MatchSet.allow(CharacterClasses.controlChar)) |
P("digit").map(_ => MatchSet.allow(CharacterClasses.digit)) |
P("graph").map(_ => MatchSet.allow(CharacterClasses.graphChar)) |
P("lower").map(_ => MatchSet.allow(CharacterClasses.lowerAlpha)) |
P("print").map(_ => MatchSet.allow(CharacterClasses.printableChar)) |
P("punct").map(_ => MatchSet.allow(CharacterClasses.punctuationChar)) |
P("space").map(_ => MatchSet.allow(CharacterClasses.whitespaceChar)) |
P("upper").map(_ => MatchSet.allow(CharacterClasses.upperAlpha)) |
P("word").map(_ => MatchSet.allow(CharacterClasses.wordChar)) |
P("xdigit").map(_ => MatchSet.allow(CharacterClasses.hexDigit))
def positiveCharClassContent[_: P]: P[MatchSet[Char]] =
(!"&&" ~ (("\\" ~ shorthandClass) | charOrRange))
.map(_.reduce(_ union _))
def charClassBase[_: P]: P[MatchSet[Char]] =
positiveCharClassContent |
("[:" ~ positivePOSIXCharClass ~ ":]") |
def charClassUnion[_: P]: P[MatchSet[Char]] =
P(charClassBase.rep(1).map(_.reduce(_ union _)))
def charClassTerm[_: P]: P[MatchSet[Char]] =
charClassUnion.flatMap { c1 =>
("&&" ~/ charClassTerm).map(c2 => c1 intersect c2) |
* Character classes like `[acz]` or `[^a-cHP-W]`.
def charClass[_: P]: P[MatchSet[Char]] =
("[^" ~ (positiveCharClassContent.map(_.negate) ~ "&&" ~ charClassTerm).map { case (c1, c2) =>
c1 intersect c2
} ~ "]") |
("[^" ~ (positiveCharClassContent.map(_.negate) ~ charClassTerm).map { case (c1, c2) =>
c1 union c2
} ~ "]") |
("[^" ~ charClassTerm.map(_.negate) ~ "]") |
("[" ~ charClassTerm ~ "]")
def base[_: P]: P[RegexC[Unit]] =
standardMatchChar.map(lit(_).void) |
("\\" ~/ (("u" ~ unicodeCodePoint | specialChar).map(lit(_).void) | shorthandClass.map(
matching(_).void))) |
wildcard.map(_.void) |
charClass.map(matching(_).void) |
// TODO distinguish between capturing and not?
("(?:" ~ regex ~ ")") |
("(" ~ regex ~ ")")
def factor[_: P]: P[RegexC[Unit]] =
P {
base.flatMap { r =>
P("*?").map(_ => r.star(Greediness.NonGreedy).void) |
P("*").map(_ => r.star(Greediness.Greedy).void) |
P("+").map(_ => r.oneOrMore(Greediness.Greedy).void) |
P("??").map(_ => r.optional(Greediness.NonGreedy).void) |
P("?").map(_ => r.optional(Greediness.Greedy).void) |
quantifier.map(q => r.quantifyFold(q, ())((_, _) => ())) |
// TODO can probably do better than toList call. Do we care?
def term[_: P]: P[RegexC[Unit]] = P(factor.rep(0).map(_.toList.sequence_))
* A parser for a regular expression. You probably want to use [[regexExpr]] instead, as this
* parser will succeed even if there are trailing characters after a valid regular expression.
def regex[_: P]: P[RegexC[Unit]] =
term.flatMap { r1 =>
("|" ~/ regex).map(r2 => r1 | r2) |
* A parser for strings that are complete regular expressions, up until the end of the string.
def regexExpr[_: P]: P[RegexC[String]] = P(regex ~ End).map(_.matched.map(_.mkString_("")))
def parseRegex(regex: String): Either[String, RegexC[String]] =
parse(regex, regexExpr(_), verboseFailures = true) match {
case f @ Failure(_, _, _) => Left(f.msg)
case Success(value, _) => Right(value)