rrec-parser_2.11.0.4.0.source-code.Parser.scala Maven / Gradle / Ivy
The newest version!
package ceedubs.irrec
package parse
import ceedubs.irrec.regex._, Match.MatchSet
import ceedubs.irrec.regex.RegexC
import ceedubs.irrec.regex.RegexPrettyPrinter.{
charClassCharsToEscape,
nonCharClassCharsToEscape,
specialNonCharClassCharToLit
}
import combinator._
import fastparse._, NoWhitespace._
import fastparse.Parsed.{Failure, Success}
import cats.collections.{Diet, Range}
import cats.implicits._
object Parser {
private val escapableCharToLit: Map[Char, Char] = specialNonCharClassCharToLit + ('-' -> '-')
/**
* Matches on special characters that should be escaped like `*` and `{`.
*/
def specialChar[_: P]: P[Char] =
CharPred(escapableCharToLit.contains(_)).!.map(s => escapableCharToLit(s.head))
.opaque(
s"special regular expression character that should be escaped such as '(', '}', '*', etc")
def unicodeCodePoint[_: P]: P[Char] =
P(
CharPred(CharacterClasses.hexDigit.contains(_))
.rep(exactly = 4)
.!
.map(hexChars => Integer.parseInt(hexChars, 16).toChar)
).opaque("A valid unicode code point in 4-digit hex form (ex: '006F')")
/**
* A shorthand class such as `\d` or `\w`. This parser itself doesn't look for the `\`; it starts
* with the character after it.
*/
def shorthandClass[_: P]: P[Match.MatchSet[Char]] =
(
P("d").map(_ => MatchSet.allow(CharacterClasses.digit)) |
P("D").map(_ => MatchSet.forbid(CharacterClasses.digit)) |
P("w").map(_ => MatchSet.allow(CharacterClasses.wordChar)) |
P("W").map(_ => MatchSet.forbid(CharacterClasses.wordChar)) |
P("h").map(_ => MatchSet.allow(CharacterClasses.horizontalWhitespaceChar)) |
P("H").map(_ => MatchSet.forbid(CharacterClasses.horizontalWhitespaceChar)) |
P("s").map(_ => MatchSet.allow(CharacterClasses.whitespaceChar)) |
P("S").map(_ => MatchSet.forbid(CharacterClasses.whitespaceChar))
).opaque("""character class such as \w, \d, \s, \S, etc""")
/**
* Standard characters to match like `a` or `%`.
*/
def standardMatchChar[_: P]: P[Char] =
CharPred(c => !nonCharClassCharsToEscape.contains(c)).!.map(s => s.head)
.opaque("""standard charact to match like `a` or `%`""")
/**
* Standard characters to match like `a` or `%` but also characters that aren't special within
* character classes such as `*` (ex: `[*+]` matches on literal `*` and `+`).
*/
def charClassStandardMatchChar[_: P]: P[Char] =
P(CharPred(c => !charClassCharsToEscape.contains(c)).!.map(s => s.head))
/**
* Matches the wildcard character `.`.
*/
def wildcard[_: P]: P[RegexC[Char]] = P(".").map(_ => combinator.wildcard)
/**
* Positive integers within the max range of Scala's `Int`.
*/
def posInt[_: P]: P[Int] =
P(
CharIn("0-9")
.rep(1)
.!
.flatMap(s => Either.catchNonFatal(s.toInt).fold(_ => Fail, Pass(_))))
.opaque(s"integer between 0 and ${Int.MaxValue}")
def singleLitCharClassChar[_: P]: P[Char] =
P(("\\u" ~ unicodeCodePoint) | ("\\" ~ specialChar | charClassStandardMatchChar))
def matchLitCharClassChar[_: P]: P[Match.Literal[Char]] =
P(singleLitCharClassChar.map(Match.Literal(_)))
/**
* Character range like `a-z`.
*/
def matchCharRange[_: P]: P[Range[Char]] =
P(
(singleLitCharClassChar ~ "-" ~ singleLitCharClassChar).map { case (l, h) =>
Range(l, h)
}
)
/**
* Matches repeat counts like `{3}` or `{1,4}`.
*/
def quantifier[_: P]: P[Quantifier] =
P(
"{" ~/ (
(posInt ~ "," ~/ posInt.? ~/ "}" ~/ (P("?").map(_ => Greediness.NonGreedy) | Pass(
Greediness.Greedy))).map { case (l, h, g) =>
Quantifier.Range(l, h, g)
} |
(posInt.map(Quantifier.Exact(_)) ~ "}")
)
).opaque("repeat count such as '{3}', '{1,4}', `{1, 4}?`, '{3,}', or `{3,}?")
def charOrRange[_: P]: P[Match.MatchSet[Char]] =
matchCharRange.map(r => MatchSet.allow(Diet.fromRange(r))) |
singleLitCharClassChar.map(c => MatchSet.allow(Diet.one(c)))
def positivePOSIXCharClass[_: P]: P[MatchSet[Char]] =
P("alnum").map(_ => MatchSet.allow(CharacterClasses.alphaNumeric)) |
P("alpha").map(_ => MatchSet.allow(CharacterClasses.alpha)) |
P("ascii").map(_ => MatchSet.allow(CharacterClasses.ascii)) |
P("blank").map(_ => MatchSet.allow(CharacterClasses.horizontalWhitespaceChar)) |
P("cntrl").map(_ => MatchSet.allow(CharacterClasses.controlChar)) |
P("digit").map(_ => MatchSet.allow(CharacterClasses.digit)) |
P("graph").map(_ => MatchSet.allow(CharacterClasses.graphChar)) |
P("lower").map(_ => MatchSet.allow(CharacterClasses.lowerAlpha)) |
P("print").map(_ => MatchSet.allow(CharacterClasses.printableChar)) |
P("punct").map(_ => MatchSet.allow(CharacterClasses.punctuationChar)) |
P("space").map(_ => MatchSet.allow(CharacterClasses.whitespaceChar)) |
P("upper").map(_ => MatchSet.allow(CharacterClasses.upperAlpha)) |
P("word").map(_ => MatchSet.allow(CharacterClasses.wordChar)) |
P("xdigit").map(_ => MatchSet.allow(CharacterClasses.hexDigit))
def positiveCharClassContent[_: P]: P[MatchSet[Char]] =
(!"&&" ~ (("\\" ~ shorthandClass) | charOrRange))
.rep(1)
.map(_.reduce(_ union _))
def charClassBase[_: P]: P[MatchSet[Char]] =
P(
positiveCharClassContent |
("[:" ~ positivePOSIXCharClass ~ ":]") |
charClass)
def charClassUnion[_: P]: P[MatchSet[Char]] =
P(charClassBase.rep(1).map(_.reduce(_ union _)))
def charClassTerm[_: P]: P[MatchSet[Char]] =
charClassUnion.flatMap { c1 =>
("&&" ~/ charClassTerm).map(c2 => c1 intersect c2) |
Pass(c1)
}
/**
* Character classes like `[acz]` or `[^a-cHP-W]`.
*/
def charClass[_: P]: P[MatchSet[Char]] =
P(
("[^" ~ (positiveCharClassContent.map(_.negate) ~ "&&" ~ charClassTerm).map { case (c1, c2) =>
c1 intersect c2
} ~ "]") |
("[^" ~ (positiveCharClassContent.map(_.negate) ~ charClassTerm).map { case (c1, c2) =>
c1 union c2
} ~ "]") |
("[^" ~ charClassTerm.map(_.negate) ~ "]") |
("[" ~ charClassTerm ~ "]")
)
def base[_: P]: P[RegexC[Unit]] =
P(
standardMatchChar.map(lit(_).void) |
("\\" ~/ (("u" ~ unicodeCodePoint | specialChar).map(lit(_).void) | shorthandClass.map(
matching(_).void))) |
wildcard.map(_.void) |
charClass.map(matching(_).void) |
// TODO distinguish between capturing and not?
("(?:" ~ regex ~ ")") |
("(" ~ regex ~ ")")
)
def factor[_: P]: P[RegexC[Unit]] =
P {
base.flatMap { r =>
P("*?").map(_ => r.star(Greediness.NonGreedy).void) |
P("*").map(_ => r.star(Greediness.Greedy).void) |
P("+").map(_ => r.oneOrMore(Greediness.Greedy).void) |
P("??").map(_ => r.optional(Greediness.NonGreedy).void) |
P("?").map(_ => r.optional(Greediness.Greedy).void) |
quantifier.map(q => r.quantifyFold(q, ())((_, _) => ())) |
Pass(r)
}
}
// TODO can probably do better than toList call. Do we care?
def term[_: P]: P[RegexC[Unit]] = P(factor.rep(0).map(_.toList.sequence_))
/**
* A parser for a regular expression. You probably want to use [[regexExpr]] instead, as this
* parser will succeed even if there are trailing characters after a valid regular expression.
*/
def regex[_: P]: P[RegexC[Unit]] =
P(
term.flatMap { r1 =>
("|" ~/ regex).map(r2 => r1 | r2) |
Pass(r1)
}
)
/**
* A parser for strings that are complete regular expressions, up until the end of the string.
*/
def regexExpr[_: P]: P[RegexC[String]] = P(regex ~ End).map(_.matched.map(_.mkString_("")))
def parseRegex(regex: String): Either[String, RegexC[String]] =
parse(regex, regexExpr(_), verboseFailures = true) match {
case f @ Failure(_, _, _) => Left(f.msg)
case Success(value, _) => Right(value)
}
}