dregex.impl.Normalizer.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of dregex_2.11 Show documentation
Show all versions of dregex_2.11 Show documentation
Deterministic Regular Expression Engine
package dregex.impl
import dregex.UnsupportedException
/**
* Regular expressions can have character classes and wildcards. In order to produce a NFA, they should be expanded
* to disjunctions. In the case of wildcards or negated characted classes, the complete alphabet must also be known
* to produce the expansion:
*
* Example transformations with alphabet: abcdefgh
*
* [abc] -> a|b|c
* [^abc] -> d|e|f|g|h
* def[^abc] -> def(d|e|f|g|h)
* . -> a|b|c|d|e|f|g|h
* abc. -> abc(a|b|c|d|e|f|g|h)
*
* As the alphabet can be potentially huge (such as unicode is) something must be done to reduce the number of
* disjunctions:
*
* [abc] -> a|b|c
* [^abc] ->
* def[^abc] -> def(d|e|f|)
* . ->
* abc. -> abc(a|b|c|)
*
* Where is a special metacharacter that matches any of the characters of the alphabet not present in
* the regex. Note that with this technique knowing the whole alphabet explicitly is not needed.
*
* Care must be taken when the regex is meant to be used for an operation with another regex (such as intersection
* or difference). In this case, must match only the characters present in neither regex. Example:
*
* Regex space: [abc] and [^cd]
* Characters present in any regex: abcd
* [abc] -> a|b|c
* [^cd] -> a|b|
*/
object Normalizer {
import RegexTree._
def alphabet(ast: Node): Set[Char] = ast match {
case complex: ComplexPart => complex.values.map(alphabet _).reduce(_ union _)
case atom: AtomPart => atom.atoms.toSet
}
/**
* Expand the wildcards (\".\") and character classes, transforming them into disjunctions over the supplied alphabet
*/
def normalize(tree: Node, alphabet: Set[NormTree.SglChar]): NormTree.Node = tree match {
// lookarounds should be expanded by now
case d: Lookaround => throw new IllegalArgumentException("lookarounds should be already expanded")
// expand wildcards
case Wildcard => NormTree.Disj(alphabet.toSeq)
// expand character classes
case CharClass(sets @ _*) => NormTree.Disj(sets.map(_.resolve(alphabet)).flatten)
case NegatedCharClass(sets @ _*) => NormTree.Disj((alphabet diff sets.map(_.resolve(alphabet)).flatten.toSet).toSeq)
// recurse over the rest
case Disj(values) => NormTree.Disj(values.map(normalize(_, alphabet)))
case Rep(min, max, value) => NormTree.Rep(min, max, normalize(value, alphabet))
case Juxt(values) => NormTree.Juxt(values.map(normalize(_, alphabet)))
case Lit(char) => NormTree.Lit(char)
case Epsilon => NormTree.Epsilon
}
}