All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dregex.Regex.scala Maven / Gradle / Ivy

The newest version!
package dregex

import java.util.regex.Pattern

import dregex.impl.RegexParser
import dregex.impl.Util
import dregex.impl.SimpleState
import dregex.impl.DfaAlgorithms
import dregex.impl.Dfa
import dregex.impl.RegexParser.DotMatch
import org.slf4j.LoggerFactory

import scala.collection.immutable.Seq
import scala.collection.JavaConverters._

import scala.collection.compat._

/**
  * A regular expression, ready to be tested against strings, or to take part in an operation against another.
  * Internally, instances of this type have a DFA (Deterministic Finite Automaton).
  */
trait Regex {

  private[this] val logger = LoggerFactory.getLogger(classOf[Regex])

  private[dregex] def dfa: Dfa[SimpleState]

  /**
    * Return this regex's [[Universe]]. Only regexes of the same universe can be operated together.
    */
  def universe: Universe

  private def checkUniverse(other: Regex): Unit = {
    if (other.universe != universe)
      throw new Exception("cannot make operations between regex from different universes")
  }

  /**
    * Return whether a string is matched by the regular expression (i.e. whether the string is included in the language
    * generated by the expression).
    * As the match is done using a DFA, its complexity is O(n), where n is the length of the string. It is constant
    * with respect to the length of the expression.
    */
  def matches(string: CharSequence): Boolean = {
    val (result, _) = matchAndReport(string)
    result
  }

  /**
    * Similar to method [[matches]], except that also return how many characters were successfully matched in case of
    * failure.
    */
  def matchAndReport(string: CharSequence): (Boolean, Int) = {
    DfaAlgorithms.matchString(dfa, universe.normalization.normalize(string))
  }

  /**
    * Intersect this regular expression with another. The resulting expression will match the strings that are
    * matched by the operands, and only those. Intersections take O(n⋅m) time, where n and m are the number of states of
    * the DFA of the operands.
    */
  def intersect(other: Regex): Regex = {
    val (res, time) = Util.time {
      checkUniverse(other)
      new SynteticRegex(DfaAlgorithms.rewriteWithSimpleStates(DfaAlgorithms.intersect(this.dfa, other.dfa)), universe)
    }
    logger.trace("{} and {} intersected in {}", this, other, time)
    res
  }

  /**
    * Subtract other regular expression from this one. The resulting expression will match the strings that are
    * matched this expression and are not matched by the other, and only those. Differences take O(n⋅m) time, where n
    * and m are the number of states of the DFA of the operands.
    */
  def diff(other: Regex): Regex = {
    val (res, time) = Util.time {
      checkUniverse(other)
      new SynteticRegex(DfaAlgorithms.rewriteWithSimpleStates(DfaAlgorithms.diff(this.dfa, other.dfa)), universe)
    }
    logger.trace("{} and {} diffed in {}", this, other, time)
    res
  }

  /**
    * Unite this regular expression with another. The resulting expression will match the strings that are matched by
    * either of the operands, and only those. Unions take O(n⋅m) time, where n and m are the number of states of the DFA
    * of the operands.
    */
  def union(other: Regex): Regex = {
    val (res, time) = Util.time {
      checkUniverse(other)
      new SynteticRegex(DfaAlgorithms.rewriteWithSimpleStates(DfaAlgorithms.union(this.dfa, other.dfa)), universe)
    }
    logger.trace("{} and {} unioned in {}", this, other, time)
    res
  }

  /**
    * Return whether this expression matches at least one string in common with another. Intersections take O(n⋅m) time,
    * where n and m are the number of states of the DFA of the operands.
    */
  def doIntersect(other: Regex): Boolean = {
    checkUniverse(other)
    DfaAlgorithms.isIntersectionNotEmpty(this.dfa, other.dfa)
  }

  /**
    * Return whether this expressions matches every expression that is matched by another. An [[diff]] between the
    * two operands is done internally.
    */
  def isSubsetOf(other: Regex): Boolean = {
    checkUniverse(other)
    DfaAlgorithms.isSubsetOf(this.dfa, other.dfa)
  }

  /**
    * Return whether this expressions matches every expression that is matched by another, but the expressions are not
    * equal. Two [[diff]] between the two operands are done internally.
    */
  def isProperSubsetOf(other: Regex): Boolean = {
    checkUniverse(other)
    DfaAlgorithms.isProperSubset(this.dfa, other.dfa)
  }

  /**
    * Return whether this regular expression is equivalent to other. Two regular expressions are equivalent if they
    * match exactly the same set of strings. This operation takes O(n⋅m) time, where n and m are the number of states of
    * the DFA of the operands.
    */
  def equiv(other: Regex): Boolean = {
    checkUniverse(other)
    DfaAlgorithms.equivalent(this.dfa, other.dfa)
  }

  /**
    * Return whether this regular expression matches anything. Note that the empty string is a valid match.
    */
  def matchesAtLeastOne(): Boolean = DfaAlgorithms.matchesAtLeastOne(dfa)

}

/**
  * @define flagsDesc match flags, a bit mask that may include `java.util.regex.Pattern.LITERAL`, and
  *                   `java.util.regex.Pattern.COMMENTS`.
  *
  * @define parseDesc In general, this method is not necessary; a call to one of the `compile` methods is simpler
  *                   and more direct. However, there are cases in which the intermediate [[ParsedRegex]]s are needed.
  *                   Most notably, when caching [[CompiledRegex]] instances (which are in general more expensive to
  *                   create).
  */
object Regex {

  private[this] val logger = LoggerFactory.getLogger(Regex.getClass)

  private[this] def flagsFromBits(bits: Int): RegexParser.Flags = {
    RegexParser.Flags(
      dotMatch = dotMatcherFromFlags(bits),
      literal = (bits & Pattern.LITERAL) != 0,
      comments = (bits & Pattern.COMMENTS) != 0,
      unicodeClasses = (bits & Pattern.UNICODE_CHARACTER_CLASS) != 0,
      caseInsensitive = (bits & Pattern.CASE_INSENSITIVE) != 0,
      unicodeCase = (bits & Pattern.UNICODE_CASE) != 0,
      canonicalEq = (bits & Pattern.CANON_EQ) != 0
    )
  }

  /**
    * Parse a regular expression from a string.
    *
    * $parseDesc
    */
  def parse(regex: String): ParsedRegex = {
    parse(regex, 0)
  }

  /**
    * Parse a regular expression from a string, with the given flags.
    *
    * $parseDesc
    *
    * @param flags $flagsDesc
    */
  def parse(regex: String, flags: Int): ParsedRegex = {
    RegexParser.parse(regex, flagsFromBits(flags))
  }

  /**
    * Parse a set of regular expressions from a string, with the given flags. Scala version.
    *
    * $parseDesc
    *
    * @param flags $flagsDesc
    */
  def parse(regexes: Seq[String], flags: Int = 0): Seq[ParsedRegex] = {
    regexes.map { r =>
      parse(r, flags)
    }
  }

  /**
    * Parse a set of regular expressions from a string, with the given flags. Java version.
    *
    * $parseDesc
    *
    * @param flags $flagsDesc
    */
  def parse(regexes: java.util.List[String], flags: Int): java.util.List[ParsedRegex] = {
    parse(regexes.asScala.to(Seq), flags).asJava
  }

  /**
    * Compile a regex parsed using one of the `parse` methods.
    *
    * $parseDesc
    */
  def compileParsed(parsedRegex: ParsedRegex, universe: Universe): CompiledRegex = {
    val (res, time) = Util.time {
      new CompiledRegex(parsedRegex.literal, parsedRegex.tree, universe)
    }
    logger.trace("{} compiled in {}", parsedRegex.literal, time: Any)
    res
  }

  /**
    * Compile a regex from a string, using it's own [[Universe]], with the given flags.
    *
    * @param flags $flagsDesc
    */
  def compile(regex: String, flags: Int): CompiledRegex = {
    val parsedRegex = parse(regex, flags)
    val (compiled, time) = Util.time {
      new CompiledRegex(regex, parsedRegex.tree, new Universe(Seq(parsedRegex.tree), parsedRegex.norm))
    }
    logger.trace("{} compiled in {}", compiled, time: Any)
    compiled
  }

  private def dotMatcherFromFlags(flags: Int): DotMatch = {
    if ((flags & Pattern.DOTALL) != 0) {
      DotMatch.All
    } else {
      if ((flags & Pattern.UNIX_LINES) != 0) {
        DotMatch.UnixLines
      } else {
        DotMatch.JavaLines
      }
    }
  }

  /**
    * Compiles a set of regular expressions in the same [[Universe]].
    */
  def compile(regex: String): CompiledRegex = compile(regex, 0)

  /**
    * Compiles a set of regular expressions in the same [[Universe]], with the given flags. Java version.
    *
    * @param flags $flagsDesc
    */
  def compile(regexes: java.util.List[String], flags: Int): java.util.List[CompiledRegex] = {
    compile(regexes.asScala.to(Seq), flags).asJava
  }

  /**
    * Compiles a set of regular expressions in the same [[Universe]]. Java version.
    */
  def compile(regexes: java.util.List[String]): java.util.List[CompiledRegex] = compile(regexes, 0)

  /**
    * Compiles a set of regular expressions in the same [[Universe]], with the given flags. Scala version.
    *
    * @param flags $flagsDesc
    */
  def compile(regexes: Seq[String], flags: Int = 0): Seq[CompiledRegex] = {
    val parsedRegexes = parse(regexes, flags)
    val universe = new Universe(parsedRegexes.map(_.tree), parsedRegexes.head.norm)
    for (parsedRegex <- parsedRegexes) yield {
      compileParsed(parsedRegex, universe)
    }
  }

  /**
    * Create a regular expression that does not match anything. Note that that is different from matching the empty
    * string. Despite the theoretical equivalence of automata and regular expressions, in practice there is no regular
    * expression that does not match anything.
    */
  def nullRegex(u: Universe): Regex = new SynteticRegex(Dfa.NothingDfa, u)

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy