All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.johnsnowlabs.nlp.util.regex.RuleFactory.scala Maven / Gradle / Ivy

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.util.regex

import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.RuleSymbols

import scala.util.matching.Regex

/** Regular Expressions rule manager. Applies rules based on Matching and Replacement strategies
  * @param matchStrategy
  *   How to decide on regex search
  * @param transformStrategy
  *   How to decide when replacing or transforming content with Regex
  */
class RuleFactory(
    matchStrategy: MatchStrategy.MatchStrategy,
    transformStrategy: TransformStrategy.TransformStrategy = TransformStrategy.NO_TRANSFORM)
    extends RuleSymbols
    with Serializable {

  import MatchStrategy._
  import RuleFactory.RuleMatch
  import TransformStrategy._

  /** Helper functions to identify context in a word for debugging */
  private def logSubStartHelper(start: Int): Int = if (start > 10) start - 10 else 0
  private def logSubEndHelper(sourceLength: Int, end: Int): Int =
    if (sourceLength - end > 10) end + 10 else sourceLength

  /** Rules and SymbolRules are key pieces of regex transformation */
  private var rules: Seq[RegexRule] = Seq()
  private var symbolRules: Seq[(String, RegexRule)] = Seq()

  /** Adds a rule to this factory */
  def addRule(rule: RegexRule): this.type = {
    rules = rules :+ rule
    this
  }

  /** Adds a rule to this factory with native types */
  def addRule(rule: Regex, description: String): this.type = {
    rules = rules :+ new RegexRule(rule, description)
    this
  }

  def clearRules(): this.type = {
    rules = Seq.empty[RegexRule]
    this
  }

  /** Shortcut functions, no need to execute them on runtime since a strategy won't change in
    * lifetime of Factory
    */
  private val findMatchFunc = (text: String) =>
    matchStrategy match {
      case MATCH_ALL =>
        rules.flatMap(rule =>
          rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
      case MATCH_FIRST =>
        rules.flatMap(rule =>
          rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
      case MATCH_COMPLETE =>
        rules.flatMap(rule =>
          rule.regex
            .findFirstMatchIn(text)
            .filter(_.matched == text)
            .map(m => RuleMatch(m, rule.identifier)))
    }

  private val transformMatchFunc =
    (text: String, regex: Regex, transform: Regex.Match => String) =>
      matchStrategy match {
        case MATCH_ALL => regex.replaceAllIn(text, transform)
        case MATCH_FIRST =>
          regex
            .findFirstMatchIn(text)
            .map(m => regex.replaceFirstIn(text, transform(m)))
            .getOrElse(text)
        case MATCH_COMPLETE =>
          regex
            .findFirstMatchIn(text)
            .filter(_.matched == text)
            .map(m => regex.replaceFirstIn(text, transform(m)))
            .getOrElse(text)
        case _ => throw new IllegalArgumentException("Invalid match strategy")
      }

  private val transformWithSymbolFunc = (symbol: String, text: String) =>
    transformStrategy match {
      case APPEND_WITH_SYMBOL =>
        rules.foldLeft(text)((target, rule) =>
          transformMatch(target, rule.regex)({ m =>
            "$0" + symbol
          }))
      case PREPEND_WITH_SYMBOL =>
        rules.foldLeft(text)((target, rule) =>
          transformMatch(target, rule.regex)({ m =>
            symbol + "$0"
          }))
      case REPLACE_ALL_WITH_SYMBOL =>
        rules.foldLeft(text)((target, rule) =>
          transformMatch(target, rule.regex)({ m =>
            symbol
          }))
      case REPLACE_WITH_SYMBOL_AND_BREAK =>
        rules.foldLeft(text)((target, rule) =>
          transformMatch(target, rule.regex)({ m =>
            symbol + BREAK_INDICATOR
          }))
      case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
    }

  private val transformWithSymbolicRulesFunc = (text: String) =>
    transformStrategy match {
      case REPLACE_EACH_WITH_SYMBOL =>
        symbolRules.foldLeft(text)((target, rule) =>
          transformMatch(target, rule._2.regex)({ m =>
            rule._1
          }))
      case REPLACE_EACH_WITH_SYMBOL_AND_BREAK =>
        symbolRules.foldLeft(text)((target, rule) =>
          rule._2.regex replaceAllIn (target, m => {
            rule._1 + BREAK_INDICATOR
          }))
      case PROTECT_FROM_BREAK =>
        rules.foldLeft(text)((target, rule) =>
          transformMatch(target, rule.regex)({ m =>
            PROTECTION_MARKER_OPEN + m.matched
              .replaceAllLiterally("$", "\\$") + PROTECTION_MARKER_CLOSE
          }))
      case BREAK_AND_PROTECT_FROM_BREAK =>
        rules.foldLeft(text)((target, rule) =>
          transformMatch(target, rule.regex)({ m =>
            BREAK_INDICATOR + PROTECTION_MARKER_OPEN + m.matched
              .replaceAllLiterally("$", "\\$") + PROTECTION_MARKER_CLOSE
          }))
      case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
    }

  /** Adds a rule and its associated symbol to apply some transformation using such symbol
    * @param symbol
    *   symbol is a character to be used in a transformation application, where many rules can
    *   apply different transformations
    * @param rule
    *   rule to be used when replacing a match with a symbol
    * @return
    */
  def addSymbolicRule(symbol: String, rule: RegexRule): this.type = {
    symbolRules = symbolRules :+ (symbol, rule)
    this
  }

  /** add multiple rules alltogether */
  def addRules(newRules: Seq[RegexRule]): this.type = {
    rules = rules ++: newRules
    this
  }

  /** overrides rules with a new set of rules */
  def setRules(newRules: Seq[RegexRule]): this.type = {
    rules = newRules
    this
  }

  /** Applies factory match strategy to find matches and returns any number of Matches */
  def findMatch(text: String): Seq[RuleMatch] = {
    findMatchFunc(text)
  }

  /** Specifically finds a first match within a group of matches */
  def findMatchFirstOnly(text: String): Option[RuleMatch] = {
    findMatch(text).headOption
  }

  /** Applies rule transform strategy and utilizing matching strategies Arguments are curried so
    * transformation can be partially applied in some cases
    * @return
    *   Resulting transformation
    */
  private def transformMatch(text: String, regex: Regex)(
      transform: Regex.Match => String): String = {
    transformMatchFunc(text: String, regex: Regex, transform: Regex.Match => String)
  }

  /** Applies factory transform of all ordered rules utilizing transform and match strategies with
    * provided symbol
    * @param symbol
    *   a symbol to use for all transformations altogether
    * @param text
    *   target text to transform
    * @return
    */
  def transformWithSymbol(symbol: String, text: String): String = {
    transformWithSymbolFunc(symbol, text)
  }

  /** Applies factory transform of all ordered rules utilizing transform and match strategies
    * corresponding each rule with its symbol
    * @param text
    *   target text to transform
    * @return
    *   Returns a transformed text
    */
  def transformWithSymbolicRules(text: String): String = {
    transformWithSymbolicRulesFunc(text)
  }
}
object RuleFactory {

  /** Specific partial constructor for [[RuleFactory]] where MatchStrategy might change on runtime
    */
  def lateMatching(transformStrategy: TransformStrategy.TransformStrategy)(
      matchStrategy: MatchStrategy.MatchStrategy): RuleFactory =
    new RuleFactory(matchStrategy, transformStrategy)

  /** Internal representation of a regex match
    * @param content
    *   the matching component, which holds [[Regex.Match]] information, plus its user
    *   identification
    * @param identifier
    *   user provided identification of a rule
    */
  case class RuleMatch(content: Regex.Match, identifier: String)
}

/** Allowed strategies for [[RuleFactory]] applications regarding replacement */
object TransformStrategy extends Enumeration {
  type TransformStrategy = Value
  val NO_TRANSFORM, APPEND_WITH_SYMBOL, PREPEND_WITH_SYMBOL, REPLACE_ALL_WITH_SYMBOL,
      REPLACE_WITH_SYMBOL_AND_BREAK, PROTECT_FROM_BREAK, BREAK_AND_PROTECT_FROM_BREAK,
      REPLACE_EACH_WITH_SYMBOL, REPLACE_EACH_WITH_SYMBOL_AND_BREAK = Value
}

/** Allowed strategies for [[RuleFactory]] applications regarding matching */
object MatchStrategy extends Enumeration {
  type MatchStrategy = Value
  val MATCH_ALL, MATCH_FIRST, MATCH_COMPLETE = Value
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy