All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.johnsnowlabs.nlp.util.regex.RuleFactory.scala Maven / Gradle / Ivy

package com.johnsnowlabs.nlp.util.regex

import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.RuleSymbols
import org.slf4j.LoggerFactory

import scala.util.matching.Regex

/**
  * Created by Saif Addin on 5/8/2017.
  */

/**
  * Regular Expressions rule manager. Applies rules based on Matching and Replacement strategies
  * @param matchStrategy How to decide on regex search
  * @param transformStrategy How to decide when replacing or transforming content with Regex
  */
class RuleFactory(matchStrategy: MatchStrategy.MatchStrategy,
                  transformStrategy: TransformStrategy.TransformStrategy = TransformStrategy.NO_TRANSFORM)
  extends RuleSymbols with Serializable {

  /**
    * Internal representation of a regex match
    * @param content the matching component, which holds [[Regex.Match]] information, plus its user identification
    * @param identifier user provided identification of a rule
    */
  protected case class RuleMatch(content: Regex.Match, identifier: String)

  import TransformStrategy._
  import MatchStrategy._

  /** Helper functions to identify context in a word for debugging */
  private val logger = LoggerFactory.getLogger("RuleFactory")
  private def logSubStartHelper(start: Int): Int = if (start > 10) start - 10 else  0
  private def logSubEndHelper(sourceLength: Int, end: Int): Int = if (sourceLength - end > 10) end + 10 else sourceLength

  /** Rules and SymbolRules are key pieces of regex transformation */
  private var rules: Seq[RegexRule] = Seq()
  private var symbolRules: Seq[(String, RegexRule)] = Seq()

  /** Adds a rule to this factory*/
  def addRule(rule: RegexRule): this.type = {
    rules = rules :+ rule
    this
  }

  /** Adds a rule to this factory with native types */
  def addRule(rule: Regex, description: String): this.type = {
    rules = rules :+ new RegexRule(rule, description)
    this
  }

  def clearRules(): this.type = {
    rules = Seq.empty[RegexRule]
    this
  }

  /** Shortcut functions, no need to execute them on runtime since a strategy won't change in lifetime of Factory */
  private val findMatchFunc = (text: String) => matchStrategy match {
    case MATCH_ALL => rules.flatMap(rule => rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
    case MATCH_FIRST => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
    case MATCH_COMPLETE => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).filter(_.matched == text).map(m => RuleMatch(m, rule.identifier)))
  }

  private val transformMatchFunc = (text: String, regex: Regex, transform: Regex.Match => String) => matchStrategy match {
    case MATCH_ALL => regex.replaceAllIn(text, transform)
    case MATCH_FIRST => regex.findFirstMatchIn(text).map(m => regex.replaceFirstIn(text, transform(m))).getOrElse(text)
    case MATCH_COMPLETE => regex.findFirstMatchIn(text).filter(_.matched == text).map(m =>
      regex.replaceFirstIn(text, transform(m))).getOrElse(text)
    case _ => throw new IllegalArgumentException("Invalid match strategy")
  }

  private val transformWithSymbolFunc = (symbol: String, text: String) => transformStrategy match {
    case APPEND_WITH_SYMBOL => rules.foldLeft(text)((target, rule) => transformMatch(target, rule.regex)({ m =>
      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
        m.matched,
        m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
        rule.identifier,
        APPEND_WITH_SYMBOL)
      "$0" + symbol
    }))
    case PREPEND_WITH_SYMBOL => rules.foldLeft(text)((target, rule) => transformMatch(target, rule.regex)({ m =>
      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
        m.matched,
        m.source.subSequence(logSubStartHelper(m.start),logSubEndHelper(m.source.length, m.end)),
        rule.identifier,
        PREPEND_WITH_SYMBOL)
      symbol + "$0"
    }))
    case REPLACE_ALL_WITH_SYMBOL => rules.foldLeft(text)((target, rule) => transformMatch(target, rule.regex)({ m =>
      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
        m.matched,
        m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
        rule.identifier,
        REPLACE_ALL_WITH_SYMBOL)
      symbol
    }))
    case REPLACE_WITH_SYMBOL_AND_BREAK => rules.foldLeft(text)((target, rule) => transformMatch(target, rule.regex)({ m =>
      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
        m.matched,
        m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
        rule.identifier,
        REPLACE_WITH_SYMBOL_AND_BREAK)
      symbol + BREAK_INDICATOR
    }))
    case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
  }

  private val transformWithSymbolicRulesFunc = (text: String) => transformStrategy match {
    case REPLACE_EACH_WITH_SYMBOL => symbolRules.foldLeft(text)((target, rule) => transformMatch(target, rule._2.regex)({ m =>
      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
        m.matched,
        m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
        rule._2.identifier,
        REPLACE_EACH_WITH_SYMBOL)
      rule._1
    }))
    case REPLACE_EACH_WITH_SYMBOL_AND_BREAK => symbolRules.foldLeft(text)((target, rule) => rule._2.regex replaceAllIn(
      target, m => {
      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
        m.matched,
        m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
        rule._2.identifier,
        REPLACE_EACH_WITH_SYMBOL_AND_BREAK)
      rule._1 + BREAK_INDICATOR
    }))
    case PROTECT_FROM_BREAK => rules.foldLeft(text)((target, rule) => transformMatch(target, rule.regex)({ m =>
      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
        m.matched,
        m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
        rule.identifier,
        PROTECT_FROM_BREAK)
      PROTECTION_MARKER_OPEN + m.matched + PROTECTION_MARKER_CLOSE
    }))
    case BREAK_AND_PROTECT_FROM_BREAK => rules.foldLeft(text)((target, rule) => transformMatch(target, rule.regex)({ m =>
      logger.debug("Matched: {} from: {} using rule {} with strategy {}",
        m.matched,
        m.source.subSequence(logSubStartHelper(m.start), logSubEndHelper(m.source.length, m.end)),
        rule.identifier,
        PROTECT_FROM_BREAK)
      BREAK_INDICATOR + PROTECTION_MARKER_OPEN + m.matched + PROTECTION_MARKER_CLOSE
    }))
    case _ => throw new IllegalArgumentException("Invalid strategy for rule factory")
  }

  /**
    * Adds a rule and its associated symbol to apply some transformation using such symbol
    * @param symbol symbol is a character to be used in a transformation application, where many rules can apply different transformations
    * @param rule rule to be used when replacing a match with a symbol
    * @return
    */
  def addSymbolicRule(symbol: String, rule: RegexRule): this.type = {
    symbolRules = symbolRules :+ (symbol, rule)
    this
  }

  /** add multiple rules alltogether */
  def addRules(newRules: Seq[RegexRule]): this.type = {
    rules = rules ++: newRules
    this
  }

  /** overrides rules with a new set of rules */
  def setRules(newRules: Seq[RegexRule]): this.type = {
    rules = newRules
    this
  }

  /**Applies factory match strategy to find matches and returns any number of Matches*/
  def findMatch(text: String): Seq[RuleMatch] = {
    findMatchFunc(text)
  }

  /** Specifically finds a first match within a group of matches */
  def findMatchFirstOnly(text: String): Option[RuleMatch] = {
    findMatch(text).headOption
  }

  /**
    * Applies rule transform strategy and utilizing matching strategies
    * Arguments are curried so transformation can be partially applied in some cases
    * @return Resulting transformation
    */
  private def transformMatch(text: String, regex: Regex)(transform: Regex.Match => String): String = {
    transformMatchFunc(text: String, regex: Regex, transform: Regex.Match => String)
  }

  /**
    * Applies factory transform of all ordered rules utilizing transform and match strategies with provided symbol
    * @param symbol a symbol to use for all transformations altogether
    * @param text target text to transform
    * @return
    */
  def transformWithSymbol(symbol: String, text: String): String = {
    transformWithSymbolFunc(symbol, text)
  }

  /**
    * Applies factory transform of all ordered rules utilizing transform and match strategies corresponding each rule with its symbol
    * @param text target text to transform
    * @return Returns a transformed text
    */
  def transformWithSymbolicRules(text: String): String = {
    transformWithSymbolicRulesFunc(text)
  }
}
object RuleFactory {
  /**Specific partial constructor for [[RuleFactory]] where MatchStrategy might change on runtime */
  def lateMatching(transformStrategy: TransformStrategy.TransformStrategy)
                  (matchStrategy: MatchStrategy.MatchStrategy): RuleFactory =
    new RuleFactory(matchStrategy, transformStrategy)
}

/**
  * Allowed strategies for [[RuleFactory]] applications regarding replacement
  */
object TransformStrategy extends Enumeration {
  type TransformStrategy = Value
  val NO_TRANSFORM,
  APPEND_WITH_SYMBOL,
  PREPEND_WITH_SYMBOL,
  REPLACE_ALL_WITH_SYMBOL,
  REPLACE_WITH_SYMBOL_AND_BREAK,
  PROTECT_FROM_BREAK,
  BREAK_AND_PROTECT_FROM_BREAK,
  REPLACE_EACH_WITH_SYMBOL,
  REPLACE_EACH_WITH_SYMBOL_AND_BREAK = Value
}

/**
  * Allowed strategies for [[RuleFactory]] applications regarding matching
  */
object MatchStrategy extends Enumeration {
  type MatchStrategy = Value
  val MATCH_ALL,
  MATCH_FIRST,
  MATCH_COMPLETE = Value
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy