All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.johnsnowlabs.nlp.annotators.sbd.pragmatic.PragmaticMethod.scala Maven / Gradle / Ivy

package com.johnsnowlabs.nlp.annotators.sbd.pragmatic

import com.johnsnowlabs.nlp.annotators.common.Sentence
import com.johnsnowlabs.nlp.util.regex.MatchStrategy.MATCH_ALL
import com.johnsnowlabs.nlp.util.regex.RuleFactory
import com.johnsnowlabs.nlp.util.regex.TransformStrategy.REPLACE_ALL_WITH_SYMBOL

/**
  * Created by Saif Addin on 5/5/2017.
  */

protected trait PragmaticMethod {
  def extractBounds(content: String): Array[Sentence]
}

/**
  * Inspired on Kevin Dias, Ruby implementation: https://github.com/diasks2/pragmatic_segmenter
  * This approach extracts sentence bounds by first formatting the data with [[RuleSymbols]] and then extracting bounds
  * with a strong RegexBased rule application
  */
class CustomPragmaticMethod(customBounds: Array[String]) extends PragmaticMethod with Serializable {
  override def extractBounds(content: String): Array[Sentence] = {

    val customBoundsFactory = new RuleFactory(MATCH_ALL, REPLACE_ALL_WITH_SYMBOL)
    customBounds.foreach(bound => customBoundsFactory.addRule(bound.r, s"split bound: $bound"))

    val symbolyzedData = new PragmaticContentFormatter(content)
        .formatCustomBounds(customBoundsFactory)
        .finish

    new PragmaticSentenceExtractor(symbolyzedData, content).pull
  }
}

class DefaultPragmaticMethod(useAbbreviations: Boolean = false) extends PragmaticMethod with Serializable {
  /** this is a hardcoded order of operations
    * considered to go from those most specific non-ambiguous cases
    * down to those that are more general and can easily be ambiguous
    */
  def extractBounds(content: String): Array[Sentence] = {
    val symbolyzedData =
      new PragmaticContentFormatter(content)
        .formatLists
        .formatNumbers
        .formatAbbreviations(useAbbreviations)
        .formatPunctuations
        .formatMultiplePeriods
        .formatGeoLocations
        .formatEllipsisRules
        .formatBetweenPunctuations
        .formatQuotationMarkInQuotation
        .formatExclamationPoint
        .formatBasicBreakers
        .finish
    new PragmaticSentenceExtractor(symbolyzedData, content).pull
  }
}

class MixedPragmaticMethod(useAbbreviations: Boolean = false, customBounds: Array[String]) extends PragmaticMethod with Serializable {
  val customBoundsFactory = new RuleFactory(MATCH_ALL, REPLACE_ALL_WITH_SYMBOL)
  customBounds.foreach(bound => customBoundsFactory.addRule(bound.r, s"split bound: $bound"))
  /** this is a hardcoded order of operations
    * considered to go from those most specific non-ambiguous cases
    * down to those that are more general and can easily be ambiguous
    */
  def extractBounds(content: String): Array[Sentence] = {
    val symbolyzedData =
      new PragmaticContentFormatter(content)
        .formatCustomBounds(customBoundsFactory)
        .formatLists
        .formatAbbreviations(useAbbreviations)
        .formatNumbers
        .formatPunctuations
        .formatMultiplePeriods
        .formatGeoLocations
        .formatEllipsisRules
        .formatBetweenPunctuations
        .formatQuotationMarkInQuotation
        .formatExclamationPoint
        .formatBasicBreakers
        .finish
    new PragmaticSentenceExtractor(symbolyzedData, content).pull
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy