All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.johnsnowlabs.nlp.annotators.sbd.pragmatic.PragmaticMethod.scala Maven / Gradle / Ivy

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.sbd.pragmatic

import com.johnsnowlabs.nlp.annotators.common.Sentence
import com.johnsnowlabs.nlp.util.io.MatchStrategy.MATCH_ALL
import com.johnsnowlabs.nlp.util.regex.RuleFactory
import com.johnsnowlabs.nlp.util.regex.TransformStrategy.{
  REPLACE_ALL_WITH_SYMBOL,
  TransformStrategy
}

protected trait PragmaticMethod {
  def extractBounds(content: String): Array[Sentence]
}

/** Inspired on Kevin Dias, Ruby implementation: https://github.com/diasks2/pragmatic_segmenter
  * This approach extracts sentence bounds by first formatting the data with [[RuleSymbols]] and
  * then extracting bounds with a strong RegexBased rule application
  */
class CustomPragmaticMethod(
    customBounds: Array[String],
    transformStrategy: TransformStrategy = REPLACE_ALL_WITH_SYMBOL)
    extends PragmaticMethod
    with Serializable {
  override def extractBounds(content: String): Array[Sentence] = {

    val customBoundsFactory = new RuleFactory(MATCH_ALL, transformStrategy)
    customBounds.foreach(bound => customBoundsFactory.addRule(bound.r, s"split bound: $bound"))

    val symbolyzedData = new PragmaticContentFormatter(content)
      .formatCustomBounds(customBoundsFactory)
      .finish

    new PragmaticSentenceExtractor(symbolyzedData, content).pull
  }
}

class DefaultPragmaticMethod(useAbbreviations: Boolean = false, detectLists: Boolean = true)
    extends PragmaticMethod
    with Serializable {

  /** this is a hardcoded order of operations considered to go from those most specific
    * non-ambiguous cases down to those that are more general and can easily be ambiguous
    */
  def extractBounds(content: String): Array[Sentence] = {
    val symbolyzedData =
      new PragmaticContentFormatter(content)
        .formatLists(detectLists)
        .formatNumbers
        .formatAbbreviations(useAbbreviations)
        .formatPunctuations
        .formatMultiplePeriods
        .formatGeoLocations
        .formatEllipsisRules
        .formatBetweenPunctuations
        .formatQuotationMarkInQuotation
        .formatExclamationPoint
        .formatBasicBreakers
        .finish
    new PragmaticSentenceExtractor(symbolyzedData, content).pull
  }
}

class MixedPragmaticMethod(
    useAbbreviations: Boolean = false,
    detectLists: Boolean = true,
    customBounds: Array[String],
    transformStrategy: TransformStrategy = REPLACE_ALL_WITH_SYMBOL)
    extends PragmaticMethod
    with Serializable {
  val customBoundsFactory = new RuleFactory(MATCH_ALL, transformStrategy)
  customBounds.foreach(bound => customBoundsFactory.addRule(bound.r, s"split bound: $bound"))

  /** this is a hardcoded order of operations considered to go from those most specific
    * non-ambiguous cases down to those that are more general and can easily be ambiguous
    */
  def extractBounds(content: String): Array[Sentence] = {
    val symbolyzedData =
      new PragmaticContentFormatter(content)
        .formatCustomBounds(customBoundsFactory)
        .formatLists(detectLists)
        .formatAbbreviations(useAbbreviations)
        .formatNumbers
        .formatPunctuations
        .formatMultiplePeriods
        .formatGeoLocations
        .formatEllipsisRules
        .formatBetweenPunctuations
        .formatQuotationMarkInQuotation
        .formatExclamationPoint
        .formatBasicBreakers
        .finish
    new PragmaticSentenceExtractor(symbolyzedData, content).pull
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy