All Downloads are FREE. Search and download functionalities are using the official Maven repository.

kr.bydelta.koala.util.SentenceSplitter.scala Maven / Gradle / Ivy

The newest version!
package kr.bydelta.koala.util

import kr.bydelta.koala.POS
import kr.bydelta.koala.data.{Morpheme, Sentence, Word}

import scala.annotation.tailrec

object SentenceSplitter {
  private val quoteRegex = "\'\"'""
  private val openParenRegex = "([{‘“([{<〔〈《「『【"
  private val closeParenRegex = ")]}’”)]}>〕〉》」』】"

  /**
    * 분석결과를 토대로 문장을 분리함.
    *
    * @param para 분리할 문단.
    * @return 문장단위로 분리된 결과
    */
  def apply(para: Seq[Word]): Seq[Sentence] = split(para)

  @tailrec
  private def rollupParenStack(morphemes: Seq[Morpheme],
                               stack: List[Char],
                               isPreviousSL: Boolean = false,
                               hasEndOfSentence: Boolean = false): (List[Char], Boolean) =
    morphemes match {
      case Seq() =>
        (stack, if (stack.isEmpty) hasEndOfSentence else false)
      case Seq(Morpheme(morph, POS.SF), tail@_*) if morph != "." || !isPreviousSL =>
        // 영문 약어 뒤에 따라붙는 마침표가 아니면.
        rollupParenStack(tail, stack, hasEndOfSentence = true)
      case Seq(Morpheme(_, POS.SN), tail@_*) =>
        // 숫자가 나타나면 종결부호 플래그 무시.
        rollupParenStack(tail, stack)
      case Seq(Morpheme(x, tag), tail@_*) =>
        val newStack = x.foldLeft(stack) {
          case (list, ch) if openParenRegex.contains(ch) =>
            ch +: list
          case (list, ch) if closeParenRegex.contains(ch) =>
            list.headOption match {
              case Some(stackHead) =>
                if (closeParenRegex.indexOf(ch) == openParenRegex.indexOf(stackHead))
                  list.tail
                else list
              case _ => list
            }
          case (list, ch) if quoteRegex.contains(ch) =>
            list.headOption match {
              case Some(stackHead) if stackHead == ch => list.tail
              case _ => ch +: list
            }
          case (list, _) => list
        }

        rollupParenStack(tail, newStack,
          isPreviousSL = tag == POS.SL,
          hasEndOfSentence = hasEndOfSentence)
    }

  /**
    * 분석결과를 토대로 문장을 분리함.
    *
    * @param para       분리할 문단.
    * @param parenStack 현재까지 열려있는 묶음기호 Stack.
    * @param acc        현재까지 분리된 문장들.
    * @return 문장단위로 분리된 결과
    */
  @tailrec
  private def split(para: Seq[Word],
                    parenStack: List[Char] = List(),
                    acc: Seq[Seq[Word]] = Seq(Seq.empty)): Seq[Sentence] =
    if (para.isEmpty) acc.filter(_.nonEmpty).map(x => Sentence(x.reverse)).reverse
  else {
    val Seq(head, tail@_*) = para
    val Seq(accHead, accTail@_*) = acc
    val (newParenStack, isEnding) = rollupParenStack(head, parenStack)
    val newAccHead = head +: accHead

    if (isEnding) split(tail, newParenStack, Seq.empty +: newAccHead +: accTail)
    else split(tail, newParenStack, newAccHead +: accTail)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy