All Downloads are FREE. Search and download functionalities are using the official Maven repository.

kr.bydelta.koala.data.Sentence.scala Maven / Gradle / Ivy

package kr.bydelta.koala.data

import scala.annotation.tailrec
import scala.collection.JavaConverters._
import scala.collection.generic.CanBuildFrom
import scala.collection.mutable.ArrayBuffer
import scala.collection.{IndexedSeqLike, mutable}

/**
  * 문장 Class / Sentence Class
  *
  * @param words 문장에 포함되는 어절의 나열. / Word sequence
  */
final class Sentence private(val words: Vector[Word])
  extends IndexedSeq[Word] with IndexedSeqLike[Word, Sentence] {

  /* Initialization */
  words.zipWithIndex.par.foreach {
    case (w, wid) => w.index = wid
  }

  /**
    * 의존 구문 분석의 Root
    *
    * Root of dependency tree.
    */
  private[koala] val root = Word()

  override def canEqual(that: Any): Boolean = that.isInstanceOf[Sentence]

  /**
    * (Java)의존 구문 분석 결과, 나타난 핵심어들.
    *
    * (Java) Head words for this sentence.
    */
  def jTopLevels = topLevels.asJava

  /**
    * (Java) 주어진 품사 표기의 Sequence를 포함하는지 확인.
    *
    * 
* `POS$.Value[][]`의 형태이며, 이는 품사가 어절을 구성하고^POS$.Value[]^, * 어절이 문장을 구성한 형태^POS$.Value[][]^를 따른 것임. *
* Sequence가 *연속되지 않더라도* 확인함. 즉, "나/NP는/JKS 밥/NNG을/JKO 먹/VV고/EC,"라는 문장구조가 있다면, * `{{POSTag.NP,POSTag.JKS},{POSTag.VV}}`는 중간 어절에 대응하는 품사의 Sequence가 없지만, 순서는 포함되므로, * `true`를 반환함. * * @param tag 확인할 통합 품사 표기의 Sequence. `POS$.Value[][]` 객체. * @return True: 존재하는 경우 */ def matches(tag: Array[Array[String]]): Boolean = matches(tag.map(_.toSeq).toSeq) /** * 주어진 품사 표기의 Sequence를 포함하는지 확인. *
* `Seq[Seq[POSTag] ]`의 형태이며, 이는 품사가 어절을 구성하고^Seq[POSTag]^, * 어절이 문장을 구성한 형태^Seq[Seq[POSTag.Value] ]^를 따른 것임. *
* Sequence가 *연속되지 않더라도* 확인함. 즉, "나/NP는/JKS 밥/NNG을/JKO 먹/VV고/EC,"라는 문장구조가 있다면, * `Seq(Seq(POS.NP,POS.JKS),Seq(POS.VV))`는 중간 어절에 대응하는 품사의 Sequence가 없지만, 순서는 포함되므로, * `true`를 반환함. * * @param tag 확인할 통합 품사 표기의 Sequence. `Seq[Seq[POSTag] ]` 객체. * @return True: 존재하는 경우 */ def matches(tag: Seq[Seq[String]]): Boolean = words.foldLeft(tag) { case (list, w) => if (w.matches(list.head)) list.tail else list }.isEmpty /** * (Java) 체언^명사, 수사, 대명사^을 포함하는 어절들 * * (Java) List of words which contains Nominal morphemes (Noun, Ordinal/Cardinal, Pronoun) * * @return 체언을 포함하는 어절들의 Sequence */ def jNouns = nouns.asJava /** * 체언^명사, 수사, 대명사^을 포함하는 어절들 * * Sequence of words which contains Nominal morphemes (Noun, Ordinal/Cardinal, Pronoun) * * @return 체언을 포함하는 어절들의 Sequence */ def nouns = words.filter(_.exists(_.isNoun)) /** * (Java) 용언^동사, 형용사^을 포함하는 어절들 * * (Java) List of words which contains Predicative morphemes (Verb, Adjective) * * @return 용언을 포함하는 어절들의 Sequence */ def jVerbs = verbs.asJava /** * 용언^동사, 형용사^을 포함하는 어절들 * * Sequence of words which contains Predicative morphemes (Verb, Adjective) * * @return 용언을 포함하는 어절들의 Sequence */ def verbs = words.filter(_.exists(_.isPredicate)) /** * (Java) 수식언^관형사, 부사^을 포함하는 어절들 * * (Java) List of words which contains Modifying morphemes (Determiner, Adverb) * * @return 수식언을 포함하는 어절들의 Sequence */ def jModifiers = modifiers.asJava /** * 수식언^관형사, 부사^을 포함하는 어절들 * * Sequence of words which contains Modifying morphemes (Determiner, Adverb) * * @return 수식언을 포함하는 어절들의 Sequence */ def modifiers = words.filter(_.exists(_.isModifier)) override def apply(idx: Int): Word = words(idx) override def length: Int = words.size /** * (Java) 어절을 문장의 순서대로 순회하는 iterator. * * @return 어절 순회 Iterator. */ def jIterator = iterator.asJava override def toString: String = surfaceString() + "\n" + words.map { w => w.toString + (if (topLevels.exists(_.target == w.index)) "[ROOT]" else "") }.mkString("\n") /** * 의존 구문 분석 결과, 나타난 핵심어들. * * Head words for this sentence. */ def topLevels = root.dependents /** * 띄어쓰기 된 문장을 반환. * * @param delimiter 어절 사이의 띄어쓰기 방식. 기본값 = 공백(" ") * @return 띄어쓰기 된 문장. */ def surfaceString(delimiter: String = " "): String = words.map(_.surface).mkString(delimiter) /** * 품사분석 결과를, 1행짜리 String으로 변환. * * @return 품사분석 결과를 담은 1행짜리 String. */ def singleLineString: String = words.map(_.singleLineString).mkString(" ") /** * 의존구문분석트리를 String형태로 그립니다. * * @return 트리 String. */ def treeString: String = if (topLevels.isEmpty) singleLineString else { val word = topLevels.head treeString(word, 0, topLevels.tail.map(_ -> 0).toList) } override protected[this] def newBuilder: mutable.Builder[Word, Sentence] = Sentence.newBuilder @tailrec private def treeString(rel: Relationship, depth: Int, stack: List[(Relationship, Int)] = List(), printed: ArrayBuffer[String] = ArrayBuffer()): String = { printed += (" " * depth + s"+${rel.relation} : ${this (rel.target).singleLineString} ... ${rel.rawRel}") val nextDepth = depth + 1 val newStack = stack ++ apply(rel.target).dependents.map(_ -> nextDepth) if (newStack.nonEmpty) { val (nextHead, nextDepth) = newStack.head treeString(nextHead, nextDepth, newStack.tail, printed) } else printed.mkString("\n") } } /** * Companion object for Sentence */ object Sentence { /** * Create a sentence. * * @param words Word sequence. * @return a new Sentence. */ def apply(words: collection.Seq[Word]) = applySeq(words) /** * Extractor for the sentences. * * @note "Extractor" is for pattern matching. That is, a sentence `s` can be matched as: *
    *       s match { case Sentence(word1, word2, _*) => ... }
    *       
* or can be matched as: *
    *       s match { case Sentence(wordseq @ _*) => ... }
    *       
* @param target Sentence to be matched * @return Some(word sequence) */ def unapplySeq(target: Sentence): Option[Seq[Word]] = { Some(target.words) } /** * Builder factory for any sentence. * * @return Builder factory instance. */ implicit def canBuildFrom: CanBuildFrom[Sentence, Word, Sentence] = new CanBuildFrom[Sentence, Word, Sentence] { override def apply(from: Sentence): mutable.Builder[Word, Sentence] = newBuilder override def apply(): mutable.Builder[Word, Sentence] = newBuilder } /** * Create new builder for type Sentence. * * @return a new Builder */ private def newBuilder = new ArrayBuffer[Word] mapResult applySeq /** * Create a sentence. * * @note Access is restricted because (i) Sentence should be created within Koala package, * and (ii) most developers using this package seldom needs this operation. * @param words Word sequence. * @return a new Sentence. */ private def applySeq(words: collection.Seq[Word]) = new Sentence(words.toVector) }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy