com.twitter.penguin.korean.tokenizer.KoreanChunker.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of korean-text Show documentation
Scala library to process Korean text
There is a newer version: 4.4.4
/*
 * Twitter Korean Text - Scala library to process Korean text
 *
 * Copyright 2014 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.twitter.penguin.korean.tokenizer

import java.util.regex.Matcher

import com.twitter.Regex
import com.twitter.penguin.korean.tokenizer.KoreanTokenizer.KoreanToken
import com.twitter.penguin.korean.util.KoreanPos
import com.twitter.penguin.korean.util.KoreanPos._

import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer

case class KoreanChunk(text: String, offset: Int, length: Int)

/**
 * Split input text into Korean Chunks (어절)
 */
object KoreanChunker {
  private val POS_PATTERNS = Map(
    Korean -> """([가-힣]+)""".r.pattern,
    Alpha -> """(\p{Alpha}+)""".r.pattern,
    Number -> ("""(\$?\p{Digit}+"""
      + """(,\p{Digit}{3})*([/~:\.-]\p{Digit}+)?"""
      + """(천|만|억|조)*(%|원|달러|위안|옌|엔|유로|등|년|월|일|회|시간|시|분|초)?)""").r.pattern,
    KoreanParticle -> """([ㄱ-ㅣ]+)""".r.pattern,
    Punctuation -> """([\p{Punct}·…’]+)""".r.pattern,
    URL -> Regex.VALID_URL,
    Email -> """([\p{Alnum}\.\-_]+@[\p{Alnum}\.]+)""".r.pattern,
    Hashtag -> Regex.VALID_HASHTAG,
    ScreenName -> Regex.VALID_MENTION_OR_LIST,
    CashTag -> Regex.VALID_CASHTAG,
    Space -> """\s+""".r.pattern
  )

  private val CHUNKING_ORDER = Seq(URL, Email, ScreenName, Hashtag, CashTag, Number, Korean, KoreanParticle, Alpha, Punctuation)

  protected[korean] def getChunks(input: String, keepSpace: Boolean = false): Seq[String] = {
    chunk(input).map(_.text)
  }

  private[this] case class ChunkMatch(start: Int, end: Int, text: String, pos: KoreanPos) {
    def disjoint(that: ChunkMatch): Boolean = {
      (that.start < this.start && that.end <= this.start) ||
        (that.start >= this.end && that.end > this.end)
    }
  }

  def splitBySpaceKeepingSpace(s: CharSequence): Seq[String] = {
    val space = """\s+""".r.pattern
    val m = space.matcher(s)

    var tokens = new ListBuffer[String]()
    var index = 0
    while(m.find()) {
      if (index < m.start) {
        tokens += s.subSequence(index, m.start).toString
      }
      tokens += s.subSequence(m.start, m.end).toString
      index = m.end
    }

    if (index < s.length()) {
      tokens += s.subSequence(index, s.length()).toString
    }

    return tokens.toList
  }

  /**
   * Recursively call m.find() to find all the matches.
   * Use tail-recursion optimization to avoid stack overflow.
   *
   * @param m input Matcher
   * @param pos KoreanPos to attach
   * @param matches ouput list of ChunkMatch
   * @return list of ChunkMatches
   */
  @scala.annotation.tailrec
  private[this] def findAllPatterns(m: Matcher, pos: KoreanPos, matches: List[ChunkMatch] = List()): List[ChunkMatch] = {
    if (m.find()) {
      findAllPatterns(m, pos, ChunkMatch(m.start, m.end, m.group(), pos) :: matches)
    } else {
      matches
    }
  }

  private[this] def splitChunks(text: String): List[ChunkMatch] = {
    if (text.charAt(0).isSpaceChar) {
      List(ChunkMatch(0, text.length, text, Space))
    } else {
      val chunksBuf = new ListBuffer[ChunkMatch]()
      var matchedLen = 0
      CHUNKING_ORDER.foreach { pos =>
        if (matchedLen < text.length) {
          val m = POS_PATTERNS(pos).matcher(text)
          while (m.find()) {
            val cm = ChunkMatch(m.start, m.end, m.group(), pos)
            if (chunksBuf.forall(cm.disjoint)) {
              chunksBuf += cm
              matchedLen += cm.end - cm.start
            }
          }
        }
      }

      val chunks = chunksBuf.sortBy(cm => cm.start).toList
      fillInUnmatched(text, chunks, Foreign)
    }
  }

  /**
   * Fill in unmatched segments with given pos
   *
   * @param text input text
   * @param chunks matched chunks
   * @param pos KoreanPos to attach to the unmatched chunk
   * @return list of ChunkMatches
   */
  private[this] def fillInUnmatched(text: String,
                                    chunks: Seq[ChunkMatch],
                                    pos: KoreanPos.Value): List[ChunkMatch] = {

    // Add Foreign for unmatched parts
    val (chunksWithForeign, prevEnd) = chunks.foldLeft((List[ChunkMatch](), 0)) {
      case ((l: List[ChunkMatch], prevEnd: Int), cm: ChunkMatch) if cm.start == prevEnd =>
        (cm :: l, cm.end)
      case ((l: List[ChunkMatch], prevEnd: Int), cm: ChunkMatch) if cm.start > prevEnd =>
        (cm :: ChunkMatch(prevEnd, cm.start, text.slice(prevEnd, cm.start), pos) :: l, cm.end)
      case ((l: List[ChunkMatch], prevEnd: Int), cm: ChunkMatch) =>
        throw new IllegalStateException("Non-disjoint chunk matches found.")
    }

    val output = if (prevEnd < text.length) {
      ChunkMatch(prevEnd, text.length, text.slice(prevEnd, text.length), pos) :: chunksWithForeign
    } else {
      chunksWithForeign
    }

    output.reverse
  }


  /**
   * Get chunks by given pos.
   *
   * @param input input string
   * @param pos one of supported KoreanPos's: URL, Email, ScreenName, Hashtag,
   *            CashTag, Korean, KoreanParticle, Number, Alpha, Punctuation
   * @return sequence of Korean chunk strings
   */
  def getChunksByPos(input: String, pos: KoreanPos): Seq[KoreanToken] = {
    chunk(input).filter(_.pos == pos)
  }

  /**
   * Split input text into a sequnce of KoreanToken. A candidate for Korean parser
   * gets tagged with KoreanPos.Korean.
   *
   * @param input input string
   * @return sequence of KoreanTokens
   */
  def chunk(input: CharSequence): Seq[KoreanToken] = {
    val s = input.toString
    val (l: List[KoreanToken], i: Int) = splitBySpaceKeepingSpace(s).flatMap {
      s => splitChunks(s)
    }.foldLeft(List[KoreanToken](), 0) {
      case ((l: List[KoreanToken], i: Int), m: ChunkMatch) =>
        val segStart = s.indexOf(m.text, i)
        (KoreanToken(m.text, m.pos, segStart, m.text.length) :: l, segStart + m.text.length)
    }
    l.reverse
  }
}