All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bitbucket.eunjeon.seunjeon.CharSet.scala Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2015 youngho yu, yongwoon lee
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 **/
package org.bitbucket.eunjeon.seunjeon

import scala.collection.mutable
import scala.io.Source
import scala.collection.mutable.ArrayBuffer


case class CharSet(str: String, rlength: Int, category: Category, morpheme: Morpheme)
case class Category(invoke:Boolean, group:Boolean, length:Int)


object CharSetDef {

  val (
    charset: Array[Byte],
    cateMorphemeIndex: Array[(Category, Morpheme)]) = loadResource

  def loadResource: (Array[Byte], Array[(Category, Morpheme)]) = {
    val lines: Seq[String] =
    Source.fromInputStream(classOf[CharSet].getResourceAsStream(DictBuilder.CHAR_DEF_FILENAME), "UTF-8").getLines().
      filterNot(line => line.startsWith("#") || line.replaceAll("\\s", "").isEmpty).toSeq

    val (rangeLines: Seq[String], categoryLines: Seq[String]) = lines.partition(_.startsWith("0x"))

    val categories: Map[String, Category] = categoryLines.map(parseCategory).toMap

    val parsedRanges: Seq[(Char, Char, (Category, Morpheme))] =
      rangeLines.
        map(parseCharset).
        map(x => (x._1, x._2, (categories(x._3), UnkDef(x._3).get)))

    val cateMorphemes: Array[(Category, Morpheme)] = buildCategoryMorphemes(parsedRanges)
    val charsetIndex: ArrayBuffer[Byte] = buildCharset(parsedRanges, cateMorphemes)

    (charsetIndex.toArray, cateMorphemes)
  }

  private def buildCategoryMorphemes(parsedRanges: Seq[(Char, Char, (Category, Morpheme))]) = {
    val tmp = parsedRanges.map(_._3).distinct.partition(_._2.getSurface == "DEFAULT")
    (tmp._1 ++ tmp._2).toArray
  }

  private def buildCharset(parsedLines: Seq[(Char, Char, (Category, Morpheme))], cateMorphemes: Array[(Category, Morpheme)]) = {
    val cateMorphemeIndex: Map[(Category, Morpheme), Byte] = cateMorphemes.zipWithIndex.map(x => (x._1, x._2.toByte)).toMap
    val charsetIndex = ArrayBuffer.fill[Byte](Char.MaxValue+1)(0)
    parsedLines.foreach { parsedLine =>
      val begin = parsedLine._1
      val finish = parsedLine._2
      val cateMorpheme = parsedLine._3
      for (idx <- begin to finish) {
        charsetIndex(idx) = cateMorphemeIndex(cateMorpheme)
      }
    }
    charsetIndex
  }

  private[seunjeon] def parseCharset(line: String): (Char, Char, String) = {
    val l = line.split("\\s+")
    val charRange = l(0).split("\\.\\.")
    val name = l(1)

    def str2Char(hexaDecimal: String) = Integer.parseInt(hexaDecimal, 16).toChar

    (str2Char(charRange.head.substring(2)), str2Char(charRange.last.substring(2)), name)
  }

  private def parseCategory(line: String) = {
    val l = line.split("\\s+")
    val name = l(0)
    val invoke = if (l(1) == "1") true else false
    val group = if (l(2) == "1") true else false
    val length = l(3).toInt
    name -> Category(invoke, group, length)
  }

  def splitCharSet(text: String): Seq[CharSet] = {
    val charsets = new mutable.ArrayBuffer[CharSet](text.length)
    if (text.length == 0) {
      return charsets
    }
    var start = 0
    var curCategoryTerm: (Category, Morpheme) = null
    text.zipWithIndex.foreach { case (ch, idx) =>
      val categoryTerm: (Category, Morpheme) = getCategoryTerm(ch)
      if (categoryTerm != curCategoryTerm) {
        // first loop
        if (curCategoryTerm == null) {
        } else {
          val charsetString = text.substring(start, idx)
          charsets.append(CharSet(charsetString, charsetString.length, curCategoryTerm._1, curCategoryTerm._2))
          start = idx
        }
        curCategoryTerm = categoryTerm
      }
    }
    val charsetString = text.substring(start, text.length)
    charsets.append(CharSet(charsetString, charsetString.length, curCategoryTerm._1, curCategoryTerm._2))

    charsets
  }

  def getCategoryTerm(ch: Char): (Category, Morpheme) = {
    val idx = charset(ch)
    cateMorphemeIndex(idx)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy