All Downloads are FREE. Search and download functionalities are using the official Maven repository.

kr.bydelta.koala.eunjeon.helper.DictionaryExtractor.scala Maven / Gradle / Ivy

The newest version!
package kr.bydelta.koala.eunjeon.helper

import java.io.{BufferedInputStream, FileOutputStream, ObjectInputStream, ObjectOutputStream}
import java.nio.charset.Charset

import org.bitbucket.eunjeon.seunjeon._

import scala.reflect.io.{File, Path}

/**
  * Dictionary Extractor Object
  *
  * Execution of this object is required before system builds, because there is a deserialization issue in SEunjeon,
  * caused by various scala versions. We need to extract and save dictionary objects differently.
  */
object DictionaryExtractor {
  def main(args: Array[String]): Unit = {
    val resources = Path("eunjeon") / "src" / "main" / "resources-2.12"
    println(resources.toAbsolute.toString())
    resources.createDirectory()

    val termDictStream = new ObjectInputStream(new BufferedInputStream(
      new BufferedInputStream(classOf[LexiconDict].getResourceAsStream(DictBuilder.TERM_DICT), 32*1024), 16*1024))
    val termDict = termDictStream.readObject().asInstanceOf[Array[CompressedMorpheme]]
    val termOut = new ObjectOutputStream(new FileOutputStream((resources / "term.dict").jfile))
    val len = termDict.length
    termOut.writeInt(len)
    termDict.foreach{
      morph =>
        writeMorpheme(termOut, morph)
    }
    termOut.close()
    termDictStream.close()

    val dictMapperStream = new ObjectInputStream(new BufferedInputStream(
      new BufferedInputStream(classOf[LexiconDict].getResourceAsStream(DictBuilder.DICT_MAPPER), 32*1024), 16*1024))
    val dictMapper = dictMapperStream.readObject().asInstanceOf[Array[Array[Int]]]
    val dictMapOut = new ObjectOutputStream(new FileOutputStream((resources / "dict.map").jfile))
    dictMapOut.writeInt(dictMapper.length)
    dictMapper.foreach{
      array =>
        dictMapOut.writeInt(array.length)
        array.foreach(dictMapOut.writeInt)
    }
    dictMapOut.close()
    dictMapperStream.close()
  }

  private def writeMorpheme(termOut: ObjectOutputStream, morph: CompressedMorpheme): Unit = {
    termOut.writeObject(morph.getSurface)
    termOut.writeShort(morph.getLeftId)
    termOut.writeShort(morph.getRightId)
    termOut.writeInt(morph.getCost)
    termOut.writeObject(morph.getFeature)
    termOut.writeByte(morph.getMType.id.toByte)
    val poses = morph.getPoses
    termOut.writeInt(poses.length)
    poses.foreach {
      pos =>
        termOut.writeByte(pos.id.toByte)
    }
  }

  private[koala] def readMorpheme(in: ObjectInputStream, needCompress: Boolean): Morpheme = {
    val surface = in.readObject().asInstanceOf[String]
    val leftId = in.readShort()
    val rightId = in.readShort()
    val cost = in.readInt()
    val features = in.readObject().asInstanceOf[String]
    val mType = MorphemeType(Byte.byte2int(in.readByte()))
    val posSz = in.readInt()
    val poses = (0 until posSz).map{
      _ => Pos(Byte.byte2int(in.readByte()))
    }.toArray

    val morph = BasicMorpheme(surface, leftId, rightId, cost, features, mType, poses)
    if(needCompress) new CompressedMorpheme(morph)
    else morph
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy