All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bitbucket.eunjeon.seunjeon.DictBuilder.scala Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2015 youngho yu, yongwoon lee
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 **/
package org.bitbucket.eunjeon.seunjeon

import java.nio.file.StandardCopyOption._
import java.nio.file.{Files, Paths}

import scala.reflect.io.{File, Path}


object DictBuilder {

  val workingDirectory = System.getProperty("user.dir")
  val MECAB_KO_DIC_PATH = workingDirectory + "/mecab-ko-dic"
  val RESOURCE_PATH = workingDirectory + "/src/main/resources"
  val DICT_PATHNAME = "/dictionary"
  val DICT_PATH = DICT_PATHNAME

  val TERM_DICT_FILENAME = "/termDict.dat"
  val TERM_DICT = DICT_PATH + TERM_DICT_FILENAME
  val DICT_MAPPER_FILENAME = "/dictMapper.dat"
  val DICT_MAPPER = DICT_PATH + DICT_MAPPER_FILENAME
  val TERM_TRIE_FILENAME = "/trie.dat"
  val TERM_TRIE = DICT_PATH + TERM_TRIE_FILENAME
  val CONNECTION_COST_FILENAME = "/connection_cost.dat"
  val CONNECTION_COST = DICT_PATH + CONNECTION_COST_FILENAME

  val UNK_DEF_FILENAME = "/unk.def"
  val UNK_DEF = DICT_PATH + UNK_DEF_FILENAME

  val CHAR_DEF_FILENAME = "/char.def"

  val LEFT_ID_DEF_FILENAME = "/left-id.def"
  val LEFT_ID_DEF = DICT_PATH + LEFT_ID_DEF_FILENAME
  val RIGHT_ID_DEF_FILENAME = "/right-id.def"
  val RIGHT_ID_DEF = DICT_PATH + RIGHT_ID_DEF_FILENAME

  def main(args: Array[String]): Unit = {
    clear()

    copyUnkDef()
    copyLeftIdDef()
    copyRightIdDef()

    println("compiling lexicon dictionary...")
    buildLexiconDict()

    println("compiling connection-cost dictionary...")
    buildConnectionCostDict()

    println("complete")
  }

  private def copyUnkDef(): Unit = {
    copyDefFile(UNK_DEF_FILENAME)
  }

  private def copyLeftIdDef(): Unit = {
    copyDefFile(LEFT_ID_DEF_FILENAME)
  }

  private def copyRightIdDef(): Unit = {
    copyDefFile(RIGHT_ID_DEF_FILENAME)
  }

  private def copyDefFile(defFileName: String): Unit = {
    Files.copy(Paths.get(MECAB_KO_DIC_PATH + File.separator + defFileName),
      Paths.get(RESOURCE_PATH + File.separator +
        DICT_PATHNAME + File.separator + defFileName), REPLACE_EXISTING)
  }

  private def buildConnectionCostDict(): Unit = {
    val connectionCostDict = new ConnectionCostDict
    connectionCostDict.loadFromFile(MECAB_KO_DIC_PATH + File.separator + "matrix.def")
    connectionCostDict.save(RESOURCE_PATH  + File.separator +
      DICT_PATHNAME + File.separator + CONNECTION_COST_FILENAME)
    println("building connection cost dictionary OK. " +
      s"(${connectionCostDict.getDictionaryInfo()})")
  }

  private def buildLexiconDict(): Unit = {
    val dictPath = RESOURCE_PATH + File.separator + DICT_PATHNAME

    val lexiconDict = new LexiconDict
    lexiconDict.loadFromDir(MECAB_KO_DIC_PATH)
    lexiconDict.save(dictPath + File.separator + TERM_DICT_FILENAME,
                     dictPath + File.separator + DICT_MAPPER_FILENAME,
                     dictPath + File.separator + TERM_TRIE_FILENAME)
    println("building LexiconDict OK. " +
      s"(${lexiconDict.getDictionaryInfo()})")
  }

  private def clear(): Unit = {
    val dictPath = RESOURCE_PATH + File.separator + DICT_PATHNAME

    Path(dictPath).deleteRecursively()
    new java.io.File(dictPath).mkdirs()
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy