
org.bitbucket.eunjeon.seunjeon.LexiconDict.scala Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2015 youngho yu, yongwoon lee
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package org.bitbucket.eunjeon.seunjeon
import java.io.{File, _}
import java.util.regex.Pattern
import com.typesafe.scalalogging.Logger
import org.slf4j.LoggerFactory
import scala.collection.mutable
import scala.io.Source
import scala.util.Try
import scala.util.matching.Regex
object LexiconDict {
val compoundDelimiter = "+"
val compoundDelimiterRegex = "(?= 2) escapedSurfaces.map(_ + "/NNG/*" ).mkString("+") else "*"
val morphemeType = if (surfaces.length >= 2) "Compound" else "*"
val feature = Array("NNG","*", jongsung, surfaces.mkString("+"), morphemeType, "*", "*", compositionFeature)
Morpheme(
escapedSurfaces.mkString(""),
NngUtil.nngLeftId,
NngUtil.nngRightId,
cost,
wrapRefArray(feature),
MorphemeType(feature),
wrapRefArray(Pos.poses(feature)))
}
private def hasJongsung(ch:Char): Boolean = {
if (((ch - 0xAC00) % 0x001C) == 0) {
false
} else {
true
}
}
private def isHangul(ch:Char): Boolean = {
if ((0x0AC00 <= ch && ch <= 0xD7A3)
|| (0x1100 <= ch && ch <= 0x11FF)
|| (0x3130 <= ch && ch <= 0x318F)) {
true
} else {
false
}
}
}
class LexiconDict {
val logger = Logger(LoggerFactory.getLogger(classOf[LexiconDict].getName))
var termDict: Array[Morpheme] = null
var dictMapper: Array[Array[Int]] = null
var trie: DoubleArrayTrie = null
def getDictionaryInfo(): String = {
s"termSize = ${termDict.length} mapper size = ${dictMapper.length}"
}
def loadFromFile(file: String): LexiconDict = {
val iterator = Source.fromFile(file, "utf-8").getLines()
loadFromIterator(iterator)
}
def loadFromDir(dir: String): LexiconDict = {
val r = new Regex(".+[.]csv")
val files = new File(dir).listFiles.filter(f => r.findFirstIn(f.getName).isDefined)
val totalIterator:Iterator[String] = files.map(f => Source.fromFile(f, "utf-8").getLines()).reduceLeft(_ ++ _)
loadFromIterator(totalIterator)
}
def loadFromString(str: String): LexiconDict = {
val iterator = str.stripMargin.split("\n").toIterator
loadFromIterator(iterator)
}
def csvParse(str: String): List[String] =
str.split(",(?=([^\"]*\"[^\"]*\")*(?![^\"]*\"))").toList.map(_.replaceFirst("^\"", "").replaceFirst("\"$", ""))
def loadFromIterator(iterator: Iterator[String]): LexiconDict = {
val startTime = System.nanoTime()
val parsedLine: Seq[Try[Morpheme]] =
iterator.dropWhile(_.head == '#').
map(_.replaceAll(" ", "")).
map(csvParse).
map { x =>
Try {
x match {
// "단어"
case List(surface) =>
LexiconDict.buildNNGTerm(surface, 1000 - (surface.length * 100))
// "단어,-100" # 단어,비용
case List(surface, cost) =>
LexiconDict.buildNNGTerm(surface, cost.toShort)
case List(surface, leftId, rightId, cost, feature@_ *) =>
Morpheme(surface,
leftId.toShort,
rightId.toShort,
cost.toShort,
wrapRefArray(feature.toArray),
MorphemeType(feature),
wrapRefArray(Pos.poses(feature)))
}
}
}.toSeq
val morphemes: Seq[Morpheme] = parsedLine.filter(_.isSuccess).map(_.get)
val elapsedTime = (System.nanoTime() - startTime) / (1000*1000)
logger.info(s"csv parsing is completed. ($elapsedTime ms)")
build(morphemes.sortBy(_.surface))
}
private def build(sortedTerms: Seq[Morpheme]): LexiconDict = {
termDict = sortedTerms.toArray
val startTime = System.nanoTime()
val surfaceIndexDict = buildSurfaceIndexDict(sortedTerms)
dictMapper = surfaceIndexDict.map(_._2)
val elapsedTime = (System.nanoTime() - startTime) / (1000*1000)
logger.info(s"terms & mapper building is completed. ($elapsedTime ms)")
trie = buildTrie(surfaceIndexDict)
this
}
def buildTrie(dict:Array[(String, Array[Int])]): DoubleArrayTrie = {
var startTime = System.nanoTime()
val trieBuilder = DoubleArrayTrieBuilder()
for (idx <- dict.indices) {
trieBuilder.add(dict(idx)._1, idx)
}
var elapsedTime = (System.nanoTime() - startTime) / (1000*1000)
logger.info(s"added to trie builder ($elapsedTime ms)")
startTime = System.nanoTime()
val doubleArrayTrie = trieBuilder.build()
elapsedTime = (System.nanoTime() - startTime) / (1000*1000)
logger.info(s"double-array trie building is completed. ($elapsedTime ms)")
doubleArrayTrie
}
def buildSurfaceIndexDict(sortedTerms: Seq[Morpheme]):Array[(String, Array[Int])] = {
val groupedTerms:mutable.ListBuffer[(String, Array[Int])] = mutable.ListBuffer()
if (sortedTerms.isEmpty) {
return groupedTerms.toArray
}
var curIndices:Array[Int] = null
var preSurface:String = null
sortedTerms.view.zipWithIndex.foreach { case (term:Morpheme, idx) =>
if (preSurface != term.surface) {
if (preSurface != null) {
groupedTerms.append((preSurface, curIndices))
}
curIndices = Array[Int]()
}
curIndices = curIndices :+ idx
preSurface = term.surface
}
groupedTerms.append((preSurface, curIndices))
groupedTerms.toArray
}
def commonPrefixSearch(keyword: String): Seq[Morpheme] = {
trie.commonPrefixSearch(keyword).flatMap(dictMapper(_).map(termDict(_)))
}
def save(termDictPath: String, dictMapperPath: String, triePath: String): Unit = {
val termDictStore = new ObjectOutputStream(
new BufferedOutputStream(new FileOutputStream(termDictPath), 16*1024))
termDictStore.writeObject(termDict)
termDictStore.close()
val dictMapperStore = new ObjectOutputStream(
new BufferedOutputStream(new FileOutputStream(dictMapperPath), 16*1024))
dictMapperStore.writeObject(dictMapper)
dictMapperStore.close()
trie.write(new java.io.File(triePath))
}
def load(): LexiconDict = {
val termDictStream = classOf[LexiconDict].getResourceAsStream(DictBuilder.TERM_DICT)
val dictMapperStream = classOf[LexiconDict].getResourceAsStream(DictBuilder.DICT_MAPPER)
val trieStream = classOf[LexiconDict].getResourceAsStream(DictBuilder.TERM_TRIE)
load(termDictStream, dictMapperStream, trieStream)
this
}
def load(termDictPath: String = DictBuilder.TERM_DICT,
dictMapperPath: String = DictBuilder.DICT_MAPPER,
lexiconTriePath: String = DictBuilder.TERM_TRIE): Unit = {
val termDictStream = new FileInputStream(termDictPath)
val dictMapperStream = new FileInputStream(dictMapperPath)
val trieStream = new FileInputStream(lexiconTriePath)
load(termDictStream, dictMapperStream, trieStream)
}
private def load(termDictStream: InputStream,
dictMapperStream: InputStream,
trieStream: InputStream): Unit = {
// FIXME: 사전 로딩이 3초에서 9초로 느려짐... posid 추가하면서 느려짐..
var startTime = System.nanoTime()
val termDictIn = new ObjectInputStream(new BufferedInputStream(termDictStream, 16*1024))
termDict = termDictIn.readObject().asInstanceOf[Array[Morpheme]]
termDictIn.close()
var elapsedTime = (System.nanoTime() - startTime) / (1000*1000)
logger.info(s"terms loading is completed. ($elapsedTime ms)")
startTime = System.nanoTime()
val dictMapperIn = new ObjectInputStream(new BufferedInputStream(dictMapperStream, 16*1024))
dictMapper = dictMapperIn.readObject().asInstanceOf[Array[Array[Int]]]
dictMapperIn.close()
elapsedTime = (System.nanoTime() - startTime) / (1000*1000)
logger.info(s"mapper loading is completed. ($elapsedTime ms)")
startTime = System.nanoTime()
trie = DoubleArrayTrie(trieStream)
logger.info(s"double-array trie loading is completed. ($elapsedTime ms)")
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy