com.mayabot.nlp.segment.reader.StopWordDict.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mynlp Show documentation
Show all versions of mynlp Show documentation
Maya Nlp subproject :mynlp
package com.mayabot.nlp.segment.reader
import com.mayabot.nlp.MynlpEnv
import com.mayabot.nlp.Mynlps
import com.mayabot.nlp.collection.dat.DoubleArrayTrieMap
import com.mayabot.nlp.injector.ImplementedBy
import com.mayabot.nlp.injector.Singleton
import java.util.*
const val StopWordDictPath = "stopwords.txt"
/**
* 停用词接口
*
* Guice默认注入SystemStopWordDict
*
* @author jimichan
*/
@ImplementedBy(SystemStopWordDict::class)
interface StopWordDict {
fun contains(word: String): Boolean
fun add(word: String)
fun remove(word: String)
fun commit()
}
/**
* 停用词词典,基于DAT的实现
*
* 可以动态新增、删减停用词。修改后需要[commit]操作。
*
* @author jimichan
*/
class DefaultStopWordDict(set: Set) : StopWordDict {
private var stopWordSet = HashSet(set)
private var dat: DoubleArrayTrieMap = DoubleArrayTrieMap(
TreeMap().apply { put("This Is Empty Flag", true) })
private var isEmpty = false
init {
commit()
}
override fun commit() {
if (stopWordSet.isEmpty()) {
isEmpty = true
return
}
isEmpty = false
val treeMap = TreeMap()
stopWordSet.forEach {
treeMap[it] = true
}
dat = DoubleArrayTrieMap(treeMap)
}
override fun add(word: String) {
stopWordSet.add(word)
}
override fun remove(word: String) {
stopWordSet.remove(word)
}
override fun contains(word: String) = dat.containsKey(word)
}
/**
* 停用词词典,从系统中加载停用词词典
*
* @author jimichan
*/
@Singleton
class SystemStopWordDict constructor(val env: MynlpEnv) : StopWordDict {
private val stopDict = DefaultStopWordDict(loadStopword())
override fun contains(word: String): Boolean {
return stopDict.contains(word)
}
override fun commit() {
stopDict.commit()
}
override fun add(word: String) {
stopDict.add(word)
}
override fun remove(word: String) {
stopDict.remove(word)
}
private fun loadStopword(): Set {
try {
val resource = env.tryLoadResource(StopWordDictPath)
resource?.let { re ->
return re.inputStream().bufferedReader().readLines().asSequence()
.map { it.trim() }.filter { it.isNotBlank() }.toSet()
}
} catch (e: Exception) {
Mynlps.logger.error("", e)
}
return emptySet()
}
}