com.mayabot.nlp.segment.plugins.pos.PosPerceptronDef.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mynlp Show documentation
Show all versions of mynlp Show documentation
Maya Nlp subproject :mynlp
package com.mayabot.nlp.segment.plugins.pos
import com.mayabot.nlp.common.FastStringBuilder
import com.mayabot.nlp.perceptron.EvaluateFunction
import com.mayabot.nlp.perceptron.PerceptronDefinition
import com.mayabot.nlp.perceptron.PerceptronModel
import com.mayabot.nlp.segment.Nature
import com.mayabot.nlp.utils.CharNormUtils
import com.mayabot.nlp.utils.Characters
object InnerPos {
val natures = Nature.values().filter {
it != Nature.newWord
&& it != Nature.begin
&& it != Nature.end
}.map { it.name }.sorted().toTypedArray()
}
/**
* 语料格式。 word1/label word2/label
*/
class PosPerceptronDef
@JvmOverloads
constructor(
val labels: Array = InnerPos.natures
) : PerceptronDefinition> {
override fun labels(): Array {
return labels
}
override fun featureMaxSize() = 50
/**
* word1/flag word2/flag
*/
override fun parseAnnotateText(text: String): List> {
return text.split(' ').mapNotNull {
val s = it.split('/')
if (s.size == 2) {
s[0] to s[1]
} else {
null
}
}
}
override fun inputList2InputSeq(list: List): List {
return list
}
private val CHAR_BEGIN = "_B_"
private val CHAR_END = "_E_"
/**
* 和前面一个词,后面一个词。
* 词本身。
* 词的前缀,词的后缀
*/
override fun featureFunction(sentence: List, size: Int, position: Int, buffer: FastStringBuilder, emit: () -> Unit) {
var preWord = if (position > 0) sentence[position - 1] else null
val curWord = sentence[position]
var nextWord = if (position < size - 1) sentence[position + 1] else null
if (nextWord!=null && nextWord.length == 1) {
val c = nextWord[0]
val isP = Characters.isPunctuation(c)
if (isP || c == ' ') {
// 我认为标点符号和词性无关
nextWord = null
}
}
if (preWord!=null && preWord.length == 1) {
val c = preWord[0]
val isP = Characters.isPunctuation(c)
if (isP || c == ' ') {
// 我认为标点符号和词性无关
preWord = null
}
}
if (preWord != null) {
buffer.clear()
buffer.append(preWord)
buffer.append('☺')
emit()
}
//让同一个特征出现两次。我认为这个特征比较重要
buffer.clear()
buffer.append(curWord)
emit()
emit()
if (nextWord != null) {
buffer.clear()
buffer.append(nextWord);
buffer.append('♂')
emit()
}
val length = curWord.length
// prefix
if (length >= 2) {
val last = length - 1
val c1 = curWord[0]
val l1 = curWord[last]
buffer.set2(c1, '★')
emit()
buffer.set2(l1, '✆')
emit()
if (length >= 3) {
val c2 = curWord[1]
val l2 = curWord[last - 1]
buffer.set3(c1, c2, '★')
emit()
buffer.set3(l1, l2, '✆')
emit()
if (length >= 4) {
val c3 = curWord[2]
val l3 = curWord[last - 2]
buffer.set4(c1, c2, c3, '★')
emit()
buffer.set4(l1, l2, l3, '✆')
emit()
}
}
}
}
override fun evaluateFunction(perceptron: PerceptronModel): EvaluateFunction? {
return null
}
override fun preProcessInputSequence(input: List): List {
return input.map { CharNormUtils.convert(it) }
}
}